Recent PGC MDD GWAS (Wray…Sullivan 2018, Howard…McIntosh 2019) contain summary statistics from UK Biobank (UKBB). Here we create summary statistics that have UK Biobank participants removed.
There are 959 participants that overlap between PGC MDD and UKBB cohorts. In previous analyses, this overlap was handled by removing overlapping individuals from the UKBB analysis. Thus, becuase these individuals were retained in the PGC cohorts, even versions of the Wray or Howard summary statistics that excluded the UKBB summary statistics from the meta-analysis (leave-one-out [LOO] with reference noUKBB
) would overlap with the whole UKBB sample. For this analysis, we removed overlapping individuals from the PGC MDD cohorts before conducting the meta-analysis, and refer to these results as rmUKBB
(remove UK Biobank).
Analysis performed on the LISA cluster using Ricopili. The R code embedded in this document can be run while this document is rendered but bash code is set to not evaluated but can be submitted to the cluster to reproduce this analysis.
DIR
is a stand-in for the location of each data set
mkdir data
# overlap data
ln -s /home/DIR/959_PGC_UKB_overlap.txt data/
# MDD Wave1 data
ln -s /home/DIR/v1/* data/
# BOMA data
ln -s /home/DIR/v1_boma/* data/
# GenRED
ln -s /home/DIR/v1_genred/* data/
Overlap between PGC MDD and UKBB cohorts were matched by genotype checksums (see also Turchin and Hirschhorn 2012).
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(readxl)
library(ggplot2)
overlap <- read_table2('data/959_PGC_UKB_overlap.txt', col_names=c('CS', 'FID', 'IID'))
## Parsed with column specification:
## cols(
## CS = col_character(),
## FID = col_character(),
## IID = col_character()
## )
Tally PGC cohorts with overlap
cohorts_count <-
overlap %>%
filter(CS != 'CheckSum.GenerationScotland.cs') %>%
select(FID) %>%
separate(FID, into=c('FID', 'IID'), sep='\\*') %>%
separate(FID, into=c('status', 'disorder', 'cohort', 'ancestry', 'sr', 'platform'), sep='_') %>%
group_by(cohort) %>%
tally()
cohorts_count
The additional overlap not listed is from with the Generation Scotland cohort.
For cohorts with overlap, create MDD case/control phenotype files where phenotype of overlapping participants is set to missing (-9
).
fams <- lapply(cohorts_count$cohort, function(cohort) {
fam_file <- list.files('data', paste0('mdd_', cohort, '.+\\.fam$'), full.names=TRUE)
if(length(fam_file > 0)) {
fam <- read_table2(fam_file, col_names=c('FID', 'IID', 'father', 'mother', 'sex', 'pheno'), col_types='ccccii')
} else {
warning(paste('No .fam file for cohort', cohort))
fam <- NULL
}
return(fam)
})
## Warning in FUN(X[[i]], ...): No .fam file for cohort jjp2
## Warning in FUN(X[[i]], ...): No .fam file for cohort shp0
fams_pheno <-
bind_rows(fams) %>%
select(FID, IID, pheno)
fams_pheno %>%
group_by(pheno) %>%
tally()
Only retain phenotypes of participants from cohorts with overlap and set the phenotype of participants in the overlap file to -9
rmUKBB_pheno <-
fams_pheno %>%
left_join(overlap, by=c('FID', 'IID')) %>%
mutate(pheno=if_else(is.na(CS), true=pheno, false=-9L))
rmUKBB_pheno %>%
group_by(pheno) %>%
tally()
write_tsv(rmUKBB_pheno, 'data/mdd_rmUKBB.pheno', col_names=F)
Write out datasets_info
to list the cohorts to analyze
datasets_info <- str_replace(unlist(sapply(cohorts_count$cohort, function(cohort) list.files('data', paste0('mdd_', cohort, '.+\\.ch\\.fl$')), simplify=TRUE, USE.NAMES=FALSE)), pattern='dasuqc1_', replacement='')
write(datasets_info, 'data/datasets_info', ncol=1)
Run the Ricopoli pipeline
cd data
postimp_navi --out pgc_MDD13 --addout rmUKBB \
--mds MDD29.0515.nproj.menv.mds_cov \
--coco 1,2,3,4,5,6 --pheno mdd_rmUKBB.pheno \
--popname eur \
--onlymeta --noclump --noldsc --nolahunt
cd ../
Check the sumstats file against the imputation panel
mdd13_rmUKBB_daner <- read_tsv('data/report_pgc_MDD13_rmUKBB/daner_pgc_MDD13_rmUKBB.gz')
## Parsed with column specification:
## cols(
## CHR = col_integer(),
## SNP = col_character(),
## BP = col_integer(),
## A1 = col_character(),
## A2 = col_character(),
## FRQ_A_8461 = col_double(),
## FRQ_U_14006 = col_double(),
## INFO = col_double(),
## OR = col_double(),
## SE = col_double(),
## P = col_double(),
## ngt = col_integer(),
## Direction = col_character(),
## HetISqt = col_double(),
## HetDf = col_integer(),
## HetPVa = col_double(),
## Nca = col_integer(),
## Nco = col_integer(),
## Neff_half = col_double()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 22 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1307227 CHR no trailing ~ " chr1_142651548_I 142~ 'data/report_pgc_M~ file 2 1307227 <NA> 19 columns 1 columns 'data/report_pgc_M~ row 3 1307228 CHR no trailing ~ " rs146171952 14312578~ 'data/report_pgc_M~ col 4 1307228 <NA> 19 columns 1 columns 'data/report_pgc_M~ expected 5 1307229 CHR no trailing ~ " rs72699878 14315459~ 'data/report_pgc_M~
## ... ................. ... ........................................................................... ........ ........................................................................... ...... ........................................................................... .... ........................................................................... ... ........................................................................... ... ........................................................................... ........ ...........................................................................
## See problems(...) for more details.
daner_snps <-
mdd13_rmUKBB_daner %>%
filter(FRQ_U_14006 >= 0.01 & FRQ_U_14006 <= 0.99) %>%
mutate(MB=floor(BP/1e6)) %>%
group_by(CHR, MB) %>%
summarise(snpObs=n())
sumfrq_eur <- read_table2('/home/gwas/pgc-samples/hapmap_ref/impute2_ref/1KG_Aug12/ALL_1000G_phase1integrated_v3_impute_macGT1/sumfrq.eur', col_names=c('SNP', 'COUNT', 'A1', 'FRQ_A1', 'A2', 'FRQ_A2', 'A1_COUNT', 'CHR', 'BP'))
## Parsed with column specification:
## cols(
## SNP = col_character(),
## COUNT = col_integer(),
## A1 = col_character(),
## FRQ_A1 = col_double(),
## A2 = col_character(),
## FRQ_A2 = col_double(),
## A1_COUNT = col_integer(),
## CHR = col_integer(),
## BP = col_integer()
## )
impute_snps <-
sumfrq_eur %>%
filter(FRQ_A1 >= 0.01) %>%
mutate(MB=floor(BP/1e6)) %>%
group_by(CHR, MB) %>%
summarise(snpRef=n())
output_snps <- daner_snps %>%
left_join(impute_snps, by=c('CHR', 'MB')) %>%
mutate(snpObs=if_else(is.na(snpObs), true=0L, false=snpObs))
ggplot(output_snps, aes(x=snpObs, y=snpRef)) +
geom_point() +
scale_x_continuous('SNP count observed') +
scale_y_continuous('SNP count expected') +
coord_fixed()
Copy anchor cohort single datasets that removed UKBB and link additional datasets for meta-analysis
mkdir -p data/summary_stats_0120_rmUKBB/single_dataset/additional_datasets
# single datasets run here
cp data/report_pgc_MDD13_rmUKBB/daner_mdd_*.gz data/summary_stats_0120_rmUKBB/single_dataset/
# rename to indicate that UKB overlap was removed
## NB: LISA has a PERL version of rename
rename 's/_eur/_rmUKBB_eur/' data/summary_stats_0120_rmUKBB/single_dataset/*.gz
# single datasets run elsewhere
cp $DIR/summary_stats_0517/jjp2_exc.UKBsamples/daner_mdd_jjp2_4UKB_exc.sa.2102.gz data/summary_stats_0120_rmUKBB/single_dataset/daner_mdd_jjp2_rmUKBB_eur_sr-qc.hg19.ch.fl.gz
# additional datasets
# copy GenScot removing UKBB
cp DIR/daner_mdd_genscot_1119a_rmUKBB.aligned.gz data/summary_stats_0120_rmUKBB/single_dataset/additional_datasets/
# symlink other additional datasets except for GenScot, UKBB, and iPsych
ln -s $DIR/summary_stats_0517/single_dataset/additional_datasets/daner_{GERA,mdd_decode}*.gz data/summary_stats_0120_rmUKBB/single_dataset/additional_datasets/
# symlink iPsych
ln -s $DIR/summary_stats_0517_ipsych/daner_mddGWAS_new_ipsych_170220.meta.gz data/summary_stats_0120_rmUKBB/single_dataset/additional_datasets/
Link anchor cohort single datasets that did not need to remove UKBB.
for daner in $DIR/summary_stats_0517/single_dataset/daner_mdd_*.gz; do
# get filename
daner_file=$(basename $daner)
# get cohort name
cohort=$(echo $daner_file | awk -F_ '{print $3}')
# check if there is not already a daner file with this name
if [ ! -f data/summary_stats_0120_rmUKBB/single_dataset/*${cohort}*.gz ]; then
echo "Adding cohort ${cohort}"
ln -s $daner data/summary_stats_0120_rmUKBB/single_dataset/
else
echo "Already have cohort ${cohort}"
fi
done
Meta-analyze PGC MDD anchor cohorts
mkdir -p data/meta
cd data/meta
# symlink all sumstats into the meta analysis working directory
ln -s ../summary_stats_0120_rmUKBB/single_dataset/daner*.gz .
# list of anchor cohort sumstats file
ls daner_mdd_*.gz > anchor_dataset_dir
# check that none of the symlinks are broken
for daner in daner_mdd_*.gz; do
if [ ! -e $daner ]; then
echo "$daner does not exist"
fi
done
# symlink the reference file
ln -s ../reference_info .
postimp_navi --result anchor_dataset_dir --onlymeta --out MDD29.0120a.rmUKBB
cd ../../
Copy meta analysis file
cp data/meta/report_MDD29.0120a.rmUKBB/daner_MDD29.0120a.rmUKBB.gz data/summary_stats_0120_rmUKBB/single_dataset/
Set up full meta analysis
mkdir -p data/full_meta
cd data/full_meta
ln -s ../summary_stats_0120_rmUKBB/single_dataset/daner_MDD29.0120a.rmUKBB.gz .
ln -s ../summary_stats_0120_rmUKBB/single_dataset/additional_datasets/daner_*.gz .
ls daner_*.gz > full_dataset_dir
ln -s ../reference_info .
postimp_navi --result full_dataset_dir --nolahunt --out pgc_mdd_meta_w2_no23andMe_rmUKBB
cd ../../
Copy out sumstats and supporting files
mkdir -p data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta_no23andMe
REPORT=data/full_meta/report_pgc_mdd_meta_w2_no23andMe_rmUKBB
INFIX=pgc_mdd_meta_w2_no23andMe_rmUKBB
cp -L $REPORT/areas.fo.het.${INFIX}.pdf.gz \
$REPORT/areas.fo.${INFIX}.pdf \
$REPORT/areas.${INFIX}.pdf \
$REPORT/basic.${INFIX}.num.xls \
$REPORT/daner_${INFIX}.gz.p3.gz \
$REPORT/daner_${INFIX}.gz.p4.clump.areator.sorted.1mhc \
$REPORT/daner_${INFIX}.gz.p4.clump.areator.sorted.1mhc.xls \
$REPORT/daner_${INFIX}.het.gz.p4.clump.areator.sorted.1mhc \
$REPORT/daner_${INFIX}.het.gz.p4.clump.areator.sorted.1mhc.xls \
$REPORT/manhattan.daner_${INFIX}*.pdf \
$REPORT/${INFIX}*qq.pdf \
data/summary_stats_0120_rmUKBB/pgc_mdd_meta_no23andMe
# retain sumstats rows with the correct number of columns (19)
# this arises from SNPs that are only present in one of the underlying cohorts and
# some of the supplementary columns are not included. This removes an acceptably
# small number of SNPS (< 100)
zcat $REPORT/daner_${INFIX}.gz | awk -F $'\t' '{if(NF == 19) {print $0}}' | gzip -c > data/full_meta/daner_${INFIX}.gz
Check number of excluded SNPs
REPORT=data/full_meta/report_pgc_mdd_meta_w2_no23andMe_rmUKBB
INFIX=pgc_mdd_meta_w2_no23andMe_rmUKBB
zcat $REPORT/daner_${INFIX}.gz | awk -F $'\t' '{if(NF < 19) {print $0}}' | wc -l
## 61
Check for duplicated CPIDs
full_meta_no23am_daner <- read_tsv('data/full_meta/daner_pgc_mdd_meta_w2_no23andMe_rmUKBB.gz')
## Parsed with column specification:
## cols(
## CHR = col_integer(),
## SNP = col_character(),
## BP = col_integer(),
## A1 = col_character(),
## A2 = col_character(),
## FRQ_A_45396 = col_double(),
## FRQ_U_97250 = col_double(),
## INFO = col_double(),
## OR = col_double(),
## SE = col_double(),
## P = col_double(),
## ngt = col_integer(),
## Direction = col_character(),
## HetISqt = col_double(),
## HetDf = col_integer(),
## HetPVa = col_double(),
## Nca = col_integer(),
## Nco = col_integer(),
## Neff_half = col_double()
## )
# CPID
cpid_dups <-
full_meta_no23am_daner %>%
group_by(CHR, BP) %>%
tally() %>%
filter(n > 1)
nrow(cpid_dups)
## [1] 5211
# SNP
snp_dups <-
full_meta_no23am_daner %>%
group_by(SNP) %>%
tally() %>%
filter(n > 1) %>%
arrange(n)
nrow(snp_dups)
## [1] 0
# drop duplicated CPIDs
full_meta_no23am_daner_nodups <- full_meta_no23am_daner %>%
anti_join(cpid_dups, by=c('CHR', 'BP')) %>%
arrange(CHR, BP)
write_tsv(full_meta_no23am_daner_nodups, 'data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta_no23andMe/daner_pgc_mdd_meta_w2_no23andMe_rmUKBB.gz')
read_excel('data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta_no23andMe/daner_pgc_mdd_meta_w2_no23andMe_rmUKBB.gz.p4.clump.areator.sorted.1mhc.xls') %>%
select(-`LD-friends(0.1).p0.001`, -`LD-friends(0.6).p0.001`, -`gwas_catalog_span.6`) %>%
filter(P <= 5e-8)
mkdir -p results
pdftoppm -png -r 100 data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta_no23andMe/manhattan.daner_pgc_mdd_meta_w2_no23andMe_rmUKBB.gz.nog2.pdf results/manhattan.daner_pgc_mdd_meta_w2_no23andMe_rmUKBB.gz.nog2
mkdir -p data/full_meta_23am
cd data/full_meta_23am
ln -s $DIR/23andme_whole_genome/daner_23andMe_v5_170227_pgc_aligned_.assoc.gz .
ln -s ../summary_stats_0120_rmUKBB/single_dataset/daner_MDD29.0120a.rmUKBB.gz .
ln -s ../summary_stats_0120_rmUKBB/single_dataset/additional_datasets/daner_*.gz .
ls daner_*.gz > full_dataset_dir
ln -s ../reference_info .
postimp_navi --result full_dataset_dir --nolahunt --out pgc_mdd_meta_w2_rmUKBB
Copy results files
mkdir -p data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta
REPORT=data/full_meta_23am/report_pgc_mdd_meta_w2_rmUKBB
INFIX=pgc_mdd_meta_w2_rmUKBB
cp -L $REPORT/areas.fo.het.${INFIX}.pdf.gz \
$REPORT/areas.fo.${INFIX}.pdf \
$REPORT/areas.${INFIX}.pdf \
$REPORT/basic.${INFIX}.num.xls \
$REPORT/daner_${INFIX}.gz.p3.gz \
$REPORT/daner_${INFIX}.gz.p4.clump.areator.sorted.1mhc \
$REPORT/daner_${INFIX}.gz.p4.clump.areator.sorted.1mhc.xls \
$REPORT/daner_${INFIX}.het.gz.p4.clump.areator.sorted.1mhc \
$REPORT/daner_${INFIX}.het.gz.p4.clump.areator.sorted.1mhc.xls \
$REPORT/manhattan.daner_${INFIX}*.pdf \
$REPORT/${INFIX}*qq.pdf \
data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta
# retain sumstats rows with the correct number of columns (19)
zcat $REPORT/daner_${INFIX}.gz | awk -F $'\t' '{if(NF == 19) {print $0}}' | gzip -c > data/full_meta_23am/daner_${INFIX}.gz
Check for duplicated CPIDs
full_meta_daner <- read_tsv('data/full_meta_23am/daner_pgc_mdd_meta_w2_rmUKBB.gz')
## Parsed with column specification:
## cols(
## CHR = col_integer(),
## SNP = col_character(),
## BP = col_integer(),
## A1 = col_character(),
## A2 = col_character(),
## FRQ_A_116209 = col_double(),
## FRQ_U_314566 = col_double(),
## INFO = col_double(),
## OR = col_double(),
## SE = col_double(),
## P = col_double(),
## ngt = col_character(),
## Direction = col_character(),
## HetISqt = col_double(),
## HetDf = col_integer(),
## HetPVa = col_double(),
## Nca = col_integer(),
## Nco = col_integer(),
## Neff_half = col_double()
## )
# CPID
cpid_dups <-
full_meta_daner %>%
group_by(CHR, BP) %>%
tally() %>%
filter(n > 1)
nrow(cpid_dups)
## [1] 4794
# SNP
snp_dups <-
full_meta_daner %>%
group_by(SNP) %>%
tally() %>%
filter(n > 1) %>%
arrange(n)
nrow(snp_dups)
## [1] 0
# drop duplicated CPIDs
full_meta_daner_nodups <- full_meta_daner %>%
anti_join(cpid_dups, by=c('CHR', 'BP')) %>%
arrange(CHR, BP)
write_tsv(full_meta_daner_nodups, 'data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta/daner_pgc_mdd_meta_w2_rmUKBB.gz')
read_excel('data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta/daner_pgc_mdd_meta_w2_rmUKBB.gz.p4.clump.areator.sorted.1mhc.xls') %>%
select(-`LD-friends(0.1).p0.001`, -`LD-friends(0.6).p0.001`, -`gwas_catalog_span.6`) %>%
filter(P <= 5e-8)
mkdir -p results
pdftoppm -png -r 100 data/summary_stats_0120_rmUKBB/pgc_mdd_full_meta/manhattan.daner_pgc_mdd_meta_w2_rmUKBB.gz.nog2.pdf results/manhattan.daner_pgc_mdd_meta_w2_rmUKBB.gz.nog2