01 August 2017

Recently, colleagues always ask me what’s the best solution to update hapmap3 from hg18 to hg19 and hg38. Here, I try to give certain solutions.

  1. Update hapmap2 from hg18 to hg19 or hg38 with liftOver and plink
wget http://zzz.bwh.harvard.edu/plink/dist/hapmap_r23a.zip
wget http://zzz.bwh.harvard.edu/plink/dist/hapmap.pop
unzip hapmap_r23a.zip

wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
wget https://raw.githubusercontent.com/Shicheng-Guo/GscPythonUtility/master/liftOverPlink.py

# rebuild plink file to avoid chromsome-miss-order problem
plink --bfile hapmap_r23a --make-bed --out hapmap_r23a.tab

# space to tab to generate bed files for liftOver from hg18 to hg19
plink --bfile hapmap_r23a.sort --recode tab --out hapmap_r23a.tab

# apply liftOverPlink.py to update hg18 to hg19 or hg38
./liftOverPlink.py -m hapmap_r23a.tab.map -p  hapmap_r23a.tab.ped -o hapmap_r23a.hg19 -c hg18ToHg19.over.chain.gz -e ./liftOver
./liftOverPlink.py -m hapmap_r23a.tab.map -p  hapmap_r23a.tab.ped -o hapmap_r23a.hg38 -c hg19ToHg38.over.chain.gz -e ./liftOver

plink --file hapmap_r23a.hg19 --make-bed --allow-extra-chr --out hapmap2.hg19
plink --file hapmap_r23a.hg38 --make-bed --allow-extra-chr --out hapmap2.hg38
  1. Update hapmap3 from hg18 to hg19 and hg38 with liftOver and plink
 # download database and script
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
wget https://raw.githubusercontent.com/Shicheng-Guo/GscPythonUtility/master/liftOverPlink.py

# download hapmap3 data in plink format
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.ped.bz2
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/relationships_w_pops_051208.txt
bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.ped.bz2
bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2

# convert from hg18 to hg19 plink file
plink --bfile hapmap3_r1_b36_fwd_consensus.qc.poly.recode --recode --out hapmap3.hg18
./liftOverPlink.py -m hapmap3.hg18.map -p hapmap3.hg18.ped -o hapmap3.hg19 -c hg18ToHg19.over.chain.gz -e ./liftOver
./liftOverPlink.py -m hapmap3.hg18.map -p hapmap3.hg18.ped -o hapmap3.hg38 -c hg19ToHg38.over.chain.gz -e ./liftOver

# update plink to binary mode
plink --file hapmap3.hg19 --make-bed --allow-extra-chr --out hapmap3.hg19
plink --file hapmap3.hg38 --make-bed --allow-extra-chr --out hapmap3.hg38

# hapmap3 data cleaning and filtering
plink --bfile hapmap3.hg19  --missing
plink --file hapmap3.hg19 --maf 0.01 --make-bed --indep 50 5 2 --out hapmap3.hg19
plink --bfile hapmap3.hg19 --extract hapmap3.hg19.prune.in --genome --min 0.185
perl ./run-IBD-QC.pl plink
  1. Merge your data with hapmap2 dataset and prepare the PCA plot
### merge personal data with hapmap2 and hapmap3
/home/guosa/hpc/db/hapmap3/hapmap3.hg19.deking
/home/guosa/hpc/db/hapmap2/hapmap2.hg19.deking
cd /home/guosa/hpc/db/hapmap2/
plink2 --bfile ~/hpc/db/hapmap2/hapmap2.hg19.deking --exclude RA2020hapmap2-merge.missnp --max-alleles 2 --make-bed --out hapmap2.hg19
plink2 --bfile ../RA2020-B8.dbsnp --max-alleles 2 --exclude RA2020hapmap2-merge.missnp --make-bed --out RA2020.hg19
plink --bfile hapmap2.hg19 --bmerge RA2020.hg19 --make-bed --out RA2020hapmap2
plink --bfile RA2020hapmap2 --maf 0.01 --geno 0.1 --make-bed --out RA2020hapmap2Update
plink --bfile RA2020hapmap2Update --make-bed --out RA2020hapmap2UpdatePCA
plink2 --bfile RA2020hapmap2UpdatePCA --pca --threads 31
hapmap2<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap2.pop",head=F)
head(hapmap2)
head(hapmap3)
eigenvec<-read.table("plink2.eigenvec",head=F)
head(eigenvec)
pop<-as.character(hapmap2[match(eigenvec[,2],hapmap2[,2]),3])
pop[is.na(pop)]<-"Guanghua"
eigenvec$pop=pop
eigenvec$col=as.numeric(as.factor(pop))
head(eigenvec)
pdf("pca.pdf")
plot(eigenvec[,3],eigenvec[,4],pch=16,col=as.factor(eigenvec$pop),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5)
legend("topright",pch=16,legend=unique(as.factor(pop)),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1.5)
dev.off()
subset(eigenvec,pop=="Guanghua" & V3<0) # RA478
  1. Merge hapmap3 data with your own data and prepare the PCA plot
cd ~/hpc/rheumatology/RA/he2020/hapmap3
plink --bfile ~/hpc/db/hapmap3/hapmap3.hg19.deking --bmerge ../RA2020-B8.dbsnp --make-bed --out RA2020hapmap3
plink2 --bfile ~/hpc/db/hapmap3/hapmap3.hg19.deking --exclude RA2020hapmap3-merge.missnp --max-alleles 2 --make-bed --out hapmap3.hg19
plink2 --bfile ../RA2020-B8.dbsnp --max-alleles 2 --exclude RA2020hapmap3-merge.missnp --make-bed --out RA2020.hg19
plink --bfile hapmap3.hg19 --bmerge RA2020.hg19 --make-bed --out RA2020.hapmap3
plink --bfile RA2020.hapmap3 --maf 0.01 --geno 0.1 --make-bed --out RA2020.hapmap3.update
plink --bfile RA2020.hapmap3.update --make-bed --out RA2020.hapmap3.update.pca
plink2 --bfile RA2020.hapmap3.update.pca --pca --threads 31
### R
hapmap3<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap3.pop",head=T)
head(hapmap3)
eigenvec<-read.table("plink2.eigenvec",head=F)
head(eigenvec)
pop<-as.character(hapmap3[match(eigenvec[,2],hapmap3[,2]),7])
pop[is.na(pop)]<-"SGH"
eigenvec$pop=pop
head(eigenvec)
pdf("Hapmap3.Guanghua.PCA.pdf")
plot(eigenvec[,3],eigenvec[,4],col=as.factor(eigenvec$pop),pch=as.numeric(as.factor(eigenvec$pop)),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5)
legend("topright",legend=unique(as.factor(pop)),pch=unique(as.numeric(as.factor(eigenvec$pop))),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1)
dev.off()
subset(eigenvec,pop=="SGH" & V3<0) # RA478
pdf("Hapmap3.Guanghua.PCA.fine.pdf")
plot(eigenvec[,3],eigenvec[,4],col=as.factor(eigenvec$pop),pch=as.numeric(as.factor(eigenvec$pop)),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5,xlim=c(0.006,0.01),ylim=c(-0.0075,0.0025))
legend("topright",legend=unique(as.factor(pop)),pch=unique(as.numeric(as.factor(eigenvec$pop))),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1)
dev.off()
subset(eigenvec,pop=="SGH" & V3<0.0075) # RA478
write.table(subset(eigenvec,pop=="SGH" & V3<0.0075),file="GH.txt",sep="\t",col.names=T,row.names=F,quote=F)
write.table(subset(eigenvec,pop=="SGH" & V3<0.0075)[,1:2],file="pca.iid.remove",sep="\t",col.names=F,row.names=F,quote=F)


blog comments powered by Disqus