Shicheng Guo
All posts
August 1, 2017

How to update hapmap2 and hapmap3 from hg18 to hg19 or hg38


Recently, colleagues always ask me what’s the best solution to update hapmap3 from hg18 to hg19 and hg38. Here, I try to give certain solutions.

  1. Update hapmap2 from hg18 to hg19 or hg38 with liftOver and plink
wget http://zzz.bwh.harvard.edu/plink/dist/hapmap_r23a.zip
wget http://zzz.bwh.harvard.edu/plink/dist/hapmap.pop
unzip hapmap_r23a.zip

wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
wget https://raw.githubusercontent.com/Shicheng-Guo/GscPythonUtility/master/liftOverPlink.py

# rebuild plink file to avoid chromsome-miss-order problem
plink --bfile hapmap_r23a --make-bed --out hapmap_r23a.tab

# space to tab to generate bed files for liftOver from hg18 to hg19
plink --bfile hapmap_r23a.sort --recode tab --out hapmap_r23a.tab

# apply liftOverPlink.py to update hg18 to hg19 or hg38
./liftOverPlink.py -m hapmap_r23a.tab.map -p  hapmap_r23a.tab.ped -o hapmap_r23a.hg19 -c hg18ToHg19.over.chain.gz -e ./liftOver
./liftOverPlink.py -m hapmap_r23a.tab.map -p  hapmap_r23a.tab.ped -o hapmap_r23a.hg38 -c hg19ToHg38.over.chain.gz -e ./liftOver

plink --file hapmap_r23a.hg19 --make-bed --allow-extra-chr --out hapmap2.hg19
plink --file hapmap_r23a.hg38 --make-bed --allow-extra-chr --out hapmap2.hg38
  1. Update hapmap3 from hg18 to hg19 and hg38 with liftOver and plink
 # download database and script
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
wget https://raw.githubusercontent.com/Shicheng-Guo/GscPythonUtility/master/liftOverPlink.py

# download hapmap3 data in plink format
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.ped.bz2
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/relationships_w_pops_051208.txt
bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.ped.bz2
bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2

# convert from hg18 to hg19 plink file
plink --bfile hapmap3_r1_b36_fwd_consensus.qc.poly.recode --recode --out hapmap3.hg18
./liftOverPlink.py -m hapmap3.hg18.map -p hapmap3.hg18.ped -o hapmap3.hg19 -c hg18ToHg19.over.chain.gz -e ./liftOver
./liftOverPlink.py -m hapmap3.hg18.map -p hapmap3.hg18.ped -o hapmap3.hg38 -c hg19ToHg38.over.chain.gz -e ./liftOver

# update plink to binary mode
plink --file hapmap3.hg19 --make-bed --allow-extra-chr --out hapmap3.hg19
plink --file hapmap3.hg38 --make-bed --allow-extra-chr --out hapmap3.hg38

# hapmap3 data cleaning and filtering
plink --bfile hapmap3.hg19  --missing
plink --file hapmap3.hg19 --maf 0.01 --make-bed --indep 50 5 2 --out hapmap3.hg19
plink --bfile hapmap3.hg19 --extract hapmap3.hg19.prune.in --genome --min 0.185
perl ./run-IBD-QC.pl plink
  1. Merge your data with hapmap2 dataset and prepare the PCA plot
### merge personal data with hapmap2 and hapmap3
/home/guosa/hpc/db/hapmap3/hapmap3.hg19.deking
/home/guosa/hpc/db/hapmap2/hapmap2.hg19.deking
cd /home/guosa/hpc/db/hapmap2/
plink2 --bfile ~/hpc/db/hapmap2/hapmap2.hg19.deking --exclude RA2020hapmap2-merge.missnp --max-alleles 2 --make-bed --out hapmap2.hg19
plink2 --bfile ../RA2020-B8.dbsnp --max-alleles 2 --exclude RA2020hapmap2-merge.missnp --make-bed --out RA2020.hg19
plink --bfile hapmap2.hg19 --bmerge RA2020.hg19 --make-bed --out RA2020hapmap2
plink --bfile RA2020hapmap2 --maf 0.01 --geno 0.1 --make-bed --out RA2020hapmap2Update
plink --bfile RA2020hapmap2Update --make-bed --out RA2020hapmap2UpdatePCA
plink2 --bfile RA2020hapmap2UpdatePCA --pca --threads 31
hapmap2<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap2.pop",head=F)
head(hapmap2)
head(hapmap3)
eigenvec<-read.table("plink2.eigenvec",head=F)
head(eigenvec)
pop<-as.character(hapmap2[match(eigenvec[,2],hapmap2[,2]),3])
pop[is.na(pop)]<-"Guanghua"
eigenvec$pop=pop
eigenvec$col=as.numeric(as.factor(pop))
head(eigenvec)
pdf("pca.pdf")
plot(eigenvec[,3],eigenvec[,4],pch=16,col=as.factor(eigenvec$pop),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5)
legend("topright",pch=16,legend=unique(as.factor(pop)),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1.5)
dev.off()
subset(eigenvec,pop=="Guanghua" & V3<0) # RA478
  1. Merge hapmap3 data with your own data and prepare the PCA plot
cd ~/hpc/rheumatology/RA/he2020/hapmap3
plink --bfile ~/hpc/db/hapmap3/hapmap3.hg19.deking --bmerge ../RA2020-B8.dbsnp --make-bed --out RA2020hapmap3
plink2 --bfile ~/hpc/db/hapmap3/hapmap3.hg19.deking --exclude RA2020hapmap3-merge.missnp --max-alleles 2 --make-bed --out hapmap3.hg19
plink2 --bfile ../RA2020-B8.dbsnp --max-alleles 2 --exclude RA2020hapmap3-merge.missnp --make-bed --out RA2020.hg19
plink --bfile hapmap3.hg19 --bmerge RA2020.hg19 --make-bed --out RA2020.hapmap3
plink --bfile RA2020.hapmap3 --maf 0.01 --geno 0.1 --make-bed --out RA2020.hapmap3.update
plink --bfile RA2020.hapmap3.update --make-bed --out RA2020.hapmap3.update.pca
plink2 --bfile RA2020.hapmap3.update.pca --pca --threads 31
### R
hapmap3<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap3.pop",head=T)
head(hapmap3)
eigenvec<-read.table("plink2.eigenvec",head=F)
head(eigenvec)
pop<-as.character(hapmap3[match(eigenvec[,2],hapmap3[,2]),7])
pop[is.na(pop)]<-"SGH"
eigenvec$pop=pop
head(eigenvec)
pdf("Hapmap3.Guanghua.PCA.pdf")
plot(eigenvec[,3],eigenvec[,4],col=as.factor(eigenvec$pop),pch=as.numeric(as.factor(eigenvec$pop)),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5)
legend("topright",legend=unique(as.factor(pop)),pch=unique(as.numeric(as.factor(eigenvec$pop))),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1)
dev.off()
subset(eigenvec,pop=="SGH" & V3<0) # RA478
pdf("Hapmap3.Guanghua.PCA.fine.pdf")
plot(eigenvec[,3],eigenvec[,4],col=as.factor(eigenvec$pop),pch=as.numeric(as.factor(eigenvec$pop)),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5,xlim=c(0.006,0.01),ylim=c(-0.0075,0.0025))
legend("topright",legend=unique(as.factor(pop)),pch=unique(as.numeric(as.factor(eigenvec$pop))),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1)
dev.off()
subset(eigenvec,pop=="SGH" & V3<0.0075) # RA478
write.table(subset(eigenvec,pop=="SGH" & V3<0.0075),file="GH.txt",sep="\t",col.names=T,row.names=F,quote=F)
write.table(subset(eigenvec,pop=="SGH" & V3<0.0075)[,1:2],file="pca.iid.remove",sep="\t",col.names=F,row.names=F,quote=F)