1, download ANNOVAR: http://annovar.openbioinformatics.org/en/latest/user-guide/download/
cd ~/tools/annovar
annotate_variation.pl -buildver hg19 -downdb cytoBand humandb/
annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/
# just for allele frequency
# annotate_variation.pl -downdb -webfrom annovar exac03 humandb -buildver hg38 &
# annotate_variation.pl -downdb -webfrom annovar esp6500siv2 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar esp6500siv2_all humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar gnomad_exome humandb -buildver hg38 &
# whole-exome data
annotate_variation.pl -downdb -webfrom annovar 1000g2015aug humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar kaviar_20150923 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar hrcr1 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar cg69 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar gnomad_genome humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar dbnsfp30a humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar esp6500siv2 humandb -buildver hg38 &
annotate_variation.pl -downdb esp6500siv2 humandb -buildver hg38 &
# whole-genome data
annotate_variation.pl -downdb -webfrom annovar gerp++ humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar cadd humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar cadd13 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar fathmm humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar eigen humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar gwava humandb -buildver hg38 &
# CNV
annotate_variation.pl -downdb -webfrom annovar dbscsnv11 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar spidex humandb -buildver hg38 &
# disease-specific variants
annotate_variation.pl -downdb -webfrom annovar clinvar_20160302 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar cosmic70 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar icgc21 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar nci60 humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar dbnsfp35c humandb -buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar dbnsfp33a humandb -buildver hg19 &
annotate_variation.pl -downdb -webfrom annovar dbnsfp35a humandb -buildver hg19 &
annotate_variation.pl -downdb -webfrom annovar dann humandb --buildver hg38 &
annotate_variation.pl -downdb -webfrom annovar dann humandb --buildver hg19 &
annotate_variation.pl -downdb -webfrom annovar ljb23_all humandb --buildver hg19&
# GWAS
annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gwasCatalog humandb/ &
annotate_variation.pl -buildver hg38 -downdb tfbsConsSites humandb/
annotate_variation.pl -buildver hg38 -downdb -webfrom annovar wgRna humandb/ &
annotate_variation.pl -buildver hg38 -downdb targetScanS humandb/
annotate_variation.pl -downdb -webfrom annovar dann humandb --buildver hg38 &
annotate_variation.pl -buildver hg38 -downdb tfbsConsSites humandb/
annotate_variation.pl -buildver hg38 -downdb tfbsConsSites humandb/
annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad_genome humandb/
annotate_variation.pl -buildver hg38 -downdb -webfrom ucsc gnomad_genome humandb/
2, ANNOVAR now takes VCF as standard input
bcftools view -G Final.vcf.gz --threads 32 -Ov -o avinput.vcf
table_annovar.pl -vcfinput avinput.vcf ~/tools/annovar/humandb/ --thread 12 -buildver hg19 -out myanno -remove -protocol refGene,dbnsfp33a -operation gx,f -nastring . -otherinfo -polish -xref ~/tools/annovar/humandb/gene_fullxref.txt
3, SNP distribution patterns, how many intergenic? how many exomic?
bcftools view -G Final.vcf.gz --threads 32 -Ov -o avinput.vcf
table_annovar.pl -vcfinput avinput.vcf ~/tools/annovar/humandb/ --thread 12 -buildver hg19 -out myanno -remove -protocol refGene,dbnsfp33a -operation gx,f -nastring . -otherinfo -polish -xref ~/tools/annovar/humandb/gene_fullxref.txt
data<-read.table("c1",as.is=T)
anno<-data[,1]
anno[anno=="ncRNA_UTR5"]<-"ncRNA_exonic"
anno[anno=="ncRNA_exonic;splicing"]<-"ncRNA_exonic"
anno[anno=="exonic;splicing"]<-"splicing"
anno[anno=="UTR5;UTR3"]<-"exonic"
anno[anno=="ncRNA_splicing"]<-"splicing"
anno[anno=="upstream;downstream"]<-"intergenic"
input<-table(anno)
pdf("pie.annovar.pdf")
pie(input,col=rainbow(length(input)),main="N=13,494,289 SNPs")
dev.off()
4, Distance distribution to exome regions?
perl -lane '{print $1 if /dist=(\d+)/}' myanno.refGene.variant_function > dist
data<-read.table("dist",as.is=T)
distance<-data[,1]
pdf("dist.annovar.pdf")
hist(distance,col=rainbow(20),main=paste("N=",length(distance),"SNPs",sep=" "))
dev.off()
distance<-distance[distance<100000]
pdf("dist.100k.annovar.pdf")
hist(distance,col=rainbow(20),main=paste("N=",length(distance),"SNPs",sep=" "),xlab="distance to nearby exome (bp)")
dev.off()
distance<-distance[distance<10000]
pdf("dist.10k.annovar.pdf")
hist(distance,col=rainbow(20),main=paste("N=",length(distance),"SNPs",sep=" "),xlab="distance to nearby exome (bp)")
dev.off()
distance<-distance[distance<1000]
pdf("dist.1k.annovar.pdf")
hist(distance,col=rainbow(20),main=paste("N=",length(distance),"SNPs",sep=" "),xlab="distance to nearby exome (bp)")
dev.off()
5, PCA analysis
awk '{print $4}' hsa.gff3.hg19.bed | sort -u > hsa.gff3.hg19.snp
plink --bfile /gpfs/home/guosa/hpc/db/hg19/1000Genome/plink/G1000plink --extract hsa.gff3.hg19.snp --make-bed --out G1000.miRNA
plink --bfile ROI.RA3000.dbsnp --bmerge G1000.miRNA --allow-no-sex --make-bed --out ./PCA/ROI
plink --bfile ROI --threads 31 --cluster --mds-plot 2
plink --bfile ROI --threads 31 --pca 2 'header' --out ROI
hapmap2<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap2.pop",head=F)
hapmap3<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap3.pop",head=T)
G1000Sam <-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/1000G/1000GenomeSampleInfo.txt",head=F,as.is=T)
G1000Super<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/1000G/superpopulation.txt",head=F,sep="\t")
G1000Sam$superpop<-G1000Super[match(G1000Sam$V3,G1000Super$V1),]$V2
write.table(G1000Sam,file="1000GenomeSampleInfo.txt",quote=F,sep="\t",col.names=F,row.names=F)
eigenvec<-read.table("plink.eigenvec",head=T)
head(eigenvec)
pop<-as.character(G1000Sam[match(eigenvec[,2],G1000Sam[,2]),3])
super<-as.character(G1000Sam[match(eigenvec[,2],G1000Sam[,2]),5])
pop[is.na(pop)]<-"GHRA"
super[is.na(super)]<-"GHRA"
eigenvec$pop=pop
eigenvec$super=super
eigenvec$col=as.numeric(as.factor(super))
eigenvec$pch=as.numeric(as.factor(super))
set<-unique(data.frame(pch=eigenvec$pch,col=eigenvec$col,legend=eigenvec$super))
for(i in 1:15){
jpeg(paste("pca.super",i,".jpg",sep=""))
plot(eigenvec[,3],eigenvec[,4],pch=16,col=eigenvec$col+i,xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5,cex=1)
legend("topright",pch=16,legend=set$legend,col=set$col+i,bty="n",cex=1)
dev.off()
}