How to update hapmap2 and hapmap3 from hg18 to hg19 or hg38

2017-08-01 00:00:00 +0000

Recently, colleagues always ask me what’s the best solution to update hapmap3 from hg18 to hg19 and hg38. Here, I try to give certain solutions.

  1. Update hapmap2 from hg18 to hg19 or hg38 with liftOver and plink
wget http://zzz.bwh.harvard.edu/plink/dist/hapmap_r23a.zip
wget http://zzz.bwh.harvard.edu/plink/dist/hapmap.pop
unzip hapmap_r23a.zip

wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
wget https://raw.githubusercontent.com/Shicheng-Guo/GscPythonUtility/master/liftOverPlink.py

# rebuild plink file to avoid chromsome-miss-order problem
plink --bfile hapmap_r23a --make-bed --out hapmap_r23a.tab

# space to tab to generate bed files for liftOver from hg18 to hg19
plink --bfile hapmap_r23a.sort --recode tab --out hapmap_r23a.tab

# apply liftOverPlink.py to update hg18 to hg19 or hg38
./liftOverPlink.py -m hapmap_r23a.tab.map -p  hapmap_r23a.tab.ped -o hapmap_r23a.hg19 -c hg18ToHg19.over.chain.gz -e ./liftOver
./liftOverPlink.py -m hapmap_r23a.tab.map -p  hapmap_r23a.tab.ped -o hapmap_r23a.hg38 -c hg19ToHg38.over.chain.gz -e ./liftOver

plink --file hapmap_r23a.hg19 --make-bed --allow-extra-chr --out hapmap2.hg19
plink --file hapmap_r23a.hg38 --make-bed --allow-extra-chr --out hapmap2.hg38
  1. Update hapmap3 from hg18 to hg19 and hg38 with liftOver and plink
 # download database and script
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
wget https://raw.githubusercontent.com/Shicheng-Guo/GscPythonUtility/master/liftOverPlink.py

# download hapmap3 data in plink format
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.ped.bz2
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/relationships_w_pops_051208.txt
bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.ped.bz2
bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2

# convert from hg18 to hg19 plink file
plink --bfile hapmap3_r1_b36_fwd_consensus.qc.poly.recode --recode --out hapmap3.hg18
./liftOverPlink.py -m hapmap3.hg18.map -p hapmap3.hg18.ped -o hapmap3.hg19 -c hg18ToHg19.over.chain.gz -e ./liftOver
./liftOverPlink.py -m hapmap3.hg18.map -p hapmap3.hg18.ped -o hapmap3.hg38 -c hg19ToHg38.over.chain.gz -e ./liftOver

# update plink to binary mode
plink --file hapmap3.hg19 --make-bed --allow-extra-chr --out hapmap3.hg19
plink --file hapmap3.hg38 --make-bed --allow-extra-chr --out hapmap3.hg38

# hapmap3 data cleaning and filtering
plink --bfile hapmap3.hg19  --missing
plink --file hapmap3.hg19 --maf 0.01 --make-bed --indep 50 5 2 --out hapmap3.hg19
plink --bfile hapmap3.hg19 --extract hapmap3.hg19.prune.in --genome --min 0.185
perl ./run-IBD-QC.pl plink
  1. Merge your data with hapmap2 dataset and prepare the PCA plot
### merge personal data with hapmap2 and hapmap3
/home/guosa/hpc/db/hapmap3/hapmap3.hg19.deking
/home/guosa/hpc/db/hapmap2/hapmap2.hg19.deking
cd /home/guosa/hpc/db/hapmap2/
plink2 --bfile ~/hpc/db/hapmap2/hapmap2.hg19.deking --exclude RA2020hapmap2-merge.missnp --max-alleles 2 --make-bed --out hapmap2.hg19
plink2 --bfile ../RA2020-B8.dbsnp --max-alleles 2 --exclude RA2020hapmap2-merge.missnp --make-bed --out RA2020.hg19
plink --bfile hapmap2.hg19 --bmerge RA2020.hg19 --make-bed --out RA2020hapmap2
plink --bfile RA2020hapmap2 --maf 0.01 --geno 0.1 --make-bed --out RA2020hapmap2Update
plink --bfile RA2020hapmap2Update --make-bed --out RA2020hapmap2UpdatePCA
plink2 --bfile RA2020hapmap2UpdatePCA --pca --threads 31
hapmap2<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap2.pop",head=F)
head(hapmap2)
head(hapmap3)
eigenvec<-read.table("plink2.eigenvec",head=F)
head(eigenvec)
pop<-as.character(hapmap2[match(eigenvec[,2],hapmap2[,2]),3])
pop[is.na(pop)]<-"Guanghua"
eigenvec$pop=pop
eigenvec$col=as.numeric(as.factor(pop))
head(eigenvec)
pdf("pca.pdf")
plot(eigenvec[,3],eigenvec[,4],pch=16,col=as.factor(eigenvec$pop),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5)
legend("topright",pch=16,legend=unique(as.factor(pop)),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1.5)
dev.off()
subset(eigenvec,pop=="Guanghua" & V3<0) # RA478
  1. Merge hapmap3 data with your own data and prepare the PCA plot
cd ~/hpc/rheumatology/RA/he2020/hapmap3
plink --bfile ~/hpc/db/hapmap3/hapmap3.hg19.deking --bmerge ../RA2020-B8.dbsnp --make-bed --out RA2020hapmap3
plink2 --bfile ~/hpc/db/hapmap3/hapmap3.hg19.deking --exclude RA2020hapmap3-merge.missnp --max-alleles 2 --make-bed --out hapmap3.hg19
plink2 --bfile ../RA2020-B8.dbsnp --max-alleles 2 --exclude RA2020hapmap3-merge.missnp --make-bed --out RA2020.hg19
plink --bfile hapmap3.hg19 --bmerge RA2020.hg19 --make-bed --out RA2020.hapmap3
plink --bfile RA2020.hapmap3 --maf 0.01 --geno 0.1 --make-bed --out RA2020.hapmap3.update
plink --bfile RA2020.hapmap3.update --make-bed --out RA2020.hapmap3.update.pca
plink2 --bfile RA2020.hapmap3.update.pca --pca --threads 31
### R
hapmap3<-read.table("https://raw.githubusercontent.com/Shicheng-Guo/AnnotationDatabase/master/hapmap3.pop",head=T)
head(hapmap3)
eigenvec<-read.table("plink2.eigenvec",head=F)
head(eigenvec)
pop<-as.character(hapmap3[match(eigenvec[,2],hapmap3[,2]),7])
pop[is.na(pop)]<-"SGH"
eigenvec$pop=pop
head(eigenvec)
pdf("Hapmap3.Guanghua.PCA.pdf")
plot(eigenvec[,3],eigenvec[,4],col=as.factor(eigenvec$pop),pch=as.numeric(as.factor(eigenvec$pop)),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5)
legend("topright",legend=unique(as.factor(pop)),pch=unique(as.numeric(as.factor(eigenvec$pop))),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1)
dev.off()
subset(eigenvec,pop=="SGH" & V3<0) # RA478
pdf("Hapmap3.Guanghua.PCA.fine.pdf")
plot(eigenvec[,3],eigenvec[,4],col=as.factor(eigenvec$pop),pch=as.numeric(as.factor(eigenvec$pop)),xlab="principle component 1",ylab="principle componment 2",cex.axis=1.5,cex.lab=1.5,xlim=c(0.006,0.01),ylim=c(-0.0075,0.0025))
legend("topright",legend=unique(as.factor(pop)),pch=unique(as.numeric(as.factor(eigenvec$pop))),col=unique(as.factor(eigenvec$pop)),bty="n",cex=1)
dev.off()
subset(eigenvec,pop=="SGH" & V3<0.0075) # RA478
write.table(subset(eigenvec,pop=="SGH" & V3<0.0075),file="GH.txt",sep="\t",col.names=T,row.names=F,quote=F)
write.table(subset(eigenvec,pop=="SGH" & V3<0.0075)[,1:2],file="pca.iid.remove",sep="\t",col.names=F,row.names=F,quote=F)

Characterizing population with principle componment analysis

2017-06-28 00:00:00 +0000

Here, Characterizing population with principle componment analysis

Data Science in Population Genetics and Medical Genetics

2017-06-13 00:00:00 +0000

Here, Data Science in Population Genetics and Medical Genetics

Geneics and epigeneitcs of rheumatoid arthritis and osteoarthritis

2017-06-10 00:00:00 +0000

Here, I want to summarized the genetic and epigeneitc difference between rheumatoid arthritis(RA) and osteoarthritis(OA). OA and RA have certain similarity such as joint demage..Genetics/Epigenetics

Genetics and epigenetics of pancreatic and cholangiocarcinoma

2017-06-09 00:00:00 +0000

Genetic and epigenetic of esophageal squamous cell carcinoma

2017-05-28 00:00:00 +0000

  • Esophageal squamous cell carcinoma
  • All the figures are only used for non-profit education. reminding me if infrigement happens.

Advances in Genetic and epigenetic of Hepatocellular Carcinoma

2017-05-02 00:00:00 +0000

  • Advances in Genetic and epigenetic of Hepatocellular Carcinoma
  • All the figures are only used for non-profit education. reminding me if infrigement happens

How to apply R to prepare fancy figures in the manuscript

2017-05-01 00:00:00 +0000

  • All the figures are only used for non-profit education. reminding me if infrigement happens.

Explaining complex machine learning models with LIME

2017-04-23 00:00:00 +0000

The classification decisions made by machine learning models are usually difficult - if not impossible - to understand by our human brains. The complexity of some of the most accurate classifiers, like neural networks, is what makes them perform so well - often with better results than achieved by humans. But it also makes them inherently hard to explain, especially to non-data scientists.

Especially, if we aim to develop machine learning models for medical diagnostics, high accuracies on test samples might not be enough to sell them to clinicians. Doctors and patients alike will be less inclined to trust a decision made by a model that they don’t understand.

Therefore, we would like to be able to explain in concrete terms why a model classified a case with a certain label, e.g. why one breast mass sample was classified as “malignant” and not as “benign”.

Local Interpretable Model-Agnostic Explanations (LIME) is an attempt to make these complex models at least partly understandable. The method has been published in

“Why Should I Trust You?” Explaining the Predictions of Any Classifier. By Marco Tulio Ribeiro, Sameer Singh and Carlos Guestrin from the University of Washington in Seattle

lime is able to explain all models for which we can obtain prediction probabilities (in R, that is every model that works with predict(type = "prob")). It makes use of the fact that linear models are easy to explain because they are based on linear relationships between features and class labels: The complex model function is approximated by locally fitting linear models to permutations of the original training set.

On each permutation, a linear model is being fit and weights are given so that incorrect classification of instances that are more similar to the original data are penalized (positive weights support a decision, negative weights contradict them). This will give an approximation of how much (and in which way) each feature contributed to a decision made by the model.

The code for lime has originally been made available for Python but the awesome Thomas Lin Pedersen has already created an implementation in R. It is not on CRAN (yet, I assume), but you can install it via Github:

devtools::install_github("thomasp85/lime")


The data I am using is the World Happiness Data from my last post. So, let’s train a neural network on this data to predict three classes of the happiness scores: low, medium and high.

load("data_15_16.RData")
# configure multicore
library(doParallel)
cl <- makeCluster(detectCores())
registerDoParallel(cl)

library(caret)
set.seed(42)
index <- createDataPartition(data_15_16$Happiness.Score.l, p = 0.7, list = FALSE)
train_data <- data_15_16[index, ]
test_data  <- data_15_16[-index, ]
set.seed(42)
model_mlp <- caret::train(Happiness.Score.l ~ .,
                         data = train_data,
                         method = "mlp",
                         trControl = trainControl(method = "repeatedcv", 
                                                  number = 10, 
                                                  repeats = 5, 
                                                  verboseIter = FALSE))


The explain function

The central function of lime is lime() It creates the function that is used in the next step to explain the model’s predictions.

We can give a couple of options. Check the help ?lime for details, but the most important to think about are:

  • Should continuous features be binned? And if so, into how many bins?

Here, I am keeping the default bin_continuous = TRUE but specify 5 instead of 4 (the default) bins with n_bins = 5.

library(lime)

explain <- lime(train_data, model_mlp, bin_continuous = TRUE, n_bins = 5, n_permutations = 1000)


Now, let’s look at how the model is explained. Here, I am not going to look at all test cases but I’m randomly choosing three cases with correct predictions and three with wrong predictions.

pred <- data.frame(sample_id = 1:nrow(test_data),
                   predict(model_mlp, test_data, type = "prob"),
                   actual = test_data$Happiness.Score.l)
  pred$prediction <- colnames(pred)[3:5][apply(pred[, 3:5], 1, which.max)]
  pred$correct <- ifelse(pred$actual == pred$prediction, "correct", "wrong")

Beware that we need to give our test-set data table row names with the sample names or IDs to be displayed in the header of our explanatory plots below.

library(tidyverse)
pred_cor <- filter(pred, correct == "correct")
pred_wrong <- filter(pred, correct == "wrong")

test_data_cor <- test_data %>%
  mutate(sample_id = 1:nrow(test_data)) %>%
  filter(sample_id %in% pred_cor$sample_id) %>%
  sample_n(size = 3) %>%
  remove_rownames() %>%
  tibble::column_to_rownames(var = "sample_id") %>%
  select(-Happiness.Score.l)

test_data_wrong <- test_data %>%
  mutate(sample_id = 1:nrow(test_data)) %>%
  filter(sample_id %in% pred_wrong$sample_id) %>%
  sample_n(size = 3) %>%
  remove_rownames() %>%
  tibble::column_to_rownames(var = "sample_id") %>%
  select(-Happiness.Score.l)


The explain function from above can now be used with our test samples. Further options we can specify are:

  • How many features do we want to use in the explanatory function?

Let’s say we have a big training set with 100 features. Looking at all features and trying to understand them all could be more confusing than helpful. And very often, a handful of very important features will be enough to predict test samples with a reasonable accuracy (see also my last post on OneR). So, we can choose how many features we want to look at with the n_features option.

  • How do we want to choose these features?

Next, we specify how we want this subset of features to be found. The default, auto, uses forward selection if we chose n_features <= 6 and uses the features with highest weights otherwise. We can also directly choose feature_select = "forward_selection", feature_select = "highest_weights" or feature_select = "lasso_path". Again, check ?lime for details.

In our example dataset, we only have 7 features and I want to look at the top 5.

I also want to have explanation for all three class labels in the response variable (low, medium and high happiness), so I am choosing n_labels = 3.

explanation_cor <- explain(test_data_cor, n_labels = 3, n_features = 5)
explanation_wrong <- explain(test_data_wrong, n_labels = 3, n_features = 5)

It will return a tidy tibble object that we can plot with plot_features():

plot_features(explanation_cor, ncol = 3)

plot_features(explanation_wrong, ncol = 3)

The information in the output tibble is described in the help function ?lime and can be viewed with

tibble::glimpse(explanation_cor)


So, what does this tell us, now? Let’s look at case 22 (the first row of our plot for correctly predicted classes): This sample has been correctly predicted to come from the medium happiness group because it

  • has a dystopia value between 2.03 & 2.32,
  • a trust/government corruption score below 0.05,
  • a GDP/economy score between 1.06 and 1.23 and
  • a life expectancy score between 0.59 and 0.7.

From the explanation for the label “high” we can also see that this case has a family score bigger than 1.12, which is more representative of high happiness samples.

pred %>%
  filter(sample_id == 22)
##   sample_id        low   medium       high actual prediction correct
## 1        22 0.02906327 0.847562 0.07429938 medium     medium correct

The explanatory function named dystopia the most strongly supporting feature for this prediction. Dystopia is an imaginary country that has the world’s least-happy people. The purpose in establishing Dystopia is to have a benchmark against which all countries can be favorably compared (no country performs more poorly than Dystopia) in terms of each of the six key variables […]

The explanatory plot tells us for each feature and class label in which range of values a representative data point would fall. If it does, this gets counted as support for this prediction, if it does not, it gets scored as contradictory. For case 22 and the feature dystopia, the data point 2.27 falls within the range for medium happiness (between 2.03 and 2.32) with a high weight.

When we look at where this case falls on the range of values for this feature, we can see that is indeed very close to the median of medium training cases and further away from the medians for high and low training cases. The other supportive features show us the same trend.

train_data %>%
  gather(x, y, Economy..GDP.per.Capita.:Dystopia.Residual) %>%
  ggplot(aes(x = Happiness.Score.l, y = y)) +
    geom_boxplot(alpha = 0.8, color = "grey") + 
    geom_point(data = gather(test_data[22, ], x, y, Economy..GDP.per.Capita.:Dystopia.Residual), color = "red", size = 3) +
    facet_wrap(~ x, scales = "free", ncol = 4)

An overview over the top 5 explanatory features for case 22 is stored in:

as.data.frame(explanation_cor[1:9]) %>%
  filter(case == "22")
##    case  label label_prob   model_r2 model_intercept
## 1    22 medium 0.84756196 0.05004205       0.5033729
## 2    22 medium 0.84756196 0.05004205       0.5033729
## 3    22 medium 0.84756196 0.05004205       0.5033729
## 4    22 medium 0.84756196 0.05004205       0.5033729
## 5    22 medium 0.84756196 0.05004205       0.5033729
## 6    22   high 0.07429938 0.06265119       0.2293890
## 7    22   high 0.07429938 0.06265119       0.2293890
## 8    22   high 0.07429938 0.06265119       0.2293890
## 9    22   high 0.07429938 0.06265119       0.2293890
## 10   22   high 0.07429938 0.06265119       0.2293890
## 11   22    low 0.02906327 0.07469729       0.3528088
## 12   22    low 0.02906327 0.07469729       0.3528088
## 13   22    low 0.02906327 0.07469729       0.3528088
## 14   22    low 0.02906327 0.07469729       0.3528088
## 15   22    low 0.02906327 0.07469729       0.3528088
##                          feature feature_value feature_weight
## 1              Dystopia.Residual       2.27394     0.14690100
## 2  Trust..Government.Corruption.       0.03005     0.06308598
## 3       Economy..GDP.per.Capita.       1.13764     0.02944832
## 4       Health..Life.Expectancy.       0.66926     0.02477567
## 5                     Generosity       0.00199    -0.01326503
## 6                         Family       1.23617     0.13629781
## 7                     Generosity       0.00199    -0.07514534
## 8  Trust..Government.Corruption.       0.03005    -0.07574480
## 9              Dystopia.Residual       2.27394    -0.07687559
## 10      Economy..GDP.per.Capita.       1.13764     0.07167086
## 11                        Family       1.23617    -0.14932931
## 12      Economy..GDP.per.Capita.       1.13764    -0.12738346
## 13                    Generosity       0.00199     0.09730858
## 14             Dystopia.Residual       2.27394    -0.07464384
## 15 Trust..Government.Corruption.       0.03005     0.06220305
##                                       feature_desc
## 1         2.025072 < Dystopia.Residual <= 2.320632
## 2        Trust..Government.Corruption. <= 0.051198
## 3  1.064792 < Economy..GDP.per.Capita. <= 1.275004
## 4  0.591822 < Health..Life.Expectancy. <= 0.701046
## 5                           Generosity <= 0.123528
## 6                                1.119156 < Family
## 7                           Generosity <= 0.123528
## 8        Trust..Government.Corruption. <= 0.051198
## 9         2.025072 < Dystopia.Residual <= 2.320632
## 10 1.064792 < Economy..GDP.per.Capita. <= 1.275004
## 11                               1.119156 < Family
## 12 1.064792 < Economy..GDP.per.Capita. <= 1.275004
## 13                          Generosity <= 0.123528
## 14        2.025072 < Dystopia.Residual <= 2.320632
## 15       Trust..Government.Corruption. <= 0.051198

In a similar way, we can explore why some predictions were wrong.


If you are interested in more machine learning posts, check out the category listing for machine_learning on my blog.


sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] dplyr_0.5.0       purrr_0.2.2       readr_1.1.0      
##  [4] tidyr_0.6.1       tibble_1.3.0      tidyverse_1.1.1  
##  [7] RSNNS_0.4-9       Rcpp_0.12.10      lime_0.1.0       
## [10] caret_6.0-73      ggplot2_2.2.1     lattice_0.20-35  
## [13] doParallel_1.0.10 iterators_1.0.8   foreach_1.4.3    
## 
## loaded via a namespace (and not attached):
##  [1] lubridate_1.6.0    assertthat_0.2.0   glmnet_2.0-5      
##  [4] rprojroot_1.2      digest_0.6.12      psych_1.7.3.21    
##  [7] R6_2.2.0           plyr_1.8.4         backports_1.0.5   
## [10] MatrixModels_0.4-1 stats4_3.3.3       evaluate_0.10     
## [13] httr_1.2.1         hrbrthemes_0.1.0   lazyeval_0.2.0    
## [16] readxl_0.1.1       minqa_1.2.4        SparseM_1.76      
## [19] extrafontdb_1.0    car_2.1-4          nloptr_1.0.4      
## [22] Matrix_1.2-8       rmarkdown_1.4      labeling_0.3      
## [25] splines_3.3.3      lme4_1.1-12        extrafont_0.17    
## [28] stringr_1.2.0      foreign_0.8-67     munsell_0.4.3     
## [31] hunspell_2.3       broom_0.4.2        modelr_0.1.0      
## [34] mnormt_1.5-5       mgcv_1.8-17        htmltools_0.3.5   
## [37] nnet_7.3-12        codetools_0.2-15   MASS_7.3-45       
## [40] ModelMetrics_1.1.0 grid_3.3.3         nlme_3.1-131      
## [43] jsonlite_1.4       Rttf2pt1_1.3.4     gtable_0.2.0      
## [46] DBI_0.6-1          magrittr_1.5       scales_0.4.1      
## [49] stringi_1.1.5      reshape2_1.4.2     xml2_1.1.1        
## [52] tools_3.3.3        forcats_0.2.0      hms_0.3           
## [55] pbkrtest_0.4-7     yaml_2.1.14        colorspace_1.3-2  
## [58] rvest_0.3.2        knitr_1.15.1       haven_1.0.0       
## [61] quantreg_5.29

How to apply R for Plotting 3D maps and location tracks

2017-04-09 00:00:00 +0000

Recently, I was on Gran Canaria for a vacation. So, what better way to keep up the holiday spirit a while longer than to visualize all the places we went in R!?

I am combining location data collected by

  1. our car GPS,
  2. Google location data from my phone and
  3. the hiking tracks we followed.

Obviously, the data itself is not public this time but the principles for plotting can be used with any .gpx files that cointain latitude, longitude and elevation data.


Gran Canaria

Gran Canaria is one of the Spanish Canary Islands off the coast of Morocco. As its sister islands, it is of volcanic origin.


Car GPS tracks

The GPS tracks in .gpx format were downloaded from the device (Garmin) the same way as I had already done with the tracks from our US/Canada roadtrip last year. Garmin’s .gpx tracks are not in a standard format that could be read with readGPX(), so I had to use htmlTreeParse() from the XML library.

library(XML)

# list of files in directory
myfiles <- list.files(path = "Takeout_2017_March", full.names = TRUE)

# all gpx files in that directory
myfiles <- myfiles[grep(".gpx", myfiles)]

for (i in 1:length(myfiles)) {

    # parse document
    pfile <- htmlTreeParse(myfiles[i], useInternalNodes = T)
  
    # Get all elevations, times and coordinates via the respective xpath
    elevations <- as.numeric(as.character(xpathSApply(pfile, path = "//trkpt/ele", xmlValue)))
    times <- xpathSApply(pfile, path = "//trkpt/time", xmlValue)
    coords <- xpathSApply(pfile, path = "//trkpt", xmlAttrs)
    
    # Extract latitude and longitude from the coordinates
    lats <- as.numeric(as.character(coords["lat",]))
    lons <- as.numeric(as.character(coords["lon",]))
    
    # Put everything in a dataframe
    geodf <- data.frame(lat = lats, lon = lons, ele = elevations, time = times)

    if (i == 1) {
      geodata <- geodf
      } else {
      geodata <- rbind(geodata, geodf)
    }
}

geodata_all <- geodata

# Transforming the time column
geodata_all$time <- as.character(strptime(geodata_all$time, format = "%Y-%m-%dT%H:%M:%SZ"))

# ordering by date
library(dplyr)
geodata_all <- arrange(geodata_all, time)

# keep only data points from Gran Canaria
geodata_gc <- filter(geodata_all, lat < 28.5 & lat > 27.5) %>%
  filter(lon > -16 & lon < -15)


Hiking tracks

The hiking tracks we followed came mostly from a German hiking guide-book, the Rother Wanderführer, 7th edition from 2016. They were in standard .gpx format and could be read with readGPX().

Only one of our hikes did not come from this book, but from Wikiloc. It could be treated the same way as the other hiking tracks, though, so I combined all hiking tracks.

library(plotKML)

# get file names
myfiles <- list.files(path = "hiking", full.names = TRUE)
myfiles <- myfiles[grep(".gpx", myfiles)]

for (i in 1:length(myfiles)) {

  # read in gpx files
  t <- readGPX(myfiles[i])
  
  # extract latitude, longitude and elevation
  geodf <- data.frame(lon = t$tracks[[1]][[1]]$lon,
                      lat = t$tracks[[1]][[1]]$lat,
                      ele = t$tracks[[1]][[1]]$ele,
                      tour = names(t$tracks[[1]]))

    # combine into data frame
    if (i == 1) {
      geodata <- geodf
      } else {
      geodata <- rbind(geodata, geodf)
    }
  rm(pfile)
}

geodata_tracks <- geodata


Google location history

The Google location data from my phone were prepared the same way as in my post about plotting your Google location history.

library(jsonlite)
system.time(x <- fromJSON("Standortverlauf.json"))
# extracting the locations dataframe
loc = x$locations

# converting time column from posix milliseconds into a readable time scale
loc$time = as.POSIXct(as.numeric(x$locations$timestampMs)/1000, origin = "1970-01-01")

# converting longitude and latitude from E7 to GPS coordinates
loc$lat = loc$latitudeE7 / 1e7
loc$lon = loc$longitudeE7 / 1e7

# keep only data from Gran Canaria
loc_gc <- filter(loc, lat < 28.5 & lat > 27.5) %>%
  filter(lon > -16 & lon < -15)


All three datasets had information about the latitude/longitude coordinate pairs and of the elevation of each observation, so I can combine these three attributes from the three location data sources.

geodata_tracks$ele <- as.numeric(as.character(geodata_tracks$ele))
data_combined <- data.frame(lat = c(geodata_gc$lat, loc_gc$lat, geodata_tracks$lat),
                            lon = c(geodata_gc$lon, loc_gc$lon, geodata_tracks$lon),
                            ele = c(geodata_gc$ele, loc_gc$altitude, geodata_tracks$ele),
                            track = c(rep("GPS", nrow(geodata_gc)), rep("Google", nrow(loc_gc)), rep("Hiking", nrow(geodata_tracks))))
data_combined <- data_combined[!duplicated(data_combined), ]


Elevation profile

I downloaded the elevation profile of Gran Canaria with the maptools and raster packages.

library(maptools)
library(raster)
srtm <- getData("SRTM", lon = -15.59972, lat = 27.965)

# crop to Gran Canaria & Tenerife
e1 <- extent(min(data_combined$lon) - 1.2, # xmin
            max(data_combined$lon) + 0.1, # xmax
            min(data_combined$lat) - 0.1, # ymin
            max(data_combined$lat) + 0.5) # ymax

srtm_ct <- crop(srtm, e1)

# plot slope and aspect with hill shades
slope <- terrain(srtm_ct, opt = "slope")
aspect <- terrain(srtm_ct, opt = "aspect")
hill <- hillShade(slope, aspect, angle = 45, direction = 45, normalize = TRUE)
plot(hill, col = grey(0:100/100), legend = FALSE)
plot(srtm_ct, col = rainbow(25, alpha = 0.35), add = TRUE)

Its sister island to the west, Tenerife is a bit larger and, as can be seen in the image above, its highest point, the Teide, is about 1000 meters higher than Gran Canaria’s highest point.


# crop to Gran Canaria
e2 <- extent(min(data_combined$lon) - 0.2, # xmin
            max(data_combined$lon) + 0.1, # xmax
            min(data_combined$lat) - 0.1, # ymin
            max(data_combined$lat) + 0.1) # ymax

srtm_c <- crop(srtm, e2)

slope <- terrain(srtm_c, opt = "slope")
aspect <- terrain(srtm_c, opt = "aspect")
hill <- hillShade(slope, aspect, angle = 45, direction = 45, normalize = TRUE)
plot(hill, col = grey(0:100/100), legend = FALSE)
plot(srtm_c, col = rainbow(25, alpha = 0.35), add = TRUE)

Looking only at Gran Canaria’s slope and aspect profile, the plots nicely show its highest point (almost 2000 m) in the center of the island and that it slopes down towards sea level at the coast.


ggmap

I centered the map at the midpoint of Gran Canaria, which is given by http://www.travelmath.com/island/Gran+Canaria as 27° 57’ 54” N / 15° 35’ 59” W. Converted to decimal with http://www.rapidtables.com/convert/number/degrees-minutes-seconds-to-degrees.htm that is 27.965 & 15.59972.

library(ggplot2)
library(ggmap)

map_theme <- list(theme(legend.position = "top",
                        panel.grid.minor = element_blank(),
                        panel.grid.major = element_blank(),
                        panel.background = element_blank(),
                        plot.background = element_rect(fill = "white"),
                        panel.border = element_blank(),
                        axis.line = element_blank(),
                        axis.text.x = element_blank(),
                        axis.text.y = element_blank(),
                        axis.ticks = element_blank(),
                        axis.title.x = element_blank(),
                        axis.title.y = element_blank(),
                        plot.title = element_text(size = 18)))
map <- get_map(c(lon = -15.59972, lat = 27.965), zoom = 10, maptype = "terrain-background", source = "google")

ggmap(map) + 
  geom_point(data = data_combined, aes(x = lon, y = lat, color = track), alpha = 0.3) +
  scale_color_brewer(palette = "Set1") +
  map_theme

The above plot shows the different location data sources. Our car GPS recorded the most data points along our routes (blue), while Google location data (red) is only recorded occasionally (usually, when I turn on my phone, so it primarily shows places we walked around in). The hiking tracks in green show the 7 hikes we’ve followed.


map <- get_map(c(lon = -15.59972, lat = 27.965), zoom = 10)

ggmap(map) + 
  geom_point(data = na.omit(data_combined), aes(x = lon, y = lat, color = ele)) +
  scale_color_gradient2(low = "lightblue", mid = "blue", high = "red", name = "Elevation") +
  map_theme

This plots shows the elevation of the location points we’ve been.


3D plots

Because the 2D plot doesn’t show the elevation very clearly, I wanted to plot this in 3D.

library(scatterplot3d)

# http://gis.stackexchange.com/questions/142156/r-how-to-get-latitudes-and-longitudes-from-a-rasterlayer
r.pts <- rasterToPoints(srtm_c, spatial = TRUE)
geo.prj <- proj4string(r.pts)
r.pts <- spTransform(r.pts, CRS(geo.prj)) 
bg_matrix <- data.frame(lon = coordinates(r.pts)[,1],
                         lat = coordinates(r.pts)[,2])

ex_bg <- extract(srtm_c, bg_matrix, cellnumbers = TRUE, df = TRUE)
bg_matrix$ele <- ex_bg$srtm_33_07

ex_points <- extract(srtm_c, cbind(data_combined$lon, data_combined$lat), cellnumbers = TRUE, df = TRUE)

s3d <- scatterplot3d(x = bg_matrix[, 1], 
                     y = bg_matrix[, 2], 
                     z = bg_matrix[, 3], 
                     type = "p", 
                     pch = 16, 
                     highlight.3d = TRUE, 
                     cex.symbols = 0.5, 
                     box = FALSE,
                     grid = FALSE,
                     xlab = "Longitude",
                     ylab = "Latitude",
                     zlab = "Elevation")
s3d$points3d(x = data_combined$lon, y = data_combined$lat, z = ex_points$srtm_33_07, col = "blue", pch = 16, cex = 0.1)

The first 3D plot shows a scatterplot of our location points on the elevation profile of Gran Canaria. It was created with scatterplot3d.


But I also wanted to be able to see the 3D plot from different angles, soI used rgl to make interactive 3D plots.

library(rgdal)
library(rasterVis)
library(rgl)
library(htmlwidgets)
options(rgl.printRglwidget = TRUE)

open3d()
plot3D(srtm_c,  maxpixels = 7e4)


Click to open the interactive 3D image and use the mouse to zoom and turn the plot:

3dplot1

This interactive 3D plot shows the elevation profile of the whole island and you can clearly see the various ravines that have been shaped over the years.

Now, I would have liked to plot our location points directly onto this plot, but I couldn’t figure out how to do this (if someone knows, please let me know). Instead, I plotted only our location points in 3D.


options(rgl.printRglwidget = TRUE)
open3d()
plot3d(data_combined$lon, data_combined$lat, ex_points$srtm_33_07, 
       xlab = "", ylab = "", zlab = "",
       col = "blue", size = 5, alpha = 0.1,
       lit = TRUE,
       box = FALSE, axes = FALSE)


Click to open the interactive 3D image and use the mouse to zoom and turn the plot:

3dplot2

Here as well, I would have liked to have the elevation profile of the island behind it, but the code that worked well within RStudio (see below) did not plot the background points in my html output. I assume that it has to do with having too many points, so if somebody knows a way to reduce the resolution, please let me know!

plot3d(bg_matrix$lon, bg_matrix$lat, bg_matrix$ele, 
       xlab = "", ylab = "", zlab = "",
       col = "grey", size = 5, alpha = 0.1,
       box = FALSE, axes = FALSE, useNULL = TRUE)
plot3d(data_combined$lon, data_combined$lat, ex_points$srtm_33_07, 
       col = "blue", add = TRUE, size = 10, alpha = 0.5, useNULL = TRUE) 


sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scatterplot3d_0.3-38 htmlwidgets_0.8      rgl_0.98.1          
##  [4] rasterVis_0.41       latticeExtra_0.6-28  RColorBrewer_1.1-2  
##  [7] lattice_0.20-34      rgdal_1.2-5          raster_2.5-8        
## [10] maptools_0.9-2       sp_1.2-4            
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.9       knitr_1.15.1      magrittr_1.5     
##  [4] xtable_1.8-2      viridisLite_0.2.0 R6_2.2.0         
##  [7] stringr_1.2.0     tools_3.3.3       parallel_3.3.3   
## [10] grid_3.3.3        htmltools_0.3.5   yaml_2.1.14      
## [13] rprojroot_1.2     digest_0.6.12     shiny_1.0.0      
## [16] mime_0.5          evaluate_0.10     rmarkdown_1.3    
## [19] stringi_1.1.2     backports_1.0.5   jsonlite_1.2     
## [22] httpuv_1.3.3      foreign_0.8-67    hexbin_1.27.1    
## [25] zoo_1.7-14