How to apply LineageHomology on a phylogeny with tips observed in two geographical locations. ================
#Load needed packages
library(LineageHomology)
library(ggplot2)
library(scales)
library(lubridate)
#Loading other packages for simulating data.
library(ape)
library(phytools)
library(phangorn)
library(BactDating)
#Pipe operator
library(dplyr)
Here we use the R packages BactDating and Phytools to simulate time dated geographical data on a phylogeny with two locations. We let the geographical states represent Norway (represented by red colour) and the rest of the World (RoW) (represented by blue colour).
set.seed(400)
tree_test = BactDating::simdatedtree(nsam=300, dateroot=2000) #300 taxa and date of the root at year 2000-
tree_test = ladderize(tree_test) #Reorder the tree to make it look nice
Q=matrix(c(0.5,0.5,0.5,0.5), nrow=2,ncol=2, byrow=F) #Set up a transition matrix for trait simulation
colnames(Q)=c("Norway","RoW") #From state 1 to state 2.
rownames(Q)=c("Norway","RoW") #From state 1 to state 2.
trait = phytools::sim.Mk(tree=tree_test,Q=Q,nsim=1) #Simulate the traits on the phylogeny
#Reconstruct ancestral states using ace.
fit1 = ace(x=trait, phy= tree_test, type="discrete", mod="ARD") #Estimate the ancestral history
plot.phylo(tree_test,lwd=2,label.offset = 0.15, mar=c(0.2,0.2,0.2,0.2),cex=0.3) #Ploy phylogeny
axisPhylo(root.time=2000, backward=F) #Add time axis
nodelabels(pie=fit1$lik.anc,cex=0.2,piecol=c("Red","Blue"))
tips = to.matrix(trait,seq=c("Norway", "RoW"))
tiplabels(pie=tips, cex=0.2,piecol=c("Red","Blue"))
The pie charts on the nodes of the phylogeny represent the probability of the node being in the different geographical locations, where red represents Norway, and blue represents RoW. LineageHomology uses these probabilities to count connected groups of tips, where all the nodes on the paths between them have a probability that > 50% for the same geographical state. LineageHomology also returns the number of tips that are not connected to any other tips in this way. Following du Plessis et al. (2021) (DOI: 10.1126/science.abf2946) we refer to the groups as transmission lineages (TLs) and isolated tips as singletons.
Next we run LineageHomology. The inputs data should take this format.
tree_test
#>
#> Phylogenetic tree with 300 tips and 299 internal nodes.
#>
#> Tip labels:
#> t214, t151, t158, t56, t40, t13, ...
#>
#> Rooted; includes branch lengths.
head(fit1$lik.anc)
#> Norway RoW
#> 301 0.19739205 0.8026080
#> 302 0.09659401 0.9034060
#> 303 0.09487369 0.9051263
#> 304 0.69688117 0.3031188
#> 305 0.43888708 0.5611129
#> 306 0.42001122 0.5799888
head( to.matrix(trait, seq=c("Norway", "RoW")))
#> Norway RoW
#> t214 1 0
#> t151 0 1
#> t158 0 1
#> t56 0 1
#> t40 1 0
#> t13 0 1
Note that the node probabilities of the tips must have row names that match the tips of the phylogeny.
Result = LineageHomology(tree_test, ace_nodes=fit1$lik.anc,
ace_tips = to.matrix(trait, seq=c("Norway", "RoW")), start_time=2000)
LineageHomology returns multiple values
names(Result)
#> [1] "Import_LocalTrans" "Lineage_sizes" "Taxa_names"
#> [4] "MRCA's" "lineage_state" "Halfedge_over_tmrca"
Result$Import_LocalTrans
#> [1] 120 180
Result$lineage_state
#> Norway RoW Norway Norway Norway Norway RoW RoW Norway RoW Norway
#> 1 2 1 1 1 1 2 2 1 2 1
#> RoW Norway RoW RoW RoW RoW Norway Norway Norway Norway Norway
#> 2 1 2 2 2 2 1 1 1 1 1
#> RoW Norway RoW Norway Norway Norway RoW Norway Norway Norway Norway
#> 2 1 2 1 1 1 2 1 1 1 1
#> Norway RoW Norway RoW Norway RoW RoW Norway Norway RoW RoW
#> 1 2 1 2 1 2 2 1 1 2 2
#> Norway RoW Norway Norway Norway Norway RoW Norway Norway Norway Norway
#> 1 2 1 1 1 1 2 1 1 1 1
#> RoW RoW Norway Norway Norway Norway Norway Norway Norway RoW Norway
#> 2 2 1 1 1 1 1 1 1 2 1
#> Norway Norway RoW Norway RoW Norway Norway Norway RoW Norway RoW
#> 1 1 2 1 2 1 1 1 2 1 2
#> Norway RoW Norway Norway RoW Norway Norway RoW RoW RoW Norway
#> 1 2 1 1 2 1 1 2 2 2 1
#> RoW RoW Norway RoW Norway RoW Norway RoW RoW Norway RoW
#> 2 2 1 2 1 2 1 2 2 1 2
#> Norway Norway RoW Norway RoW RoW RoW Norway Norway RoW RoW
#> 1 1 2 1 2 2 2 1 1 2 2
#> RoW RoW Norway RoW Norway RoW RoW Norway Norway RoW
#> 2 2 1 2 1 2 2 1 1 2
Result$Lineage_sizes
#> [1] 1 9 1 2 1 8 18 1 3 1 1 1 2 1 5 7 1 3 1 8 2 1 2 1 1
#> [26] 1 1 2 2 1 2 1 1 1 1 2 1 1 1 1 23 2 1 43 1 3 1 2 2 1
#> [51] 2 1 2 1 8 1 1 3 1 1 1 1 1 4 1 1 1 2 1 1 1 1 2 1 1
#> [76] 2 1 4 2 3 2 4 1 1 1 3 2 1 4 2 2 2 1 1 1 1 2 3 2 1
#> [101] 1 2 2 2 2 4 2 1 1 1 1 1 1 1 2 1 1 1 2 1
head(Result$Taxa_names)
#> $`Lineage no: 1`
#> [1] "t214"
#>
#> $`Lineage no: 2`
#> [1] "t151" "t158" "t56" "t13" "t78" "t67" "t121" "t179" "t283"
#>
#> $`Lineage no: 3`
#> [1] "t40"
#>
#> $`Lineage no: 4`
#> [1] "t173" "t237"
#>
#> $`Lineage no: 5`
#> [1] "t90"
#>
#> $`Lineage no: 6`
#> [1] "t298" "t77" "t142" "t141" "t18" "t51" "t12" "t145"
Result$`MRCA's`
#> [1] 2007.930 2003.575 2008.290 2008.563 2010.003 2002.453 2000.000 2012.839
#> [9] 2008.899 2011.696 2015.350 2018.117 2017.801 2020.377 2016.547 2010.997
#> [17] 2017.068 2014.607 2012.965 2013.090 2012.813 2012.869 2013.422 2014.381
#> [25] 2014.017 2015.072 2012.170 2008.436 2009.600 2008.405 2010.026 2012.277
#> [33] 2011.058 2008.982 2014.212 2007.166 2008.433 2010.349 2009.121 2010.671
#> [41] 2000.831 2002.355 2002.036 2005.762 2015.719 2009.045 2013.681 2008.684
#> [49] 2014.641 2017.424 2018.459 2023.799 2011.636 2013.835 2009.699 2013.235
#> [57] 2014.098 2011.071 2013.331 2013.415 2013.012 2014.263 2021.411 2017.157
#> [65] 2018.212 2009.985 2014.863 2016.249 2017.649 2011.083 2014.290 2016.099
#> [73] 2011.329 2012.356 2013.407 2009.467 2010.733 2012.375 2002.329 2004.124
#> [81] 2004.303 2006.608 2008.731 2009.043 2005.787 2012.925 2014.917 2014.136
#> [89] 2015.214 2018.742 2017.105 2015.514 2013.682 2016.469 2015.963 2015.299
#> [97] 2014.016 2010.969 2013.198 2017.861 2016.219 2009.868 2008.353 2011.624
#> [105] 2002.336 2007.166 2010.421 2008.355 2006.529 2007.034 2010.135 2012.856
#> [113] 2009.975 2009.423 2013.700 2013.159 2008.576 2011.045 2009.292 2012.001
Result$lineage_state
#> Norway RoW Norway Norway Norway Norway RoW RoW Norway RoW Norway
#> 1 2 1 1 1 1 2 2 1 2 1
#> RoW Norway RoW RoW RoW RoW Norway Norway Norway Norway Norway
#> 2 1 2 2 2 2 1 1 1 1 1
#> RoW Norway RoW Norway Norway Norway RoW Norway Norway Norway Norway
#> 2 1 2 1 1 1 2 1 1 1 1
#> Norway RoW Norway RoW Norway RoW RoW Norway Norway RoW RoW
#> 1 2 1 2 1 2 2 1 1 2 2
#> Norway RoW Norway Norway Norway Norway RoW Norway Norway Norway Norway
#> 1 2 1 1 1 1 2 1 1 1 1
#> RoW RoW Norway Norway Norway Norway Norway Norway Norway RoW Norway
#> 2 2 1 1 1 1 1 1 1 2 1
#> Norway Norway RoW Norway RoW Norway Norway Norway RoW Norway RoW
#> 1 1 2 1 2 1 1 1 2 1 2
#> Norway RoW Norway Norway RoW Norway Norway RoW RoW RoW Norway
#> 1 2 1 1 2 1 1 2 2 2 1
#> RoW RoW Norway RoW Norway RoW Norway RoW RoW Norway RoW
#> 2 2 1 2 1 2 1 2 2 1 2
#> Norway Norway RoW Norway RoW RoW RoW Norway Norway RoW RoW
#> 1 1 2 1 2 2 2 1 1 2 2
#> RoW RoW Norway RoW Norway RoW RoW Norway Norway RoW
#> 2 2 1 2 1 2 2 1 1 2
The simplest method to visualize the results from LineageHomology is to use a treemap plot. Treemap_lineagehomology uses the treemap package to visualize transmission lineages as rectangles with areas that reflect their relative sizes. Additionally, the text on each square gives the name of group number of the transmission lineage, the estimated time of the mrca (TMRCA) of the transmission lineage and the number of tips in the group (S)
LineageHomology::treemap_lineagehomology(Result)
The accumulation of tips in a lineage over time can be visualized by using density plots. Before using the plotting methods below, we need to run the processing function lineage_info. lineage_info takes the results from LineageHomology and names and dates observed dates of the tips in the tree. The formats of these should match the ones shown below.
#Set up matrix with taxa info
name_date = data.frame(name = names(trait), dates= BactDating::leafDates(tree_test))
head(name_date)
#> name dates
#> 1 t214 2007.930
#> 2 t151 2005.666
#> 3 t158 2008.698
#> 4 t56 2008.041
#> 5 t40 2008.290
#> 6 t13 2006.730
Result_lineage_info = LineageHomology::lineage_info(Result,name_date)
Ridgeplot_lineagedensities takes the input from the function lineage_info and produces density plots using for each TL over time. The density plots are made using the ggridges R-package. The TLs are sorted from largest to smallest and can be coloured according to the state that defines the TL.
LineageHomology::ridgeplot_lineagedensities(Result_lineage_info=Result_lineage_info,groups_larger_than = 1,datelims=c("2000-01-01","2025-01-01","3 year"),color_by_state = F)
LineageHomology::ridgeplot_lineagedensities(Result_lineage_info=Result_lineage_info,groups_larger_than = 1,datelims=c("2000-01-01","2025-01-01","3 year"),color_by_state = T)
LineageHomology::ridgeplot_lineagedensities(Result_lineage_info=Result_lineage_info,groups_larger_than = 4,datelims=c("2000-01-01","2025-01-01","3 year"),color_by_state = T)
LineageHomology::ridgeplot_lineagedensities(Result_lineage_info=Result_lineage_info,groups_larger_than = 10,datelims=c("2000-01-01","2025-01-01","3 year"),color_by_state = T)
To visualize growth patterns over time, LineageHomology provides a function for plotting the cumulative number of observations in each TL separately.
LineageHomology::lineage_growth_cumulative(Result_lineage_info = Result_lineage_info, datelims=c("2000-01-01","2025-01-01","3 year"))
LineageHomology::lineage_growth_cumulative(Result_lineage_info = Result_lineage_info, datelims=c("2000-01-01","2025-01-01","3 year"),color_by_state = T)
In all the figures above, the transmission lineage division was done by dividing lineages at points where a transition is estimated with more than 50 percent probability. If one wishes to divide the transmission lineages by sampling from the observed probabilities, one can use the function LineageHomology_w_uncertainty (or v2, which is faster). This retains the uncertainty in the phylogeographical mapping. Here we visualize the difference in the assignment of TLs in two runs on the same mapping by using treemap plots.
Result1 = LineageHomology_w_uncertainty_v2(tree_test, ace_nodes=fit1$lik.anc,
ace_tips = to.matrix(trait, seq=c("Norway", "RoW")), start_time=2000)
LineageHomology::treemap_lineagehomology(Result1)
Result2 = LineageHomology_w_uncertainty_v2(tree_test, ace_nodes=fit1$lik.anc,
ace_tips = to.matrix(trait, seq=c("Norway", "RoW")), start_time=2000)
LineageHomology::treemap_lineagehomology(Result2)
As these figures show, the division of lineages will vary according to the sampled states of some of the nodes. This can be useful for producing confidence limits on, e.g. the size of the largest lineage.
Here we use pbreplicate to counts transmission lineages probabilistically many times to obtain confidence intervals, or, in this case, a distribution) for the size of the largest transmission lineage.
library(pbapply)
multi_counts = pbreplicate(
1000,
LineageHomology::LineageHomology_w_uncertainty_v2(
tree=tree_test,
ace_nodes=fit1$lik.anc,
ace_tips = to.matrix(trait, seq=c("Norway", "RoW")),
start_time = 2000
)
)
largest_lineage = c()
for(i in 1:1000) {
largest_lineage = c(largest_lineage, max(multi_counts[,i]$Lineage_sizes))
}
hist(largest_lineage, breaks=100, xlim=c(0,40), probability = F, main="Size of largest transmission lineage",xlab="Size", ylab="Probability")
As shown above, in the given mapping, we are confident that the largest transmission lineage contains no more than 33 tips.
LineageHomology can also be used to estimate importation and local transmission over time from the phylogeographic mapping. Here we focus on Norway and run LineageHomology on the observed tips in Norway before estimating the contribution of importation and local transmission events over time in Norway.
library(pbapply)
Norwegian_tips = names(trait[trait=="Norway"])
head(Norwegian_tips)
#> [1] "t214" "t40" "t173" "t237" "t90" "t298"
multi_counts = pbreplicate(
1000,
LineageHomology::LineageHomology_w_uncertainty_v2(
tree=tree_test,
give_tips = Norwegian_tips,
ace_nodes=fit1$lik.anc,
ace_tips = to.matrix(trait, seq=c("Norway", "RoW")),
start_time = 2000
)
)
#Count number of local transmission and importations in half-year (0.5) intervals.
result_import_local_transmission = import_local_transmission(tree = tree_test,LineageHomology_replicates = multi_counts, start_time = 2000,time_interval_size = 0.5)
We can summarise the number of tips estimated to be a result of importation events and local transmission events.
Summarize_import_local_transmission(LineageHomology_replicates = multi_counts)
#> 2.5% 50% 97.5%
#> Import 77.9750000 86.000000 94.0000000
#> Local transmission 53.0000000 61.000000 69.0250000
#> Import / Total 0.5304422 0.585034 0.6394558
Thus in the phylogeographic mapping above, there seem to be slightly more importation events in Norway (58%) than local transmission events. Lastly, we can plot the relative contribution of importation and local transmission in time intervals spanning half a year.
plot_importation_local_transmission(tree = tree_test, result_import_local_transmission = result_import_local_transmission,start_time=2000, time_interval_size = 0.5, date_breaks = "2 years",importation_or_local = "both") #Use importation_or_local = "importation" or "local" if you do not have the grid package installed.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.