Nothing
# Calculate distance (single number) between two sets of exchangeable unlabeled trees.
# Hence, the distance calculated does not depend in the ordering/labeling of tips in each tree, nor on the order of trees in each tree set.
# The trees may include multifurcations and monofurcations.
# If normalized==TRUE, the distance will always be within 0 and 1. However, it is no longer guaranteed that the normalized distance function satisfies the triangle inequality, as required for actual metrics.
forest_distance = function( treesA, # list of phylo trees
treesB, # list of phylo trees
metric = "WassersteinNodeAges", # which distance function to use
combine = "mean_pairwise", # how the pairwise distances between trees should be combined to form a single distance between the forests.
normalized = FALSE,
NLeigenvalues = 10){ # number of top eigenvalues to consider of the Laplacian Spectrum (e.g., for the metric "WassersteinLaplacianSpectrum"). If <=0, all eigenvalues are considered, which can substantially increase computation time for large trees.
NtreesA = length(treesA)
NtreesB = length(treesB)
distances = matrix(0, ncol=NtreesA, nrow=NtreesB)
intermediatesA = vector(mode="list", NtreesA) # temporary storage for any intermediate results for treesA (will be filled as we go)
intermediatesB = vector(mode="list", NtreesB) # temporary storage for any intermediate results for treesB (will be filled as we go)
for(a in seq_len(NtreesA)){
treeA = treesA[[a]]
NcladesA = length(treeA$tip.label) + treeA$Nnode
for(b in seq_len(NtreesB)){
treeB = treesB[[b]]
NcladesB = length(treeB$tip.label) + treeB$Nnode
if(metric=="WassersteinNodeAges"){
# First Wasserstein ("1-Wasserstein") distance between the empirical distributions of node ages
# This distance depends on branch lengths, but not on tip labeling nor topology (only on the ages of the nodes)
# Hence, this is strictly speaking only a pseudometric, even in the space of unlabeled trees. It is a metric in the space of tree equivalence classes, where two trees are equivalent iff they have the same node ages.
if(is.null(intermediatesA[[a]])){
root_ageA = get_tree_span(treeA)$max_distance
clade_agesA = root_ageA - get_all_distances_to_root(treeA)
intermediatesA[[a]] = list(root_age=root_ageA, clade_ages=clade_agesA)
}else{
root_ageA = intermediatesA[[a]]$root_age
clade_agesA = intermediatesA[[a]]$clade_ages
}
if(is.null(intermediatesB[[b]])){
root_ageB = get_tree_span(treeB)$max_distance
clade_agesB = root_ageB - get_all_distances_to_root(treeB)
intermediatesB[[b]] = list(root_age=root_ageB, clade_ages=clade_agesB)
}else{
root_ageB = intermediatesB[[b]]$root_age
clade_agesB = intermediatesB[[b]]$clade_ages
}
if(normalized){
# rescale time so that it is relative to the oldest root_age, and thus all clade ages are numbers between 0 and 1
# this also means that the 1st-Wasserstein distance will be between 0 and 1
max_root_age = max(root_ageA,root_ageB)
clade_agesA = clade_agesA/max_root_age
clade_agesB = clade_agesB/max_root_age
}
distances[a,b] = first_Wasserstein_distance(clade_agesA[(length(treeA$tip.label)+1):length(clade_agesA)], clade_agesB[(length(treeB$tip.label)+1):length(clade_agesB)])
}else if(metric=="WassersteinLaplacianSpectrum"){
# First Wasserstein ("1-Wasserstein") distance between the eigenspectra of the modified graph Laplacians
# This distance depends on topology and branch lengths, but not on tip labeling nor on the rooting
# Hence, this is strictly speaking a metric in the space of unrooted unlabeled trees, but not a metric in the space of labeled trees.
# This metric is similar to that proposed by Lewitus and Morlon (2016, Systematic Biology. 65:495-507), with the difference that the latter calculates a smoothened version of the spectrum (via a Gaussian convolution kernel) and then calculates the Kullback-Leibler divergence between the smoothened densities.
# Note that if not all eigenvalues are used (i.e. NLeigenvalues>0 and NLeigenvalues<max(NcladesA,NcladesB)), then this is not even a metric on the space of unlabeled unrooted trees.
if(is.null(intermediatesA[[a]])){
LaplacianA = weighted_graph_Laplacian_of_tree(treeA, sparse=((NLeigenvalues>0) && (NLeigenvalues<NcladesA)))
if((NLeigenvalues>0) && (NLeigenvalues<NcladesA)){
spectrumA = RSpectra::eigs(A=LaplacianA, k=NLeigenvalues, which="LM", opts=list(retvec=FALSE))$values
}else{
spectrumA = eigen(x=LaplacianA, symmetric=TRUE, only.values=TRUE)$values
}
intermediatesA[[a]] = list(spectrum=spectrumA)
}else{
spectrumA = intermediatesA[[a]]$spectrum
}
if(is.null(intermediatesB[[b]])){
LaplacianB = weighted_graph_Laplacian_of_tree(treeB, sparse=((NLeigenvalues>0) && (NLeigenvalues<NcladesB)))
if((NLeigenvalues>0) && (NLeigenvalues<NcladesB)){
spectrumB = RSpectra::eigs(A=LaplacianB, k=NLeigenvalues, which="LM", opts=list(retvec=FALSE))$values
}else{
spectrumB = eigen(x=LaplacianB, symmetric=TRUE, only.values=TRUE)$values
}
intermediatesB[[b]] = list(spectrum=spectrumB)
}else{
spectrumB = intermediatesB[[b]]$spectrum
}
if(normalized){
# rescale eigenvalues so that they are relative to the largest eigenvalue of both trees; hence, all rescaled eigenvalues will be within [0,1]
# this also means that the 1st-Wasserstein distance will be between 0 and 1
largest_eigenvalue = max(spectrumA, spectrumB)
spectrumA = spectrumA / largest_eigenvalue
spectrumB = spectrumB / largest_eigenvalue
}
# calculate first Wasserstein distance between the two spectra
distances[a,b] = first_Wasserstein_distance(spectrumA, spectrumB)
}else{
stop(sprintf("Unknown metric '%s'",metric))
}
}
}
# combine pairwise distances
if(combine=="mean_pairwise"){
distance = mean(distances)
}else if(combine=="max_pairwise"){
distance = max(distances)
}else{
return(list(succes=FALSE, error=sprintf("forest_distance: Unknown option '%s' for combine",combine)))
}
return(list(success = TRUE,
distance = distance,
distances = distances))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.