In ms609/TreeDistData: Analysis of Phylogenetic Tree Distance Measures

origPar <- par()
library('TreeDistData')
data("distanceDistribution25", package = "TreeDistData")
# No point in plotting both MAST and MASTI, as they're perfect rank correlates
exclude <- c('nni_t', 'nni_u', 'nni_l', 'nni_L', 'nni_U', 'masti')
distanceDistribution25 <- distanceDistribution25[!rownames(distanceDistribution25) %in% exclude, ]
data("distanceDistribution50", package = "TreeDistData")
distanceDistribution50 <- distanceDistribution50[!rownames(distanceDistribution50) %in% exclude, ]
toCorrelate <- c(
                 "es", "kc",
                 "path",
                 "qd", 
                 "ms", "msid", 
                 "pid", "cid", "nye",
                 "jco2", "jnc2", "jco4", "jnc4",
                 "icrf", "rf",
                 "spr", "tbr_l", "tbr_u", "mast" # Integers
                 )
if (!all(rownames(distanceDistribution50) %in% toCorrelate)) {
  warning("Missing ", rownames(distanceDistribution50)[!rownames(distanceDistribution50) %in% toCorrelate])
}
colScale <- colorspace::sequential_hcl(1000, palette = 'viridis')

CorrPlot <- function (distribution, methodA, methodB) {
  a <- distribution[methodA, ]
  b <- distribution[methodB, ]
  plot(a ~ b, pch = '.', col = '#00000033', axes = FALSE, ylab = '', xlab = '')
  reg <- lm(a ~ b)
  abline(reg, col = '#ff3333')
}

FormatR2 <- function (r2) {
  if (abs(r2) < 1e-3) {
    pow <- floor(log10(abs(r2)))
    paste0(signif(r2, 2) * 10^-pow, "e\U2212", -pow)
  } else {
    format(r2, digits = 2, nsmall = 2)
  }
}

PlotR2 <- function (distribution, methodA, methodB) {
  a <- distribution[methodA, ]
  b <- distribution[methodB, ]
  plot.new()
  r2 <- cor(a, b) ^ 2
  regCol <- colScale[max(ceiling((r2 - 0.5) * 2000), 1L)]
  text(0.5, 0.5, FormatR2(r2), col = regCol, cex = if (r2 < 1e-3) 1.1 else 1.4)
}

PlotCorr <- function (distribution) {
  par(mfrow = length(toCorrelate) + c(0L, 0L), mar = rep(0.2, 4))
  lapply(seq_along(toCorrelate), function(i)
    lapply(seq_along(toCorrelate), function(j) {
      if (i == j) {
        plot.new()
        method <- toCorrelate[i]
        text(0.5, 0.5, tdBoxAbbrevs[method],
             col = TreeDistCol(method), cex = 0.9)
        box(which = 'plot', col = TreeDistCol(method), lwd = 3)
      } else if (i < j) {
        PlotR2(distribution, toCorrelate[i], toCorrelate[j])
      } else {
        CorrPlot(distribution, toCorrelate[i], toCorrelate[j])
      }
    })) -> XX
}

```{R metric-correlation, echo=FALSE, fig.width=7, fig.asp=9/8, out.width='98%', fig.align='center', warning=FALSE} PlotCorr(distanceDistribution25)

**Supplementary Figure 1. Correlation (r²) between distances for 10&nbsp;000 pairs of 25-leaf trees**

```{R metric-correlation-50, echo=FALSE, fig.align='center', fig.asp=9/8, fig.width=7, warning=FALSE, out.width='98%'}
PlotCorr(distanceDistribution50)

Supplementary Figure 2. Correlation (r²) between distances for 10 000 pairs of 50-leaf trees

Not plotted:

MASTI, which correlates perfectly with MAST;
NNI: both upper and lower bounds correlate almost perfectly with RF, due to the method of calculating the approximation.