In ms609/TreeDistData: Analysis of Phylogenetic Tree Distance Measures

if (requireNamespace('benchmarkme', quietly = TRUE)) {
  cpu <- benchmarkme::get_cpu()
  ram <- benchmarkme::get_ram()
} else {
  cpu <- list(model_name = "UNKNOWN", no_of_cores = "UNKNOWN")
  ram <- "UNKNOWN"
}

origPar <- par()

# Note: For maximum speed, check compiler options in ~/.R/Makevars when building 
# the package.  Suggest setting PKG_CXXFLAGS = -Ofast -march=native -mtune=native 
library('TreeDist')
path.dist <- phangorn::path.dist
SPR.dist <- phangorn::SPR.dist
TBRDist <- TBRDist::TBRDist
RF <- TreeDist::RobinsonFoulds

nTrees <- 45L # For c. 1000 comparisons
nRepeats <- 10L
nPairs <- nTrees * (nTrees - 1) / 2

SPRWalk <- function (nTip) {
  set.seed(0L)
  sprWalk <- vector('list', nTrees)
  sprWalk[[1]] <- lastTree <- TreeTools::PectinateTree(nTip)

  for (i in seq_len(nTrees)[-1]) {
    sprWalk[[i]] <- lastTree <- TreeSearch::SPR(lastTree)
  }

  trees <- lapply(sprWalk, TreeTools::Preorder)
  trees <- lapply(trees, TreeTools::Cladewise)
  class(trees) <- 'multiPhylo'
  trees
}

Times <- function (trees) {
  splits <- TreeTools::as.Splits(trees)
  timings <- microbenchmark::microbenchmark(
    pid = SharedPhylogeneticInfo(splits, normalize = FALSE),
    msid = MatchingSplitInfo(splits, normalize = FALSE),
    cid = MutualClusteringInfo(splits, normalize = FALSE),
    nye = NyeSimilarity(splits, normalize = FALSE),
    jnc2 = JaccardRobinsonFoulds(splits, k = 2L, similarity = TRUE, allowConflict = FALSE, normalize = FALSE),
    jnc4 = JaccardRobinsonFoulds(splits, k = 4L, similarity = TRUE, allowConflict = FALSE, normalize = FALSE),
    jco2 = JaccardRobinsonFoulds(splits, k = 2L, similarity = TRUE, allowConflict = TRUE, normalize = FALSE),
    jco4 = JaccardRobinsonFoulds(splits, k = 4L, similarity = TRUE, allowConflict = TRUE, normalize = FALSE),
    ms = MatchingSplitDistance(splits),
    qd = Quartet::ManyToManyQuartetAgreement(trees),
    mast = TreeTools::PairwiseDistances(trees, MASTSize, rooted = FALSE),
    nni = TreeTools::PairwiseDistances(trees, NNIDist, 7L),
    spr = SPR.dist(trees),
    tbr = TBRDist(trees, exact = FALSE),
    rf = RF(trees),
    icrf = InfoRobinsonFoulds(splits),
    path = path.dist(trees),
    kc = KendallColijn(trees),
    es = KendallColijn(trees, Vector = SplitVector),

    times = nRepeats
  )
}

Plot <- function (timings, nTip) {
  par(cex = 0.8)
  boxplot(timings, border = TreeDistData::TreeDistCol(
    as.character(summary(timings)[, 'expr'])),
        xlab = 'Method', ylab = 'Total time elapsed / ms',
        main = paste(nTip, 'leaves'))
}

The time taken to compare all r nPairs pairs of trees by each method was calculated, replicating timing r nRepeats times.

The below are the results of a benchmarking exercise conducted on an r cpu$model_name machine with r as.character(benchmarkme:::print.ram(ram)) of RAM, comparing r nTrees trees obtained by performing successive subtree pruning and regrafting rearrangements on a pectinate starting tree.
The values reported in Smith [-@Smith2020] were calculated on a modest desktop computer, and will differ from those reported here, which will have been generated on the hardware system used to render the documentation.

Figure

times20 <- Times(SPRWalk(20))
Plot(times20, 20)

times50 <- Times(SPRWalk(50))
Plot(times50, 50)

Tabulation

results <- summary(times20, unit = 'us')
rownames(results) <- TreeDistData::tdMdAbbrevs[as.character(results[, 'expr'])]
results <- cbind(results[, 'mean', drop = FALSE],
                 summary(times50, unit = 'us')[, 'mean', drop = FALSE])

colnames(results) <- paste(c(20, 50), 'leaves')
twoSF <- formatC(as.matrix(signif(results[order(results[, 1]), ] / nPairs, 2)),
                 format = 'fg', digits = 2L)
TreeDistData::.TDDTable(DT::datatable, twoSF,
                        caption = "Mean time per comparison / µs")