tests/testthat/test-batch_coverage.R

## Coverage tests for batch C++ functions, fast paths, and cross-pairs.
##
## These tests target code paths introduced during the OpenMP / sort+merge /
## CostMatrix-pooling optimisation cycle that are not exercised by the
## existing test suite.

library("TreeTools", quietly = TRUE)

# Shared fixtures ----
tips20 <- paste0("t", seq_len(20))
tA <- ape::as.phylo(0:4, tipLabels = tips20)
tB <- ape::as.phylo(5:9, tipLabels = tips20)
trees20 <- ape::as.phylo(0:9, tipLabels = tips20)

tips8 <- paste0("t", seq_len(8))
tA8 <- ape::as.phylo(0:2, tipLabels = tips8)
tB8 <- ape::as.phylo(3:5, tipLabels = tips8)


# DifferentPhylogeneticInfo ----

test_that("DPI all-pairs fast path agrees with per-pair", {
  batch <- DifferentPhylogeneticInfo(trees20)
  m <- as.matrix(batch)
  expect_equal(m[2, 1],
               DifferentPhylogeneticInfo(trees20[[1]], trees20[[2]]),
               tolerance = 1e-10)
  expect_equal(m[5, 3],
               DifferentPhylogeneticInfo(trees20[[3]], trees20[[5]]),
               tolerance = 1e-10)
})

test_that("DPI cross-pairs fast path agrees with per-pair", {
  cross <- DifferentPhylogeneticInfo(tA, tB)
  expect_equal(dim(cross), c(5L, 5L))
  expect_equal(cross[1, 1],
               DifferentPhylogeneticInfo(tA[[1]], tB[[1]]),
               tolerance = 1e-10)
  expect_equal(cross[3, 2],
               DifferentPhylogeneticInfo(tA[[3]], tB[[2]]),
               tolerance = 1e-10)
})


# MatchingSplitInfoDistance ----

test_that("MSID all-pairs fast path agrees with per-pair", {
  batch <- MatchingSplitInfoDistance(trees20)
  m <- as.matrix(batch)
  expect_equal(m[2, 1],
               MatchingSplitInfoDistance(trees20[[1]], trees20[[2]]),
               tolerance = 1e-10)
  expect_equal(m[5, 3],
               MatchingSplitInfoDistance(trees20[[3]], trees20[[5]]),
               tolerance = 1e-10)
})

test_that("MSID cross-pairs fast path agrees with per-pair", {
  cross <- MatchingSplitInfoDistance(tA, tB)
  expect_equal(dim(cross), c(5L, 5L))
  expect_equal(cross[1, 1],
               MatchingSplitInfoDistance(tA[[1]], tB[[1]]),
               tolerance = 1e-10)
  expect_equal(cross[3, 2],
               MatchingSplitInfoDistance(tA[[3]], tB[[2]]),
               tolerance = 1e-10)
})


# InfoRobinsonFoulds (distance mode) ----

test_that("IRF all-pairs fast path agrees with per-pair", {
  batch <- InfoRobinsonFoulds(trees20)
  m <- as.matrix(batch)
  expect_equal(m[2, 1],
               InfoRobinsonFoulds(trees20[[1]], trees20[[2]]),
               tolerance = 1e-10)
  expect_equal(m[5, 3],
               InfoRobinsonFoulds(trees20[[3]], trees20[[5]]),
               tolerance = 1e-10)
})

test_that("IRF cross-pairs fast path agrees with per-pair", {
  cross <- InfoRobinsonFoulds(tA, tB)
  expect_equal(dim(cross), c(5L, 5L))
  expect_equal(cross[1, 1],
               InfoRobinsonFoulds(tA[[1]], tB[[1]]),
               tolerance = 1e-10)
  expect_equal(cross[3, 2],
               InfoRobinsonFoulds(tA[[3]], tB[[2]]),
               tolerance = 1e-10)
})


# ClusteringInfoDistance (all-pairs already tested; add cross-pairs variety) ----

test_that("CID cross-pairs with different-sized collections", {
  tA3 <- ape::as.phylo(0:2, tipLabels = tips20)
  tB7 <- ape::as.phylo(3:9, tipLabels = tips20)
  cross <- ClusteringInfoDistance(tA3, tB7)
  expect_equal(dim(cross), c(3L, 7L))
  expect_equal(cross[2, 5],
               ClusteringInfoDistance(tA3[[2]], tB7[[5]]),
               tolerance = 1e-10)
})


# MSD cross-pairs via CalculateTreeDistance → .SplitDistanceManyMany ----

test_that("MSD cross-pairs batch agrees with per-pair", {
  cross <- MatchingSplitDistance(tA8, tB8)
  expect_equal(dim(cross), c(3L, 3L))
  expect_equal(cross[1, 1],
               MatchingSplitDistance(tA8[[1]], tB8[[1]]),
               tolerance = 1e-10)
  expect_equal(cross[2, 3],
               MatchingSplitDistance(tA8[[2]], tB8[[3]]),
               tolerance = 1e-10)
})


# Nye / Jaccard cross-pairs via CalculateTreeDistance → .SplitDistanceManyMany ----

test_that("NyeSimilarity cross-pairs batch agrees with per-pair", {
  cross <- NyeSimilarity(tA8, tB8)
  expect_equal(dim(cross), c(3L, 3L))
  expect_equal(cross[1, 1],
               NyeSimilarity(tA8[[1]], tB8[[1]]),
               tolerance = 1e-10)
})

test_that("JRF cross-pairs batch with k and allowConflict", {
  # Default k=1, allowConflict=TRUE
  cross1 <- JaccardRobinsonFoulds(tA8, tB8)
  expect_equal(dim(cross1), c(3L, 3L))
  expect_equal(cross1[2, 1],
               JaccardRobinsonFoulds(tA8[[2]], tB8[[1]]),
               tolerance = 1e-10)

  # k = 2
  cross_k2 <- JaccardRobinsonFoulds(tA8, tB8, k = 2)
  expect_equal(cross_k2[1, 1],
               JaccardRobinsonFoulds(tA8[[1]], tB8[[1]], k = 2),
               tolerance = 1e-10)

  # allowConflict = FALSE
  cross_nc <- JaccardRobinsonFoulds(tA8, tB8, allowConflict = FALSE)
  expect_equal(cross_nc[1, 1],
               JaccardRobinsonFoulds(tA8[[1]], tB8[[1]],
                                     allowConflict = FALSE),
               tolerance = 1e-10)

  # k = Inf
  cross_inf <- JaccardRobinsonFoulds(tA8, tB8, k = Inf)
  expect_equal(cross_inf[2, 2],
               JaccardRobinsonFoulds(tA8[[2]], tB8[[2]], k = Inf),
               tolerance = 1e-10)
})


# MSI cross-pairs via CalculateTreeDistance → .SplitDistanceManyMany ----

test_that("MSI cross-pairs batch agrees with per-pair", {
  cross <- MatchingSplitInfo(tA8, tB8)
  expect_equal(dim(cross), c(3L, 3L))
  expect_equal(cross[1, 1],
               MatchingSplitInfo(tA8[[1]], tB8[[1]]),
               tolerance = 1e-10)
})

# SPI cross-pairs via CalculateTreeDistance → .SplitDistanceManyMany ----

test_that("SPI cross-pairs batch agrees with per-pair", {
  cross <- SharedPhylogeneticInfo(tA8, tB8)
  expect_equal(dim(cross), c(3L, 3L))
  expect_equal(cross[1, 1],
               SharedPhylogeneticInfo(tA8[[1]], tB8[[1]]),
               tolerance = 1e-10)
})


# C++ batch entropy/info functions ----

test_that("cpp_clustering_entropy_batch matches R ClusteringEntropy", {
  trees <- ape::as.phylo(0:4, tipLabels = tips20)
  splits_list <- as.Splits(trees)
  nTip <- length(tips20)
  batch <- TreeDist:::cpp_clustering_entropy_batch(splits_list, as.integer(nTip))
  r_ref <- vapply(trees, ClusteringEntropy, double(1))
  expect_equal(batch, unname(r_ref), tolerance = 1e-12)
})

test_that("cpp_splitwise_info_batch matches R SplitwiseInfo", {
  trees <- ape::as.phylo(0:4, tipLabels = tips20)
  splits_list <- as.Splits(trees)
  nTip <- length(tips20)
  batch <- TreeDist:::cpp_splitwise_info_batch(splits_list, as.integer(nTip))
  r_ref <- vapply(trees, SplitwiseInfo, double(1))
  expect_equal(batch, unname(r_ref), tolerance = 1e-8)
})

test_that("cpp_clustering_entropy_batch: polytomy trees", {
  # Trees with a single non-trivial split (polytomy)
  poly1 <- ape::read.tree(text = "((t1,t2,t3),(t4,t5,t6));")
  poly2 <- ape::read.tree(text = "((t1,t2),(t3,t4,t5,t6));")
  trees <- structure(list(poly1, poly2), class = "multiPhylo")
  splits_list <- as.Splits(trees)
  nTip <- 6L
  batch <- TreeDist:::cpp_clustering_entropy_batch(splits_list, nTip)
  r_ref <- vapply(trees, ClusteringEntropy, double(1))
  expect_equal(batch, unname(r_ref), tolerance = 1e-12)
})

test_that("cpp_splitwise_info_batch: polytomy trees", {
  poly1 <- ape::read.tree(text = "((t1,t2,t3),(t4,t5,t6));")
  poly2 <- ape::read.tree(text = "((t1,t2),(t3,t4,t5,t6));")
  trees <- structure(list(poly1, poly2), class = "multiPhylo")
  splits_list <- as.Splits(trees)
  nTip <- 6L
  batch <- TreeDist:::cpp_splitwise_info_batch(splits_list, nTip)
  r_ref <- vapply(trees, SplitwiseInfo, double(1))
  expect_equal(batch, unname(r_ref), tolerance = 1e-8)
})


# KendallColijn batch paths ----

test_that("KC all-pairs uses pair_diff_euclidean", {
  trees <- ape::as.phylo(0:4, tipLabels = tips8)
  kc <- KendallColijn(trees)
  expect_s3_class(kc, "dist")
  expect_equal(attr(kc, "Size"), 5L)
  m <- as.matrix(kc)
  expect_equal(m[2, 1], KendallColijn(trees[[1]], trees[[2]]),
               tolerance = 1e-10)
  expect_equal(m[4, 3], KendallColijn(trees[[3]], trees[[4]]),
               tolerance = 1e-10)
})

test_that("KC cross-pairs uses vec_diff_euclidean", {
  cross <- KendallColijn(tA8, tB8)
  expect_equal(dim(cross), c(3L, 3L))
  expect_equal(cross[1, 1], KendallColijn(tA8[[1]], tB8[[1]]),
               tolerance = 1e-10)
  expect_equal(cross[2, 3], KendallColijn(tA8[[2]], tB8[[3]]),
               tolerance = 1e-10)
})

test_that("KC all-pairs with SplitVector", {
  trees <- ape::as.phylo(0:3, tipLabels = tips8)
  kc <- KendallColijn(trees, Vector = SplitVector)
  expect_s3_class(kc, "dist")
  m <- as.matrix(kc)
  expect_equal(m[2, 1],
               KendallColijn(trees[[1]], trees[[2]], Vector = SplitVector),
               tolerance = 1e-10)
})

test_that("KC cross-pairs with SplitVector", {
  cross <- KendallColijn(tA8, tB8, Vector = SplitVector)
  expect_equal(dim(cross), c(3L, 3L))
  expect_equal(cross[1, 1],
               KendallColijn(tA8[[1]], tB8[[1]], Vector = SplitVector),
               tolerance = 1e-10)
})


# Large-tree cross-pairs (multi-bin >64 tips) ----

test_that("Cross-pairs with >64-tip trees exercise multi-bin path", {
  skip_on_cran()
  tips100 <- paste0("t", seq_len(100))
  tA100 <- ape::as.phylo(0:2, tipLabels = tips100)
  tB100 <- ape::as.phylo(3:5, tipLabels = tips100)

  # CID cross-pairs
  cid <- ClusteringInfoDistance(tA100, tB100)
  expect_equal(dim(cid), c(3L, 3L))
  expect_equal(cid[1, 1],
               ClusteringInfoDistance(tA100[[1]], tB100[[1]]),
               tolerance = 1e-10)

  # MSD cross-pairs
  msd <- MatchingSplitDistance(tA100, tB100)
  expect_equal(dim(msd), c(3L, 3L))
  expect_equal(msd[1, 1],
               MatchingSplitDistance(tA100[[1]], tB100[[1]]),
               tolerance = 1e-10)

  # DPI cross-pairs
  dpi <- DifferentPhylogeneticInfo(tA100, tB100)
  expect_equal(dim(dpi), c(3L, 3L))
  expect_equal(dpi[1, 1],
               DifferentPhylogeneticInfo(tA100[[1]], tB100[[1]]),
               tolerance = 1e-10)
})

test_that("All-pairs entropy batch with >64-tip trees", {
  skip_on_cran()
  tips100 <- paste0("t", seq_len(100))
  trees100 <- ape::as.phylo(0:2, tipLabels = tips100)
  splits_list <- as.Splits(trees100)

  ce <- TreeDist:::cpp_clustering_entropy_batch(splits_list, 100L)
  ce_ref <- vapply(trees100, ClusteringEntropy, double(1))
  expect_equal(ce, unname(ce_ref), tolerance = 1e-12)

  si <- TreeDist:::cpp_splitwise_info_batch(splits_list, 100L)
  si_ref <- vapply(trees100, SplitwiseInfo, double(1))
  expect_equal(si, unname(si_ref), tolerance = 1e-8)
})


# SPI/MSI all-pairs batch dispatch via .SplitDistanceAllPairs ----

test_that("SPI all-pairs batch agrees with per-pair", {
  batch <- SharedPhylogeneticInfo(trees20)
  m <- as.matrix(batch)
  expect_equal(m[2, 1],
               SharedPhylogeneticInfo(trees20[[1]], trees20[[2]]),
               tolerance = 1e-10)
})

test_that("MSI all-pairs batch agrees with per-pair", {
  batch <- MatchingSplitInfo(trees20)
  m <- as.matrix(batch)
  expect_equal(m[2, 1],
               MatchingSplitInfo(trees20[[1]], trees20[[2]]),
               tolerance = 1e-10)
})


# Nye all-pairs batch dispatch ----

test_that("Nye all-pairs batch agrees with per-pair", {
  batch <- NyeSimilarity(trees20)
  m <- as.matrix(batch)
  expect_equal(m[2, 1],
               NyeSimilarity(trees20[[1]], trees20[[2]]),
               tolerance = 1e-10)
})


# Regression test for issue #162 ----
# MutualClusteringInfo matching was not globally optimal for larger trees,
# causing ClusteringInfoDistance to disagree with manual calculation.
# Fixed in v2.11.0 (PR #163).  This test verifies the fix holds in the
# batch path as well as the per-pair path.

test_that("Issue #162: CID by-hand matches function for 33-taxon trees", {
  tree1 <- ape::read.tree(text = "(B,A,((AG,AF),((((C,(E,D)),((F,G),H)),((K,I),J)),((Q,R),((((AE,(AC,AD)),(AB,(N,(P,O)))),(((Y,(Z,AA)),(W,X)),(V,(T,U)))),((M,S),L))))));")
  tree2 <- ape::read.tree(text = "(B,A,((AG,AF),((((C,(E,D)),((F,G),H)),(((Q,R),((((AE,(AC,AD)),AB),(N,(P,O))),((Y,(Z,AA)),((V,(T,U)),(W,X))))),((M,L),S))),((K,I),J))));")

  # Per-pair path: reported matching score must equal MCI value

  mci <- MutualClusteringInfo(tree1, tree2, reportMatching = TRUE)
  expect_equal(sum(attr(mci, "matchedScores")), mci[[1]], tolerance = 1e-10)

  # Per-pair path: CID by hand must equal CID function
  h <- ClusteringEntropy(tree1) + ClusteringEntropy(tree2)
  d_fn <- ClusteringInfoDistance(tree1, tree2, normalize = TRUE)
  d_hand <- (h - 2 * mci[[1]]) / h
  expect_equal(d_fn, d_hand, tolerance = 1e-10)
})

# RobinsonFoulds cross-pairs (Day 1985 ClusterTable batch) ----

test_that("RF cross-pairs fast path agrees with per-pair", {
  cross <- RobinsonFoulds(tA, tB)
  expect_equal(dim(cross), c(5L, 5L))
  expect_equal(cross[1, 1],
               RobinsonFoulds(tA[[1]], tB[[1]]),
               tolerance = 0)
  expect_equal(cross[3, 2],
               RobinsonFoulds(tA[[3]], tB[[2]]),
               tolerance = 0)
  expect_equal(cross[5, 5],
               RobinsonFoulds(tA[[5]], tB[[5]]),
               tolerance = 0)
})

test_that("RF cross-pairs similarity mode agrees with per-pair", {
  cross_sim <- RobinsonFoulds(tA, tB, similarity = TRUE)
  expect_equal(cross_sim[2, 3],
               RobinsonFoulds(tA[[2]], tB[[3]], similarity = TRUE),
               tolerance = 0)
})

test_that("RF cross-pairs normalization agrees with per-pair", {
  cross_norm <- RobinsonFoulds(tA, tB, normalize = TRUE)
  expect_equal(cross_norm[1, 4],
               RobinsonFoulds(tA[[1]], tB[[4]], normalize = TRUE),
               tolerance = 1e-10)
})

test_that("RF cross-pairs handles single-tree inputs via fallback", {
  # Single tree1 → falls back to CalculateTreeDistance
  d <- RobinsonFoulds(tA[[1]], tB)
  expect_equal(length(d), 5L)
  expect_equal(d[1], RobinsonFoulds(tA[[1]], tB[[1]]), tolerance = 0)
})

test_that("RF cross-pairs all-pairs matches cross-pairs diagonal", {
  all_dist <- as.matrix(RobinsonFoulds(trees20))
  # Cross-pairs of trees20 with itself should match all-pairs
  cross <- RobinsonFoulds(trees20, trees20)
  expect_equal(dim(cross), c(10L, 10L))
  for (i in 1:10) {
    for (j in 1:10) {
      expect_equal(cross[i, j], all_dist[i, j], tolerance = 0,
                   info = paste0("i=", i, " j=", j))
    }
  }
})

test_that("RF batch paths use heap allocation for large trees (>8192 tips)", {
  skip_on_cran()
  n <- 8193L
  tr <- list(TreeTools::RandomTree(n), TreeTools::RandomTree(n))
  class(tr) <- "multiPhylo"

  # All-pairs (hits S_heap resize in robinson_foulds_all_pairs)
  ap <- RobinsonFoulds(tr)
  expect_length(ap, 1L)
  expect_gte(ap, 0)

  # Cross-pairs (hits S_heap resize in robinson_foulds_cross_pairs)
  cp <- RobinsonFoulds(tr[1], tr[2])
  expect_equal(cp[1, 1], as.matrix(ap)[1, 2])
})

test_that("Issue #162: batch path agrees with per-pair for 33-taxon trees", {
  tree1 <- ape::read.tree(text = "(B,A,((AG,AF),((((C,(E,D)),((F,G),H)),((K,I),J)),((Q,R),((((AE,(AC,AD)),(AB,(N,(P,O)))),(((Y,(Z,AA)),(W,X)),(V,(T,U)))),((M,S),L))))));")
  tree2 <- ape::read.tree(text = "(B,A,((AG,AF),((((C,(E,D)),((F,G),H)),(((Q,R),((((AE,(AC,AD)),AB),(N,(P,O))),((Y,(Z,AA)),((V,(T,U)),(W,X))))),((M,L),S))),((K,I),J))));")
  tree3 <- PectinateTree(tree1[["tip.label"]])
  trees <- structure(list(tree1, tree2, tree3), class = "multiPhylo")

  # Batch all-pairs via fast path
  batch_cid <- ClusteringInfoDistance(trees)
  m <- as.matrix(batch_cid)

  # All three pairwise CIDs must match per-pair computation
  expect_equal(m[2, 1], ClusteringInfoDistance(tree1, tree2), tolerance = 1e-10)
  expect_equal(m[3, 1], ClusteringInfoDistance(tree1, tree3), tolerance = 1e-10)
  expect_equal(m[3, 2], ClusteringInfoDistance(tree2, tree3), tolerance = 1e-10)
})

Try the TreeDist package in your browser

Any scripts or data that you put into this service are public.

TreeDist documentation built on June 10, 2026, 5:06 p.m.