kmeanspp_streaming.R
In TreeDist: Calculate and Map Distances Between Phylogenetic Trees

# Benchmark: streaming pmin vs apply-rebuild in KMeansPP initialisation
#
# Compares old (apply-rebuild) vs new (streaming pmin) seeding loop.
# Also verifies bit-identical output under fixed set.seed().

library(TreeDist)

# ---- helpers ----------------------------------------------------------------

old_seed_loop <- function(d, n, k, nstart) {
  best_centres <- NULL
  best_score <- Inf
  for (s in seq_len(nstart)) {
    centres <- c(sample.int(n, 1L), numeric(k - 1L))
    for (i in 2:k) {
      p <- apply(d[centres, , drop = FALSE], 2, min) ^ 2
      centres[i] <- sample.int(n, 1L, prob = p)
    }
    score <- sum(apply(d[centres, , drop = FALSE], 2, min) ^ 2)
    if (score < best_score) { best_score <- score; best_centres <- centres }
  }
  best_centres
}

new_seed_loop <- function(d, n, k, nstart) {
  best_centres <- NULL
  best_score <- Inf
  for (s in seq_len(nstart)) {
    centres <- integer(k)
    centres[1L] <- sample.int(n, 1L)
    min_d <- d[centres[1L], ]
    for (i in 2:k) {
      p <- min_d ^ 2
      centres[i] <- sample.int(n, 1L, prob = p)
      min_d <- pmin.int(min_d, d[centres[i], ])
    }
    score <- sum(min_d ^ 2)
    if (score < best_score) { best_score <- score; best_centres <- centres }
  }
  best_centres
}

# ---- identity check ---------------------------------------------------------

check_identity <- function(n, k, nstart, seed = 42L) {
  set.seed(seed); x <- matrix(rnorm(n * 5), n, 5)
  d <- as.matrix(dist(x))
  # old loop stores centres as double (c(sample.int, numeric(...))); new as
  # integer. Coerce both to integer before comparing.
  set.seed(seed + 1L); old <- as.integer(old_seed_loop(d, n, k, nstart))
  set.seed(seed + 1L); new <- as.integer(new_seed_loop(d, n, k, nstart))
  identical(old, new)
}

cat("Identity checks:\n")
for (cfg in list(c(200, 10), c(500, 20))) {
  n <- cfg[1]; k <- cfg[2]
  ok <- check_identity(n, k, nstart = 10L)
  cat(sprintf("  N=%d k=%d: %s\n", n, k, if (ok) "PASS" else "FAIL"))
}

# ---- benchmark --------------------------------------------------------------

run_bench <- function(n, k, nstart = 10L, times = 5L) {
  set.seed(99L); x <- matrix(rnorm(n * 5), n, 5)
  d <- as.matrix(dist(x))

  if (requireNamespace("microbenchmark", quietly = TRUE)) {
    mb <- microbenchmark::microbenchmark(
      old = { set.seed(1L); old_seed_loop(d, n, k, nstart) },
      new = { set.seed(1L); new_seed_loop(d, n, k, nstart) },
      times = times
    )
    med <- tapply(mb$time, mb$expr, median)
    ratio <- med[["old"]] / med[["new"]]
    cat(sprintf("  N=%d k=%d: old %.2f ms | new %.2f ms | ratio %.2fx\n",
                n, k,
                med[["old"]] / 1e6,
                med[["new"]] / 1e6,
                ratio))
  } else {
    t_old <- median(replicate(times, {
      set.seed(1L); system.time(old_seed_loop(d, n, k, nstart))["elapsed"]
    }))
    t_new <- median(replicate(times, {
      set.seed(1L); system.time(new_seed_loop(d, n, k, nstart))["elapsed"]
    }))
    ratio <- t_old / t_new
    cat(sprintf("  N=%d k=%d: old %.4f s | new %.4f s | ratio %.2fx\n",
                n, k, t_old, t_new, ratio))
  }
}

cat("\nBenchmark (seeding loop only, nstart=10):\n")
for (cfg in list(c(200, 10), c(500, 20))) {
  run_bench(n = cfg[1], k = cfg[2])
}

Any scripts or data that you put into this service are public.

TreeDist documentation built on June 10, 2026, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

TreeDist
Calculate and Map Distances Between Phylogenetic Trees

tests/benchmark/kmeanspp_streaming.R
In TreeDist: Calculate and Map Distances Between Phylogenetic Trees

Try the TreeDist package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

TreeDist Calculate and Map Distances Between Phylogenetic Trees

tests/benchmark/kmeanspp_streaming.R In TreeDist: Calculate and Map Distances Between Phylogenetic Trees

Try the TreeDist package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

TreeDist
Calculate and Map Distances Between Phylogenetic Trees

tests/benchmark/kmeanspp_streaming.R
In TreeDist: Calculate and Map Distances Between Phylogenetic Trees