tests/testthat/test-build_clusters.R

testthat::skip_on_cran()

# ==============================================================================
# Tests for build_clusters()
# ==============================================================================

# ---- Test data ----

make_test_data <- function(n = 50, k = 26, n_states = 4, seed = 42) {
  set.seed(seed)
  states <- LETTERS[seq_len(n_states)]
  mat <- matrix(sample(states, n * k, replace = TRUE), nrow = n, ncol = k)
  colnames(mat) <- paste0("T", seq_len(k))
  as.data.frame(mat, stringsAsFactors = FALSE)
}

make_data_with_na <- function(n = 30, k = 20, n_states = 3, seed = 42) {
  set.seed(seed)
  states <- LETTERS[seq_len(n_states)]
  mat <- matrix(sample(states, n * k, replace = TRUE), nrow = n, ncol = k)
  # Add void markers to last few columns
  for (i in seq_len(n)) {
    trail_start <- sample(k - 5, 1) + 5
    if (trail_start <= k) mat[i, trail_start:k] <- "%"
  }
  # Add some mid-sequence NAs
  mat[sample(n * k, 10)] <- "*"
  colnames(mat) <- paste0("T", seq_len(k))
  as.data.frame(mat, stringsAsFactors = FALSE)
}

# ==============================================================================
# 1. Input validation
# ==============================================================================

test_that("build_clusters validates inputs", {
  df <- make_test_data(n = 20, k = 10)
  expect_error(build_clusters(df, k = 1), "k >= 2")
  expect_error(build_clusters(df, k = 20), "k <= n - 1")
  expect_error(build_clusters(df, k = 2, dissimilarity = "invalid"))
  expect_error(build_clusters(df, k = 2, method = "invalid"))
  expect_error(build_clusters(df, k = 2, dissimilarity = "lv", weighted = TRUE),
               "Weighting is only supported")
  expect_error(build_clusters("not a df", k = 2))
})

# ==============================================================================
# 2. Basic clustering works for all metrics
# ==============================================================================

test_that("build_clusters runs for all 9 metrics", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  for (metric in c("hamming", "osa", "lv", "dl", "lcs",
                    "qgram", "cosine", "jaccard", "jw")) {
    cl <- build_clusters(df, k = 3, dissimilarity = metric)
    expect_true(inherits(cl, "net_clustering"), info = metric)
    expect_equal(cl$k, 3L, info = metric)
    expect_equal(sum(cl$sizes), 30L, info = metric)
    expect_length(cl$assignments, 30L)
    expect_true(inherits(cl$distance, "dist"), info = metric)
    expect_true(is.numeric(cl$silhouette), info = metric)
  }
})

# ==============================================================================
# 3. Clustering methods
# ==============================================================================

test_that("PAM returns medoids", {
  df <- make_test_data(n = 20, k = 10)
  cl <- build_clusters(df, k = 3, method = "pam")
  expect_false(is.null(cl$medoids))
  expect_length(cl$medoids, 3L)
  expect_true(all(cl$medoids %in% seq_len(20)))
})

test_that("hierarchical methods work", {
  df <- make_test_data(n = 20, k = 10)
  for (m in c("ward.D2", "ward.D", "complete", "average", "single")) {
    cl <- build_clusters(df, k = 3, method = m)
    expect_true(inherits(cl, "net_clustering"), info = m)
    expect_true(is.null(cl$medoids), info = m)
    expect_equal(sum(cl$sizes), 20L, info = m)
  }
})

# ==============================================================================
# 4. Weighted Hamming
# ==============================================================================

test_that("weighted Hamming produces different distances than unweighted", {
  df <- make_test_data(n = 20, k = 10)
  cl_uw <- build_clusters(df, k = 2, dissimilarity = "hamming", weighted = FALSE)
  cl_w <- build_clusters(df, k = 2, dissimilarity = "hamming",
                       weighted = TRUE, lambda = 1)
  # Distance matrices should differ
  d_uw <- as.matrix(cl_uw$distance)
  d_w <- as.matrix(cl_w$distance)
  expect_false(isTRUE(all.equal(d_uw, d_w)))
  # Weighted distances should be <= unweighted (weights <= 1)
  expect_true(all(d_w <= d_uw + 1e-10))
})

test_that("lambda = 0 weighted matches unweighted", {
  df <- make_test_data(n = 20, k = 10)
  cl_uw <- build_clusters(df, k = 2, dissimilarity = "hamming", weighted = FALSE)
  # weighted = TRUE with lambda = 0 should give same distances
  # (lambda forced to 0 when weighted = FALSE, but lambda = 0 means uniform)
  # Actually testing: explicit lambda = 0 with weighted = TRUE
  # In our impl: lambda = 0 → weights = exp(0 * ...) / max = rep(1)
  # But the function sets lambda <- if (weighted) lambda else 0
  # With weighted = TRUE, lambda = 0 → weights = 1
  cl_w0 <- build_clusters(df, k = 2, dissimilarity = "hamming",
                        weighted = TRUE, lambda = 0)
  d_uw <- as.matrix(cl_uw$distance)
  d_w0 <- as.matrix(cl_w0$distance)
  expect_equal(d_uw, d_w0, tolerance = 1e-10)
})

# ==============================================================================
# 5. NA / void marker handling
# ==============================================================================

test_that("na_syms are treated as missing", {
  df <- make_data_with_na(n = 20, k = 10)
  cl <- build_clusters(df, k = 2)
  expect_s3_class(cl, "net_clustering")
  expect_equal(sum(cl$sizes), 20L)
})

test_that("custom na_syms work", {
  df <- make_test_data(n = 20, k = 10, n_states = 3)
  # Replace some values with custom NA symbol
  df[1:5, 8:10] <- "MISSING"
  cl <- build_clusters(df, k = 2, na_syms = c("*", "%", "MISSING"))
  expect_s3_class(cl, "net_clustering")
})

# ==============================================================================
# 6. Seed reproducibility
# ==============================================================================

test_that("seed produces reproducible results", {
  df <- make_test_data(n = 30, k = 10)
  cl1 <- build_clusters(df, k = 3, seed = 123)
  cl2 <- build_clusters(df, k = 3, seed = 123)
  expect_equal(cl1$assignments, cl2$assignments)
  expect_equal(as.matrix(cl1$distance), as.matrix(cl2$distance))
})

# ==============================================================================
# 7. S3 methods
# ==============================================================================

test_that("print.net_clustering works", {
  df <- make_test_data(n = 20, k = 10)
  cl <- build_clusters(df, k = 2)
  out <- capture.output(print(cl))
  expect_true(any(grepl("Sequence Clustering", out)))
  expect_true(any(grepl("pam", out)))
  expect_true(any(grepl("hamming", out)))
  expect_true(any(grepl("Silhouette", out)))
})

test_that("summary.net_clustering works", {
  df <- make_test_data(n = 20, k = 10)
  cl <- build_clusters(df, k = 2)
  stats <- capture.output(res <- summary(cl))
  expect_true(is.data.frame(res))
  expect_equal(nrow(res), 2L)
  expect_true("size" %in% names(res))
  expect_true("mean_within_dist" %in% names(res))
})

test_that("plot.net_clustering silhouette works", {
  df <- make_test_data(n = 20, k = 10)
  cl <- build_clusters(df, k = 2)
  p <- plot(cl, type = "silhouette")
  expect_s3_class(p, "ggplot")
})

test_that("plot.net_clustering mds works", {
  df <- make_test_data(n = 20, k = 10)
  cl <- build_clusters(df, k = 2)
  p <- plot(cl, type = "mds")
  expect_s3_class(p, "ggplot")
})

test_that("plot.net_clustering heatmap works", {
  df <- make_test_data(n = 20, k = 10)
  cl <- build_clusters(df, k = 2)
  p <- plot(cl, type = "heatmap")
  expect_s3_class(p, "ggplot")
})

# ==============================================================================
# 9. Cross-validation against tna
# ==============================================================================

test_that("distance matrices match tna for metrics with matching implementations", {
  skip_if_not_installed("tna")
  data <- tna::group_regulation[1:50, ]

  # Only compare metrics where tna and stringdist agree.
  # tna's own C implementations of osa/lv/dl/lcs/jw differ from
  # stringdist (which is the reference implementation). Confirmed:
  # stringdist::stringdist("acfhicbc", tna3, method="lv") matches
  # our result (21) while tna:::levenshtein_dist gives 18.
  for (metric in c("hamming", "qgram", "cosine", "jaccard")) {
    tna_r <- tna::cluster_data(data, k = 2, dissimilarity = metric, q = 2)
    our_r <- build_clusters(data, k = 2, dissimilarity = metric, q = 2L)
    expect_equal(
      as.matrix(our_r$distance), as.matrix(tna_r$distance),
      tolerance = 1e-10, info = metric
    )
  }
})

test_that("weighted hamming matches tna (lambda = 0.5)", {
  skip_if_not_installed("tna")
  data <- tna::group_regulation[1:50, ]

  tna_r <- tna::cluster_data(data, k = 2, dissimilarity = "hamming",
                             weighted = TRUE, lambda = 0.5)
  our_r <- build_clusters(data, k = 2, dissimilarity = "hamming",
                        weighted = TRUE, lambda = 0.5)
  expect_equal(
    as.matrix(our_r$distance), as.matrix(tna_r$distance),
    tolerance = 1e-10
  )
})

test_that("weighted hamming matches tna (lambda = 2.0)", {
  skip_if_not_installed("tna")
  data <- tna::group_regulation[1:50, ]

  tna_r <- tna::cluster_data(data, k = 2, dissimilarity = "hamming",
                             weighted = TRUE, lambda = 2.0)
  our_r <- build_clusters(data, k = 2, dissimilarity = "hamming",
                        weighted = TRUE, lambda = 2.0)
  expect_equal(
    as.matrix(our_r$distance), as.matrix(tna_r$distance),
    tolerance = 1e-10
  )
})

test_that("cluster assignments match tna for PAM", {
  skip_if_not_installed("tna")
  data <- tna::group_regulation[1:50, ]

  tna_r <- tna::cluster_data(data, k = 3, dissimilarity = "hamming")
  our_r <- build_clusters(data, k = 3, dissimilarity = "hamming")

  # Distance matrices must match exactly
  expect_equal(
    as.matrix(our_r$distance), as.matrix(tna_r$distance),
    tolerance = 1e-10
  )
  # PAM is deterministic on same distance matrix → same assignments
  expect_equal(our_r$assignments, tna_r$assignments)
  expect_equal(our_r$silhouette, tna_r$silhouette, tolerance = 1e-10)
})

test_that("cluster assignments match tna for hclust methods", {
  skip_if_not_installed("tna")
  data <- tna::group_regulation[1:50, ]

  for (m in c("complete", "average")) {
    tna_r <- tna::cluster_data(data, k = 3, dissimilarity = "hamming",
                               method = m)
    our_r <- build_clusters(data, k = 3, dissimilarity = "hamming",
                          method = m)
    expect_equal(
      as.matrix(our_r$distance), as.matrix(tna_r$distance),
      tolerance = 1e-10, info = m
    )
    expect_equal(our_r$assignments, tna_r$assignments, info = m)
  }
})

# ==============================================================================
# 10. R fallback vs stringdist consistency
# ==============================================================================

test_that("R fallback matches stringdist for all applicable metrics", {
  skip_if_not_installed("stringdist")
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  enc <- Nestimate:::.encode_sequences(df, c("*", "%"))

  for (metric in c("hamming", "osa", "lv", "dl", "lcs",
                    "qgram", "cosine", "jaccard", "jw")) {
    d_r <- as.matrix(
      Nestimate:::.dissimilarity_matrix_r(enc, metric, lambda = 0, q = 2L, p = 0.1)
    )
    d_sd <- as.matrix(
      Nestimate:::.dissimilarity_matrix_stringdist(enc, metric, lambda = 0, q = 2L, p = 0.1)
    )
    expect_equal(d_r, d_sd, tolerance = 1e-10, info = metric)
  }
})

# ==============================================================================
# 11. Edge cases
# ==============================================================================

test_that("single state data works", {
  df <- data.frame(T1 = rep("A", 10), T2 = rep("A", 10), T3 = rep("A", 10))
  cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
  # All distances should be 0
  expect_true(all(as.matrix(cl$distance) == 0))
})

test_that("two-state binary data works", {
  set.seed(1)
  df <- data.frame(
    T1 = sample(c("X", "Y"), 20, replace = TRUE),
    T2 = sample(c("X", "Y"), 20, replace = TRUE),
    T3 = sample(c("X", "Y"), 20, replace = TRUE)
  )
  cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
  expect_s3_class(cl, "net_clustering")
})

test_that("data with all trailing NAs handled", {
  df <- data.frame(
    T1 = c("A", "B", "A", "B", "C"),
    T2 = c("B", "A", "C", "A", "A"),
    T3 = c("%", "%", "%", "%", "%"),
    T4 = c("%", "%", "%", "%", "%")
  )
  cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
  expect_s3_class(cl, "net_clustering")
  expect_equal(sum(cl$sizes), 5L)
})

test_that("matrix input works", {
  mat <- matrix(sample(LETTERS[1:3], 30, replace = TRUE), nrow = 10)
  colnames(mat) <- paste0("T", 1:3)
  cl <- build_clusters(mat, k = 2)
  expect_s3_class(cl, "net_clustering")
  expect_equal(sum(cl$sizes), 10L)
})

test_that("q-gram parameter affects result", {
  df <- make_test_data(n = 20, k = 10, n_states = 4)
  cl_q2 <- build_clusters(df, k = 2, dissimilarity = "qgram", q = 2L)
  cl_q3 <- build_clusters(df, k = 2, dissimilarity = "qgram", q = 3L)
  # Different q should give different distance matrices
  expect_false(isTRUE(all.equal(
    as.matrix(cl_q2$distance), as.matrix(cl_q3$distance)
  )))
})

test_that("p parameter affects jw result", {
  df <- make_test_data(n = 20, k = 10, n_states = 4)
  cl_p1 <- build_clusters(df, k = 2, dissimilarity = "jw", p = 0.1)
  cl_p2 <- build_clusters(df, k = 2, dissimilarity = "jw", p = 0.2)
  # Different p should give different distance matrices
  expect_false(isTRUE(all.equal(
    as.matrix(cl_p1$distance), as.matrix(cl_p2$distance)
  )))
})

# ==============================================================================
# 12. Encoding tests
# ==============================================================================

test_that(".encode_sequences handles na_syms correctly", {
  df <- data.frame(T1 = c("A", "*", "B"), T2 = c("B", "A", "%"))
  enc <- Nestimate:::.encode_sequences(df, na_syms = c("*", "%"))
  # df row 1 = c("A", "B"), row 2 = c("*", "A"), row 3 = c("B", "%")
  expect_true(is.na(enc$int_mat[2, 1]))  # "*" → NA
  expect_true(is.na(enc$int_mat[3, 2]))  # "%" → NA
  expect_equal(enc$states, c("A", "B"))
  expect_equal(enc$n_states, 2L)
  expect_equal(enc$len, c(2L, 2L, 1L))  # row 3: last obs at col 1
})

test_that(".encode_sequences integer encoding is correct", {
  df <- data.frame(T1 = c("C", "A", "B"), T2 = c("A", "B", "C"))
  enc <- Nestimate:::.encode_sequences(df, na_syms = c("*", "%"))
  # States sorted: A=1, B=2, C=3
  expect_equal(enc$states, c("A", "B", "C"))
  expect_equal(enc$int_mat[1, ], c(3L, 1L))  # C=3, A=1
  expect_equal(enc$int_mat[2, ], c(1L, 2L))  # A=1, B=2
  expect_equal(enc$int_mat[3, ], c(2L, 3L))  # B=2, C=3
})

# ==============================================================================
# 13. Individual distance function tests
# ==============================================================================

test_that("hamming distance is correct", {
  # Same sequence → 0
  expect_equal(Nestimate:::.hamming_dist_r(c(1,2,3), c(1,2,3), 3, 3, 1), 0)
  # All different → length
  expect_equal(Nestimate:::.hamming_dist_r(c(1,2,3), c(4,5,6), 3, 3, 1), 3)
  # One difference
  expect_equal(Nestimate:::.hamming_dist_r(c(1,2,3), c(1,2,4), 3, 3, 1), 1)
})

test_that("hamming distance with weights is correct", {
  w <- exp(-1 * c(0, 1, 2))
  w <- w / max(w)
  # All different: sum of weights
  expect_equal(
    Nestimate:::.hamming_dist_r(c(1,2,3), c(4,5,6), 3, 3, w),
    sum(w)
  )
  # First position different only
  expect_equal(
    Nestimate:::.hamming_dist_r(c(1,2,3), c(4,2,3), 3, 3, w),
    w[1]
  )
})

test_that("levenshtein distance is correct", {
  # kitten → sitting = 3
  x <- c(1, 2, 3, 3, 4, 5)  # k i t t e n
  y <- c(6, 2, 3, 3, 2, 5, 7)  # s i t t i n g
  expect_equal(Nestimate:::.levenshtein_dist_r(x, y, 6, 7), 3)
  # Same → 0
  expect_equal(Nestimate:::.levenshtein_dist_r(c(1,2), c(1,2), 2, 2), 0)
  # Empty vs non-empty
  expect_equal(Nestimate:::.levenshtein_dist_r(c(1,2,3), c(1,2,3), 0, 3), 3)
})

test_that("lcs distance is correct", {
  # lcs of (1,2,3,4) and (2,4,3) → lcs length = 2 (2,3 or 2,4)
  # distance = 4 + 3 - 2*2 = 3
  expect_equal(Nestimate:::.lcs_dist_r(c(1,2,3,4), c(2,4,3), 4, 3), 3)
  # Same → 0
  expect_equal(Nestimate:::.lcs_dist_r(c(1,2,3), c(1,2,3), 3, 3), 0)
})

# ==============================================================================
# 14. Input extraction tests
# ==============================================================================

test_that("build_clusters works on netobject from sequence methods", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  net <- build_network(df, method = "relative")
  cl_net <- build_clusters(net, k = 3)
  cl_df <- build_clusters(df, k = 3)
  expect_s3_class(cl_net, "net_clustering")
  expect_equal(as.matrix(cl_net$distance), as.matrix(cl_df$distance),
               tolerance = 1e-10)
})

test_that("build_clusters works on frequency netobject", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  net <- build_network(df, method = "frequency")
  cl <- build_clusters(net, k = 3)
  expect_s3_class(cl, "net_clustering")
  expect_equal(sum(cl$sizes), 30L)
})

test_that("build_clusters rejects association-method netobjects", {
  set.seed(42)
  ndf <- data.frame(matrix(rnorm(100), 20, 5))
  colnames(ndf) <- paste0("V", 1:5)
  net <- build_network(ndf, method = "cor")
  expect_error(build_clusters(net, k = 2), "sequence data")
})

test_that("build_clusters works on tna model", {
  skip_if_not_installed("tna")
  data <- tna::group_regulation[1:30, ]
  model <- tna::tna(data)
  cl_tna <- build_clusters(model, k = 2)
  cl_df <- build_clusters(data, k = 2)
  expect_s3_class(cl_tna, "net_clustering")
  expect_equal(as.matrix(cl_tna$distance), as.matrix(cl_df$distance),
               tolerance = 1e-10)
})

test_that("build_clusters works on cograph_network", {
  skip_if_not_installed("cograph")
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  # Build a mock cograph_network with $data
  cg <- structure(
    list(data = df, weights = matrix(0, 4, 4), directed = TRUE),
    class = c("cograph_network", "list")
  )
  cl_cg <- build_clusters(cg, k = 3)
  cl_df <- build_clusters(df, k = 3)
  expect_s3_class(cl_cg, "net_clustering")
  expect_equal(as.matrix(cl_cg$distance), as.matrix(cl_df$distance),
               tolerance = 1e-10)
})

# ==============================================================================
# 15. build_network dispatch for net_clustering
# ==============================================================================

test_that("build_network dispatches on net_clustering", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  cl <- build_clusters(df, k = 3)
  grp <- build_network(cl)
  expect_s3_class(grp, "netobject_group")
  expect_length(grp, 3L)
  expect_equal(names(grp), paste("Cluster", 1:3))
  # Each sub-network should be a netobject
  for (nm in names(grp)) {
    expect_s3_class(grp[[nm]], "netobject")
  }
})

test_that("build_network(clustering) default method is relative", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  cl <- build_clusters(df, k = 3)
  grp <- build_network(cl)
  expect_equal(grp[[1]]$method, "relative")
})

test_that("build_network(clustering) respects method arg", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  cl <- build_clusters(df, k = 3)
  grp <- build_network(cl, method = "frequency")
  expect_equal(grp[[1]]$method, "frequency")
})

test_that("build_network(clustering) sub-networks have correct sizes", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  cl <- build_clusters(df, k = 3)
  grp <- build_network(cl)
  total_seqs <- sum(vapply(grp, function(net) nrow(net$data), integer(1L)))
  expect_equal(total_seqs, 30L)
  # Sizes match clustering sizes
  for (i in seq_along(grp)) {
    expect_equal(nrow(grp[[i]]$data), unname(cl$sizes[i]))
  }
})

test_that("build_network(clustering) stores clustering metadata", {
  df <- make_test_data(n = 30, k = 10, n_states = 4)
  cl <- build_clusters(df, k = 3)
  grp <- build_network(cl)
  expect_s3_class(attr(grp, "clustering"), "net_clustering")
  expect_equal(attr(grp, "clustering")$k, 3L)
})

# ==============================================================================
# 16. Covariate analysis — input forms
# ==============================================================================

make_cov_data <- function(n = 40, seed = 42) {
  set.seed(seed)
  data.frame(
    T1 = sample(LETTERS[1:3], n, replace = TRUE),
    T2 = sample(LETTERS[1:3], n, replace = TRUE),
    T3 = sample(LETTERS[1:3], n, replace = TRUE),
    Age = rnorm(n, 25, 5),
    Gender = sample(c("M", "F"), n, replace = TRUE),
    stringsAsFactors = FALSE
  )
}

test_that("all 5 covariate input forms produce identical results", {
  df <- make_cov_data()
  cl_formula <- build_clusters(df, k = 3, covariates = ~ Age + Gender)
  cl_char <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  cl_string <- build_clusters(df, k = 3, covariates = "Age + Gender")
  cl_df <- build_clusters(df, k = 3, covariates = df[, c("Age", "Gender")])

  ref <- cl_char$covariates$coefficients
  expect_equal(cl_formula$covariates$coefficients, ref)
  expect_equal(cl_string$covariates$coefficients, ref)
  expect_equal(cl_df$covariates$coefficients, ref)
})

test_that("covariates = NULL produces no covariate analysis", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = NULL)
  expect_null(cl$covariates)
})

# ==============================================================================
# 17. Covariate analysis — profiles
# ==============================================================================

test_that("numeric profiles are correct", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  np <- cl$covariates$profiles$numeric
  expect_true(is.data.frame(np))
  expect_true(all(c("cluster", "n", "pct", "variable", "mean", "sd", "median")
                   %in% names(np)))
  # Verify against manual computation
  for (clust in unique(np$cluster)) {
    mask <- cl$assignments == clust
    manual_mean <- mean(df$Age[mask])
    manual_sd <- sd(df$Age[mask])
    manual_med <- median(df$Age[mask])
    row <- np[np$cluster == clust & np$variable == "Age", ]
    expect_equal(row$mean, manual_mean, tolerance = 1e-10)
    expect_equal(row$sd, manual_sd, tolerance = 1e-10)
    expect_equal(row$median, manual_med, tolerance = 1e-10)
  }
})

test_that("categorical profiles are correct", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  cp <- cl$covariates$profiles$categorical
  expect_true(is.data.frame(cp))
  expect_true(all(c("cluster", "n", "variable", "level", "count", "pct")
                   %in% names(cp)))
  # Verify against manual computation
  for (clust in unique(cp$cluster)) {
    mask <- cl$assignments == clust
    tab <- table(df$Gender[mask])
    for (lev in names(tab)) {
      row <- cp[cp$cluster == clust & cp$variable == "Gender" &
                cp$level == lev, ]
      expect_equal(row$count, as.integer(tab[[lev]]))
    }
  }
})

test_that("all-numeric covariates produce no categorical profile", {
  df <- make_cov_data()
  df$Score <- rnorm(nrow(df))
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Score"))
  expect_null(cl$covariates$profiles$categorical)
  expect_true(is.data.frame(cl$covariates$profiles$numeric))
})

test_that("all-categorical covariates produce no numeric profile", {
  df <- make_cov_data()
  df$Group <- sample(c("X", "Y"), nrow(df), replace = TRUE)
  cl <- build_clusters(df, k = 3, covariates = c("Gender", "Group"))
  expect_null(cl$covariates$profiles$numeric)
  expect_true(is.data.frame(cl$covariates$profiles$categorical))
})

# ==============================================================================
# 18. Covariate analysis — k=2 cross-validation against glm(binomial)
# ==============================================================================

test_that("k=2 multinomial matches glm(binomial)", {
  set.seed(99)
  df <- data.frame(
    T1 = sample(LETTERS[1:4], 60, replace = TRUE),
    T2 = sample(LETTERS[1:4], 60, replace = TRUE),
    T3 = sample(LETTERS[1:4], 60, replace = TRUE),
    T4 = sample(LETTERS[1:4], 60, replace = TRUE),
    Age = rnorm(60, 30, 8),
    Score = runif(60, 0, 100),
    stringsAsFactors = FALSE
  )
  cl <- build_clusters(df, k = 2, covariates = c("Age", "Score"))
  our <- cl$covariates$coefficients

  # glm reference
  cov_df <- df[, c("Age", "Score")]
  cov_df$cluster <- factor(cl$assignments)
  ref_fit <- glm(cluster ~ Age + Score, data = cov_df, family = binomial)
  ref <- summary(ref_fit)$coefficients

  # Compare coefficients (excluding intercept from our table for variable match)
  for (vname in c("Age", "Score")) {
    our_row <- our[our$variable == vname, ]
    expect_equal(our_row$estimate[[1L]], unname(ref[vname, "Estimate"]),
                 tolerance = 1e-3, info = vname)
    expect_equal(our_row$std_error[[1L]], unname(ref[vname, "Std. Error"]),
                 tolerance = 1e-3, info = vname)
    expect_equal(our_row$z[[1L]], unname(ref[vname, "z value"]),
                 tolerance = 1e-3, info = vname)
    expect_equal(our_row$p[[1L]], unname(ref[vname, "Pr(>|z|)"]),
                 tolerance = 1e-2, info = vname)
  }
})

# ==============================================================================
# 19. Covariate analysis — k>2 manual verification
# ==============================================================================

test_that("k>2 coefficients match manual extraction from multinom", {
  set.seed(77)
  df <- data.frame(
    T1 = sample(LETTERS[1:3], 50, replace = TRUE),
    T2 = sample(LETTERS[1:3], 50, replace = TRUE),
    T3 = sample(LETTERS[1:3], 50, replace = TRUE),
    X1 = rnorm(50),
    X2 = rnorm(50),
    stringsAsFactors = FALSE
  )
  cl <- build_clusters(df, k = 3, covariates = c("X1", "X2"))
  our <- cl$covariates$coefficients

  # Manual reference from raw multinom
  fit_df <- data.frame(cluster = factor(cl$assignments), X1 = df$X1, X2 = df$X2)
  fit <- nnet::multinom(cluster ~ X1 + X2, data = fit_df, trace = FALSE)
  s <- summary(fit)
  coefs <- s$coefficients
  ses <- s$standard.errors

  for (i in seq_len(nrow(coefs))) {
    for (j in seq_len(ncol(coefs))) {
      est <- coefs[i, j]
      se <- ses[i, j]
      z_manual <- est / se
      p_manual <- 2 * (1 - pnorm(abs(z_manual)))
      or_manual <- exp(est)
      ci_lo_manual <- exp(est - 1.96 * se)
      ci_hi_manual <- exp(est + 1.96 * se)

      row <- our[our$cluster == rownames(coefs)[i] &
                 our$variable == colnames(coefs)[j], ]
      expect_equal(row$estimate, est, tolerance = 1e-10)
      expect_equal(row$std_error, se, tolerance = 1e-10)
      expect_equal(row$odds_ratio, or_manual, tolerance = 1e-10)
      expect_equal(row$ci_lower, ci_lo_manual, tolerance = 1e-10)
      expect_equal(row$ci_upper, ci_hi_manual, tolerance = 1e-10)
      expect_equal(row$z, z_manual, tolerance = 1e-10)
      expect_equal(row$p, p_manual, tolerance = 1e-10)
    }
  }
})

# ==============================================================================
# 20. Covariate analysis — model fit
# ==============================================================================

test_that("model fit stats are correct", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  fit <- cl$covariates$fit
  expect_true(is.numeric(fit$aic))
  expect_true(is.numeric(fit$bic))
  expect_true(is.numeric(fit$deviance))
  expect_true(fit$mcfadden_r2 >= 0 && fit$mcfadden_r2 <= 1)
  expect_equal(fit$reference_cluster, "1")
})

# ==============================================================================
# 21. Covariate analysis — S3 methods
# ==============================================================================

test_that("print shows covariates line", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  out <- capture.output(print(cl))
  expect_true(any(grepl("Covariates:", out)))
  expect_true(any(grepl("post-hoc", out)))
})

test_that("summary shows covariate analysis", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  out <- capture.output(res <- summary(cl))
  expect_true(any(grepl("Post-hoc Covariate Analysis", out)))
  expect_true(any(grepl("Predictors of Membership", out)))
  expect_true(any(grepl("McFadden", out)))
  expect_true(any(grepl("does not influence", out)))
  # Return value is a list with cluster_stats and covariates
  expect_true(is.list(res))
  expect_true(!is.null(res$cluster_stats))
  expect_true(!is.null(res$covariates))
})

test_that("summary without covariates returns data.frame (backwards compat)", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3)
  out <- capture.output(res <- summary(cl))
  expect_true(is.data.frame(res))
})

test_that("plot type='predictors' returns ggplot", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  p <- plot(cl, type = "predictors")
  expect_s3_class(p, "ggplot")
})

test_that("plot type='predictors' errors without covariates", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3)
  expect_error(plot(cl, type = "predictors"), "No covariate analysis")
})

# ==============================================================================
# 22. Covariate analysis — edge cases
# ==============================================================================

test_that("single covariate works", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = "Age")
  expect_true(!is.null(cl$covariates))
  expect_true(all(cl$covariates$coefficients$variable %in%
                  c("(Intercept)", "Age")))
})

test_that("covariate with NAs warns and works", {
  df <- make_cov_data()
  df$Age[c(1, 5, 10)] <- NA
  expect_warning(
    cl <- build_clusters(df, k = 3, covariates = "Age"),
    "Dropped 3 rows"
  )
  expect_true(!is.null(cl$covariates))
})

test_that("constant covariate errors", {
  df <- make_cov_data()
  df$Const <- 5
  expect_error(
    build_clusters(df, k = 3, covariates = "Const"),
    "constant"
  )
})

test_that("covariate data.frame row count mismatch errors", {
  df <- make_cov_data()
  bad_cov <- data.frame(Age = rnorm(10))
  expect_error(
    build_clusters(df, k = 3, covariates = bad_cov),
    "rows"
  )
})

test_that("netobject metadata covariates work", {
  set.seed(42)
  df <- data.frame(
    T1 = sample(LETTERS[1:3], 40, replace = TRUE),
    T2 = sample(LETTERS[1:3], 40, replace = TRUE),
    T3 = sample(LETTERS[1:3], 40, replace = TRUE),
    Age = rnorm(40, 25, 5),
    Score = runif(40, 0, 100),
    stringsAsFactors = FALSE
  )
  net <- build_network(df, method = "relative")
  # Age and Score are numeric → stored in $metadata
  expect_true(!is.null(net$metadata))
  cl <- build_clusters(net, k = 3, covariates = c("Age", "Score"))
  expect_true(!is.null(cl$covariates))
  # Should match direct data.frame input
  cl_df <- build_clusters(df, k = 3, covariates = c("Age", "Score"))
  expect_equal(cl$covariates$coefficients, cl_df$covariates$coefficients)
})

test_that("build_network round-trip preserves covariates", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  grp <- build_network(cl)
  stored <- attr(grp, "clustering")
  expect_true(!is.null(stored$covariates))
  expect_equal(stored$covariates$fit$aic, cl$covariates$fit$aic)
})

# ==============================================================================
# 23. dl distance function — zero-length edge cases (L87-88)
# ==============================================================================

test_that(".dl_dist_r zero-length edge cases work", {
  # n == 0 → return m
  expect_equal(Nestimate:::.dl_dist_r(integer(), c(1L, 2L), 0L, 2L), 2)
  # m == 0 → return n
  expect_equal(Nestimate:::.dl_dist_r(c(1L, 2L), integer(), 2L, 0L), 2)
  # Same sequence → 0
  expect_equal(Nestimate:::.dl_dist_r(c(1L, 2L, 3L), c(1L, 2L, 3L), 3L, 3L), 0)
  # Transposition: one transposition = 1 in DL
  expect_equal(Nestimate:::.dl_dist_r(c(1L, 2L), c(2L, 1L), 2L, 2L), 1)
})

# ==============================================================================
# 24. levenshtein zero-length edge cases (L103)
# ==============================================================================

test_that(".levenshtein_dist_r zero-length handles both empty", {
  # both empty → 0
  expect_equal(Nestimate:::.levenshtein_dist_r(integer(), integer(), 0L, 0L), 0)
})

# ==============================================================================
# 25. qgram distance functions direct tests (L141-150, L155-160, L165-173, L178-181, L186-187, L195)
# ==============================================================================

test_that(".get_qgram_r returns empty for n < q", {
  x <- c(1L, 2L)
  result <- Nestimate:::.get_qgram_r(x, 2L, 3L)
  expect_length(result, 0L)
})

test_that(".qgram_dist_r computes correct L1 distance", {
  qx <- c("A\x01B" = 2L, "B\x01C" = 1L)
  qy <- c("A\x01B" = 1L, "C\x01D" = 2L)
  # |2-1| + |1-0| + |0-2| = 1 + 1 + 2 = 4
  expect_equal(Nestimate:::.qgram_dist_r(NULL, NULL, qx, qy), 4L)
})

test_that(".cosine_dist_r is zero for identical profiles", {
  qx <- c("AB" = 3L, "BC" = 2L)
  expect_equal(Nestimate:::.cosine_dist_r(NULL, NULL, qx, qx), 0)
})

test_that(".cosine_dist_r returns 1 when denominator is zero", {
  # Both zero-norm profiles: den == 0
  qx <- integer()
  qy <- integer()
  expect_equal(Nestimate:::.cosine_dist_r(NULL, NULL, qx, qy), 1)
})

test_that(".jaccard_dist_r returns 0 for empty vs empty", {
  expect_equal(Nestimate:::.jaccard_dist_r(integer(), integer()), 0)
})

test_that(".jaccard_dist_r is correct for known inputs", {
  qx <- c("AB" = 1L, "BC" = 1L)
  qy <- c("BC" = 1L, "CD" = 1L)
  # |intersection| / |union| = 1 / 3 → distance = 1 - 1/3 = 2/3
  expect_equal(Nestimate:::.jaccard_dist_r(qx, qy), 2/3)
})

test_that(".jaro_dist_r returns 0 for identical", {
  x <- c(1L, 2L, 3L)
  expect_equal(Nestimate:::.jaro_dist_r(x, x, 3L, 3L), 0)
})

# ==============================================================================
# 26. Cosine distance matrix with zero-norm rows (L319-322)
# ==============================================================================

test_that("cosine distance R path handles all-NA sequences (zero-norm rows)", {
  # Tests the zero_rows path in .cosine_matrix_tdm via the R fallback
  df <- data.frame(
    T1 = c("A", "%", "B"),
    T2 = c("B", "%", "C"),
    T3 = c("C", "%", "A"),
    stringsAsFactors = FALSE
  )
  enc <- Nestimate:::.encode_sequences(df, c("*", "%"))
  dm <- as.matrix(
    Nestimate:::.dissimilarity_matrix_r(enc, "cosine", lambda = 0, q = 2L, p = 0.1)
  )
  # Row 2 is all NA → distance to rows 1 and 3 should be 1
  expect_equal(dm[2, 1], 1)
  expect_equal(dm[2, 3], 1)
  expect_equal(dm[2, 2], 0)
})

# ==============================================================================
# 27. tna / cograph_network covariates rejection (L430, L440)
# ==============================================================================

test_that("tna input with column-name covariates errors", {
  skip_if_not_installed("tna")
  model <- tna::tna(tna::group_regulation[1:20, ])
  expect_error(
    build_clusters(model, k = 2, covariates = "some_col"),
    "tna/cograph_network"
  )
})

test_that("cograph_network input with column-name covariates errors", {
  cg <- structure(
    list(data = make_test_data(n = 20, k = 5),
         weights = matrix(0, 4, 4), directed = TRUE),
    class = c("cograph_network", "list")
  )
  expect_error(
    build_clusters(cg, k = 2, covariates = "some_col"),
    "tna/cograph_network"
  )
})

# ==============================================================================
# 28. netobject covariate lookup: missing column error (L477, L485)
# ==============================================================================

test_that("netobject covariate with missing column errors", {
  set.seed(1)
  df <- data.frame(
    T1 = sample(LETTERS[1:3], 30, replace = TRUE),
    T2 = sample(LETTERS[1:3], 30, replace = TRUE),
    stringsAsFactors = FALSE
  )
  net <- build_network(df, method = "relative")
  expect_error(
    build_clusters(net, k = 2, covariates = "NonExistent"),
    "not found"
  )
})

# ==============================================================================
# 29. Raw data.frame covariate with missing column error (L497-498)
# ==============================================================================

test_that("data.frame covariate missing column errors with helpful message", {
  df <- make_cov_data()
  expect_error(
    build_clusters(df, k = 3, covariates = c("Age", "NotAColumn")),
    "not found|NotAColumn"
  )
})

# ==============================================================================
# 30. Unsupported input type for build_clusters (L755 via .extract_sequence_data)
# ==============================================================================

test_that(".extract_sequence_data errors on unsupported input", {
  expect_error(
    build_clusters(list(a = 1, b = 2), k = 2),
    "Unsupported input"
  )
})

# ==============================================================================
# 31. summary.net_clustering single-member cluster (L755 singleton path)
# ==============================================================================

test_that("summary.net_clustering handles singleton clusters", {
  # Force a single-member cluster by using PAM with extreme data
  df <- data.frame(
    T1 = c("A", "A", "A", "Z"),
    T2 = c("A", "A", "A", "Z"),
    T3 = c("A", "A", "A", "Z"),
    stringsAsFactors = FALSE
  )
  cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
  out <- capture.output(res <- summary(cl))
  expect_true(is.data.frame(res))
})

# ==============================================================================
# 32. .run_covariate_analysis: small-cluster warning (L895-896)
# ==============================================================================

test_that("covariate analysis warns when cluster too small for params", {
  # Create highly imbalanced clusters: 1 observation in one cluster
  # Use tiny n and 2 clusters to force min_cl < n_params
  set.seed(2)
  df <- data.frame(
    T1 = sample(LETTERS[1:3], 8, replace = TRUE),
    T2 = sample(LETTERS[1:3], 8, replace = TRUE),
    T3 = sample(LETTERS[1:3], 8, replace = TRUE),
    X1 = rnorm(8),
    X2 = rnorm(8),
    X3 = rnorm(8),
    X4 = rnorm(8),
    stringsAsFactors = FALSE
  )
  # Use many covariates and small k to trigger the warning
  expect_warning(
    build_clusters(df, k = 2, covariates = c("X1", "X2", "X3", "X4")),
    "parameters|Estimates may be unreliable"
  )
})

# ==============================================================================
# 33. ordered factor covariate gets unordered (L925-926, L930)
# ==============================================================================

test_that("ordered factor covariates are coerced to unordered", {
  df <- make_cov_data()
  df$Level <- factor(sample(c("Low", "Med", "High"), nrow(df), replace = TRUE),
                     levels = c("Low", "Med", "High"), ordered = TRUE)
  cl <- build_clusters(df, k = 3, covariates = "Level")
  expect_true(!is.null(cl$covariates))
})

# ==============================================================================
# 34. .print_covariate_profiles called directly via summary (L950, L959-964)
# ==============================================================================

test_that(".print_covariate_profiles prints both numeric and categorical", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
  out <- capture.output(summary(cl))
  expect_true(any(grepl("Cluster Profiles.*numeric", out)))
  expect_true(any(grepl("Cluster Profiles.*categorical", out)))
})

# ==============================================================================
# 35. .print_covariate_profiles: categorical level missing in a cluster (L1242)
# ==============================================================================

test_that(".print_covariate_profiles handles missing level in a cluster", {
  # Use a rare level that may not appear in every cluster
  set.seed(9)
  df <- data.frame(
    T1 = sample(LETTERS[1:4], 30, replace = TRUE),
    T2 = sample(LETTERS[1:4], 30, replace = TRUE),
    Group = c(rep("X", 25), rep("Y", 5)),  # Y is rare
    stringsAsFactors = FALSE
  )
  cl <- build_clusters(df, k = 3, covariates = "Group")
  out <- capture.output(summary(cl))
  expect_true(any(grepl("0 \\(0%\\)", out)))
})

# ==============================================================================
# 36. .resolve_covariates: no variables specified (L971-972, L982-987, L992-993)
# ==============================================================================

test_that(".resolve_covariates formula with no vars errors", {
  df <- make_cov_data()
  # An empty formula-like string would parse to no vars; use empty char vector
  expect_error(
    build_clusters(df, k = 3, covariates = character(0)),
    "No covariate|specified"
  )
})

# ==============================================================================
# 37. .resolve_covariates: bad type (L997-1000)
# ==============================================================================

test_that(".resolve_covariates rejects non-supported type", {
  df <- make_cov_data()
  expect_error(
    build_clusters(df, k = 3, covariates = 42),
    "formula|character|data.frame"
  )
})

# ==============================================================================
# 38. .resolve_covariates: nrow mismatch via resolved cov_df (L997-1000)
# ==============================================================================

test_that(".resolve_covariates errors on row mismatch from raw data extraction", {
  # This tests the path where raw_data is df but cov_names not present → error
  # with "Available:" message
  df <- make_cov_data()
  expect_error(
    build_clusters(df, k = 3, covariates = "MissingColumn"),
    "MissingColumn|not found"
  )
})

# ==============================================================================
# 39. .run_covariate_analysis: k=2 returns vector not matrix (L1085)
# ==============================================================================

test_that("k=2 covariate analysis still produces a proper coef data.frame", {
  df <- make_cov_data()
  cl <- build_clusters(df, k = 2, covariates = c("Age", "Gender"))
  coefs <- cl$covariates$coefficients
  expect_true(is.data.frame(coefs))
  expect_true("cluster" %in% names(coefs))
  expect_true("variable" %in% names(coefs))
  # For k=2, there is only one row of clusters
  expect_equal(length(unique(coefs$cluster)), 1L)
})

# ==============================================================================
# 40. .run_covariate_analysis: n_dropped > 0 → profiles on complete-case (L1121-1124)
# ==============================================================================

test_that("covariate NA rows use complete-case for profiles", {
  df <- make_cov_data()
  df$Age[1:5] <- NA
  expect_warning(
    cl <- build_clusters(df, k = 3, covariates = "Age"),
    "Dropped 5"
  )
  # Profiles should reflect 35 complete cases
  np <- cl$covariates$profiles$numeric
  total_n <- sum(unique(np[, c("cluster", "n")])$n)
  expect_equal(total_n, 35L)
})


# ==============================================================================
# cluster_network() convenience wrapper (L1435-1452)
# ==============================================================================

test_that("cluster_network with default pam returns netobject_group (L1435-1452)", {
  set.seed(42)
  states <- c("A","B","C")
  data <- data.frame(
    V1 = sample(states, 50, TRUE), V2 = sample(states, 50, TRUE),
    V3 = sample(states, 50, TRUE), V4 = sample(states, 50, TRUE),
    stringsAsFactors = FALSE
  )
  grp <- cluster_network(data, k = 2)
  expect_true(inherits(grp, "netobject_group"))
  expect_equal(length(grp), 2)
})

test_that("cluster_network with mmm returns netobject_group (L1445-1448)", {
  set.seed(42)
  states <- c("A","B","C")
  data <- data.frame(
    V1 = sample(states, 50, TRUE), V2 = sample(states, 50, TRUE),
    V3 = sample(states, 50, TRUE), V4 = sample(states, 50, TRUE),
    stringsAsFactors = FALSE
  )
  grp <- cluster_network(data, k = 2, cluster_by = "mmm")
  expect_true(inherits(grp, "netobject_group"))
})

test_that("cluster_network from netobject inherits build_args (L1438-1443)", {
  set.seed(42)
  states <- c("A","B","C")
  data <- data.frame(
    V1 = sample(states, 50, TRUE), V2 = sample(states, 50, TRUE),
    V3 = sample(states, 50, TRUE), V4 = sample(states, 50, TRUE),
    stringsAsFactors = FALSE
  )
  net <- build_network(data, method = "relative")
  grp <- cluster_network(net, k = 2)
  expect_true(inherits(grp, "netobject_group"))
})

Try the Nestimate package in your browser

Any scripts or data that you put into this service are public.

Nestimate documentation built on April 20, 2026, 5:06 p.m.