Nothing
testthat::skip_on_cran()
# ==============================================================================
# Tests for build_clusters()
# ==============================================================================
# ---- Test data ----
make_test_data <- function(n = 50, k = 26, n_states = 4, seed = 42) {
set.seed(seed)
states <- LETTERS[seq_len(n_states)]
mat <- matrix(sample(states, n * k, replace = TRUE), nrow = n, ncol = k)
colnames(mat) <- paste0("T", seq_len(k))
as.data.frame(mat, stringsAsFactors = FALSE)
}
make_data_with_na <- function(n = 30, k = 20, n_states = 3, seed = 42) {
set.seed(seed)
states <- LETTERS[seq_len(n_states)]
mat <- matrix(sample(states, n * k, replace = TRUE), nrow = n, ncol = k)
# Add void markers to last few columns
for (i in seq_len(n)) {
trail_start <- sample(k - 5, 1) + 5
if (trail_start <= k) mat[i, trail_start:k] <- "%"
}
# Add some mid-sequence NAs
mat[sample(n * k, 10)] <- "*"
colnames(mat) <- paste0("T", seq_len(k))
as.data.frame(mat, stringsAsFactors = FALSE)
}
# ==============================================================================
# 1. Input validation
# ==============================================================================
test_that("build_clusters validates inputs", {
df <- make_test_data(n = 20, k = 10)
expect_error(build_clusters(df, k = 1), "k >= 2")
expect_error(build_clusters(df, k = 20), "k <= n - 1")
expect_error(build_clusters(df, k = 2, dissimilarity = "invalid"))
expect_error(build_clusters(df, k = 2, method = "invalid"))
expect_error(build_clusters(df, k = 2, dissimilarity = "lv", weighted = TRUE),
"Weighting is only supported")
expect_error(build_clusters("not a df", k = 2))
})
# ==============================================================================
# 2. Basic clustering works for all metrics
# ==============================================================================
test_that("build_clusters runs for all 9 metrics", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
for (metric in c("hamming", "osa", "lv", "dl", "lcs",
"qgram", "cosine", "jaccard", "jw")) {
cl <- build_clusters(df, k = 3, dissimilarity = metric)
expect_true(inherits(cl, "net_clustering"), info = metric)
expect_equal(cl$k, 3L, info = metric)
expect_equal(sum(cl$sizes), 30L, info = metric)
expect_length(cl$assignments, 30L)
expect_true(inherits(cl$distance, "dist"), info = metric)
expect_true(is.numeric(cl$silhouette), info = metric)
}
})
# ==============================================================================
# 3. Clustering methods
# ==============================================================================
test_that("PAM returns medoids", {
df <- make_test_data(n = 20, k = 10)
cl <- build_clusters(df, k = 3, method = "pam")
expect_false(is.null(cl$medoids))
expect_length(cl$medoids, 3L)
expect_true(all(cl$medoids %in% seq_len(20)))
})
test_that("hierarchical methods work", {
df <- make_test_data(n = 20, k = 10)
for (m in c("ward.D2", "ward.D", "complete", "average", "single")) {
cl <- build_clusters(df, k = 3, method = m)
expect_true(inherits(cl, "net_clustering"), info = m)
expect_true(is.null(cl$medoids), info = m)
expect_equal(sum(cl$sizes), 20L, info = m)
}
})
# ==============================================================================
# 4. Weighted Hamming
# ==============================================================================
test_that("weighted Hamming produces different distances than unweighted", {
df <- make_test_data(n = 20, k = 10)
cl_uw <- build_clusters(df, k = 2, dissimilarity = "hamming", weighted = FALSE)
cl_w <- build_clusters(df, k = 2, dissimilarity = "hamming",
weighted = TRUE, lambda = 1)
# Distance matrices should differ
d_uw <- as.matrix(cl_uw$distance)
d_w <- as.matrix(cl_w$distance)
expect_false(isTRUE(all.equal(d_uw, d_w)))
# Weighted distances should be <= unweighted (weights <= 1)
expect_true(all(d_w <= d_uw + 1e-10))
})
test_that("lambda = 0 weighted matches unweighted", {
df <- make_test_data(n = 20, k = 10)
cl_uw <- build_clusters(df, k = 2, dissimilarity = "hamming", weighted = FALSE)
# weighted = TRUE with lambda = 0 should give same distances
# (lambda forced to 0 when weighted = FALSE, but lambda = 0 means uniform)
# Actually testing: explicit lambda = 0 with weighted = TRUE
# In our impl: lambda = 0 → weights = exp(0 * ...) / max = rep(1)
# But the function sets lambda <- if (weighted) lambda else 0
# With weighted = TRUE, lambda = 0 → weights = 1
cl_w0 <- build_clusters(df, k = 2, dissimilarity = "hamming",
weighted = TRUE, lambda = 0)
d_uw <- as.matrix(cl_uw$distance)
d_w0 <- as.matrix(cl_w0$distance)
expect_equal(d_uw, d_w0, tolerance = 1e-10)
})
# ==============================================================================
# 5. NA / void marker handling
# ==============================================================================
test_that("na_syms are treated as missing", {
df <- make_data_with_na(n = 20, k = 10)
cl <- build_clusters(df, k = 2)
expect_s3_class(cl, "net_clustering")
expect_equal(sum(cl$sizes), 20L)
})
test_that("custom na_syms work", {
df <- make_test_data(n = 20, k = 10, n_states = 3)
# Replace some values with custom NA symbol
df[1:5, 8:10] <- "MISSING"
cl <- build_clusters(df, k = 2, na_syms = c("*", "%", "MISSING"))
expect_s3_class(cl, "net_clustering")
})
# ==============================================================================
# 6. Seed reproducibility
# ==============================================================================
test_that("seed produces reproducible results", {
df <- make_test_data(n = 30, k = 10)
cl1 <- build_clusters(df, k = 3, seed = 123)
cl2 <- build_clusters(df, k = 3, seed = 123)
expect_equal(cl1$assignments, cl2$assignments)
expect_equal(as.matrix(cl1$distance), as.matrix(cl2$distance))
})
# ==============================================================================
# 7. S3 methods
# ==============================================================================
test_that("print.net_clustering works", {
df <- make_test_data(n = 20, k = 10)
cl <- build_clusters(df, k = 2)
out <- capture.output(print(cl))
expect_true(any(grepl("Sequence Clustering", out)))
expect_true(any(grepl("pam", out)))
expect_true(any(grepl("hamming", out)))
expect_true(any(grepl("Silhouette", out)))
})
test_that("summary.net_clustering works", {
df <- make_test_data(n = 20, k = 10)
cl <- build_clusters(df, k = 2)
stats <- capture.output(res <- summary(cl))
expect_true(is.data.frame(res))
expect_equal(nrow(res), 2L)
expect_true("size" %in% names(res))
expect_true("mean_within_dist" %in% names(res))
})
test_that("plot.net_clustering silhouette works", {
df <- make_test_data(n = 20, k = 10)
cl <- build_clusters(df, k = 2)
p <- plot(cl, type = "silhouette")
expect_s3_class(p, "ggplot")
})
test_that("plot.net_clustering mds works", {
df <- make_test_data(n = 20, k = 10)
cl <- build_clusters(df, k = 2)
p <- plot(cl, type = "mds")
expect_s3_class(p, "ggplot")
})
test_that("plot.net_clustering heatmap works", {
df <- make_test_data(n = 20, k = 10)
cl <- build_clusters(df, k = 2)
p <- plot(cl, type = "heatmap")
expect_s3_class(p, "ggplot")
})
# ==============================================================================
# 9. Cross-validation against tna
# ==============================================================================
test_that("distance matrices match tna for metrics with matching implementations", {
skip_if_not_installed("tna")
data <- tna::group_regulation[1:50, ]
# Only compare metrics where tna and stringdist agree.
# tna's own C implementations of osa/lv/dl/lcs/jw differ from
# stringdist (which is the reference implementation). Confirmed:
# stringdist::stringdist("acfhicbc", tna3, method="lv") matches
# our result (21) while tna:::levenshtein_dist gives 18.
for (metric in c("hamming", "qgram", "cosine", "jaccard")) {
tna_r <- tna::cluster_data(data, k = 2, dissimilarity = metric, q = 2)
our_r <- build_clusters(data, k = 2, dissimilarity = metric, q = 2L)
expect_equal(
as.matrix(our_r$distance), as.matrix(tna_r$distance),
tolerance = 1e-10, info = metric
)
}
})
test_that("weighted hamming matches tna (lambda = 0.5)", {
skip_if_not_installed("tna")
data <- tna::group_regulation[1:50, ]
tna_r <- tna::cluster_data(data, k = 2, dissimilarity = "hamming",
weighted = TRUE, lambda = 0.5)
our_r <- build_clusters(data, k = 2, dissimilarity = "hamming",
weighted = TRUE, lambda = 0.5)
expect_equal(
as.matrix(our_r$distance), as.matrix(tna_r$distance),
tolerance = 1e-10
)
})
test_that("weighted hamming matches tna (lambda = 2.0)", {
skip_if_not_installed("tna")
data <- tna::group_regulation[1:50, ]
tna_r <- tna::cluster_data(data, k = 2, dissimilarity = "hamming",
weighted = TRUE, lambda = 2.0)
our_r <- build_clusters(data, k = 2, dissimilarity = "hamming",
weighted = TRUE, lambda = 2.0)
expect_equal(
as.matrix(our_r$distance), as.matrix(tna_r$distance),
tolerance = 1e-10
)
})
test_that("cluster assignments match tna for PAM", {
skip_if_not_installed("tna")
data <- tna::group_regulation[1:50, ]
tna_r <- tna::cluster_data(data, k = 3, dissimilarity = "hamming")
our_r <- build_clusters(data, k = 3, dissimilarity = "hamming")
# Distance matrices must match exactly
expect_equal(
as.matrix(our_r$distance), as.matrix(tna_r$distance),
tolerance = 1e-10
)
# PAM is deterministic on same distance matrix → same assignments
expect_equal(our_r$assignments, tna_r$assignments)
expect_equal(our_r$silhouette, tna_r$silhouette, tolerance = 1e-10)
})
test_that("cluster assignments match tna for hclust methods", {
skip_if_not_installed("tna")
data <- tna::group_regulation[1:50, ]
for (m in c("complete", "average")) {
tna_r <- tna::cluster_data(data, k = 3, dissimilarity = "hamming",
method = m)
our_r <- build_clusters(data, k = 3, dissimilarity = "hamming",
method = m)
expect_equal(
as.matrix(our_r$distance), as.matrix(tna_r$distance),
tolerance = 1e-10, info = m
)
expect_equal(our_r$assignments, tna_r$assignments, info = m)
}
})
# ==============================================================================
# 10. R fallback vs stringdist consistency
# ==============================================================================
test_that("R fallback matches stringdist for all applicable metrics", {
skip_if_not_installed("stringdist")
df <- make_test_data(n = 30, k = 10, n_states = 4)
enc <- Nestimate:::.encode_sequences(df, c("*", "%"))
for (metric in c("hamming", "osa", "lv", "dl", "lcs",
"qgram", "cosine", "jaccard", "jw")) {
d_r <- as.matrix(
Nestimate:::.dissimilarity_matrix_r(enc, metric, lambda = 0, q = 2L, p = 0.1)
)
d_sd <- as.matrix(
Nestimate:::.dissimilarity_matrix_stringdist(enc, metric, lambda = 0, q = 2L, p = 0.1)
)
expect_equal(d_r, d_sd, tolerance = 1e-10, info = metric)
}
})
# ==============================================================================
# 11. Edge cases
# ==============================================================================
test_that("single state data works", {
df <- data.frame(T1 = rep("A", 10), T2 = rep("A", 10), T3 = rep("A", 10))
cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
# All distances should be 0
expect_true(all(as.matrix(cl$distance) == 0))
})
test_that("two-state binary data works", {
set.seed(1)
df <- data.frame(
T1 = sample(c("X", "Y"), 20, replace = TRUE),
T2 = sample(c("X", "Y"), 20, replace = TRUE),
T3 = sample(c("X", "Y"), 20, replace = TRUE)
)
cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
expect_s3_class(cl, "net_clustering")
})
test_that("data with all trailing NAs handled", {
df <- data.frame(
T1 = c("A", "B", "A", "B", "C"),
T2 = c("B", "A", "C", "A", "A"),
T3 = c("%", "%", "%", "%", "%"),
T4 = c("%", "%", "%", "%", "%")
)
cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
expect_s3_class(cl, "net_clustering")
expect_equal(sum(cl$sizes), 5L)
})
test_that("matrix input works", {
mat <- matrix(sample(LETTERS[1:3], 30, replace = TRUE), nrow = 10)
colnames(mat) <- paste0("T", 1:3)
cl <- build_clusters(mat, k = 2)
expect_s3_class(cl, "net_clustering")
expect_equal(sum(cl$sizes), 10L)
})
test_that("q-gram parameter affects result", {
df <- make_test_data(n = 20, k = 10, n_states = 4)
cl_q2 <- build_clusters(df, k = 2, dissimilarity = "qgram", q = 2L)
cl_q3 <- build_clusters(df, k = 2, dissimilarity = "qgram", q = 3L)
# Different q should give different distance matrices
expect_false(isTRUE(all.equal(
as.matrix(cl_q2$distance), as.matrix(cl_q3$distance)
)))
})
test_that("p parameter affects jw result", {
df <- make_test_data(n = 20, k = 10, n_states = 4)
cl_p1 <- build_clusters(df, k = 2, dissimilarity = "jw", p = 0.1)
cl_p2 <- build_clusters(df, k = 2, dissimilarity = "jw", p = 0.2)
# Different p should give different distance matrices
expect_false(isTRUE(all.equal(
as.matrix(cl_p1$distance), as.matrix(cl_p2$distance)
)))
})
# ==============================================================================
# 12. Encoding tests
# ==============================================================================
test_that(".encode_sequences handles na_syms correctly", {
df <- data.frame(T1 = c("A", "*", "B"), T2 = c("B", "A", "%"))
enc <- Nestimate:::.encode_sequences(df, na_syms = c("*", "%"))
# df row 1 = c("A", "B"), row 2 = c("*", "A"), row 3 = c("B", "%")
expect_true(is.na(enc$int_mat[2, 1])) # "*" → NA
expect_true(is.na(enc$int_mat[3, 2])) # "%" → NA
expect_equal(enc$states, c("A", "B"))
expect_equal(enc$n_states, 2L)
expect_equal(enc$len, c(2L, 2L, 1L)) # row 3: last obs at col 1
})
test_that(".encode_sequences integer encoding is correct", {
df <- data.frame(T1 = c("C", "A", "B"), T2 = c("A", "B", "C"))
enc <- Nestimate:::.encode_sequences(df, na_syms = c("*", "%"))
# States sorted: A=1, B=2, C=3
expect_equal(enc$states, c("A", "B", "C"))
expect_equal(enc$int_mat[1, ], c(3L, 1L)) # C=3, A=1
expect_equal(enc$int_mat[2, ], c(1L, 2L)) # A=1, B=2
expect_equal(enc$int_mat[3, ], c(2L, 3L)) # B=2, C=3
})
# ==============================================================================
# 13. Individual distance function tests
# ==============================================================================
test_that("hamming distance is correct", {
# Same sequence → 0
expect_equal(Nestimate:::.hamming_dist_r(c(1,2,3), c(1,2,3), 3, 3, 1), 0)
# All different → length
expect_equal(Nestimate:::.hamming_dist_r(c(1,2,3), c(4,5,6), 3, 3, 1), 3)
# One difference
expect_equal(Nestimate:::.hamming_dist_r(c(1,2,3), c(1,2,4), 3, 3, 1), 1)
})
test_that("hamming distance with weights is correct", {
w <- exp(-1 * c(0, 1, 2))
w <- w / max(w)
# All different: sum of weights
expect_equal(
Nestimate:::.hamming_dist_r(c(1,2,3), c(4,5,6), 3, 3, w),
sum(w)
)
# First position different only
expect_equal(
Nestimate:::.hamming_dist_r(c(1,2,3), c(4,2,3), 3, 3, w),
w[1]
)
})
test_that("levenshtein distance is correct", {
# kitten → sitting = 3
x <- c(1, 2, 3, 3, 4, 5) # k i t t e n
y <- c(6, 2, 3, 3, 2, 5, 7) # s i t t i n g
expect_equal(Nestimate:::.levenshtein_dist_r(x, y, 6, 7), 3)
# Same → 0
expect_equal(Nestimate:::.levenshtein_dist_r(c(1,2), c(1,2), 2, 2), 0)
# Empty vs non-empty
expect_equal(Nestimate:::.levenshtein_dist_r(c(1,2,3), c(1,2,3), 0, 3), 3)
})
test_that("lcs distance is correct", {
# lcs of (1,2,3,4) and (2,4,3) → lcs length = 2 (2,3 or 2,4)
# distance = 4 + 3 - 2*2 = 3
expect_equal(Nestimate:::.lcs_dist_r(c(1,2,3,4), c(2,4,3), 4, 3), 3)
# Same → 0
expect_equal(Nestimate:::.lcs_dist_r(c(1,2,3), c(1,2,3), 3, 3), 0)
})
# ==============================================================================
# 14. Input extraction tests
# ==============================================================================
test_that("build_clusters works on netobject from sequence methods", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
net <- build_network(df, method = "relative")
cl_net <- build_clusters(net, k = 3)
cl_df <- build_clusters(df, k = 3)
expect_s3_class(cl_net, "net_clustering")
expect_equal(as.matrix(cl_net$distance), as.matrix(cl_df$distance),
tolerance = 1e-10)
})
test_that("build_clusters works on frequency netobject", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
net <- build_network(df, method = "frequency")
cl <- build_clusters(net, k = 3)
expect_s3_class(cl, "net_clustering")
expect_equal(sum(cl$sizes), 30L)
})
test_that("build_clusters rejects association-method netobjects", {
set.seed(42)
ndf <- data.frame(matrix(rnorm(100), 20, 5))
colnames(ndf) <- paste0("V", 1:5)
net <- build_network(ndf, method = "cor")
expect_error(build_clusters(net, k = 2), "sequence data")
})
test_that("build_clusters works on tna model", {
skip_if_not_installed("tna")
data <- tna::group_regulation[1:30, ]
model <- tna::tna(data)
cl_tna <- build_clusters(model, k = 2)
cl_df <- build_clusters(data, k = 2)
expect_s3_class(cl_tna, "net_clustering")
expect_equal(as.matrix(cl_tna$distance), as.matrix(cl_df$distance),
tolerance = 1e-10)
})
test_that("build_clusters works on cograph_network", {
skip_if_not_installed("cograph")
df <- make_test_data(n = 30, k = 10, n_states = 4)
# Build a mock cograph_network with $data
cg <- structure(
list(data = df, weights = matrix(0, 4, 4), directed = TRUE),
class = c("cograph_network", "list")
)
cl_cg <- build_clusters(cg, k = 3)
cl_df <- build_clusters(df, k = 3)
expect_s3_class(cl_cg, "net_clustering")
expect_equal(as.matrix(cl_cg$distance), as.matrix(cl_df$distance),
tolerance = 1e-10)
})
# ==============================================================================
# 15. build_network dispatch for net_clustering
# ==============================================================================
test_that("build_network dispatches on net_clustering", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
cl <- build_clusters(df, k = 3)
grp <- build_network(cl)
expect_s3_class(grp, "netobject_group")
expect_length(grp, 3L)
expect_equal(names(grp), paste("Cluster", 1:3))
# Each sub-network should be a netobject
for (nm in names(grp)) {
expect_s3_class(grp[[nm]], "netobject")
}
})
test_that("build_network(clustering) default method is relative", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
cl <- build_clusters(df, k = 3)
grp <- build_network(cl)
expect_equal(grp[[1]]$method, "relative")
})
test_that("build_network(clustering) respects method arg", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
cl <- build_clusters(df, k = 3)
grp <- build_network(cl, method = "frequency")
expect_equal(grp[[1]]$method, "frequency")
})
test_that("build_network(clustering) sub-networks have correct sizes", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
cl <- build_clusters(df, k = 3)
grp <- build_network(cl)
total_seqs <- sum(vapply(grp, function(net) nrow(net$data), integer(1L)))
expect_equal(total_seqs, 30L)
# Sizes match clustering sizes
for (i in seq_along(grp)) {
expect_equal(nrow(grp[[i]]$data), unname(cl$sizes[i]))
}
})
test_that("build_network(clustering) stores clustering metadata", {
df <- make_test_data(n = 30, k = 10, n_states = 4)
cl <- build_clusters(df, k = 3)
grp <- build_network(cl)
expect_s3_class(attr(grp, "clustering"), "net_clustering")
expect_equal(attr(grp, "clustering")$k, 3L)
})
# ==============================================================================
# 16. Covariate analysis — input forms
# ==============================================================================
make_cov_data <- function(n = 40, seed = 42) {
set.seed(seed)
data.frame(
T1 = sample(LETTERS[1:3], n, replace = TRUE),
T2 = sample(LETTERS[1:3], n, replace = TRUE),
T3 = sample(LETTERS[1:3], n, replace = TRUE),
Age = rnorm(n, 25, 5),
Gender = sample(c("M", "F"), n, replace = TRUE),
stringsAsFactors = FALSE
)
}
test_that("all 5 covariate input forms produce identical results", {
df <- make_cov_data()
cl_formula <- build_clusters(df, k = 3, covariates = ~ Age + Gender)
cl_char <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
cl_string <- build_clusters(df, k = 3, covariates = "Age + Gender")
cl_df <- build_clusters(df, k = 3, covariates = df[, c("Age", "Gender")])
ref <- cl_char$covariates$coefficients
expect_equal(cl_formula$covariates$coefficients, ref)
expect_equal(cl_string$covariates$coefficients, ref)
expect_equal(cl_df$covariates$coefficients, ref)
})
test_that("covariates = NULL produces no covariate analysis", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = NULL)
expect_null(cl$covariates)
})
# ==============================================================================
# 17. Covariate analysis — profiles
# ==============================================================================
test_that("numeric profiles are correct", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
np <- cl$covariates$profiles$numeric
expect_true(is.data.frame(np))
expect_true(all(c("cluster", "n", "pct", "variable", "mean", "sd", "median")
%in% names(np)))
# Verify against manual computation
for (clust in unique(np$cluster)) {
mask <- cl$assignments == clust
manual_mean <- mean(df$Age[mask])
manual_sd <- sd(df$Age[mask])
manual_med <- median(df$Age[mask])
row <- np[np$cluster == clust & np$variable == "Age", ]
expect_equal(row$mean, manual_mean, tolerance = 1e-10)
expect_equal(row$sd, manual_sd, tolerance = 1e-10)
expect_equal(row$median, manual_med, tolerance = 1e-10)
}
})
test_that("categorical profiles are correct", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
cp <- cl$covariates$profiles$categorical
expect_true(is.data.frame(cp))
expect_true(all(c("cluster", "n", "variable", "level", "count", "pct")
%in% names(cp)))
# Verify against manual computation
for (clust in unique(cp$cluster)) {
mask <- cl$assignments == clust
tab <- table(df$Gender[mask])
for (lev in names(tab)) {
row <- cp[cp$cluster == clust & cp$variable == "Gender" &
cp$level == lev, ]
expect_equal(row$count, as.integer(tab[[lev]]))
}
}
})
test_that("all-numeric covariates produce no categorical profile", {
df <- make_cov_data()
df$Score <- rnorm(nrow(df))
cl <- build_clusters(df, k = 3, covariates = c("Age", "Score"))
expect_null(cl$covariates$profiles$categorical)
expect_true(is.data.frame(cl$covariates$profiles$numeric))
})
test_that("all-categorical covariates produce no numeric profile", {
df <- make_cov_data()
df$Group <- sample(c("X", "Y"), nrow(df), replace = TRUE)
cl <- build_clusters(df, k = 3, covariates = c("Gender", "Group"))
expect_null(cl$covariates$profiles$numeric)
expect_true(is.data.frame(cl$covariates$profiles$categorical))
})
# ==============================================================================
# 18. Covariate analysis — k=2 cross-validation against glm(binomial)
# ==============================================================================
test_that("k=2 multinomial matches glm(binomial)", {
set.seed(99)
df <- data.frame(
T1 = sample(LETTERS[1:4], 60, replace = TRUE),
T2 = sample(LETTERS[1:4], 60, replace = TRUE),
T3 = sample(LETTERS[1:4], 60, replace = TRUE),
T4 = sample(LETTERS[1:4], 60, replace = TRUE),
Age = rnorm(60, 30, 8),
Score = runif(60, 0, 100),
stringsAsFactors = FALSE
)
cl <- build_clusters(df, k = 2, covariates = c("Age", "Score"))
our <- cl$covariates$coefficients
# glm reference
cov_df <- df[, c("Age", "Score")]
cov_df$cluster <- factor(cl$assignments)
ref_fit <- glm(cluster ~ Age + Score, data = cov_df, family = binomial)
ref <- summary(ref_fit)$coefficients
# Compare coefficients (excluding intercept from our table for variable match)
for (vname in c("Age", "Score")) {
our_row <- our[our$variable == vname, ]
expect_equal(our_row$estimate[[1L]], unname(ref[vname, "Estimate"]),
tolerance = 1e-3, info = vname)
expect_equal(our_row$std_error[[1L]], unname(ref[vname, "Std. Error"]),
tolerance = 1e-3, info = vname)
expect_equal(our_row$z[[1L]], unname(ref[vname, "z value"]),
tolerance = 1e-3, info = vname)
expect_equal(our_row$p[[1L]], unname(ref[vname, "Pr(>|z|)"]),
tolerance = 1e-2, info = vname)
}
})
# ==============================================================================
# 19. Covariate analysis — k>2 manual verification
# ==============================================================================
test_that("k>2 coefficients match manual extraction from multinom", {
set.seed(77)
df <- data.frame(
T1 = sample(LETTERS[1:3], 50, replace = TRUE),
T2 = sample(LETTERS[1:3], 50, replace = TRUE),
T3 = sample(LETTERS[1:3], 50, replace = TRUE),
X1 = rnorm(50),
X2 = rnorm(50),
stringsAsFactors = FALSE
)
cl <- build_clusters(df, k = 3, covariates = c("X1", "X2"))
our <- cl$covariates$coefficients
# Manual reference from raw multinom
fit_df <- data.frame(cluster = factor(cl$assignments), X1 = df$X1, X2 = df$X2)
fit <- nnet::multinom(cluster ~ X1 + X2, data = fit_df, trace = FALSE)
s <- summary(fit)
coefs <- s$coefficients
ses <- s$standard.errors
for (i in seq_len(nrow(coefs))) {
for (j in seq_len(ncol(coefs))) {
est <- coefs[i, j]
se <- ses[i, j]
z_manual <- est / se
p_manual <- 2 * (1 - pnorm(abs(z_manual)))
or_manual <- exp(est)
ci_lo_manual <- exp(est - 1.96 * se)
ci_hi_manual <- exp(est + 1.96 * se)
row <- our[our$cluster == rownames(coefs)[i] &
our$variable == colnames(coefs)[j], ]
expect_equal(row$estimate, est, tolerance = 1e-10)
expect_equal(row$std_error, se, tolerance = 1e-10)
expect_equal(row$odds_ratio, or_manual, tolerance = 1e-10)
expect_equal(row$ci_lower, ci_lo_manual, tolerance = 1e-10)
expect_equal(row$ci_upper, ci_hi_manual, tolerance = 1e-10)
expect_equal(row$z, z_manual, tolerance = 1e-10)
expect_equal(row$p, p_manual, tolerance = 1e-10)
}
}
})
# ==============================================================================
# 20. Covariate analysis — model fit
# ==============================================================================
test_that("model fit stats are correct", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
fit <- cl$covariates$fit
expect_true(is.numeric(fit$aic))
expect_true(is.numeric(fit$bic))
expect_true(is.numeric(fit$deviance))
expect_true(fit$mcfadden_r2 >= 0 && fit$mcfadden_r2 <= 1)
expect_equal(fit$reference_cluster, "1")
})
# ==============================================================================
# 21. Covariate analysis — S3 methods
# ==============================================================================
test_that("print shows covariates line", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
out <- capture.output(print(cl))
expect_true(any(grepl("Covariates:", out)))
expect_true(any(grepl("post-hoc", out)))
})
test_that("summary shows covariate analysis", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
out <- capture.output(res <- summary(cl))
expect_true(any(grepl("Post-hoc Covariate Analysis", out)))
expect_true(any(grepl("Predictors of Membership", out)))
expect_true(any(grepl("McFadden", out)))
expect_true(any(grepl("does not influence", out)))
# Return value is a list with cluster_stats and covariates
expect_true(is.list(res))
expect_true(!is.null(res$cluster_stats))
expect_true(!is.null(res$covariates))
})
test_that("summary without covariates returns data.frame (backwards compat)", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3)
out <- capture.output(res <- summary(cl))
expect_true(is.data.frame(res))
})
test_that("plot type='predictors' returns ggplot", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
p <- plot(cl, type = "predictors")
expect_s3_class(p, "ggplot")
})
test_that("plot type='predictors' errors without covariates", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3)
expect_error(plot(cl, type = "predictors"), "No covariate analysis")
})
# ==============================================================================
# 22. Covariate analysis — edge cases
# ==============================================================================
test_that("single covariate works", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = "Age")
expect_true(!is.null(cl$covariates))
expect_true(all(cl$covariates$coefficients$variable %in%
c("(Intercept)", "Age")))
})
test_that("covariate with NAs warns and works", {
df <- make_cov_data()
df$Age[c(1, 5, 10)] <- NA
expect_warning(
cl <- build_clusters(df, k = 3, covariates = "Age"),
"Dropped 3 rows"
)
expect_true(!is.null(cl$covariates))
})
test_that("constant covariate errors", {
df <- make_cov_data()
df$Const <- 5
expect_error(
build_clusters(df, k = 3, covariates = "Const"),
"constant"
)
})
test_that("covariate data.frame row count mismatch errors", {
df <- make_cov_data()
bad_cov <- data.frame(Age = rnorm(10))
expect_error(
build_clusters(df, k = 3, covariates = bad_cov),
"rows"
)
})
test_that("netobject metadata covariates work", {
set.seed(42)
df <- data.frame(
T1 = sample(LETTERS[1:3], 40, replace = TRUE),
T2 = sample(LETTERS[1:3], 40, replace = TRUE),
T3 = sample(LETTERS[1:3], 40, replace = TRUE),
Age = rnorm(40, 25, 5),
Score = runif(40, 0, 100),
stringsAsFactors = FALSE
)
net <- build_network(df, method = "relative")
# Age and Score are numeric → stored in $metadata
expect_true(!is.null(net$metadata))
cl <- build_clusters(net, k = 3, covariates = c("Age", "Score"))
expect_true(!is.null(cl$covariates))
# Should match direct data.frame input
cl_df <- build_clusters(df, k = 3, covariates = c("Age", "Score"))
expect_equal(cl$covariates$coefficients, cl_df$covariates$coefficients)
})
test_that("build_network round-trip preserves covariates", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
grp <- build_network(cl)
stored <- attr(grp, "clustering")
expect_true(!is.null(stored$covariates))
expect_equal(stored$covariates$fit$aic, cl$covariates$fit$aic)
})
# ==============================================================================
# 23. dl distance function — zero-length edge cases (L87-88)
# ==============================================================================
test_that(".dl_dist_r zero-length edge cases work", {
# n == 0 → return m
expect_equal(Nestimate:::.dl_dist_r(integer(), c(1L, 2L), 0L, 2L), 2)
# m == 0 → return n
expect_equal(Nestimate:::.dl_dist_r(c(1L, 2L), integer(), 2L, 0L), 2)
# Same sequence → 0
expect_equal(Nestimate:::.dl_dist_r(c(1L, 2L, 3L), c(1L, 2L, 3L), 3L, 3L), 0)
# Transposition: one transposition = 1 in DL
expect_equal(Nestimate:::.dl_dist_r(c(1L, 2L), c(2L, 1L), 2L, 2L), 1)
})
# ==============================================================================
# 24. levenshtein zero-length edge cases (L103)
# ==============================================================================
test_that(".levenshtein_dist_r zero-length handles both empty", {
# both empty → 0
expect_equal(Nestimate:::.levenshtein_dist_r(integer(), integer(), 0L, 0L), 0)
})
# ==============================================================================
# 25. qgram distance functions direct tests (L141-150, L155-160, L165-173, L178-181, L186-187, L195)
# ==============================================================================
test_that(".get_qgram_r returns empty for n < q", {
x <- c(1L, 2L)
result <- Nestimate:::.get_qgram_r(x, 2L, 3L)
expect_length(result, 0L)
})
test_that(".qgram_dist_r computes correct L1 distance", {
qx <- c("A\x01B" = 2L, "B\x01C" = 1L)
qy <- c("A\x01B" = 1L, "C\x01D" = 2L)
# |2-1| + |1-0| + |0-2| = 1 + 1 + 2 = 4
expect_equal(Nestimate:::.qgram_dist_r(NULL, NULL, qx, qy), 4L)
})
test_that(".cosine_dist_r is zero for identical profiles", {
qx <- c("AB" = 3L, "BC" = 2L)
expect_equal(Nestimate:::.cosine_dist_r(NULL, NULL, qx, qx), 0)
})
test_that(".cosine_dist_r returns 1 when denominator is zero", {
# Both zero-norm profiles: den == 0
qx <- integer()
qy <- integer()
expect_equal(Nestimate:::.cosine_dist_r(NULL, NULL, qx, qy), 1)
})
test_that(".jaccard_dist_r returns 0 for empty vs empty", {
expect_equal(Nestimate:::.jaccard_dist_r(integer(), integer()), 0)
})
test_that(".jaccard_dist_r is correct for known inputs", {
qx <- c("AB" = 1L, "BC" = 1L)
qy <- c("BC" = 1L, "CD" = 1L)
# |intersection| / |union| = 1 / 3 → distance = 1 - 1/3 = 2/3
expect_equal(Nestimate:::.jaccard_dist_r(qx, qy), 2/3)
})
test_that(".jaro_dist_r returns 0 for identical", {
x <- c(1L, 2L, 3L)
expect_equal(Nestimate:::.jaro_dist_r(x, x, 3L, 3L), 0)
})
# ==============================================================================
# 26. Cosine distance matrix with zero-norm rows (L319-322)
# ==============================================================================
test_that("cosine distance R path handles all-NA sequences (zero-norm rows)", {
# Tests the zero_rows path in .cosine_matrix_tdm via the R fallback
df <- data.frame(
T1 = c("A", "%", "B"),
T2 = c("B", "%", "C"),
T3 = c("C", "%", "A"),
stringsAsFactors = FALSE
)
enc <- Nestimate:::.encode_sequences(df, c("*", "%"))
dm <- as.matrix(
Nestimate:::.dissimilarity_matrix_r(enc, "cosine", lambda = 0, q = 2L, p = 0.1)
)
# Row 2 is all NA → distance to rows 1 and 3 should be 1
expect_equal(dm[2, 1], 1)
expect_equal(dm[2, 3], 1)
expect_equal(dm[2, 2], 0)
})
# ==============================================================================
# 27. tna / cograph_network covariates rejection (L430, L440)
# ==============================================================================
test_that("tna input with column-name covariates errors", {
skip_if_not_installed("tna")
model <- tna::tna(tna::group_regulation[1:20, ])
expect_error(
build_clusters(model, k = 2, covariates = "some_col"),
"tna/cograph_network"
)
})
test_that("cograph_network input with column-name covariates errors", {
cg <- structure(
list(data = make_test_data(n = 20, k = 5),
weights = matrix(0, 4, 4), directed = TRUE),
class = c("cograph_network", "list")
)
expect_error(
build_clusters(cg, k = 2, covariates = "some_col"),
"tna/cograph_network"
)
})
# ==============================================================================
# 28. netobject covariate lookup: missing column error (L477, L485)
# ==============================================================================
test_that("netobject covariate with missing column errors", {
set.seed(1)
df <- data.frame(
T1 = sample(LETTERS[1:3], 30, replace = TRUE),
T2 = sample(LETTERS[1:3], 30, replace = TRUE),
stringsAsFactors = FALSE
)
net <- build_network(df, method = "relative")
expect_error(
build_clusters(net, k = 2, covariates = "NonExistent"),
"not found"
)
})
# ==============================================================================
# 29. Raw data.frame covariate with missing column error (L497-498)
# ==============================================================================
test_that("data.frame covariate missing column errors with helpful message", {
df <- make_cov_data()
expect_error(
build_clusters(df, k = 3, covariates = c("Age", "NotAColumn")),
"not found|NotAColumn"
)
})
# ==============================================================================
# 30. Unsupported input type for build_clusters (L755 via .extract_sequence_data)
# ==============================================================================
test_that(".extract_sequence_data errors on unsupported input", {
expect_error(
build_clusters(list(a = 1, b = 2), k = 2),
"Unsupported input"
)
})
# ==============================================================================
# 31. summary.net_clustering single-member cluster (L755 singleton path)
# ==============================================================================
test_that("summary.net_clustering handles singleton clusters", {
# Force a single-member cluster by using PAM with extreme data
df <- data.frame(
T1 = c("A", "A", "A", "Z"),
T2 = c("A", "A", "A", "Z"),
T3 = c("A", "A", "A", "Z"),
stringsAsFactors = FALSE
)
cl <- build_clusters(df, k = 2, dissimilarity = "hamming")
out <- capture.output(res <- summary(cl))
expect_true(is.data.frame(res))
})
# ==============================================================================
# 32. .run_covariate_analysis: small-cluster warning (L895-896)
# ==============================================================================
test_that("covariate analysis warns when cluster too small for params", {
# Create highly imbalanced clusters: 1 observation in one cluster
# Use tiny n and 2 clusters to force min_cl < n_params
set.seed(2)
df <- data.frame(
T1 = sample(LETTERS[1:3], 8, replace = TRUE),
T2 = sample(LETTERS[1:3], 8, replace = TRUE),
T3 = sample(LETTERS[1:3], 8, replace = TRUE),
X1 = rnorm(8),
X2 = rnorm(8),
X3 = rnorm(8),
X4 = rnorm(8),
stringsAsFactors = FALSE
)
# Use many covariates and small k to trigger the warning
expect_warning(
build_clusters(df, k = 2, covariates = c("X1", "X2", "X3", "X4")),
"parameters|Estimates may be unreliable"
)
})
# ==============================================================================
# 33. ordered factor covariate gets unordered (L925-926, L930)
# ==============================================================================
test_that("ordered factor covariates are coerced to unordered", {
df <- make_cov_data()
df$Level <- factor(sample(c("Low", "Med", "High"), nrow(df), replace = TRUE),
levels = c("Low", "Med", "High"), ordered = TRUE)
cl <- build_clusters(df, k = 3, covariates = "Level")
expect_true(!is.null(cl$covariates))
})
# ==============================================================================
# 34. .print_covariate_profiles called directly via summary (L950, L959-964)
# ==============================================================================
test_that(".print_covariate_profiles prints both numeric and categorical", {
df <- make_cov_data()
cl <- build_clusters(df, k = 3, covariates = c("Age", "Gender"))
out <- capture.output(summary(cl))
expect_true(any(grepl("Cluster Profiles.*numeric", out)))
expect_true(any(grepl("Cluster Profiles.*categorical", out)))
})
# ==============================================================================
# 35. .print_covariate_profiles: categorical level missing in a cluster (L1242)
# ==============================================================================
test_that(".print_covariate_profiles handles missing level in a cluster", {
# Use a rare level that may not appear in every cluster
set.seed(9)
df <- data.frame(
T1 = sample(LETTERS[1:4], 30, replace = TRUE),
T2 = sample(LETTERS[1:4], 30, replace = TRUE),
Group = c(rep("X", 25), rep("Y", 5)), # Y is rare
stringsAsFactors = FALSE
)
cl <- build_clusters(df, k = 3, covariates = "Group")
out <- capture.output(summary(cl))
expect_true(any(grepl("0 \\(0%\\)", out)))
})
# ==============================================================================
# 36. .resolve_covariates: no variables specified (L971-972, L982-987, L992-993)
# ==============================================================================
test_that(".resolve_covariates formula with no vars errors", {
df <- make_cov_data()
# An empty formula-like string would parse to no vars; use empty char vector
expect_error(
build_clusters(df, k = 3, covariates = character(0)),
"No covariate|specified"
)
})
# ==============================================================================
# 37. .resolve_covariates: bad type (L997-1000)
# ==============================================================================
test_that(".resolve_covariates rejects non-supported type", {
df <- make_cov_data()
expect_error(
build_clusters(df, k = 3, covariates = 42),
"formula|character|data.frame"
)
})
# ==============================================================================
# 38. .resolve_covariates: nrow mismatch via resolved cov_df (L997-1000)
# ==============================================================================
test_that(".resolve_covariates errors on row mismatch from raw data extraction", {
# This tests the path where raw_data is df but cov_names not present → error
# with "Available:" message
df <- make_cov_data()
expect_error(
build_clusters(df, k = 3, covariates = "MissingColumn"),
"MissingColumn|not found"
)
})
# ==============================================================================
# 39. .run_covariate_analysis: k=2 returns vector not matrix (L1085)
# ==============================================================================
test_that("k=2 covariate analysis still produces a proper coef data.frame", {
df <- make_cov_data()
cl <- build_clusters(df, k = 2, covariates = c("Age", "Gender"))
coefs <- cl$covariates$coefficients
expect_true(is.data.frame(coefs))
expect_true("cluster" %in% names(coefs))
expect_true("variable" %in% names(coefs))
# For k=2, there is only one row of clusters
expect_equal(length(unique(coefs$cluster)), 1L)
})
# ==============================================================================
# 40. .run_covariate_analysis: n_dropped > 0 → profiles on complete-case (L1121-1124)
# ==============================================================================
test_that("covariate NA rows use complete-case for profiles", {
df <- make_cov_data()
df$Age[1:5] <- NA
expect_warning(
cl <- build_clusters(df, k = 3, covariates = "Age"),
"Dropped 5"
)
# Profiles should reflect 35 complete cases
np <- cl$covariates$profiles$numeric
total_n <- sum(unique(np[, c("cluster", "n")])$n)
expect_equal(total_n, 35L)
})
# ==============================================================================
# cluster_network() convenience wrapper (L1435-1452)
# ==============================================================================
test_that("cluster_network with default pam returns netobject_group (L1435-1452)", {
set.seed(42)
states <- c("A","B","C")
data <- data.frame(
V1 = sample(states, 50, TRUE), V2 = sample(states, 50, TRUE),
V3 = sample(states, 50, TRUE), V4 = sample(states, 50, TRUE),
stringsAsFactors = FALSE
)
grp <- cluster_network(data, k = 2)
expect_true(inherits(grp, "netobject_group"))
expect_equal(length(grp), 2)
})
test_that("cluster_network with mmm returns netobject_group (L1445-1448)", {
set.seed(42)
states <- c("A","B","C")
data <- data.frame(
V1 = sample(states, 50, TRUE), V2 = sample(states, 50, TRUE),
V3 = sample(states, 50, TRUE), V4 = sample(states, 50, TRUE),
stringsAsFactors = FALSE
)
grp <- cluster_network(data, k = 2, cluster_by = "mmm")
expect_true(inherits(grp, "netobject_group"))
})
test_that("cluster_network from netobject inherits build_args (L1438-1443)", {
set.seed(42)
states <- c("A","B","C")
data <- data.frame(
V1 = sample(states, 50, TRUE), V2 = sample(states, 50, TRUE),
V3 = sample(states, 50, TRUE), V4 = sample(states, 50, TRUE),
stringsAsFactors = FALSE
)
net <- build_network(data, method = "relative")
grp <- cluster_network(net, k = 2)
expect_true(inherits(grp, "netobject_group"))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.