tests/testthat/test-gmix-clustering.R

test_that("kmed initialization converges on the anisotropic same-mean regression case", {
  set.seed(111)
  gauss_dat <- rbind(
    simulate_gaussian(250, c(0, 0), diag(2)),
    simulate_gaussian(250, c(0, 0), matrix(c(20, 0, 0, 0.01), ncol = 2))
  )

  fit_kmed <- gmix(gauss_dat, K = 2, init = "kmed", save_taus = TRUE)
  fit_pam <- gmix(gauss_dat, K = 2, init = "pam")

  expect_s3_class(fit_kmed, "mbcfit")
  expect_equal(fit_kmed$info$code, 1)
  expect_true(all(is.finite(fit_kmed$posterior)))
  expect_equal(dim(fit_kmed$posterior), c(500L, 2L))
  expect_equal(rowSums(fit_kmed$posterior), rep(1, 500), tolerance = 1e-8)
  expect_gt(best_label_accuracy(fit_kmed$cluster, fit_pam$cluster), 0.95)
})

test_that("gmix recovers a well-separated three-cluster solution and predicts consistently", {
  set.seed(20260309)
  dat <- rbind(
    simulate_gaussian(90, c(-6, 0), matrix(c(1.0, 0.2, 0.2, 0.7), ncol = 2)),
    simulate_gaussian(90, c(0, 6), matrix(c(0.8, -0.1, -0.1, 1.2), ncol = 2)),
    simulate_gaussian(90, c(6, -2), matrix(c(1.1, 0.3, 0.3, 0.9), ncol = 2))
  )
  truth <- rep(1:3, each = 90)

  fit <- gmix(dat, K = 3, init = "kmed", erc = 20)
  pred <- predict(fit, dat)

  expect_equal(fit$info$code, 1)
  expect_equal(sum(fit$size), nrow(dat))
  expect_gt(best_label_accuracy(truth, fit$cluster), 0.97)
  expect_gt(best_label_accuracy(truth, pred), 0.97)
})

test_that("plot works for the previous kmed failure case", {
  set.seed(111)
  gauss_dat <- rbind(
    simulate_gaussian(250, c(0, 0), diag(2)),
    simulate_gaussian(250, c(0, 0), matrix(c(20, 0, 0, 0.01), ncol = 2))
  )

  fit <- gmix(gauss_dat, K = 2, init = "kmed")
  path <- tempfile(fileext = ".pdf")

  grDevices::pdf(path)
  on.exit({
    grDevices::dev.off()
    unlink(path)
  }, add = TRUE)

  expect_no_error(
    plot(fit, data = gauss_dat, what = c("clustering", "contour", "boundary"))
  )
})

test_that("gmix handles imbalanced clusters with a soft custom initialization", {
  set.seed(20260310)
  cov_mid <- matrix(
    c(
      1.0, 1.8, 0.0,
      1.8, 10.0, 0.0,
      0.0, 0.0, 0.02
    ),
    ncol = 3
  )
  train <- rbind(
    simulate_gaussian(120, c(-5, 0, 0.1), diag(c(1.2, 18, 0.03))),
    simulate_gaussian(70, c(2, 35, -0.2), cov_mid),
    simulate_gaussian(10, c(8, -28, 0.35), diag(c(0.15, 2.0, 0.01)))
  )
  truth <- rep(1:3, c(120, 70, 10))

  init <- matrix(0.02, nrow(train), 3)
  init[cbind(seq_len(nrow(train)), truth)] <- 0.96
  init[1:8, c(1, 2)] <- init[1:8, c(2, 1)]
  init <- init / rowSums(init)

  fit <- gmix(train, K = 3, erc = 25, init = init, save_cluster = TRUE, save_params = TRUE, save_taus = TRUE, iter.max = 300)
  score <- qscore(train, fit$params, type = "both")

  expect_s3_class(fit, "mbcfit")
  expect_true(fit$info$code %in% c(1, 2))
  if (fit$info$code == 2) {
    expect_true(fit$iter == 300L)
  } else {
    expect_equal(fit$info$code, 1)
    expect_lte(fit$iter, 300L)
  }

  expect_equal(fit$N, nrow(train))
  expect_equal(fit$P, ncol(train))
  expect_equal(fit$K, 3L)
  expect_true(is.finite(fit$loglik))
  expect_equal(sum(fit$size), nrow(train))
  expect_equal(unname(fit$size), c(120, 70, 10), tolerance = 0)

  expect_equal(length(fit$cluster), nrow(train))
  expect_true(all(fit$cluster %in% 1:3))

  expect_equal(dim(fit$posterior), c(nrow(train), 3L))
  expect_equal(rowSums(fit$posterior), rep(1, nrow(train)), tolerance = 1e-6)
  expect_true(all(is.finite(fit$posterior)))

  expect_true(all(is.finite(unlist(fit$params))))
  expect_true(all(is.finite(fit$params$proportion)))
  expect_equal(sum(fit$params$proportion), 1, tolerance = 1e-6)
  expect_true(all(fit$params$proportion > 0))

  expect_true(all(is.finite(score)))
  expect_equal(best_label_accuracy(truth, fit$cluster), 1)

  held_out <- rbind(
    simulate_gaussian(5, c(-5, 0, 0.1), diag(c(1.2, 18, 0.03))),
    simulate_gaussian(5, c(2, 35, -0.2), cov_mid),
    simulate_gaussian(5, c(8, -28, 0.35), diag(c(0.15, 2.0, 0.01)))
  )
  held_truth <- rep(1:3, each = 5)
  pred <- predict(fit, held_out)
  expect_equal(best_label_accuracy(held_truth, pred), 1)
})

test_that("gmix validates and coerces custom function initializations before entering C", {
  set.seed(20260311)
  dat <- rbind(
    simulate_gaussian(20, c(0, 0), diag(2)),
    simulate_gaussian(20, c(4, 4), diag(2))
  )

  init_ok <- function(data, K) {
    z <- matrix(0L, nrow(data), K)
    half <- nrow(data) / 2
    z[seq_len(half), 1] <- 1L
    z[seq.int(half + 1, nrow(data)), 2] <- 1L
    z
  }

  init_bad <- function(data, K) {
    matrix(1L, nrow(data) - 1, K)
  }

  init_zero_row <- init_ok(dat, 2)
  init_zero_row[1, ] <- 0

  init_df <- as.data.frame(init_ok(dat, 2))
  init_ok_df <- function(data, K) {
    as.data.frame(init_ok(data, K))
  }
  init_inf <- init_ok(dat, 2)
  init_inf[1, 1] <- Inf
  init_neg <- init_ok(dat, 2)
  init_neg[1, 1] <- -1
  init_zero_cluster <- init_ok(dat, 2)
  init_zero_cluster[, 2] <- 0

  fit <- gmix(dat, K = 2, init = init_ok, erc = 20)
  fit_zero_row <- gmix(dat, K = 2, init = init_zero_row, erc = 20)
  fit_df <- gmix(dat, K = 2, init = init_df, erc = 20)
  fit_fun_df <- gmix(dat, K = 2, init = init_ok_df, erc = 20)

  expect_s3_class(fit, "mbcfit")
  expect_equal(fit$info$code, 1)
  expect_equal(sum(fit$size), nrow(dat))
  expect_equal(fit_zero_row$info$code, 1)
  expect_equal(sum(fit_zero_row$size), nrow(dat))
  expect_equal(fit_df$info$code, 1)
  expect_equal(sum(fit_df$size), nrow(dat))
  expect_equal(fit_fun_df$info$code, 1)
  expect_equal(sum(fit_fun_df$size), nrow(dat))
  expect_error(
    gmix(dat, K = 2, init = init_bad, erc = 20),
    "incoherent number of instances"
  )
  expect_error(
    gmix(dat, K = 2, init = init_inf, erc = 20),
    "finite values"
  )
  expect_error(
    gmix(dat, K = 2, init = init_neg, erc = 20),
    "non-negative weights"
  )
  expect_error(
    gmix(dat, K = 2, init = init_zero_cluster, erc = 20),
    "positive total weight"
  )
  expect_error(
    gmix(dat, K = 2, init = "foo", erc = 20),
    "Unsupported character value for 'init'"
  )
  expect_error(
    gmix(dat, init = "kmeans", erc = 20),
    "'K' must be given if 'init' is 'kmeans'"
  )
  expect_error(
    gmix(dat, init = "pam", erc = 20),
    "'K' must be given if 'init' is 'pam'"
  )
})

test_that("mset_gmix validates init specifications early", {
  expect_error(
    mset_gmix(init = "foo"),
    "character values must be one of"
  )
  expect_error(
    mset_gmix(init = list("kmed", 5)),
    "character values must be one of"
  )
  expect_error(
    mset_gmix(init = list("foo", "kmed")),
    "character values must be one of"
  )
})

test_that("mset_gmix preserves mixed init lists of supported strings, functions and weights", {
  dat <- iris[1:20, 1:4]

  init_fun <- function(data, K) {
    z <- matrix(0, nrow(data), K)
    z[1:10, 1] <- 1
    z[11:20, 2] <- 1
    z
  }
  init_df <- as.data.frame(init_fun(dat, 2))

  mset <- mset_gmix(K = 2, init = list("kmed", init_fun, init_df))
  fits <- lapply(mset, function(m) m$fn(dat))

  expect_length(mset, 9L)
  expect_true(all(vapply(fits, inherits, logical(1), what = "mbcfit")))
  expect_true(all(vapply(fits, function(x) x$info$code == 1, logical(1))))
})

Try the qcluster package in your browser

Any scripts or data that you put into this service are public.

qcluster documentation built on June 5, 2026, 5:07 p.m.