test-misl.R
In misl: Multiple Imputation by Super Learning

# tests/testthat/test-misl2.R
# Run with: devtools::test()

# ── Shared test fixtures ──────────────────────────────────────────────────────

make_cont_data <- function(n = 50, seed = 1) {
  set.seed(seed)
  df <- data.frame(x1 = rnorm(n), x2 = rnorm(n), y = rnorm(n))
  df[sample(n, 10), "y"] <- NA
  df
}

make_bin_data <- function(n = 50, seed = 2) {
  set.seed(seed)
  df <- data.frame(x1 = rnorm(n), x2 = rnorm(n), y = rbinom(n, 1, 0.5))
  df[sample(n, 10), "y"] <- NA
  df
}

make_cat_data <- function(n = 50, seed = 3) {
  set.seed(seed)
  df <- data.frame(
    x1 = rnorm(n),
    x2 = rnorm(n),
    y  = factor(sample(c("a", "b", "c"), n, replace = TRUE))
  )
  df[sample(n, 10), "y"] <- NA
  df
}

make_ord_data <- function(n = 50, seed = 6) {
  set.seed(seed)
  df <- data.frame(
    x1 = rnorm(n),
    x2 = rnorm(n),
    y  = factor(sample(c("low", "med", "high"), n, replace = TRUE),
                levels = c("low", "med", "high"), ordered = TRUE)
  )
  df[sample(n, 10), "y"] <- NA
  df
}

make_mixed_data <- function(n = 80, seed = 4) {
  set.seed(seed)
  df <- data.frame(
    cont = rnorm(n),
    bin  = rbinom(n, 1, 0.5),
    cat  = factor(sample(c("x", "y", "z"), n, replace = TRUE)),
    pred = rnorm(n)
  )
  df[sample(n, 10), "cont"] <- NA
  df[sample(n, 10), "bin"]  <- NA
  df[sample(n, 10), "cat"]  <- NA
  df
}

make_factor_bin_data <- function(n = 50, seed = 5) {
  set.seed(seed)
  df <- data.frame(
    x = rnorm(n),
    y = factor(sample(c("Yes", "No"), n, replace = TRUE))
  )
  df[sample(n, 10), "y"] <- NA
  df
}

# ── check_dataset() ───────────────────────────────────────────────────────────

test_that("check_dataset() errors on non-data-frame input", {
  expect_error(check_dataset("a string"),    "data frame or matrix")
  expect_error(check_dataset(1:10),          "data frame or matrix")
  expect_error(check_dataset(list(a = 1:5)), "data frame or matrix")
})

test_that("check_dataset() errors on empty input", {
  expect_error(check_dataset(data.frame()),              "at least one row")
  expect_error(check_dataset(data.frame(a = integer())), "at least one row")
})

test_that("check_dataset() errors on complete dataset", {
  expect_error(check_dataset(datasets::iris), "complete")
})

test_that("check_dataset() passes silently on valid incomplete data", {
  df <- make_cont_data()
  expect_invisible(check_dataset(df))
})

# ── check_datatype() ──────────────────────────────────────────────────────────

test_that("check_datatype() identifies categorical columns", {
  expect_equal(check_datatype(factor(c("a", "b", "c"))), "categorical")
})

test_that("check_datatype() identifies ordinal columns", {
  expect_equal(
    check_datatype(factor(c("low", "med", "high"), ordered = TRUE)),
    "ordinal"
  )
})

test_that("check_datatype() identifies two-level factor as binomial", {
  expect_equal(check_datatype(factor(c("Yes", "No", "Yes", NA))), "binomial")
  expect_equal(check_datatype(factor(c("A", "B"))),                "binomial")
})

test_that("check_datatype() identifies binomial columns", {
  expect_equal(check_datatype(c(0, 1, 0, 1, NA)), "binomial")
  expect_equal(check_datatype(c(0, 0, 0)),         "binomial")
})

test_that("check_datatype() identifies continuous columns", {
  expect_equal(check_datatype(c(1.5, 2.3, 3.7)), "continuous")
  expect_equal(check_datatype(c(1L, 2L, 3L)),     "continuous")
  expect_equal(check_datatype(c(0, 1, 2)),         "continuous")
})

# ── list_learners() ───────────────────────────────────────────────────────────

test_that("list_learners() returns a tibble with expected columns", {
  result <- list_learners()
  expect_s3_class(result, "tbl_df")
  expect_true(all(c("learner", "description", "package", "installed") %in%
                    colnames(result)))
})

test_that("list_learners() filters by outcome type", {
  cont <- list_learners("continuous")
  bin  <- list_learners("binomial")
  cat  <- list_learners("categorical")
  ord  <- list_learners("ordinal")

  expect_true("glm"  %in% cont$learner)
  expect_true("glm"  %in% bin$learner)
  expect_false("glm" %in% cat$learner)
  expect_false("glm" %in% ord$learner)

  expect_true("multinom_reg"  %in% cat$learner)
  expect_false("multinom_reg" %in% cont$learner)
  expect_false("multinom_reg" %in% bin$learner)
  expect_false("multinom_reg" %in% ord$learner)

  expect_true("polr"  %in% ord$learner)
  expect_false("polr" %in% cont$learner)
  expect_false("polr" %in% bin$learner)
  expect_false("polr" %in% cat$learner)

  expect_true("rand_forest" %in% ord$learner)
  expect_true("boost_tree"  %in% ord$learner)

  expect_false("continuous"  %in% colnames(cont))
  expect_false("binomial"    %in% colnames(bin))
  expect_false("categorical" %in% colnames(cat))
  expect_false("ordinal"     %in% colnames(ord))
})

test_that("list_learners() installed_only returns only installed learners", {
  result <- list_learners(installed_only = TRUE)
  expect_true(all(result$installed))
})

test_that("list_learners() errors on invalid outcome_type", {
  expect_error(list_learners("invalid"), "should be one of")
})

# ── misl() -- output structure ────────────────────────────────────────────────

test_that("misl() returns a list of length m", {
  result <- misl(make_cont_data(), m = 2, maxit = 1, con_method = "glm", seed = 1)
  expect_type(result, "list")
  expect_length(result, 2)
})

test_that("misl() each element has $datasets and $trace", {
  result <- misl(make_cont_data(), m = 2, maxit = 1, con_method = "glm", seed = 1)
  for (i in seq_along(result)) {
    expect_named(result[[i]], c("datasets", "trace"))
  }
})

test_that("misl() imputed datasets have same dimensions as input", {
  df     <- make_cont_data()
  result <- misl(df, m = 2, maxit = 1, con_method = "glm", seed = 1)
  for (i in seq_along(result)) {
    expect_equal(dim(result[[i]]$datasets), dim(df))
  }
})

# ── misl() -- no remaining missingness ───────────────────────────────────────

test_that("misl() produces no NAs for continuous outcome", {
  result <- misl(make_cont_data(), m = 2, maxit = 2, con_method = "glm", seed = 1)
  expect_false(anyNA(result[[1]]$datasets))
  expect_false(anyNA(result[[2]]$datasets))
})

test_that("misl() produces no NAs for binomial outcome", {
  result <- misl(make_bin_data(), m = 2, maxit = 2, bin_method = "glm", seed = 2)
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() produces no NAs for categorical outcome", {
  result <- misl(make_cat_data(), m = 2, maxit = 2,
                 cat_method = "rand_forest", seed = 3)
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() produces no NAs for ordinal outcome", {
  skip_if_not_installed("MASS")
  result <- misl(make_ord_data(), m = 2, maxit = 2,
                 ord_method = "polr", seed = 6)
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() produces no NAs for mixed outcome types", {
  result <- misl(
    make_mixed_data(), m = 2, maxit = 2,
    con_method = "glm", bin_method = "glm", cat_method = "rand_forest",
    seed = 4
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() produces no NAs for factor-coded binary outcome", {
  result <- misl(make_factor_bin_data(), m = 2, maxit = 2,
                 bin_method = "glm", seed = 5)
  expect_false(anyNA(result[[1]]$datasets))
})

# ── misl() -- imputed values are plausible ────────────────────────────────────

test_that("misl() binary imputations are only 0 or 1", {
  df     <- make_bin_data()
  result <- misl(df, m = 2, maxit = 2, bin_method = "glm", seed = 2)
  for (i in seq_along(result)) {
    expect_true(all(result[[i]]$datasets$y %in% c(0, 1)))
  }
})

test_that("misl() categorical imputations stay within observed levels", {
  df     <- make_cat_data()
  result <- misl(df, m = 2, maxit = 2, cat_method = "rand_forest", seed = 3)
  for (i in seq_along(result)) {
    expect_true(all(result[[i]]$datasets$y %in% levels(df$y)))
  }
})

test_that("misl() ordinal imputations stay within observed levels and remain ordered", {
  skip_if_not_installed("MASS")
  df     <- make_ord_data()
  result <- misl(df, m = 2, maxit = 2, ord_method = "polr", seed = 6)
  for (i in seq_along(result)) {
    expect_true(all(result[[i]]$datasets$y %in% levels(df$y)))
    expect_true(is.ordered(result[[i]]$datasets$y))
  }
})

test_that("misl() factor-coded binary imputations stay within observed levels", {
  df     <- make_factor_bin_data()
  result <- misl(df, m = 2, maxit = 2, bin_method = "glm", seed = 5)
  for (i in seq_along(result)) {
    expect_true(all(result[[i]]$datasets$y %in% levels(df$y)))
    expect_s3_class(result[[i]]$datasets$y, "factor")
  }
})

test_that("misl() does not alter observed values", {
  df      <- make_cont_data()
  result  <- misl(df, m = 2, maxit = 2, con_method = "glm", seed = 1)
  obs_idx <- !is.na(df$y)
  for (i in seq_along(result)) {
    expect_equal(result[[i]]$datasets$y[obs_idx], df$y[obs_idx])
  }
})

# ── misl() -- trace plot ──────────────────────────────────────────────────────

test_that("misl() trace has expected columns", {
  result <- misl(make_cont_data(), m = 2, maxit = 2, con_method = "glm", seed = 1)
  expect_named(
    result[[1]]$trace,
    c("statistic", "variable", "m", "iteration", "value")
  )
})

test_that("misl() trace has mean and sd rows for continuous columns", {
  result <- misl(make_cont_data(), m = 2, maxit = 2, con_method = "glm", seed = 1)
  trace  <- result[[1]]$trace
  y_rows <- trace[trace$variable == "y", ]
  expect_true("mean" %in% y_rows$statistic)
  expect_true("sd"   %in% y_rows$statistic)
})

test_that("misl() trace is NA for factor-coded binary columns", {
  result <- misl(make_factor_bin_data(), m = 1, maxit = 2,
                 bin_method = "glm", seed = 5)
  trace  <- result[[1]]$trace
  y_rows <- trace[trace$variable == "y", ]
  expect_true(all(is.na(y_rows$value)))
})

test_that("misl() trace is NA for ordinal columns", {
  skip_if_not_installed("MASS")
  result <- misl(make_ord_data(), m = 1, maxit = 2,
                 ord_method = "polr", seed = 6)
  trace  <- result[[1]]$trace
  y_rows <- trace[trace$variable == "y", ]
  expect_true(all(is.na(y_rows$value)))
})

# ── misl() -- cv_folds parameter ─────────────────────────────────────────────

test_that("misl() runs with cv_folds = 3", {
  skip_if_not_installed("ranger")
  result <- misl(
    make_cont_data(), m = 2, maxit = 1,
    con_method = c("glm", "rand_forest"),
    cv_folds = 3, seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() cv_folds is ignored for single learner", {
  result <- misl(
    make_cont_data(), m = 2, maxit = 1,
    con_method = "glm", cv_folds = 3, seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() errors on invalid cv_folds", {
  df <- make_cont_data()
  expect_error(misl(df, m = 1, maxit = 1, con_method = "glm", cv_folds = 1),   "'cv_folds' must be an integer >= 2.")
  expect_error(misl(df, m = 1, maxit = 1, con_method = "glm", cv_folds = 1.5), "'cv_folds' must be an integer >= 2.")
  expect_error(misl(df, m = 1, maxit = 1, con_method = "glm", cv_folds = "3"), "'cv_folds' must be an integer >= 2.")
})

# ── misl() -- binomial bootstrap guard ───────────────────────────────────────

test_that("misl() handles imbalanced binary columns without error", {
  set.seed(99)
  n  <- 100
  df <- data.frame(
    x  = rnorm(n),
    y  = c(rep(1, 95), rep(0, 5))
  )
  df[sample(which(df$y == 1), 5), "y"] <- NA
  expect_no_error(
    misl(df, m = 2, maxit = 1, bin_method = "glm", seed = 99)
  )
})

# ── misl() -- ignore_predictors ──────────────────────────────────────────────

test_that("misl() respects ignore_predictors", {
  df <- make_cont_data()
  expect_no_error(
    misl(df, m = 1, maxit = 1, con_method = "glm",
         ignore_predictors = "x1", seed = 1)
  )
})

# ── misl() -- single vs multiple learners ────────────────────────────────────

test_that("misl() runs with a single learner (no stacking)", {
  result <- misl(make_cont_data(), m = 2, maxit = 1, con_method = "glm", seed = 1)
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() runs with multiple learners (stacking)", {
  skip_if_not_installed("ranger")
  result <- misl(
    make_cont_data(), m = 2, maxit = 1,
    con_method = c("glm", "rand_forest"), seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})

# ── misl() -- multinom_reg learner ───────────────────────────────────────────

test_that("misl() runs with multinom_reg for categorical outcome", {
  result <- misl(
    make_cat_data(), m = 2, maxit = 1,
    cat_method = "multinom_reg", seed = 3
  )
  expect_false(anyNA(result[[1]]$datasets))
  expect_true(all(result[[1]]$datasets$y %in% levels(make_cat_data()$y)))
})

# ── misl() -- polr learner ────────────────────────────────────────────────────

test_that("misl() runs with polr for ordinal outcome", {
  skip_if_not_installed("MASS")
  result <- misl(
    make_ord_data(), m = 2, maxit = 1,
    ord_method = "polr", seed = 6
  )
  expect_false(anyNA(result[[1]]$datasets))
  expect_true(all(result[[1]]$datasets$y %in% levels(make_ord_data()$y)))
  expect_true(is.ordered(result[[1]]$datasets$y))
})

# ── misl() -- custom parsnip specs ───────────────────────────────────────────

test_that("misl() accepts a parsnip spec for continuous outcome", {
  skip_if_not_installed("ranger")
  custom_spec <- parsnip::rand_forest(trees = 50) |>
    parsnip::set_engine("ranger")
  result <- misl(
    make_cont_data(), m = 2, maxit = 1,
    con_method = list(custom_spec), seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() accepts a mixed list of strings and parsnip specs", {
  skip_if_not_installed("ranger")
  custom_spec <- parsnip::rand_forest(trees = 50) |>
    parsnip::set_engine("ranger")
  result <- misl(
    make_cont_data(), m = 2, maxit = 1,
    con_method = list("glm", custom_spec),
    cv_folds = 3, seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() accepts a parsnip spec for binary outcome", {
  skip_if_not_installed("ranger")
  custom_spec <- parsnip::rand_forest(trees = 50) |>
    parsnip::set_engine("ranger")
  result <- misl(
    make_bin_data(), m = 2, maxit = 1,
    bin_method = list(custom_spec), seed = 2
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() accepts a parsnip spec for categorical outcome", {
  custom_spec <- parsnip::multinom_reg() |>
    parsnip::set_engine("nnet")
  result <- misl(
    make_cat_data(), m = 2, maxit = 1,
    cat_method = list(custom_spec), seed = 3
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() accepts a parsnip spec for ordinal outcome", {
  skip_if_not_installed("MASS")
  # polr is handled via MASS::polr() directly, passed as the named string
  result <- misl(
    make_ord_data(), m = 2, maxit = 1,
    ord_method = list("polr"), seed = 6
  )
  expect_false(anyNA(result[[1]]$datasets))
  expect_true(is.ordered(result[[1]]$datasets$y))
})

test_that("misl() enforces mode on user-supplied parsnip spec", {
  skip_if_not_installed("ranger")
  # Supply a spec with the wrong mode - misl should correct it
  wrong_mode_spec <- parsnip::rand_forest(trees = 50) |>
    parsnip::set_engine("ranger") |>
    parsnip::set_mode("classification")  # wrong for continuous
  result <- misl(
    make_cont_data(), m = 2, maxit = 1,
    con_method = list(wrong_mode_spec), seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})

test_that("misl() backwards compatible: character vector still works", {
  result <- misl(
    make_cont_data(), m = 2, maxit = 1,
    con_method = c("glm"), seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})

# ── plot_misl_trace() ─────────────────────────────────────────────────────────

test_that("plot_misl_trace() runs without error", {
  skip_if_not_installed("ggplot2")
  skip_if_not_installed("ggforce")
  result <- misl(make_cont_data(), m = 3, maxit = 3, con_method = "glm", seed = 1)
  expect_no_error(plot_misl_trace(result))
})

test_that("plot_misl_trace() returns trace data invisibly", {
  skip_if_not_installed("ggplot2")
  skip_if_not_installed("ggforce")
  result <- misl(make_cont_data(), m = 3, maxit = 3, con_method = "glm", seed = 1)
  out <- plot_misl_trace(result)
  expect_s3_class(out, "data.frame")
  expect_true(all(c("statistic", "variable", "m", "iteration", "value") %in%
                    colnames(out)))
})

test_that("plot_misl_trace() drops categorical variables silently", {
  skip_if_not_installed("ggplot2")
  skip_if_not_installed("ggforce")
  result <- misl(make_mixed_data(), m = 2, maxit = 2,
                 con_method = "glm", bin_method = "glm",
                 cat_method = "rand_forest", seed = 4)
  out <- plot_misl_trace(result)
  # cat column should not appear in trace output since it has no trace values
  expect_false("cat" %in% unique(out$variable))
})

test_that("plot_misl_trace() returns message when no trace data available", {
  skip_if_not_installed("ggplot2")
  skip_if_not_installed("ggforce")
  result <- misl(make_cat_data(), m = 2, maxit = 2,
                 cat_method = "rand_forest", seed = 3)
  expect_message(plot_misl_trace(result), "No trace data available")
})

test_that("plot_misl_trace() accepts custom ncol and nrow", {
  skip_if_not_installed("ggplot2")
  skip_if_not_installed("ggforce")
  result <- misl(make_cont_data(), m = 3, maxit = 3, con_method = "glm", seed = 1)
  expect_no_error(plot_misl_trace(result, ncol = 1, nrow = 2))
})

test_that("plot_misl_trace() errors when ggplot2 not available", {
  result <- misl(make_cont_data(), m = 2, maxit = 2, con_method = "glm", seed = 1)
  local_mocked_bindings(
    requireNamespace = function(pkg, ...) if (pkg == "ggplot2") FALSE else TRUE,
    .package = "base"
  )
  expect_error(plot_misl_trace(result), "ggplot2")
})

test_that("plot_misl_trace() errors when ggforce not available", {
  result <- misl(make_cont_data(), m = 2, maxit = 2, con_method = "glm", seed = 1)
  local_mocked_bindings(
    requireNamespace = function(pkg, ...) if (pkg == "ggforce") FALSE else TRUE,
    .package = "base"
  )
  expect_error(plot_misl_trace(result), "ggforce")
})

test_that("misl() errors with a helpful message for unknown learner", {
  suppressWarnings(
    expect_error(
      misl(make_cont_data(), m = 1, maxit = 1, con_method = "lasso"),
      "Unknown learner"
    )
  )
})

test_that("misl() errors with install hint for missing backend package", {
  local_mocked_bindings(
    requireNamespace = function(pkg, ...) if (pkg == "xgboost") FALSE else TRUE,
    .package = "base"
  )
  suppressWarnings(
    expect_error(
      misl(make_cont_data(), m = 1, maxit = 1, con_method = "boost_tree"),
      "install.packages"
    )
  )
})

test_that("misl() errors with helpful message for invalid learner type", {
  suppressWarnings(
    expect_error(
      misl(make_cont_data(), m = 1, maxit = 1, con_method = list(123)),
      "named character string or a parsnip model spec"
    )
  )
})

test_that("misl() completes successfully with a valid user-supplied parsnip spec", {
  skip_if_not_installed("ranger")
  valid_spec <- parsnip::rand_forest(trees = 10) |>
    parsnip::set_engine("ranger")
  result <- misl(
    make_cont_data(), m = 1, maxit = 1,
    con_method = list(valid_spec), seed = 1
  )
  expect_false(anyNA(result[[1]]$datasets))
})