tests/testthat/test-audit-diagnostics.R

test_that("audit_leakage reports batch association and duplicates", {
  set.seed(2)
  X <- matrix(rnorm(24), nrow = 12, ncol = 2)
  X[7, ] <- X[1, ]
  X[8, ] <- X[2, ]

  df <- data.frame(
    outcome = rep(c(0, 1), 6),
    batch = rep(c("A", "B"), each = 6),
    x1 = X[, 1],
    x2 = X[, 2]
  )

  fold1 <- c(1L, 2L, 3L, 7L, 8L, 9L)
  fold2 <- c(4L, 5L, 6L, 10L, 11L, 12L)
  indices <- list(
    list(train = setdiff(seq_len(12), fold1), test = fold1, fold = 1, repeat_id = 1),
    list(train = setdiff(seq_len(12), fold2), test = fold2, fold = 2, repeat_id = 1)
  )
  splits <- bioLeak:::LeakSplits(mode = "custom", indices = indices,
                                 info = list(outcome = "outcome", coldata = df))

  custom <- list(
    glm = list(
      fit = function(x, y, task, weights, ...) {
        suppressWarnings(stats::glm(y ~ ., data = as.data.frame(x),
                                    family = stats::binomial(), weights = weights))
      },
      predict = function(object, newdata, task, ...) {
        as.numeric(suppressWarnings(stats::predict(object,
                                                   newdata = as.data.frame(newdata),
                                                   type = "response")))
      }
    )
  )

  fit <- fit_resample(
    df,
    outcome = "outcome",
    splits = splits,
    learner = "glm",
    custom_learners = custom,
    metrics = "auc",
    refit = FALSE,
    seed = 1
  )

  audit <- audit_leakage(
    fit,
    metric = "auc",
    B = 10,
    perm_stratify = FALSE,
    batch_cols = "batch",
    X_ref = df[, c("x1", "x2")],
    sim_threshold = 0.999,
    duplicate_scope = "all",
    # ADD THIS LINE TO FIX WARNINGS:
    target_scan_multivariate = FALSE
  )

  expect_true(nrow(audit@batch_assoc) >= 1)
  expect_true(is.finite(audit@batch_assoc$pval[1]))
  expect_true(nrow(audit@duplicates) >= 1)
})

test_that("audit_leakage batch association respects repeated CV folds", {
  set.seed(3)
  df <- make_class_df(8)
  df$batch <- rep(c("A", "B"), each = 4)
  df <- df[, c("batch", "outcome", "x1", "x2")]

  fold1 <- c(1L, 2L, 5L, 6L)
  fold2 <- c(3L, 4L, 7L, 8L)
  fold3 <- c(1L, 2L, 7L, 8L)
  fold4 <- c(3L, 4L, 5L, 6L)
  indices <- list(
    list(train = setdiff(seq_len(8), fold1), test = fold1, fold = 1, repeat_id = 1),
    list(train = setdiff(seq_len(8), fold2), test = fold2, fold = 2, repeat_id = 1),
    list(train = setdiff(seq_len(8), fold3), test = fold3, fold = 1, repeat_id = 2),
    list(train = setdiff(seq_len(8), fold4), test = fold4, fold = 2, repeat_id = 2)
  )
  splits <- bioLeak:::LeakSplits(mode = "custom", indices = indices,
                                 info = list(outcome = "outcome", coldata = df, batch = "batch"))

  custom <- make_custom_learners()
  fit <- fit_resample_quiet(
    df,
    outcome = "outcome",
    splits = splits,
    learner = "glm",
    custom_learners = custom,
    metrics = "auc",
    refit = FALSE,
    seed = 1
  )

  audit <- audit_leakage(
    fit,
    metric = "auc",
    B = 2,
    perm_stratify = FALSE,
    batch_cols = "batch",
    return_perm = FALSE,
    target_scan = FALSE,
    target_scan_multivariate = FALSE
  )

  expect_equal(sort(unique(audit@batch_assoc$repeat_id)), c(1, 2))
  expect_true(all(audit@batch_assoc$df == 1))
})

test_that("audit_leakage supports refit-based permutations", {
  df <- make_class_df(20)
  splits <- make_split_plan_quiet(df, outcome = "outcome",
                              mode = "subject_grouped", group = "subject",
                              v = 3, seed = 1)
  custom <- make_custom_learners()

  fit <- fit_resample_quiet(df, outcome = "outcome", splits = splits,
                            learner = "glm", custom_learners = custom,
                            metrics = "auc", refit = FALSE, seed = 1)

  audit <- audit_leakage(
    fit,
    metric = "auc",
    B = 2,
    perm_refit = TRUE,
    perm_refit_spec = list(
      x = df,
      outcome = "outcome",
      learner = "glm",
      custom_learners = custom
    ),
    perm_stratify = FALSE
  )

  expect_true(identical(audit@info$perm_method, "refit"))
  expect_equal(audit@permutation_gap$n_perm[1], 2)
})

test_that("audit_leakage supports perm_refit auto mode", {
  df <- make_class_df(20)
  splits <- make_split_plan_quiet(df, outcome = "outcome",
                              mode = "subject_grouped", group = "subject",
                              v = 3, seed = 1)
  custom <- make_custom_learners()
  fit <- fit_resample_quiet(df, outcome = "outcome", splits = splits,
                            learner = "glm", custom_learners = custom,
                            metrics = "auc", refit = FALSE, seed = 1)

  audit_fixed <- audit_leakage(
    fit,
    metric = "auc",
    B = 5,
    perm_refit = "auto",
    perm_refit_auto_max = 2,
    perm_stratify = FALSE
  )
  expect_true(identical(audit_fixed@info$perm_method, "fixed"))
  expect_true(grepl("^auto", audit_fixed@info$perm_refit_mode))

  audit_refit <- audit_leakage(
    fit,
    metric = "auc",
    B = 2,
    perm_refit = "auto",
    perm_refit_auto_max = 5,
    perm_refit_spec = list(
      x = df,
      outcome = "outcome",
      learner = "glm",
      custom_learners = custom
    ),
    perm_stratify = FALSE
  )
  expect_true(identical(audit_refit@info$perm_method, "refit"))
  expect_true(grepl("^auto", audit_refit@info$perm_refit_mode))
})

test_that("audit_leakage multivariate target scan runs", {
  df <- make_class_df(24)
  splits <- make_split_plan_quiet(df, outcome = "outcome",
                              mode = "subject_grouped", group = "subject",
                              v = 3, seed = 1)
  custom <- make_custom_learners()
  fit <- fit_resample_quiet(df, outcome = "outcome", splits = splits,
                            learner = "glm", custom_learners = custom,
                            metrics = "auc", refit = FALSE, seed = 1)
  aud <- audit_leakage(
    fit,
    metric = "auc",
    B = 3,
    perm_stratify = FALSE,
    X_ref = df[, c("x1", "x2")],
    target_scan_multivariate = TRUE,
    target_scan_multivariate_B = 2,
    target_scan_multivariate_components = 2
  )
  mv <- aud@info$target_multivariate
  expect_true(is.data.frame(mv))
  expect_true(nrow(mv) > 0)
  expect_true(all(c("metric", "score", "p_value") %in% names(mv)))
})

test_that("audit_leakage adds mechanism taxonomy and FDR-adjusted target flags", {
  df <- make_class_df(24)
  splits <- make_split_plan_quiet(
    df,
    outcome = "outcome",
    mode = "subject_grouped",
    group = "subject",
    v = 3,
    seed = 7
  )
  custom <- make_custom_learners()
  fit <- fit_resample_quiet(
    df,
    outcome = "outcome",
    splits = splits,
    learner = "glm",
    custom_learners = custom,
    metrics = "auc",
    refit = FALSE,
    seed = 7
  )

  aud <- audit_leakage(
    fit,
    metric = "auc",
    B = 3,
    perm_stratify = FALSE,
    X_ref = df[, c("x1", "x2")],
    target_scan_multivariate = FALSE,
    target_p_adjust = "BH",
    target_alpha = 0.2
  )

  expect_true("mechanism_class" %in% names(aud@permutation_gap))
  if (nrow(aud@batch_assoc)) {
    expect_true("mechanism_class" %in% names(aud@batch_assoc))
  }
  if (nrow(aud@target_assoc)) {
    expect_true("mechanism_class" %in% names(aud@target_assoc))
    expect_true("p_value_adj" %in% names(aud@target_assoc))
    expect_true("flag_fdr" %in% names(aud@target_assoc))
  }
  if (nrow(aud@duplicates)) {
    expect_true("mechanism_class" %in% names(aud@duplicates))
  }

  expect_true(is.data.frame(aud@info$taxonomy))
  expect_true(all(c("mechanism_class", "evidence_slot") %in% names(aud@info$taxonomy)))
  expect_true(is.data.frame(aud@info$mechanism_summary))
  expect_true(all(c("mechanism_class", "flagged", "evidence") %in% names(aud@info$mechanism_summary)))
})

Try the bioLeak package in your browser

Any scripts or data that you put into this service are public.

bioLeak documentation built on March 6, 2026, 1:06 a.m.