Nothing
skip_if_tune_deps <- function() {
skip_if_not_installed("tune")
skip_if_not_installed("dials")
skip_if_not_installed("glmnet")
skip_if_not_installed("recipes")
skip_if_not_installed("rsample")
skip_if_not_installed("yardstick")
skip_if_not_installed("workflows")
skip_if_not_installed("parsnip")
}
tune_resample_quiet <- function(...) {
out <- NULL
capture.output({
out <- suppressWarnings(tune_resample(...))
})
out
}
test_that("tune_resample selects a deterministic simple model with one_std_err", {
skip_if_tune_deps()
df <- make_class_df(24)
splits <- make_split_plan_quiet(df, outcome = "outcome",
mode = "subject_grouped", group = "subject",
v = 2, nested = TRUE, stratify = FALSE, seed = 11)
spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
parsnip::set_engine("glmnet")
rec <- recipes::recipe(outcome ~ x1 + x2, data = df)
local_mocked_bindings(
tune_grid = function(...) structure(list(), class = "mock_tune_results"),
collect_metrics = function(...) {
data.frame(
.metric = rep("accuracy", 4),
.estimator = rep("binary", 4),
.estimate = c(0.80, 0.76, 0.77, 0.77),
.config = c("cfg_a", "cfg_a", "cfg_b", "cfg_b"),
penalty = c(0.01, 0.01, 1.0, 1.0),
stringsAsFactors = FALSE
)
},
finalize_workflow = function(x, parameters, ...) x,
.package = "tune"
)
local_mocked_bindings(
fit_resample = function(x, outcome, splits, ...) {
truth <- factor(rep(c(0, 1), length.out = 4), levels = c(0, 1))
methods::new(
"LeakFit",
splits = splits,
metrics = data.frame(fold = 1, learner = "mock", accuracy = 0.75),
metric_summary = data.frame(learner = "mock", accuracy_mean = 0.75, accuracy_sd = 0),
audit = data.frame(),
predictions = list(data.frame(
id = as.character(seq_len(4)),
truth = truth,
pred = c(0.45, 0.55, 0.40, 0.60),
fold = 1,
learner = "mock",
stringsAsFactors = FALSE
)),
preprocess = list(),
learners = list(),
outcome = outcome,
task = "binomial",
feature_names = c("x1", "x2"),
info = list(sample_ids = as.character(seq_len(nrow(x))))
)
},
.package = "bioLeak"
)
tuned_best <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
learner = spec, preprocess = rec, grid = 2,
metrics = "accuracy", selection = "best", seed = 11)
tuned_ose <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
learner = spec, preprocess = rec, grid = 2,
metrics = "accuracy", selection = "one_std_err", seed = 11)
expect_true(nrow(tuned_best$best_params) > 0)
expect_true(nrow(tuned_ose$best_params) > 0)
expect_true(all(tuned_best$best_params$penalty == 0.01))
expect_true(all(tuned_ose$best_params$penalty == 1.0))
})
test_that("tune_resample supports final refit and stores fold status", {
skip_if_tune_deps()
df <- make_class_df(80)
splits <- make_split_plan_quiet(df, outcome = "outcome",
mode = "subject_grouped", group = "subject",
v = 2, nested = TRUE, stratify = FALSE, seed = 2)
spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
parsnip::set_engine("glmnet")
rec <- recipes::recipe(outcome ~ x1 + x2, data = df)
tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
learner = spec, preprocess = rec, grid = 2,
metrics = "accuracy", selection = "best",
refit = TRUE, seed = 2)
expect_true(isTRUE(tuned$info$refit))
expect_true(!is.null(tuned$final_model))
expect_s3_class(tuned$final_workflow, "workflow")
expect_true(nrow(tuned$final_params) > 0)
expect_true(is.data.frame(tuned$fold_status))
expect_true(nrow(tuned$fold_status) == length(splits@indices))
expect_true(all(tuned$fold_status$status %in% c("success", "skipped", "failed")))
})
test_that("final refit uses aggregated params, not single best outer fold", {
skip_if_tune_deps()
df <- make_class_df(80)
splits <- make_split_plan_quiet(df, outcome = "outcome",
mode = "subject_grouped", group = "subject",
v = 2, nested = TRUE, stratify = FALSE, seed = 2)
spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
parsnip::set_engine("glmnet")
rec <- recipes::recipe(outcome ~ x1 + x2, data = df)
tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
learner = spec, preprocess = rec, grid = 3,
metrics = "accuracy", selection = "best",
refit = TRUE, seed = 2)
# refit_method must be "aggregate", not a single-fold selection
expect_identical(tuned$info$refit_method, "aggregate")
# refit_fold must be NA — no single fold was selected based on outer metrics
expect_true(is.na(tuned$info$refit_fold))
# final_params must have exactly the hyperparameter columns, no fold column
expect_false("fold" %in% names(tuned$final_params))
# --- Behavioral proof that outer test metrics are not consulted ---
# final_params must exactly equal the coordinate-wise aggregate of ALL folds'
# best_params. This is the core independence guarantee: the old (leaky) code
# did which.max(metrics_df[[metric]]) to pick a single fold, so final_params
# would equal that fold's row. The new code aggregates all folds equally.
bp <- tuned$best_params
param_cols <- setdiff(names(bp), "fold")
expect_true(nrow(bp) >= 2,
info = "Need at least 2 outer folds to test aggregation")
expect_true(length(param_cols) >= 1,
info = "Need at least 1 hyperparameter column")
for (col in param_cols) {
if (is.numeric(bp[[col]])) {
expected <- stats::median(bp[[col]], na.rm = TRUE)
expect_equal(tuned$final_params[[col]], expected,
info = paste("Param", col,
"must equal median across ALL folds, not a single fold's value"))
} else {
tbl <- table(bp[[col]])
expected <- names(tbl)[which.max(tbl)]
expect_equal(as.character(tuned$final_params[[col]]), expected,
info = paste("Param", col,
"must equal majority vote across ALL folds"))
}
}
# Verify final_params is not simply copied from the best-metric fold.
# Identify the fold that had the best outer metric — if it differs from the
# aggregate, the old (leaky) logic would have returned that fold's params.
metric_col <- tuned$info$selection_metric
if (!is.null(metric_col) && metric_col %in% names(tuned$metrics)) {
metric_vals <- tuned$metrics[[metric_col]]
minimize <- metric_col %in% c("rmse", "mae", "log_loss", "mn_log_loss")
best_idx <- if (minimize) which.min(metric_vals) else which.max(metric_vals)
best_fold <- tuned$metrics$fold[best_idx]
best_fold_params <- bp[bp$fold == best_fold, param_cols, drop = FALSE]
# Check if folds actually disagree on params
folds_disagree <- FALSE
for (col in param_cols) {
if (length(unique(bp[[col]])) > 1L) {
folds_disagree <- TRUE
break
}
}
if (folds_disagree && nrow(best_fold_params) == 1L) {
# When folds disagree, the aggregate should NOT equal the best-metric
# fold's params. This is the direct proof: the old code would have
# returned best_fold_params, the new code returns the aggregate.
params_match_best <- all(vapply(param_cols, function(col) {
identical(tuned$final_params[[col]], best_fold_params[[col]])
}, logical(1)))
expect_false(params_match_best,
info = paste("final_params must not equal fold", best_fold,
"params (the best-metric fold) — that would be",
"nested-CV leakage"))
}
}
})
test_that("LeakTune summary and audit handle skipped outer folds", {
skip_if_tune_deps()
set.seed(1)
df <- data.frame(
subject = paste0("s", seq_len(60)),
outcome = factor(c(rep(0, 30), rep(1, 30)), levels = c(0, 1)),
x1 = rnorm(60),
x2 = rnorm(60),
stringsAsFactors = FALSE
)
indices <- list(
list(train = 1:20, test = 21:60, fold = 1L, repeat_id = 1L),
list(train = c(1:20, 31:60), test = 21:30, fold = 2L, repeat_id = 1L)
)
split_info <- list(
outcome = "outcome",
v = 2L,
repeats = 1L,
seed = 1L,
mode = "subject_grouped",
perm_mode = "subject_grouped",
group = "subject",
batch = NULL,
study = NULL,
time = NULL,
stratify = FALSE,
nested = FALSE,
horizon = 0,
summary = data.frame(
fold = c(1L, 2L),
repeat_id = c(1L, 1L),
train_n = c(20L, 50L),
test_n = c(40L, 10L)
),
hash = "manual",
inner = NULL,
compact = FALSE,
fold_assignments = NULL,
coldata = df
)
splits <- bioLeak:::LeakSplits(mode = "subject_grouped", indices = indices, info = split_info)
spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
parsnip::set_engine("glmnet")
rec <- recipes::recipe(outcome ~ x1 + x2, data = df)
tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
learner = spec, preprocess = rec, grid = 2,
metrics = "accuracy", seed = 1)
expect_true(is.null(tuned$outer_fits[[1]]))
expect_true(!is.null(tuned$outer_fits[[2]]))
expect_true(is.data.frame(tuned$fold_status))
expect_equal(nrow(tuned$fold_status), 2)
expect_true(any(tuned$fold_status$status == "skipped"))
expect_true(any(tuned$fold_status$status == "success"))
sum_out <- paste(capture.output(summary(tuned)), collapse = "\n")
expect_match(sum_out, "Outer Folds: 1 successful / 2 total", fixed = TRUE)
audit <- suppressWarnings(expect_no_error(
audit_leakage(
tuned,
metric = "accuracy",
B = 3,
boot_B = 20,
perm_stratify = FALSE,
return_perm = FALSE,
target_scan = FALSE,
target_scan_multivariate = FALSE
)
))
expect_s4_class(audit, "LeakAudit")
})
test_that("tune_resample tunes binomial thresholds from inner predictions", {
skip_if_tune_deps()
df <- make_class_df(24)
splits <- make_split_plan_quiet(df, outcome = "outcome",
mode = "subject_grouped", group = "subject",
v = 2, nested = TRUE, stratify = FALSE, seed = 21)
spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
parsnip::set_engine("glmnet")
rec <- recipes::recipe(outcome ~ x1 + x2, data = df)
seen_save_pred <- logical()
seen_thresholds <- numeric()
local_mocked_bindings(
tune_grid = function(..., control) {
seen_save_pred <<- c(seen_save_pred, isTRUE(control$save_pred))
structure(list(), class = "mock_tune_results")
},
collect_metrics = function(...) {
data.frame(
.metric = rep("accuracy", 4),
.estimator = rep("binary", 4),
.estimate = c(0.8, 0.8, 0.7, 0.7),
.config = c("cfg_a", "cfg_a", "cfg_b", "cfg_b"),
penalty = c(0.01, 0.01, 1.0, 1.0),
stringsAsFactors = FALSE
)
},
collect_predictions = function(...) {
data.frame(
.config = rep("cfg_a", 4),
outcome = factor(c(0, 1, 0, 1), levels = c(0, 1)),
.pred_0 = c(0.9, 0.6, 0.55, 0.4),
.pred_1 = c(0.1, 0.4, 0.45, 0.6),
stringsAsFactors = FALSE
)
},
finalize_workflow = function(x, parameters, ...) x,
.package = "tune"
)
local_mocked_bindings(
fit_resample = function(x, outcome, splits, classification_threshold = 0.5, ...) {
seen_thresholds <<- c(seen_thresholds, classification_threshold)
truth <- factor(rep(c(0, 1), length.out = 4), levels = c(0, 1))
methods::new(
"LeakFit",
splits = splits,
metrics = data.frame(fold = 1, learner = "mock", accuracy = 0.75),
metric_summary = data.frame(learner = "mock", accuracy_mean = 0.75, accuracy_sd = 0),
audit = data.frame(),
predictions = list(data.frame(
id = as.character(seq_len(4)),
truth = truth,
pred = c(0.1, 0.4, 0.45, 0.6),
fold = 1,
learner = "mock",
stringsAsFactors = FALSE
)),
preprocess = list(),
learners = list(),
outcome = outcome,
task = "binomial",
feature_names = c("x1", "x2"),
info = list(sample_ids = as.character(seq_len(nrow(x))))
)
},
.package = "bioLeak"
)
tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
learner = spec, preprocess = rec, grid = 2,
metrics = "accuracy", selection = "best", seed = 21,
tune_threshold = TRUE, threshold_grid = c(0.2, 0.8))
expect_true(all(seen_save_pred))
expect_equal(length(seen_thresholds), length(splits@indices))
expect_true(all(abs(seen_thresholds - 0.2) < 1e-12))
expect_true(is.data.frame(tuned$thresholds))
expect_equal(nrow(tuned$thresholds), length(splits@indices))
expect_true(all(abs(tuned$thresholds$threshold - 0.2) < 1e-12))
expect_true(isTRUE(tuned$info$threshold_tuned))
expect_equal(tuned$info$threshold_metric, "accuracy")
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.