R/tune_resample.R
In bioLeak: Leakage-Safe Modeling and Auditing for Genomic and Clinical Data

Documented in tune_resample

# Nested tuning ---------------------------------------------------------------

#' Leakage-aware nested tuning with tidymodels
#'
#' Runs nested cross-validation for hyperparameter tuning using leakage-aware
#' splits. Inner resamples are constructed from each outer training fold to
#' avoid information leakage during tuning. Requires tidymodels tuning
#' packages and a workflow or recipe-based preprocessing. Survival tasks are
#' not yet supported.
#'
#' @param x SummarizedExperiment or matrix/data.frame.
#' @param outcome Outcome column name (if x is SE or data.frame).
#' @param splits LeakSplits object defining the outer resamples. If the splits
#'   do not already include inner folds, they are created from each outer
#'   training fold using the same split metadata. rsample splits must already
#'   include inner folds.
#' @param split_cols Optional named list/character vector or `"auto"` (default)
#'   overriding group/batch/study/time column names when `splits` is an rsample
#'   object and its attributes are missing. `"auto"` falls back to common
#'   metadata column names (e.g., `group`, `subject`, `batch`, `study`, `time`).
#'   Supported names are `group`, `batch`, `study`, and `time`.
#' @param learner A parsnip model_spec with tunable parameters, or a workflows
#'   workflow. When a model_spec is provided, a workflow is built using
#'   `preprocess` or a formula.
#' @param preprocess Optional `recipes::recipe`. Required when you need
#'   preprocessing for tuning. Ignored when `learner` is already a workflow.
#'   Recipe/workflow leakage guardrails run before tuning; configure policy via
#'   \code{options(bioLeak.validation_mode = "warn" | "error" | "off")}.
#' @param grid Tuning grid passed to `tune::tune_grid()`. Can be a data.frame or
#'   an integer size.
#' @param metrics Character vector of metric names (`auc`, `pr_auc`, `accuracy`,
#'   `macro_f1`, `log_loss`, `rmse`) or a yardstick metric set/list. Metrics are
#'   computed with yardstick; unsupported metrics are dropped with a warning.
#'   For binomial tasks, if any inner assessment fold contains a single class,
#'   probability metrics (`auc`, `roc_auc`, `pr_auc`) are dropped for tuning with
#'   a warning.
#' @param positive_class Optional value indicating the positive class for
#'   binomial outcomes. When set, the outcome levels are reordered so the
#'   positive class is second.
#' @param selection Selection rule for tuning, either `"best"` or `"one_std_err"`.
#' @param selection_metric Metric name used for selecting hyperparameters.
#'   Defaults to the first metric in `metrics`. If the chosen metric yields
#'   no valid results, the first available metric is used with a warning.
#' @param inner_v Optional number of folds for inner CV when inner splits are
#'   not precomputed. Defaults to the outer `v`.
#' @param inner_repeats Optional number of repeats for inner CV when inner
#'   splits are not precomputed. Defaults to 1.
#' @param inner_seed Optional seed for inner split generation when inner splits
#'   are not precomputed. Defaults to the outer split seed.
#' @param control Optional `tune::control_grid()` settings for tuning.
#' @param tune_threshold Logical; when `TRUE` for binomial tasks, selects a
#'   probability threshold from inner-fold predictions and applies it only to the
#'   corresponding outer-fold evaluation.
#' @param threshold_grid Numeric vector of thresholds in `[0, 1]` considered when
#'   `tune_threshold = TRUE`.
#' @param threshold_metric Metric used to pick thresholds when
#'   `tune_threshold = TRUE`. Supported values are `"accuracy"`,
#'   `"balanced_accuracy"`, and `"f1"`, or a custom function with signature
#'   `function(truth, pred_class, prob, threshold)`.
#' @param parallel Logical; passed to [fit_resample()] when evaluating outer
#'   folds (single-fold, no refit).
#' @param refit Logical; if TRUE, refits a final tuned workflow on the full
#'   dataset using aggregated hyperparameters across all outer folds (median
#'   for numeric parameters, majority vote for categorical). This avoids
#'   nested-CV leakage that would occur from selecting a single fold's params.
#' @param seed Integer seed for reproducibility.
#' @return A list of class `"LeakTune"` with components:
#'   \item{metrics}{Outer-fold metrics.}
#'   \item{metric_summary}{Mean/SD metrics across outer folds with columns
#'     \code{learner}, and \code{<metric>_mean} and \code{<metric>_sd} for
#'     each metric.}
#'   \item{best_params}{Best hyperparameters per outer fold.}
#'   \item{inner_results}{List of inner tuning results.}
#'   \item{outer_fits}{List of outer LeakFit objects.}
#'   \item{thresholds}{Per-fold threshold choices when threshold tuning is enabled.}
#'   \item{fold_status}{Outer-fold status log with stage, status, reason, and notes.}
#'   \item{final_model}{Optional final workflow fit when `refit = TRUE`.}
#'   \item{info}{Metadata about the tuning run.}
#' @examples
#' \donttest{
#'   if (requireNamespace("tune", quietly = TRUE) &&
#'       requireNamespace("recipes", quietly = TRUE) &&
#'       requireNamespace("glmnet", quietly = TRUE) &&
#'       requireNamespace("rsample", quietly = TRUE) &&
#'       requireNamespace("workflows", quietly = TRUE) &&
#'       requireNamespace("yardstick", quietly = TRUE) &&
#'      requireNamespace("dials", quietly = TRUE)) {
#'     df <- data.frame(
#'       subject = rep(1:10, each = 2),
#'       outcome = factor(rep(c(0, 1), each = 10)),
#'       x1 = rnorm(20),
#'       x2 = rnorm(20)
#'     )
#'     splits <- make_split_plan(df, outcome = "outcome",
#'                          mode = "subject_grouped", group = "subject",
#'                          v = 3, nested = TRUE, stratify = TRUE)
#'     spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
#'       parsnip::set_engine("glmnet")
#'     rec <- recipes::recipe(outcome ~ x1 + x2, data = df)
#'     tuned <- tune_resample(df, outcome = "outcome", splits = splits,
#'                           learner = spec, preprocess = rec, grid = 5)
#'     tuned$metric_summary
#'   }
#'}
#'
#' @export
tune_resample <- function(x, outcome, splits,
                          learner,
                          preprocess = NULL,
                          grid = 10,
                          metrics = NULL,
                          positive_class = NULL,
                          selection = c("best", "one_std_err"),
                          selection_metric = NULL,
                          inner_v = NULL,
                          inner_repeats = 1,
                          inner_seed = NULL,
                          control = NULL,
                          parallel = FALSE,
                          refit = FALSE,
                          seed = 1,
                          split_cols = "auto",
                          tune_threshold = FALSE,
                          threshold_grid = seq(0.1, 0.9, by = 0.05),
                          threshold_metric = "accuracy") {

  selection <- match.arg(selection)
  .bio_strict_checks(context = "tune_resample", seed = seed)
  if (!is.logical(refit) || length(refit) != 1L || is.na(refit)) {
    .bio_stop("refit must be TRUE or FALSE.", "bioLeak_input_error")
  }
  if (!is.logical(tune_threshold) || length(tune_threshold) != 1L || is.na(tune_threshold)) {
    .bio_stop("tune_threshold must be TRUE or FALSE.", "bioLeak_input_error")
  }
  if (!is.numeric(threshold_grid) || !length(threshold_grid)) {
    .bio_stop("threshold_grid must be a non-empty numeric vector.", "bioLeak_input_error")
  }
  threshold_grid <- sort(unique(as.numeric(threshold_grid)))
  if (any(!is.finite(threshold_grid)) || any(threshold_grid < 0 | threshold_grid > 1)) {
    .bio_stop("threshold_grid must contain finite values in [0, 1].", "bioLeak_input_error")
  }
  threshold_metric_is_fn <- is.function(threshold_metric)
  if (!threshold_metric_is_fn) {
    if (!is.character(threshold_metric) || length(threshold_metric) != 1L) {
      .bio_stop("threshold_metric must be one of: accuracy, balanced_accuracy, f1, or a function.",
                "bioLeak_input_error")
    }
    threshold_metric <- match.arg(threshold_metric, c("accuracy", "balanced_accuracy", "f1"))
  }

  if (!requireNamespace("tune", quietly = TRUE)) {
    .bio_stop("Package 'tune' is required for tune_resample().", "bioLeak_dependency_error")
  }
  if (!requireNamespace("dials", quietly = TRUE)) {
    .bio_stop("Package 'dials' is required for tune_resample().", "bioLeak_dependency_error")
  }
  if (!requireNamespace("yardstick", quietly = TRUE)) {
    .bio_stop("Package 'yardstick' is required for tune_resample().", "bioLeak_dependency_error")
  }
  if (!requireNamespace("rsample", quietly = TRUE)) {
    .bio_stop("Package 'rsample' is required for tune_resample().", "bioLeak_dependency_error")
  }
  if (!requireNamespace("workflows", quietly = TRUE)) {
    .bio_stop("Package 'workflows' is required for tune_resample().", "bioLeak_dependency_error")
  }

  is_parsnip_spec <- function(obj) inherits(obj, "model_spec")
  is_workflow <- function(obj) inherits(obj, "workflow")

  if (!is_workflow(learner) && !is_parsnip_spec(learner)) {
    .bio_stop("learner must be a parsnip model_spec or a workflows::workflow.", "bioLeak_input_error")
  }

  if (is_workflow(learner) && !is.null(preprocess)) {
    warning("preprocess ignored when learner is a workflow.")
  }
  if (!is.null(preprocess) && !inherits(preprocess, "recipe") && !is_workflow(learner)) {
    .bio_stop("tune_resample requires a recipe (or a workflow) for preprocessing.",
              "bioLeak_input_error")
  }
  if (!is_workflow(learner) && inherits(preprocess, "recipe") &&
      !requireNamespace("recipes", quietly = TRUE)) {
    .bio_stop("Package 'recipes' is required when preprocess is a recipe.",
              "bioLeak_dependency_error")
  }
  validation_mode <- .bio_validation_mode()
  if (inherits(preprocess, "recipe") && !is_workflow(learner)) {
    .bio_validate_recipe_graph(
      preprocess,
      context = "tune_resample",
      mode = validation_mode
    )
  }
  if (is_workflow(learner)) {
    .bio_validate_workflow_graph(
      learner,
      context = "tune_resample",
      mode = validation_mode
    )
  }

  # --- Learner Name Extraction ---
  derive_learner_label <- function(obj) {
    spec <- obj
    if (inherits(obj, "workflow")) {
      if (requireNamespace("workflows", quietly = TRUE)) {
        spec <- tryCatch(workflows::extract_spec_parsnip(obj), error = function(e) NULL)
      }
    }
    if (inherits(spec, "model_spec")) {
      cls <- class(spec)
      cls <- cls[!cls %in% c("model_spec", "object")]
      model_type <- if (length(cls) > 0) cls[1] else "model"
      engine <- spec$engine
      if (!is.null(engine)) {
        return(paste0(model_type, "/", engine))
      } else {
        return(model_type)
      }
    }
    return("tuned_model")
  }
  learner_label <- derive_learner_label(learner)
  # -------------------------------

  if (!inherits(splits, "LeakSplits")) {
    if (.bio_is_rsample(splits)) {
      coldata <- if (.bio_is_se(x)) {
        as.data.frame(SummarizedExperiment::colData(x))
      } else if (is.data.frame(x)) {
        x
      } else if (is.matrix(x)) {
        data.frame(row_id = seq_len(nrow(x)))
      } else {
        NULL
      }
      splits <- .bio_as_leaksplits_from_rsample(splits, n = nrow(.bio_get_x(x)), coldata = coldata,
                                                split_cols = split_cols)
    } else {
      .bio_stop("splits must be a LeakSplits or rsample rset/rsplit.", "bioLeak_input_error")
    }
  }
  if (identical(splits@mode, "rsample") && is.null(splits@info$inner)) {
    .bio_stop("rsample splits require precomputed inner folds for tune_resample().", "bioLeak_input_error")
  }

  Xall <- .bio_get_x(x)
  y_orig <- .bio_get_y(x, outcome)
  yall <- y_orig
  y_data <- y_orig

  if (.bio_is_survival(yall)) {
    .bio_stop("tune_resample does not yet support survival tasks.", "bioLeak_input_error")
  }

  drop_cols <- outcome
  split_info <- splits@info
  drop_cols <- unique(c(drop_cols,
                        split_info$group,
                        split_info$batch,
                        split_info$study,
                        split_info$time))
  drop_cols <- drop_cols[!is.na(drop_cols) & nzchar(drop_cols)]
  if (length(drop_cols) && !is.null(colnames(Xall))) {
    Xall <- Xall[, setdiff(colnames(Xall), drop_cols), drop = FALSE]
  }

  task <- if (.bio_is_binomial(yall)) "binomial"
  else if (.bio_is_multiclass(yall)) "multiclass"
  else if (.bio_is_regression(yall)) "gaussian"
  else if (is.factor(yall) && nlevels(yall) == 2) "binomial"
  else if (is.factor(yall) && nlevels(yall) > 2) "multiclass"
  else .bio_stop("Unsupported outcome type for tuning.", "bioLeak_input_error")

  class_levels <- NULL
  if (task == "binomial") {
    if (!is.factor(yall)) {
      yall <- factor(yall)
      y_data <- factor(y_data)
    }
    yall <- droplevels(yall)
    if (nlevels(yall) != 2) {
      .bio_stop("Binomial task requires exactly two outcome levels.", "bioLeak_input_error")
    }
    if (!is.null(positive_class)) {
      pos_chr <- as.character(positive_class)
      if (length(pos_chr) != 1L) {
        .bio_stop("positive_class must be a single value.", "bioLeak_input_error")
      }
      levels_y <- levels(yall)
      if (!pos_chr %in% levels_y) {
        .bio_stop(sprintf("positive_class '%s' not found in outcome levels: %s",
                          pos_chr, paste(levels_y, collapse = ", ")),
                  "bioLeak_input_error")
      }
      if (!identical(pos_chr, levels_y[2])) {
        levels_y <- c(setdiff(levels_y, pos_chr), pos_chr)
        yall <- factor(yall, levels = levels_y)
        if (!inherits(preprocess, "recipe")) {
          y_data <- factor(y_data, levels = levels_y)
        }
      }
    }
    class_levels <- levels(yall)
  } else if (task == "multiclass") {
    if (!is.factor(yall)) {
      yall <- factor(yall)
      y_data <- factor(y_data)
    }
    yall <- droplevels(yall)
    if (nlevels(yall) < 3) {
      .bio_stop("Multiclass task requires 3 or more outcome levels.", "bioLeak_input_error")
    }
    if (!is.null(positive_class)) {
      warning("positive_class is ignored for multiclass tasks.")
    }
    class_levels <- levels(yall)
  } else if (!is.numeric(yall)) {
    yall <- as.numeric(yall)
    y_data <- as.numeric(y_data)
    if (anyNA(yall)) stop("Gaussian task requires numeric outcome values.", call. = FALSE)
  }

  if (isTRUE(tune_threshold) && !identical(task, "binomial")) {
    warning("tune_threshold is only supported for binomial tasks; disabling threshold tuning.",
            call. = FALSE)
    tune_threshold <- FALSE
  }

  if (is.null(control)) control <- tune::control_grid()
  if (isTRUE(tune_threshold)) {
    if (!"save_pred" %in% names(control)) {
      stop("control must be created by tune::control_grid() when tune_threshold = TRUE.",
           call. = FALSE)
    }
    control$save_pred <- TRUE
  }

  resolve_metrics <- function(metrics, task) {
    macro_f1 <- yardstick::metric_tweak("macro_f1", yardstick::f_meas, estimator = "macro")
    auc_metric <- yardstick::metric_tweak("auc", yardstick::roc_auc, event_level = "second")
    roc_metric <- yardstick::metric_tweak("roc_auc", yardstick::roc_auc, event_level = "second")
    pr_metric <- yardstick::metric_tweak("pr_auc", yardstick::pr_auc, event_level = "second")
    log_loss_metric <- yardstick::metric_tweak("log_loss", yardstick::mn_log_loss)
    mn_log_loss_metric <- yardstick::metric_tweak("mn_log_loss", yardstick::mn_log_loss)

    defaults <- if (task == "binomial") c("auc", "pr_auc", "accuracy")
    else if (task == "multiclass") c("accuracy", "macro_f1")
    else c("rmse")

    metric_map <- list(
      auc = auc_metric,
      roc_auc = roc_metric,
      pr_auc = pr_metric,
      accuracy = yardstick::accuracy,
      macro_f1 = macro_f1,
      log_loss = log_loss_metric,
      mn_log_loss = mn_log_loss_metric,
      rmse = yardstick::rmse
    )

    if (is.null(metrics)) metrics <- defaults

    if (is.character(metrics)) {
      metrics[metrics == "auc"] <- "roc_auc"
      valid_keys <- intersect(metrics, names(metric_map))
      invalid <- setdiff(metrics, names(metric_map))

      if (length(invalid)) {
        warning(sprintf("Dropping unsupported metrics for %s task: %s", task,
                        paste(invalid, collapse = ", ")))
      }

      if (length(valid_keys) == 0) valid_keys <- defaults[defaults %in% names(metric_map)]

      fns <- metric_map[valid_keys]
      metric_set <- do.call(yardstick::metric_set, fns)
      return(list(set = metric_set, names = valid_keys))
    }

    if (inherits(metrics, "metric_set")) {
      return(list(set = metrics, names = names(attr(metrics, "metrics"))))
    }

    if (is.list(metrics)) {
      metric_set <- do.call(yardstick::metric_set, metrics)
      return(list(set = metric_set, names = names(attr(metric_set, "metrics"))))
    }

    stop("metrics must be a character vector or yardstick metric set.", call. = FALSE)
  }

  subset_metric_set <- function(metric_set, keep) {
    metric_list <- attr(metric_set, "metrics")
    metric_list <- metric_list[names(metric_list) %in% keep]
    do.call(yardstick::metric_set, metric_list)
  }

  metrics_resolved <- resolve_metrics(metrics, task)
  tune_metrics <- metrics_resolved$set
  metric_names <- metrics_resolved$names

  if (is.null(selection_metric)) {
    if ("roc_auc" %in% metric_names) selection_metric <- "roc_auc"
    else if ("auc" %in% metric_names) selection_metric <- "auc"
    else selection_metric <- metric_names[[1]]
  }

  if (!selection_metric %in% metric_names) {
    if (selection_metric == "auc" && "roc_auc" %in% metric_names) selection_metric <- "roc_auc"
    else if (selection_metric == "roc_auc" && "auc" %in% metric_names) selection_metric <- "auc"
    else stop(sprintf("selection_metric '%s' not found in metrics.", selection_metric), call. = FALSE)
  }

  make_fold_df <- function(X, y) {
    df <- as.data.frame(X, check.names = FALSE)
    df[[outcome]] <- y
    df[, c(outcome, setdiff(names(df), outcome)), drop = FALSE]
  }

  has_two_classes <- function(y) {
    y <- y[!is.na(y)]
    length(unique(y)) >= 2L
  }

  evaluate_threshold_metric <- function(truth, prob, threshold, metric) {
    pred_class <- factor(
      ifelse(prob >= threshold, class_levels[2], class_levels[1]),
      levels = class_levels
    )
    if (is.function(metric)) {
      return(as.numeric(metric(truth, pred_class, prob, threshold)))
    }
    if (identical(metric, "accuracy")) {
      return(mean(pred_class == truth))
    }

    truth_pos <- truth == class_levels[2]
    pred_pos <- pred_class == class_levels[2]
    tp <- sum(truth_pos & pred_pos)
    tn <- sum(!truth_pos & !pred_pos)
    fp <- sum(!truth_pos & pred_pos)
    fn <- sum(truth_pos & !pred_pos)

    if (identical(metric, "balanced_accuracy")) {
      sens <- if ((tp + fn) > 0) tp / (tp + fn) else NA_real_
      spec <- if ((tn + fp) > 0) tn / (tn + fp) else NA_real_
      return(mean(c(sens, spec), na.rm = TRUE))
    }
    if (identical(metric, "f1")) {
      precision <- if ((tp + fp) > 0) tp / (tp + fp) else NA_real_
      recall <- if ((tp + fn) > 0) tp / (tp + fn) else NA_real_
      if (!is.finite(precision) || !is.finite(recall) || (precision + recall) == 0) {
        return(NA_real_)
      }
      return(2 * precision * recall / (precision + recall))
    }
    NA_real_
  }

  select_threshold <- function(truth, prob, grid, metric) {
    scores <- vapply(grid, function(thr) {
      evaluate_threshold_metric(truth = truth, prob = prob, threshold = thr, metric = metric)
    }, numeric(1))
    valid <- is.finite(scores)
    if (!any(valid)) {
      return(list(threshold = 0.5, metric_value = NA_real_))
    }
    grid_ok <- grid[valid]
    scores_ok <- scores[valid]
    best <- which(scores_ok == max(scores_ok))
    if (length(best) > 1L) {
      best <- best[order(abs(grid_ok[best] - 0.5), grid_ok[best])[1L]]
    } else {
      best <- best[1L]
    }
    list(threshold = grid_ok[[best]], metric_value = scores_ok[[best]])
  }

  available_metrics <- function(metrics_df) {
    if (is.null(metrics_df) || !nrow(metrics_df)) return(character())
    if (!".metric" %in% names(metrics_df)) return(character())
    value_col <- NULL
    if (".estimate" %in% names(metrics_df)) {
      value_col <- ".estimate"
    } else if ("estimate" %in% names(metrics_df)) {
      value_col <- "estimate"
    } else if ("mean" %in% names(metrics_df)) {
      value_col <- "mean"
    }
    if (is.null(value_col)) return(character())
    metric_ok <- tapply(!is.na(metrics_df[[value_col]]), metrics_df$.metric, any)
    names(metric_ok)[metric_ok]
  }

  infer_simplicity_direction <- function(name) {
    nm <- tolower(name)
    simpler_when_higher <- c(
      "penalty", "lambda", "cost_complexity", "cp", "gamma",
      "min_n", "min.node.size", "min_child_weight", "loss_reduction",
      "sample_size", "margin"
    )
    simpler_when_lower <- c(
      "mtry", "trees", "num_trees", "tree_depth", "max_depth", "depth",
      "degree", "neighbors", "k", "num_terms", "num_comp", "num_components",
      "hidden_units", "size", "epochs", "iter", "stop_iter", "learn_rate",
      "mixture"
    )
    if (nm %in% simpler_when_higher) return("higher")
    if (nm %in% simpler_when_lower) return("lower")
    "lower"
  }

  simplify_rank <- function(df, config_col = ".config") {
    if (!nrow(df)) return(numeric(0))
    param_cols <- setdiff(names(df), c(config_col, "mean", "sd", "n", "std_err"))
    if (!length(param_cols)) return(rep(0, nrow(df)))

    score <- rep(0, nrow(df))
    for (col in param_cols) {
      v <- df[[col]]
      dir <- infer_simplicity_direction(col)
      if (is.logical(v)) v <- as.numeric(v)
      if (is.factor(v)) v <- as.character(v)
      if (is.character(v)) {
        ord <- rank(v, ties.method = "average", na.last = "keep")
      } else if (is.numeric(v)) {
        ord <- rank(v, ties.method = "average", na.last = "keep")
      } else {
        ord <- rank(as.character(v), ties.method = "average", na.last = "keep")
      }
      if (dir == "higher") {
        finite_ord <- ord[is.finite(ord)]
        max_ord <- if (length(finite_ord)) max(finite_ord) else 0
        ord <- ifelse(is.finite(ord), (max_ord + 1) - ord, ord)
      }
      ord[!is.finite(ord)] <- max(ord[is.finite(ord)], 0) + 1
      score <- score + ord
    }
    score
  }

  select_config <- function(metrics_df, metric_name, selection_rule) {
    res_sub <- metrics_df[
      metrics_df$.metric == metric_name & is.finite(metrics_df$.estimate),
      c(".config", ".estimate"),
      drop = FALSE
    ]
    if (!nrow(res_sub)) return(NULL)

    agg_mean <- aggregate(.estimate ~ .config, data = res_sub, FUN = mean, na.rm = TRUE)
    names(agg_mean)[names(agg_mean) == ".estimate"] <- "mean"
    agg_sd <- aggregate(.estimate ~ .config, data = res_sub, FUN = stats::sd, na.rm = TRUE)
    names(agg_sd)[names(agg_sd) == ".estimate"] <- "sd"
    agg_n <- aggregate(.estimate ~ .config, data = res_sub, FUN = length)
    names(agg_n)[names(agg_n) == ".estimate"] <- "n"
    cfg <- Reduce(function(x, y) merge(x, y, by = ".config", all = TRUE),
                  list(agg_mean, agg_sd, agg_n))
    cfg$std_err <- ifelse(cfg$n > 1L & is.finite(cfg$sd),
                          cfg$sd / sqrt(cfg$n),
                          0)

    param_cols <- setdiff(names(metrics_df), c(".metric", ".estimator", ".estimate", "n", "std_err", ".config"))
    params_by_cfg <- unique(metrics_df[, c(".config", param_cols), drop = FALSE])
    cfg <- merge(cfg, params_by_cfg, by = ".config", all.x = TRUE, sort = FALSE)

    minimize <- metric_name %in% c("rmse", "mae", "log_loss", "mn_log_loss")
    best_idx <- if (minimize) which.min(cfg$mean) else which.max(cfg$mean)
    if (!length(best_idx) || !is.finite(cfg$mean[best_idx[1]])) return(NULL)
    best_idx <- best_idx[1]

    cand <- cfg
    if (identical(selection_rule, "one_std_err")) {
      best_mean <- cfg$mean[best_idx]
      best_se <- cfg$std_err[best_idx]
      if (!is.finite(best_se)) best_se <- 0
      keep <- if (minimize) {
        cfg$mean <= (best_mean + best_se)
      } else {
        cfg$mean >= (best_mean - best_se)
      }
      cand <- cfg[keep & is.finite(cfg$mean), , drop = FALSE]
      if (!nrow(cand)) cand <- cfg[best_idx, , drop = FALSE]
    } else {
      tol <- sqrt(.Machine$double.eps)
      tied <- which(abs(cfg$mean - cfg$mean[best_idx]) <= tol)
      cand <- cfg[tied, , drop = FALSE]
      if (!nrow(cand)) cand <- cfg[best_idx, , drop = FALSE]
    }

    cand$.simplicity <- simplify_rank(cand)
    ord <- order(cand$.simplicity, cand$.config, na.last = TRUE)
    chosen <- cand[ord[1], , drop = FALSE]

    params <- chosen[, setdiff(names(chosen), c(".config", "mean", "sd", "n", "std_err", ".simplicity")),
                     drop = FALSE]
    list(config = chosen$.config[[1]], params = params, metric = metric_name)
  }

  build_workflow <- function() {
    if (is_workflow(learner)) return(learner)
    if (inherits(preprocess, "recipe")) {
      return(workflows::workflow() |> workflows::add_model(learner) |>
               workflows::add_recipe(preprocess))
    }
    if (length(outcome) != 1L) {
      stop("Formula workflows require a single outcome column.", call. = FALSE)
    }
    form <- stats::as.formula(paste0("`", outcome, "` ~ ."))
    workflows::workflow() |>
      workflows::add_model(learner) |>
      workflows::add_formula(form)
  }

  base_workflow <- build_workflow()

  make_inner_rset <- function(inner_idx, data) {
    if (!length(inner_idx)) stop("Inner splits are empty.", call. = FALSE)
    ids <- vapply(seq_along(inner_idx), function(i) {
      fold <- inner_idx[[i]]
      if (!is.null(fold$fold)) {
        rep_id <- fold$repeat_id %||% 1L
        paste0("Fold", fold$fold, "_Repeat", rep_id)
      } else {
        paste0("Fold", i)
      }
    }, character(1))
    split_objs <- lapply(seq_along(inner_idx), function(i) {
      fold <- inner_idx[[i]]
      if (is.null(fold$train) || is.null(fold$test)) {
        stop("Inner split is missing train/test indices.", call. = FALSE)
      }
      .bio_make_rsplit(fold$train, fold$test, data = data, id = ids[[i]])
    })
    manual_rset <- utils::getFromNamespace("manual_rset", "rsample")
    do.call(manual_rset, list(splits = split_objs, ids = ids))
  }

  make_outer_splits <- function(train, test) {
    info <- splits@info
    info$v <- 1L
    info$repeats <- 1L
    info$nested <- FALSE
    info$inner <- NULL
    info$compact <- FALSE
    info$fold_assignments <- NULL
    info$summary <- data.frame(
      fold = 1L,
      repeat_id = 1L,
      train_n = length(train),
      test_n = length(test),
      stringsAsFactors = FALSE
    )
    indices <- list(list(train = train, test = test, fold = 1L, repeat_id = 1L))
    info$hash <- .bio_hash_indices(indices)
    new("LeakSplits", mode = splits@mode, indices = indices, info = info)
  }

  inner_seed <- inner_seed %||% (splits@info$seed %||% seed)
  inner_v <- inner_v %||% (splits@info$v %||% 5L)
  inner_repeats <- inner_repeats %||% 1L

  df_all <- if (is.data.frame(x)) {
    x
  } else {
    make_fold_df(Xall, y_data)
  }

  coldata <- if (.bio_is_se(x)) {
    as.data.frame(SummarizedExperiment::colData(x))
  } else if (is.data.frame(x)) {
    x
  } else if (is.matrix(x)) {
    data.frame(row_id = seq_len(nrow(Xall)))
  } else {
    NULL
  }

  folds <- splits@indices
  metrics_rows <- list()
  best_rows <- list()
  inner_results <- list()
  outer_fits <- list()
  threshold_rows <- list()
  threshold_metric_label <- if (is.function(threshold_metric)) "custom" else threshold_metric
  skipped_single_class_outer <- 0L
  skipped_single_class_inner <- 0L
  skipped_no_results <- 0L
  metrics_used <- character()
  selection_metric_requested <- selection_metric
  selection_metric_used <- selection_metric
  selection_metric_warned <- FALSE
  fold_status_rows <- vector("list", length(folds))
  mark_fold_status <- function(fold_id, stage, status, reason = NA_character_, notes = NA_character_) {
    fold_status_rows[[fold_id]] <<- data.frame(
      fold = fold_id,
      stage = as.character(stage),
      status = as.character(status),
      reason = as.character(reason),
      notes = as.character(notes),
      stringsAsFactors = FALSE
    )
  }

  for (i in seq_along(folds)) {
    fold <- folds[[i]]
    fold <- .bio_resolve_fold_indices(splits, fold, n = nrow(df_all), data = coldata)
    tr <- fold$train
    te <- fold$test
    if (!length(tr) || !length(te)) {
      warning(sprintf("Outer fold %d skipped: empty train/test.", i), call. = FALSE)
      mark_fold_status(i, stage = "outer_split", status = "skipped",
                       reason = "empty_train_or_test", notes = "Outer split has empty train or test indices.")
      next
    }

    df_train <- df_all[tr, , drop = FALSE]
    y_train_logic <- yall[tr]

    if (task %in% c("binomial", "multiclass") && !has_two_classes(y_train_logic)) {
      warning(sprintf("Outer fold %d skipped: training data has a single outcome class.", i),
              call. = FALSE)
      skipped_single_class_outer <- skipped_single_class_outer + 1L
      mark_fold_status(i, stage = "outer_train", status = "skipped",
                       reason = "single_class_training",
                       notes = "Outer training data contains fewer than two outcome classes.")
      next
    }

    inner_idx <- NULL
    inner_precomputed <- FALSE
    if (!is.null(splits@info$inner) && length(splits@info$inner) >= i) {
      inner_idx <- splits@info$inner[[i]]
      inner_precomputed <- TRUE
    }
    if (is.null(inner_idx)) {
      if (identical(splits@mode, "rsample")) {
        inner_mode <- "subject_grouped"
        if (!is.null(splits@info$perm_mode)) inner_mode <- splits@info$perm_mode
        grp <- splits@info$group
        if(is.null(grp) && "subject" %in% names(df_train)) grp <- "subject"
        if(is.null(grp) && "id" %in% names(df_train)) grp <- "id"
        x_inner <- if (.bio_is_se(x)) x[, tr] else x[tr, , drop = FALSE]
      } else {
        x_inner <- if (.bio_is_se(x)) x[, tr] else x[tr, , drop = FALSE]
      }

      inner <- make_split_plan(
        x_inner,
        outcome = outcome,
        mode = splits@mode,
        group = splits@info$group,
        batch = splits@info$batch,
        study = splits@info$study,
        time = splits@info$time,
        v = inner_v,
        repeats = inner_repeats,
        stratify = isTRUE(splits@info$stratify),
        nested = FALSE,
        seed = inner_seed + i,
        horizon = splits@info$horizon %||% 0,
        purge = splits@info$purge %||% 0,
        embargo = splits@info$embargo %||% 0,
        progress = FALSE
      )
      inner_idx <- inner@indices
    }

    inner_global <- FALSE
    if (inner_precomputed) {
      inner_positions <- unlist(lapply(inner_idx, function(fold) {
        c(fold$train, fold$test)
      }), use.names = FALSE)
      inner_positions <- inner_positions[!is.na(inner_positions)]
      if (!length(inner_positions)) {
        stop(sprintf("Outer fold %d: inner splits are empty.", i), call. = FALSE)
      }
      if (any(inner_positions < 1L)) {
        stop(sprintf("Outer fold %d: inner split indices must be positive.", i), call. = FALSE)
      }
      if (any(inner_positions > nrow(df_all))) {
        stop(sprintf("Outer fold %d: inner split indices exceed available rows.", i),
             call. = FALSE)
      }
      inner_in_train <- all(inner_positions %in% seq_len(nrow(df_train)))
      inner_in_outer <- all(inner_positions %in% tr)
      if (!inner_in_train && inner_in_outer) {
        inner_global <- TRUE
      } else if (inner_in_outer && !identical(tr, seq_len(nrow(df_train)))) {
        inner_global <- TRUE
      } else if (!inner_in_train && !inner_in_outer) {
        stop(sprintf(
          "Outer fold %d: inner split indices do not match outer training data.",
          i
        ), call. = FALSE)
      }
    }

    # Precomputed inner indices may be global; choose the matching data source.
    y_inner <- if (inner_global) yall else y_train_logic
    if (task %in% c("binomial", "multiclass")) {
      inner_has_class <- vapply(inner_idx, function(fold) {
        idx <- fold$train
        if (is.null(idx)) return(FALSE)
        has_two_classes(y_inner[idx])
      }, logical(1))
      if (!any(inner_has_class)) {
        warning(sprintf(
          "Outer fold %d skipped: inner training folds have a single outcome class. Consider `stratify = TRUE` or reducing `inner_v`.",
          i
        ), call. = FALSE)
        skipped_single_class_inner <- skipped_single_class_inner + 1L
        mark_fold_status(i, stage = "inner_train", status = "skipped",
                         reason = "single_class_training",
                         notes = "All inner training folds have fewer than two outcome classes.")
        next
      }
      if (task == "binomial") {
        drop_count <- sum(!inner_has_class)
        if (drop_count > 0L) {
          inner_idx <- inner_idx[inner_has_class]
          warning(sprintf(
            "Outer fold %d: dropped %d inner fold(s) with single-class training data.",
            i, drop_count
          ), call. = FALSE)
        }
      }
    }

    tune_metrics_fold <- tune_metrics
    metric_names_fold <- metric_names
    if (task == "binomial") {
      inner_assess_two <- vapply(inner_idx, function(fold) {
        idx <- fold$test
        if (is.null(idx)) return(FALSE)
        has_two_classes(y_inner[idx])
      }, logical(1))
      if (!all(inner_assess_two)) {
        drop_metrics <- intersect(metric_names_fold, c("auc", "roc_auc", "pr_auc"))
        keep_metrics <- setdiff(metric_names_fold, drop_metrics)
        if (length(drop_metrics) && length(keep_metrics)) {
          tune_metrics_fold <- subset_metric_set(tune_metrics_fold, keep_metrics)
          metric_names_fold <- keep_metrics
          warning(sprintf(
            "Outer fold %d: inner assessment folds lack both classes; dropping metrics %s for tuning.",
            i, paste(drop_metrics, collapse = ", ")
          ), call. = FALSE)
        }
      }
    }

    metrics_used <- unique(c(metrics_used, metric_names_fold))

    inner_data <- if (inner_global) df_all else df_train
    inner_rset <- make_inner_rset(inner_idx, inner_data)
    set.seed(seed + i)
    tune_res <- tune::tune_grid(
      base_workflow,
      resamples = inner_rset,
      grid = grid,
      metrics = tune_metrics_fold,
      control = control
    )
    inner_results[[i]] <- tune_res
    no_results <- FALSE
    metrics_raw <- tryCatch(
      tune::collect_metrics(tune_res, summarize = FALSE),
      error = function(e) {
        if (grepl("No results are available", conditionMessage(e), fixed = TRUE)) {
          no_results <<- TRUE
          return(NULL)
        }
        stop(e)
      }
    )
    if (isTRUE(no_results) || is.null(metrics_raw) || !nrow(metrics_raw)) {
      warning(sprintf(
        "Outer fold %d skipped: inner tuning produced no results. Check `tune::collect_notes()` for failures.",
        i
      ), call. = FALSE)
      skipped_no_results <- skipped_no_results + 1L
      mark_fold_status(i, stage = "inner_tuning", status = "skipped",
                       reason = "no_results",
                       notes = "tune::collect_metrics() returned no rows.")
      next
    }
    avail_metrics <- available_metrics(metrics_raw)
    metric_used <- selection_metric_used

    if (!metric_used %in% avail_metrics) {
      if ("<prb_mtrc>" %in% avail_metrics && metric_used %in% c("roc_auc", "auc", "pr_auc")) {
        metric_used <- "<prb_mtrc>"
      } else if ("<clss_mtr>" %in% avail_metrics && metric_used %in% c("accuracy", "kap")) {
        metric_used <- "<clss_mtr>"
      } else {
        if (length(avail_metrics)) {
          metric_used <- avail_metrics[1]
          if (!selection_metric_warned) {
            warning(sprintf("Selected metric not found; falling back to '%s'.", metric_used), call. = FALSE)
            selection_metric_warned <- TRUE
          }
        } else {
          warning(sprintf("Outer fold %d skipped: no usable metrics found.", i), call. = FALSE)
          skipped_no_results <- skipped_no_results + 1L
          mark_fold_status(i, stage = "metric_resolution", status = "skipped",
                           reason = "no_usable_metrics",
                           notes = "No metric with finite estimates was available for model selection.")
          next
        }
      }
    }

    selection_metric_used <- metric_used
    selected_cfg <- select_config(metrics_raw, metric_name = metric_used, selection_rule = selection)
    if (is.null(selected_cfg) || is.null(selected_cfg$params) || !nrow(selected_cfg$params)) {
      warning(sprintf("Outer fold %d skipped: could not determine best parameters.", i), call. = FALSE)
      mark_fold_status(i, stage = "model_selection", status = "skipped",
                       reason = "selection_failed",
                       notes = sprintf("Failed to select hyperparameters using metric '%s'.", metric_used))
      next
    }
    best_config <- selected_cfg$config
    best_params <- selected_cfg$params

    fold_threshold <- 0.5
    fold_threshold_metric <- NA_real_
    fold_threshold_n <- 0L
    if (isTRUE(tune_threshold) && identical(task, "binomial")) {
      inner_preds <- tryCatch(
        tune::collect_predictions(tune_res),
        error = function(e) NULL
      )
      if (is.null(inner_preds) || !nrow(inner_preds)) {
        warning(sprintf("Outer fold %d: threshold tuning skipped (no inner predictions).", i),
                call. = FALSE)
      } else {
        pred_cfg <- inner_preds[inner_preds$.config == best_config, , drop = FALSE]
        if (!nrow(pred_cfg)) {
          warning(sprintf("Outer fold %d: threshold tuning skipped (selected config has no predictions).", i),
                  call. = FALSE)
        } else {
          pos_col <- paste0(".pred_", make.names(class_levels[2]))
          prob_cols <- setdiff(grep("^\\.pred_", names(pred_cfg), value = TRUE), ".pred_class")
          if (!pos_col %in% prob_cols && length(prob_cols) >= 2L) {
            pos_col <- prob_cols[[2]]
          }
          if (!pos_col %in% names(pred_cfg)) {
            warning(sprintf("Outer fold %d: threshold tuning skipped (positive probability column not found).", i),
                    call. = FALSE)
          } else {
            prob_vals <- as.numeric(pred_cfg[[pos_col]])
            if (".row" %in% names(pred_cfg)) {
              row_idx <- as.integer(pred_cfg$.row)
              valid_rows <- is.finite(row_idx) & row_idx >= 1L & row_idx <= length(y_inner)
              row_idx <- row_idx[valid_rows]
              prob_vals <- prob_vals[valid_rows]
              truth_vals <- factor(y_inner[row_idx], levels = class_levels)
            } else if (length(outcome) == 1L && outcome %in% names(pred_cfg)) {
              truth_vals <- factor(pred_cfg[[outcome]], levels = class_levels)
            } else if ("truth" %in% names(pred_cfg)) {
              truth_vals <- factor(pred_cfg$truth, levels = class_levels)
            } else {
              truth_vals <- factor(character(0), levels = class_levels)
            }
            keep <- is.finite(prob_vals) & !is.na(truth_vals)
            prob_vals <- prob_vals[keep]
            truth_vals <- truth_vals[keep]

            if (length(prob_vals) && has_two_classes(truth_vals)) {
              threshold_pick <- select_threshold(
                truth = truth_vals,
                prob = prob_vals,
                grid = threshold_grid,
                metric = threshold_metric
              )
              fold_threshold <- threshold_pick$threshold
              fold_threshold_metric <- threshold_pick$metric_value
              fold_threshold_n <- length(prob_vals)
            } else {
              warning(sprintf(
                "Outer fold %d: threshold tuning skipped (insufficient inner predictions with both classes).",
                i
              ), call. = FALSE)
            }
          }
        }
      }
      threshold_rows[[length(threshold_rows) + 1L]] <- data.frame(
        fold = i,
        threshold = fold_threshold,
        metric_value = fold_threshold_metric,
        n_inner = fold_threshold_n,
        stringsAsFactors = FALSE
      )
    }

    final_workflow <- tune::finalize_workflow(base_workflow, best_params)
    outer_splits <- make_outer_splits(tr, te)

    # CRITICAL: Force the correct name in fit_resample by passing a named list
    learner_for_fit <- list()
    learner_for_fit[[learner_label]] <- final_workflow

    outer_fit_args <- list(
      x = x,
      outcome = outcome,
      splits = outer_splits,
      preprocess = list(),
      learner = learner_for_fit,
      metrics = tune_metrics_fold,
      positive_class = if (task == "binomial") class_levels[2] else NULL,
      parallel = parallel,
      refit = FALSE,
      seed = seed + i
    )
    if (identical(task, "binomial")) {
      outer_fit_args$classification_threshold <- fold_threshold
    }
    outer_fit <- do.call(fit_resample, outer_fit_args)

    outer_fits[[i]] <- outer_fit
    fold_metrics <- outer_fit@metrics
    fold_metrics$fold <- i
    metrics_rows[[length(metrics_rows) + 1L]] <- fold_metrics
    mark_fold_status(i, stage = "outer_evaluation", status = "success",
                     reason = NA_character_,
                     notes = sprintf("Selected config '%s' using metric '%s'.",
                                     as.character(best_config), as.character(metric_used)))

    best_df <- as.data.frame(best_params)
    best_df$fold <- i
    best_rows[[length(best_rows) + 1L]] <- best_df
  }

  if (!length(metrics_rows)) {
    msg <- "No successful outer folds were completed."
    reasons <- character()
    if (skipped_single_class_outer) {
      reasons <- c(reasons, "Some outer training folds had a single outcome class.")
    }
    if (skipped_single_class_inner) {
      reasons <- c(reasons, "Some inner training folds had a single outcome class.")
    }
    if (skipped_no_results) {
      reasons <- c(reasons, "Inner tuning produced no results for some folds.")
    }
    if (length(reasons)) {
      msg <- paste(msg, paste(reasons, collapse = " "),
                   "Consider `stratify = TRUE` in make_split_plan() or reducing `inner_v`.")
    }
    stop(msg, call. = FALSE)
  }

  # Unify metric row schemas before binding (folds may differ in computed metrics)
  all_cols <- unique(unlist(lapply(metrics_rows, names)))
  metrics_rows <- lapply(metrics_rows, function(row) {
    for (col in setdiff(all_cols, names(row))) row[[col]] <- NA_real_
    row[, all_cols, drop = FALSE]
  })
  metrics_df <- do.call(rbind, metrics_rows)
  best_params_df <- if (length(best_rows)) do.call(rbind, best_rows) else data.frame()
  fold_status_df <- if (length(fold_status_rows)) {
    do.call(rbind, lapply(seq_along(fold_status_rows), function(i) {
      row <- fold_status_rows[[i]]
      if (is.null(row)) {
        data.frame(
          fold = i,
          stage = "outer_loop",
          status = "failed",
          reason = "unknown",
          notes = "Fold ended without a recorded status.",
          stringsAsFactors = FALSE
        )
      } else {
        row
      }
    }))
  } else {
    data.frame(fold = integer(0), stage = character(0), status = character(0),
               reason = character(0), notes = character(0), stringsAsFactors = FALSE)
  }
  thresholds_df <- if (length(threshold_rows)) {
    do.call(rbind, threshold_rows)
  } else {
    data.frame(fold = integer(0), threshold = numeric(0),
               metric_value = numeric(0), n_inner = integer(0), stringsAsFactors = FALSE)
  }
  threshold_stability <- if (nrow(thresholds_df)) {
    list(
      mean = mean(thresholds_df$threshold, na.rm = TRUE),
      sd = stats::sd(thresholds_df$threshold, na.rm = TRUE),
      min = min(thresholds_df$threshold, na.rm = TRUE),
      max = max(thresholds_df$threshold, na.rm = TRUE)
    )
  } else {
    list(mean = NA_real_, sd = NA_real_, min = NA_real_, max = NA_real_)
  }

  metric_cols <- setdiff(names(metrics_df), "fold")
  metric_summary_raw <- aggregate(. ~ learner, data = metrics_df[, metric_cols, drop = FALSE],
                                  FUN = function(x) c(mean = mean(x, na.rm = TRUE),
                                                      sd = sd(x, na.rm = TRUE)),
                                  na.action = stats::na.pass)
  # flatten embedded matrices from aggregate() into separate columns
  metric_summary <- data.frame(learner = metric_summary_raw$learner,
                               stringsAsFactors = FALSE)
  for (col in setdiff(colnames(metric_summary_raw), "learner")) {
    mat <- metric_summary_raw[[col]]
    if (is.matrix(mat)) {
      metric_summary[[paste0(col, "_mean")]] <- mat[, "mean"]
      metric_summary[[paste0(col, "_sd")]] <- mat[, "sd"]
    } else {
      metric_summary[[col]] <- mat
    }
  }

  # confidence intervals -------------------------------------------------------
  tryCatch({
    ci_df <- cv_ci(metrics_df, method = "nadeau_bengio")
    ci_cols <- grep("_ci_lo$|_ci_hi$", names(ci_df), value = TRUE)
    if (length(ci_cols) && nrow(ci_df) == nrow(metric_summary)) {
      for (cc in ci_cols) {
        metric_summary[[cc]] <- ci_df[[cc]]
      }
    }
  }, error = function(e) NULL)

  final_model <- NULL
  final_workflow <- NULL
  final_params <- data.frame()
  refit_fold <- NA_integer_
  refit_metric <- NA_character_
  if (isTRUE(refit) && nrow(best_params_df) > 0) {
    refit_metric <- selection_metric_used

    # Aggregate hyperparameters across all outer folds to avoid nested-CV
    # leakage. The old approach selected the fold with the best outer-fold
    # test metric and used that fold's inner-CV params — a violation because
    # final params become informed by outer test performance.
    #
    # This coordinate-wise aggregation (median for numeric, majority vote
    # for categorical) is leakage-safe but may produce parameter combinations
    # never jointly evaluated in any single inner CV. This is an accepted
    # tradeoff: the aggregated values lack direct validation grounding but
    # preserve the independence of the outer evaluation loop.
    param_cols <- setdiff(names(best_params_df), "fold")
    if (length(param_cols)) {
      agg_list <- lapply(param_cols, function(col) {
        vals <- best_params_df[[col]]
        if (is.numeric(vals)) {
          stats::median(vals, na.rm = TRUE)
        } else {
          # majority vote
          tbl <- table(vals)
          names(tbl)[which.max(tbl)]
        }
      })
      names(agg_list) <- param_cols
      final_params <- as.data.frame(agg_list, stringsAsFactors = FALSE)
      # Restore numeric types where needed
      for (col in param_cols) {
        if (is.numeric(best_params_df[[col]])) {
          final_params[[col]] <- as.numeric(final_params[[col]])
        }
      }
      refit_fold <- NA_integer_
      final_workflow <- tune::finalize_workflow(base_workflow, final_params)
      final_model <- generics::fit(final_workflow, data = df_all)
    } else {
      warning("Refit requested but skipped: no hyperparameter columns found.", call. = FALSE)
    }
  }

  structure(
    list(
      splits = splits,
      metrics = metrics_df,
      metric_summary = metric_summary,
      best_params = best_params_df,
      inner_results = inner_results,
      outer_fits = outer_fits,
      thresholds = thresholds_df,
      fold_status = fold_status_df,
      final_model = final_model,
      final_workflow = final_workflow,
      final_params = final_params,
      info = list(
        task = task,
        metrics_requested = metric_names,
        metrics_used = if (length(metrics_used)) metrics_used else metric_names,
        positive_class = if (task == "binomial") class_levels[2] else NULL,
        selection = selection,
        selection_metric = selection_metric_used,
        selection_metric_requested = selection_metric_requested,
        threshold_tuned = isTRUE(tune_threshold) && identical(task, "binomial"),
        threshold_metric = threshold_metric_label,
        threshold_grid = threshold_grid,
        thresholds = thresholds_df,
        threshold_stability = threshold_stability,
        refit = isTRUE(refit),
        refit_method = if (isTRUE(refit)) "aggregate" else NA_character_,
        refit_metric = refit_metric,
        refit_fold = refit_fold,
        fold_status = fold_status_df,
        grid = grid,
        seed = seed,
        provenance = .bio_capture_provenance()
      )
    ),
    class = "LeakTune"
  )
}