Nothing
# Tidymodels interoperability helpers ---------------------------------------
.bio_is_rset <- function(x) inherits(x, "rset")
.bio_is_rsplit <- function(x) inherits(x, "rsplit")
.bio_is_rsample <- function(x) .bio_is_rset(x) || .bio_is_rsplit(x)
.bio_is_recipe <- function(x) inherits(x, "recipe")
.bio_is_workflow <- function(x) inherits(x, "workflow")
.bio_is_yardstick_metric <- function(x) inherits(x, "metric") || inherits(x, "metric_function")
.bio_validation_mode <- function(mode = NULL) {
if (.bio_is_strict()) return("error")
mode <- mode %||% getOption("bioLeak.validation_mode", "warn")
if (is.null(mode) || length(mode) != 1L || !is.character(mode)) {
mode <- "warn"
}
mode <- tolower(mode)
if (!mode %in% c("warn", "error", "off")) {
warning("Unknown bioLeak validation mode; using 'warn'.", call. = FALSE)
mode <- "warn"
}
mode
}
.bio_signal_policy <- function(message, mode = "warn") {
mode <- .bio_validation_mode(mode)
if (identical(mode, "off")) return(invisible(FALSE))
if (identical(mode, "error")) {
stop(message, call. = FALSE)
}
warning(message, call. = FALSE)
invisible(TRUE)
}
.bio_recipe_step_label <- function(step) {
cls <- class(step)
cls <- cls[!cls %in% c("step", "list")]
if (!length(cls)) return("step")
cls[[1]]
}
.bio_recipe_is_trained <- function(recipe_obj) {
if (!.bio_is_recipe(recipe_obj)) return(FALSE)
has_tr_info <- !is.null(recipe_obj$tr_info)
has_orig_levels <- !is.null(recipe_obj$orig_lvls)
has_fit_times <- !is.null(recipe_obj$fit_times)
isTRUE(has_tr_info || has_orig_levels || has_fit_times)
}
.bio_validate_recipe_graph <- function(recipe_obj, context = "fit_resample", mode = NULL) {
if (!.bio_is_recipe(recipe_obj)) return(invisible(TRUE))
mode <- .bio_validation_mode(mode)
if (identical(mode, "off")) return(invisible(TRUE))
if (.bio_recipe_is_trained(recipe_obj)) {
.bio_signal_policy(
sprintf(
"%s: received a trained recipe. Pass an untrained recipe to avoid fold-global leakage.",
context
),
mode
)
}
steps <- recipe_obj$steps %||% list()
if (length(steps)) {
for (i in seq_along(steps)) {
st <- steps[[i]]
if (inherits(st, "step_lag")) {
lag_vals <- suppressWarnings(as.numeric(st$lag %||% numeric(0)))
if (length(lag_vals) && any(is.finite(lag_vals) & lag_vals < 0)) {
.bio_signal_policy(
sprintf(
"%s: recipe step %d (%s) uses negative lag values; this can introduce lookahead leakage.",
context, i, .bio_recipe_step_label(st)
),
mode
)
}
}
}
}
invisible(TRUE)
}
.bio_workflow_is_trained <- function(workflow_obj) {
if (!.bio_is_workflow(workflow_obj) || !requireNamespace("workflows", quietly = TRUE)) {
return(FALSE)
}
is_trained_fn <- get0("is_trained_workflow", envir = asNamespace("workflows"), inherits = FALSE)
if (is.function(is_trained_fn)) {
out <- tryCatch(is_trained_fn(workflow_obj), error = function(e) FALSE)
return(isTRUE(out))
}
has_fit <- tryCatch({
workflows::extract_fit_parsnip(workflow_obj)
TRUE
}, error = function(e) FALSE)
isTRUE(has_fit)
}
.bio_validate_workflow_graph <- function(workflow_obj, context = "fit_resample", mode = NULL) {
if (!.bio_is_workflow(workflow_obj)) return(invisible(TRUE))
mode <- .bio_validation_mode(mode)
if (identical(mode, "off")) return(invisible(TRUE))
if (.bio_workflow_is_trained(workflow_obj)) {
.bio_signal_policy(
sprintf(
"%s: received a trained workflow. Pass an unfitted workflow to avoid fold-global leakage.",
context
),
mode
)
}
if (requireNamespace("workflows", quietly = TRUE)) {
pre <- tryCatch(workflows::extract_preprocessor(workflow_obj), error = function(e) NULL)
if (.bio_is_recipe(pre)) {
.bio_validate_recipe_graph(
pre,
context = paste0(context, " (workflow preprocessor)"),
mode = mode
)
}
}
invisible(TRUE)
}
.bio_perm_mode <- function(splits) {
if (!inherits(splits, "LeakSplits")) return(NA_character_)
mode <- splits@mode %||% NA_character_
valid_modes <- c("subject_grouped", "batch_blocked", "study_loocv", "time_series")
if (!identical(mode, "rsample")) {
return(mode)
}
perm_mode <- splits@info$perm_mode %||% NULL
if (is.null(perm_mode)) return(mode)
perm_mode <- as.character(perm_mode)
perm_mode <- perm_mode[!is.na(perm_mode) & nzchar(perm_mode)]
if (!length(perm_mode)) return(mode)
perm_mode <- perm_mode[[1]]
if (perm_mode %in% valid_modes) perm_mode else mode
}
.bio_rsample_col_name <- function(val) {
if (is.null(val) || is.logical(val)) return(NULL)
if (is.character(val)) {
val <- val[!is.na(val) & nzchar(val)]
return(if (length(val)) val else NULL)
}
if (inherits(val, "formula")) {
vars <- all.vars(val)
return(if (length(vars)) vars else NULL)
}
if (is.name(val)) return(as.character(val))
if (is.call(val)) {
vars <- all.vars(val)
return(if (length(vars)) vars else NULL)
}
NULL
}
.bio_rsample_split_cols <- function(splits) {
grab <- function(nm) .bio_rsample_col_name(attr(splits, nm, exact = TRUE))
list(
group = grab("group") %||% grab("groups"),
batch = grab("batch"),
study = grab("study"),
time = grab("time") %||% grab("index") %||% grab("date")
)
}
.bio_rsplit_indices <- function(split, n = NULL) {
if (!inherits(split, "rsplit")) {
stop("Expected an rsample rsplit object.", call. = FALSE)
}
in_id <- split$in_id %||% split$analysis
out_id <- split$out_id %||% split$assessment
if (is.null(in_id)) {
stop("rsplit object does not expose analysis indices.", call. = FALSE)
}
if (is.null(out_id)) {
if (is.null(n)) {
stop("rsplit object does not expose assessment indices; provide n.", call. = FALSE)
}
out_id <- setdiff(seq_len(n), in_id)
}
if (!is.null(n)) {
if (any(in_id < 1L | in_id > n)) {
stop("rsplit analysis indices exceed data rows.", call. = FALSE)
}
if (any(out_id < 1L | out_id > n)) {
stop("rsplit assessment indices exceed data rows.", call. = FALSE)
}
}
list(train = as.integer(in_id), test = as.integer(out_id))
}
.bio_as_leaksplits_from_rsample <- function(splits, n = NULL, coldata = NULL, split_cols = NULL) {
if (!requireNamespace("rsample", quietly = TRUE)) {
stop("Package 'rsample' is required for rsample split objects.", call. = FALSE)
}
if (.bio_is_rsplit(splits)) {
split_list <- list(splits)
split_ids <- "Fold1"
split_ids2 <- NULL
} else if (.bio_is_rset(splits)) {
split_list <- splits$splits
split_ids <- if ("id" %in% names(splits)) as.character(splits$id) else NULL
split_ids2 <- if ("id2" %in% names(splits)) as.character(splits$id2) else NULL
} else {
stop("splits must be a LeakSplits or rsample rset/rsplit.", call. = FALSE)
}
if (!length(split_list)) {
stop("rsample object contains no splits.", call. = FALSE)
}
if (is.null(split_ids)) {
split_ids <- paste0("Fold", seq_along(split_list))
}
fold_map <- as.integer(factor(split_ids, levels = unique(split_ids)))
if (is.null(split_ids2)) {
repeat_map <- rep(1L, length(split_list))
} else {
repeat_map <- as.integer(factor(split_ids2, levels = unique(split_ids2)))
}
rsample_cols_attr <- .bio_rsample_split_cols(splits)
rsample_cols <- rsample_cols_attr
split_cols_norm <- NULL
auto_cols <- FALSE
if (!is.null(split_cols)) {
if (is.character(split_cols) && length(split_cols) == 1L && identical(split_cols, "auto")) {
auto_cols <- TRUE
split_cols <- NULL
} else if (is.character(split_cols)) {
if (is.null(names(split_cols)) || !length(names(split_cols))) {
stop("split_cols must be a named character vector or list.", call. = FALSE)
}
split_cols <- as.list(split_cols)
} else if (!is.list(split_cols)) {
stop("split_cols must be a named character vector or list.", call. = FALSE)
}
if (!is.null(split_cols)) {
split_cols_norm <- list(
group = .bio_rsample_col_name(split_cols$group %||% split_cols$groups),
batch = .bio_rsample_col_name(split_cols$batch),
study = .bio_rsample_col_name(split_cols$study),
time = .bio_rsample_col_name(split_cols$time)
)
for (nm in names(split_cols_norm)) {
if (!is.null(split_cols_norm[[nm]])) {
rsample_cols[[nm]] <- split_cols_norm[[nm]]
}
}
}
}
if (isTRUE(auto_cols) && !is.null(coldata)) {
cd_names <- names(coldata)
if (is.null(rsample_cols$group)) {
if ("group" %in% cd_names) {
rsample_cols$group <- "group"
} else if ("subject" %in% cd_names) {
rsample_cols$group <- "subject"
}
}
if (is.null(rsample_cols$batch)) {
if ("batch" %in% cd_names) rsample_cols$batch <- "batch"
}
if (is.null(rsample_cols$study)) {
if ("study" %in% cd_names) rsample_cols$study <- "study"
}
if (is.null(rsample_cols$time)) {
if ("time" %in% cd_names) {
rsample_cols$time <- "time"
} else if ("date" %in% cd_names) {
rsample_cols$time <- "date"
} else if ("index" %in% cd_names) {
rsample_cols$time <- "index"
}
}
}
valid_modes <- c("subject_grouped", "batch_blocked", "study_loocv", "time_series")
mode_attr <- attr(splits, "bioLeak_mode", exact = TRUE)
if (!is.null(mode_attr)) {
mode_attr <- as.character(mode_attr)
mode_attr <- mode_attr[!is.na(mode_attr) & nzchar(mode_attr)]
mode_attr <- if (length(mode_attr)) mode_attr[[1]] else NULL
}
perm_mode_attr <- attr(splits, "bioLeak_perm_mode", exact = TRUE)
if (!is.null(perm_mode_attr)) {
perm_mode_attr <- as.character(perm_mode_attr)
perm_mode_attr <- perm_mode_attr[!is.na(perm_mode_attr) & nzchar(perm_mode_attr)]
perm_mode_attr <- if (length(perm_mode_attr)) perm_mode_attr[[1]] else NULL
}
mode_attr_valid <- !is.null(mode_attr) && mode_attr %in% valid_modes
perm_mode_attr_valid <- !is.null(perm_mode_attr) && perm_mode_attr %in% valid_modes
if (isTRUE(mode_attr_valid) && isTRUE(perm_mode_attr_valid) &&
!identical(mode_attr, perm_mode_attr)) {
stop("bioLeak_mode and bioLeak_perm_mode conflict; supply only one or ensure they match.",
call. = FALSE)
}
split_mode <- "rsample"
perm_mode <- NULL
if (isTRUE(mode_attr_valid)) {
split_mode <- mode_attr
perm_mode <- mode_attr
} else if (isTRUE(perm_mode_attr_valid)) {
perm_mode <- perm_mode_attr
} else {
time_ok <- function(col, cd) {
if (is.null(cd)) return(FALSE)
if (!is.null(col)) {
col <- col[!is.na(col) & nzchar(col)]
col <- if (length(col)) col[[1]] else NULL
}
if (is.null(col)) {
if (!"time" %in% names(cd)) return(FALSE)
col <- "time"
}
if (!col %in% names(cd)) return(FALSE)
vals <- cd[[col]]
is.numeric(vals) || inherits(vals, c("POSIXct", "Date"))
}
time_col <- rsample_cols$time
time_series_classes <- c("rolling_origin", "sliding_index", "sliding_period", "sliding_window")
if (inherits(splits, time_series_classes) && time_ok(time_col, coldata)) {
perm_mode <- "time_series"
} else if (inherits(splits, "group_rset")) {
perm_mode <- "subject_grouped"
} else {
explicit_cols <- rsample_cols_attr
if (!is.null(split_cols_norm)) {
for (nm in names(split_cols_norm)) {
explicit_cols[[nm]] <- split_cols_norm[[nm]]
}
}
has_explicit <- any(vapply(explicit_cols, function(x) {
!is.null(x) && length(x)
}, logical(1)))
perm_cols <- if (isTRUE(has_explicit)) explicit_cols else rsample_cols
candidates <- character(0)
if (!is.null(perm_cols$time) && time_ok(perm_cols$time, coldata)) {
candidates <- c(candidates, "time_series")
}
if (!is.null(perm_cols$study)) {
candidates <- c(candidates, "study_loocv")
}
if (!is.null(perm_cols$batch)) {
candidates <- c(candidates, "batch_blocked")
}
if (!is.null(perm_cols$group)) {
candidates <- c(candidates, "subject_grouped")
}
if (length(candidates) == 1L) perm_mode <- candidates[[1]]
}
}
if (is.null(perm_mode)) {
stop(paste0("rsample splits require an explicit perm_mode (set attr(splits, ",
"'bioLeak_perm_mode') or 'bioLeak_mode') or a single inferable ",
"group/batch/study/time column via split_cols."),
call. = FALSE)
}
indices <- vector("list", length(split_list))
summary_rows <- vector("list", length(split_list))
for (i in seq_along(split_list)) {
idx <- .bio_rsplit_indices(split_list[[i]], n = n)
indices[[i]] <- list(
train = idx$train,
test = idx$test,
fold = fold_map[[i]],
repeat_id = repeat_map[[i]]
)
summary_rows[[i]] <- data.frame(
fold = fold_map[[i]],
repeat_id = repeat_map[[i]],
train_n = length(idx$train),
test_n = length(idx$test),
stringsAsFactors = FALSE
)
}
split_summary <- do.call(rbind, summary_rows)
info <- list(
outcome = NULL,
v = length(unique(fold_map)),
repeats = length(unique(repeat_map)),
seed = NA_integer_,
mode = split_mode,
perm_mode = perm_mode,
group = rsample_cols$group,
batch = rsample_cols$batch,
study = rsample_cols$study,
time = rsample_cols$time,
stratify = NA,
nested = FALSE,
horizon = 0,
purge = 0,
embargo = 0,
summary = split_summary,
hash = .bio_hash_indices(indices),
inner = NULL,
compact = FALSE,
fold_assignments = NULL,
coldata = coldata,
source = "rsample",
id = split_ids,
id2 = split_ids2,
split_cols_override = split_cols_norm
)
methods::new("LeakSplits", mode = split_mode, indices = indices, info = info)
}
.bio_resolve_fold_indices <- function(splits, fold, n, data = NULL) {
if (!inherits(splits, "LeakSplits")) {
stop("splits must be a LeakSplits object.", call. = FALSE)
}
if (!isTRUE(splits@info$compact) || !is.null(fold$train)) return(fold)
fold_assignments <- splits@info$fold_assignments
if (is.null(fold_assignments) || !length(fold_assignments)) {
stop("Compact splits require fold assignments to compute indices.", call. = FALSE)
}
r <- fold$repeat_id %||% 1L
assign_vec <- fold_assignments[[r]]
if (is.null(assign_vec)) {
stop(sprintf("Missing fold assignments for repeat %s.", r), call. = FALSE)
}
test <- which(assign_vec == fold$fold)
if (identical(splits@mode, "time_series")) {
time_col <- splits@info$time
time_vec <- NULL
if (!is.null(time_col) && !is.null(data) && time_col %in% names(data)) {
time_vec <- data[[time_col]]
} else if (!is.null(time_col) && !is.null(splits@info$coldata) &&
time_col %in% names(splits@info$coldata)) {
time_vec <- splits@info$coldata[[time_col]]
}
if (is.null(time_vec)) {
stop("time_series compact splits require time column in data.", call. = FALSE)
}
split_horizon <- splits@info$horizon %||% 0
split_purge <- splits@info$purge %||% 0
split_embargo <- splits@info$embargo %||% 0
train <- .bio_time_series_train_indices(
time_vec = time_vec,
test_idx = test,
candidate_idx = seq_len(n),
horizon = split_horizon,
purge = split_purge,
embargo = split_embargo
)
} else {
train <- setdiff(seq_len(n), test)
}
list(train = train, test = test, fold = fold$fold, repeat_id = fold$repeat_id)
}
.bio_make_rsplit <- function(train, test, data, id = NULL) {
rsample_make_splits <- utils::getFromNamespace("make_splits", "rsample")
args <- list(list(analysis = train, assessment = test), data = data)
if (!is.null(id)) args$id <- id
out <- try(do.call(rsample_make_splits, args), silent = TRUE)
if (inherits(out, "try-error") && !is.null(id)) {
args$id <- NULL
out <- do.call(rsample_make_splits, args)
}
out
}
#' Convert LeakSplits to an rsample resample set
#'
#' @param x LeakSplits object created by [make_split_plan()].
#' @param data Optional data.frame used to populate rsample splits. When NULL,
#' the stored `coldata` from `x` is used (if available).
#' @param ... Additional arguments passed to methods (unused).
#' @return An rsample \code{rset} object compatible with tidymodels workflows.
#' The returned object is a tibble with class \code{rset} containing:
#' \describe{
#' \item{\code{splits}}{List-column of \code{rsplit} objects, each with
#' \code{analysis} (training indices) and \code{assessment} (test indices).}
#' \item{\code{id}}{Character column with fold identifiers (e.g., "Fold1").}
#' \item{\code{id2}}{Character column with repeat identifiers (e.g., "Repeat1")
#' when multiple repeats are present; otherwise absent.}
#' }
#' The object also carries attributes for \code{group}, \code{batch},
#' \code{study}, \code{time} (when available from the original \code{LeakSplits}),
#' and \code{bioLeak_mode} indicating the original splitting mode. This allows
#' the splits to be used with \code{tune::tune_grid()}, \code{rsample::fit_resamples()},
#' and other tidymodels functions.
#' @examples
#' if (requireNamespace("rsample", quietly = TRUE)) {
#' df <- data.frame(
#' subject = rep(1:10, each = 2),
#' outcome = rbinom(20, 1, 0.5),
#' x1 = rnorm(20),
#' x2 = rnorm(20)
#' )
#' splits <- make_split_plan(df, outcome = "outcome",
#' mode = "subject_grouped", group = "subject", v = 5)
#' rset <- as_rsample(splits, data = df)
#' }
#' @export
as_rsample <- function(x, data = NULL, ...) UseMethod("as_rsample")
#' @export
as_rsample.LeakSplits <- function(x, data = NULL, ...) {
if (!requireNamespace("rsample", quietly = TRUE)) {
stop("Package 'rsample' is required to export rsample splits.", call. = FALSE)
}
if (!inherits(x, "LeakSplits")) {
stop("x must be a LeakSplits object.", call. = FALSE)
}
if (is.null(data)) data <- x@info$coldata
if (is.null(data)) {
stop("Provide 'data' to export rsample splits.", call. = FALSE)
}
data <- as.data.frame(data, check.names = FALSE)
n <- nrow(data)
folds <- x@indices
if (!length(folds)) stop("LeakSplits object contains no folds.", call. = FALSE)
resolved <- lapply(folds, function(fold) {
.bio_resolve_fold_indices(x, fold, n = n, data = data)
})
ids <- vapply(resolved, function(fold) {
paste0("Fold", fold$fold)
}, character(1))
repeats <- vapply(resolved, function(fold) fold$repeat_id %||% 1L, integer(1))
ids2 <- if (length(unique(repeats)) > 1L) {
paste0("Repeat", repeats)
} else {
NULL
}
split_objs <- lapply(seq_along(resolved), function(i) {
fold <- resolved[[i]]
.bio_make_rsplit(fold$train, fold$test, data = data, id = ids[[i]])
})
manual_rset <- utils::getFromNamespace("manual_rset", "rsample")
args <- list(splits = split_objs, ids = ids)
if (!is.null(ids2) && "ids2" %in% names(formals(manual_rset))) {
args$ids2 <- ids2
}
rset <- do.call(manual_rset, args)
info <- x@info
if (!is.null(info$group)) attr(rset, "group") <- info$group
if (!is.null(info$batch)) attr(rset, "batch") <- info$batch
if (!is.null(info$study)) attr(rset, "study") <- info$study
if (!is.null(info$time)) attr(rset, "time") <- info$time
if (!is.null(info$mode)) attr(rset, "bioLeak_mode") <- info$mode
rset
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.