knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(BORG) # Check package availability has_caret <- requireNamespace("caret", quietly = TRUE) has_recipes <- requireNamespace("recipes", quietly = TRUE) has_rsample <- requireNamespace("rsample", quietly = TRUE) has_mlr3 <- requireNamespace("mlr3", quietly = TRUE)
BORG integrates with major R machine learning frameworks. This guide shows how to validate workflows and use BORG-guarded CV functions in each ecosystem.
The simplest integration - manual index-based splitting:
# Create data data <- iris set.seed(42) n <- nrow(data) train_idx <- sample(n, 0.7 * n) test_idx <- setdiff(1:n, train_idx) # Validate the split borg(data, train_idx = train_idx, test_idx = test_idx)
# CORRECT: Fit preprocessing on training data only train_data <- data[train_idx, ] train_means <- colMeans(train_data[, 1:4]) train_sds <- apply(train_data[, 1:4], 2, sd) # Apply train statistics to both sets scaled_train <- scale(data[train_idx, 1:4], center = train_means, scale = train_sds) scaled_test <- scale(data[test_idx, 1:4], center = train_means, scale = train_sds)
BORG can validate preProcess and trainControl objects, and provides a guarded wrapper.
library(caret) data(mtcars) train_idx <- 1:25 test_idx <- 26:32 # BAD: preProcess on full data (LEAKS!) pp_bad <- preProcess(mtcars[, -1], method = c("center", "scale")) borg_inspect(pp_bad, train_idx, test_idx, data = mtcars) # GOOD: preProcess on training data only pp_good <- preProcess(mtcars[train_idx, -1], method = c("center", "scale")) borg_inspect(pp_good, train_idx, test_idx, data = mtcars)
Use borg_trainControl() to automatically block random resampling when dependencies are detected:
# Standard caret workflow with spatial data spatial_data <- data.frame( lon = runif(200, 0, 100), lat = runif(200, 0, 100), response = rnorm(200) ) # This will warn/error if random CV is inappropriate ctrl <- borg_trainControl( data = spatial_data, coords = c("lon", "lat"), method = "cv", number = 5 ) # If spatial autocorrelation detected, blocks random CV # Use auto_block = TRUE to automatically switch to spatial blocking
library(recipes) library(rsample) data(mtcars) set.seed(123) split <- initial_split(mtcars, prop = 0.8) train_idx <- split$in_id test_idx <- setdiff(seq_len(nrow(mtcars)), train_idx) # BAD: Recipe prepped on full data rec_bad <- recipe(mpg ~ ., data = mtcars) |> step_normalize(all_numeric_predictors()) |> prep() # Uses full mtcars! borg_inspect(rec_bad, train_idx, test_idx, data = mtcars) # GOOD: Recipe prepped on training only rec_good <- recipe(mpg ~ ., data = training(split)) |> step_normalize(all_numeric_predictors()) |> prep() borg_inspect(rec_good, train_idx, test_idx, data = mtcars)
BORG provides drop-in replacements for rsample functions that respect data dependencies:
# Standard rsample folds <- vfold_cv(data, v = 5) # Random folds # BORG-guarded version folds <- borg_vfold_cv( data = spatial_data, coords = c("lon", "lat"), v = 5, auto_block = TRUE # Switches to spatial blocking if needed )
# For grouped data folds <- borg_group_vfold_cv( data = clinical_data, group = patient_id, v = 5 )
# For temporal data - enforces chronological ordering split <- borg_initial_split( data = ts_data, time = "date", prop = 0.8 )
# Validate existing rsample objects ts_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = 200), value = cumsum(rnorm(200)) ) rolling <- rolling_origin( data = ts_data, initial = 100, assess = 20, cumulative = FALSE ) # Check for temporal leakage borg_inspect(rolling, train_idx = NULL, test_idx = NULL)
Validate mlr3 tasks and resamplings:
library(mlr3) # Create task task <- TaskClassif$new("iris", iris, target = "Species") # Create resampling resampling <- rsmp("cv", folds = 5) resampling$instantiate(task) # Validate first fold train_idx <- resampling$train_set(1) test_idx <- resampling$test_set(1) borg_inspect(task, train_idx, test_idx)
For time series and panel data, temporal ordering is critical.
set.seed(123) n <- 365 ts_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = n), value = cumsum(rnorm(n)), feature = rnorm(n) ) # Chronological split train_idx <- 1:252 test_idx <- 253:365 # Validate temporal ordering result <- borg(ts_data, train_idx = train_idx, test_idx = test_idx, time = "date") result
rolling <- rolling_origin( data = ts_data, initial = 200, assess = 30, cumulative = FALSE ) # Validate the resampling scheme borg_inspect(rolling, train_idx = NULL, test_idx = NULL)
For spatial data, nearby points are often correlated.
set.seed(456) n <- 200 spatial_data <- data.frame( lon = runif(n, -10, 10), lat = runif(n, -10, 10), response = rnorm(n), predictor = rnorm(n) ) # Geographic split (west vs east) train_idx <- which(spatial_data$lon < 0) test_idx <- which(spatial_data$lon >= 0) # Validate with spatial awareness result <- borg(spatial_data, train_idx = train_idx, test_idx = test_idx, coords = c("lon", "lat")) result
# Let BORG generate spatially-blocked folds result <- borg(spatial_data, coords = c("lon", "lat"), target = "response", v = 5) result$diagnosis@recommended_cv # Access the folds length(result$folds)
For hierarchical data (patients, sites, species):
# Clinical trial data with repeated measures clinical_data <- data.frame( patient_id = rep(1:50, each = 4), visit = rep(1:4, times = 50), outcome = rnorm(200) ) # Let BORG create leave-group-out folds result <- borg(clinical_data, groups = "patient_id", target = "outcome", v = 5) result$diagnosis@recommended_cv # Verify no patient appears in both train and test fold1 <- result$folds[[1]] train_patients <- unique(clinical_data$patient_id[fold1$train]) test_patients <- unique(clinical_data$patient_id[fold1$test]) length(intersect(train_patients, test_patients)) # Should be 0
Validate an entire workflow at once:
# Build a workflow data <- iris set.seed(789) n <- nrow(data) train_idx <- sample(n, 0.7 * n) test_idx <- setdiff(1:n, train_idx) # Validate everything result <- borg_validate(list( data = data, train_idx = train_idx, test_idx = test_idx )) result
# Workflow with overlap (common mistake) bad_workflow <- list( data = iris, train_idx = 1:100, test_idx = 51:150 # Overlaps! ) result <- borg_validate(bad_workflow) result
BORG can automatically fix certain types of leakage:
# Workflow with fixable issues workflow <- list( data = iris, train_idx = 1:100, test_idx = 51:150 # Overlap ) # Attempt automatic repair fixed <- borg_assimilate(workflow) if (length(fixed$unfixable) > 0) { cat("Partial assimilation:", length(fixed$unfixable), "risk(s) require manual fix:", paste(fixed$unfixable, collapse = ", "), "\n") } else { cat("Assimilation complete:", length(fixed$fixed), "risk(s) corrected\n") }
Note: Index overlap requires choosing a new split strategy and cannot be fixed automatically.
| Framework | Validation Function | Guarded Alternative |
|-----------|--------------------|--------------------|
| Base R | borg(), borg_inspect() | - |
| caret | borg_inspect(preProcess) | borg_trainControl() |
| rsample | borg_inspect(vfold_cv) | borg_vfold_cv(), borg_initial_split() |
| recipes | borg_inspect(recipe) | - |
| mlr3 | borg_inspect(task) | - |
vignette("quickstart") - Basic usage and concepts
vignette("risk-taxonomy") - Complete catalog of detectable risks
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.