Framework Integration"
In BORG: Bounded Outcome Risk Guard for Model Evaluation

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
library(BORG)

# Check package availability
has_caret <- requireNamespace("caret", quietly = TRUE)
has_recipes <- requireNamespace("recipes", quietly = TRUE)
has_rsample <- requireNamespace("rsample", quietly = TRUE)
has_mlr3 <- requireNamespace("mlr3", quietly = TRUE)

BORG integrates with major R machine learning frameworks. This guide shows how to validate workflows and use BORG-guarded CV functions in each ecosystem.

Base R

The simplest integration - manual index-based splitting:

# Create data
data <- iris
set.seed(42)
n <- nrow(data)
train_idx <- sample(n, 0.7 * n)
test_idx <- setdiff(1:n, train_idx)

# Validate the split
borg(data, train_idx = train_idx, test_idx = test_idx)

Safe Preprocessing Pattern

# CORRECT: Fit preprocessing on training data only
train_data <- data[train_idx, ]
train_means <- colMeans(train_data[, 1:4])
train_sds <- apply(train_data[, 1:4], 2, sd)

# Apply train statistics to both sets
scaled_train <- scale(data[train_idx, 1:4], center = train_means, scale = train_sds)
scaled_test <- scale(data[test_idx, 1:4], center = train_means, scale = train_sds)

caret

BORG can validate preProcess and trainControl objects, and provides a guarded wrapper.

Validating preProcess Objects

library(caret)

data(mtcars)
train_idx <- 1:25
test_idx <- 26:32

# BAD: preProcess on full data (LEAKS!)
pp_bad <- preProcess(mtcars[, -1], method = c("center", "scale"))
borg_inspect(pp_bad, train_idx, test_idx, data = mtcars)

# GOOD: preProcess on training data only
pp_good <- preProcess(mtcars[train_idx, -1], method = c("center", "scale"))
borg_inspect(pp_good, train_idx, test_idx, data = mtcars)

BORG-Guarded trainControl

Use borg_trainControl() to automatically block random resampling when dependencies are detected:

# Standard caret workflow with spatial data
spatial_data <- data.frame(
  lon = runif(200, 0, 100),
  lat = runif(200, 0, 100),
  response = rnorm(200)
)

# This will warn/error if random CV is inappropriate
ctrl <- borg_trainControl(
  data = spatial_data,
  coords = c("lon", "lat"),
  method = "cv",
  number = 5
)
# If spatial autocorrelation detected, blocks random CV
# Use auto_block = TRUE to automatically switch to spatial blocking

tidymodels (rsample + recipes)

Validating Recipe Objects

library(recipes)
library(rsample)

data(mtcars)
set.seed(123)
split <- initial_split(mtcars, prop = 0.8)
train_idx <- split$in_id
test_idx <- setdiff(seq_len(nrow(mtcars)), train_idx)

# BAD: Recipe prepped on full data
rec_bad <- recipe(mpg ~ ., data = mtcars) |>
  step_normalize(all_numeric_predictors()) |>
  prep()  # Uses full mtcars!

borg_inspect(rec_bad, train_idx, test_idx, data = mtcars)

# GOOD: Recipe prepped on training only
rec_good <- recipe(mpg ~ ., data = training(split)) |>
  step_normalize(all_numeric_predictors()) |>
  prep()

borg_inspect(rec_good, train_idx, test_idx, data = mtcars)

BORG-Guarded rsample Functions

BORG provides drop-in replacements for rsample functions that respect data dependencies:

# Standard rsample
folds <- vfold_cv(data, v = 5)  # Random folds

# BORG-guarded version
folds <- borg_vfold_cv(
  data = spatial_data,
  coords = c("lon", "lat"),
  v = 5,
  auto_block = TRUE  # Switches to spatial blocking if needed
)

# For grouped data
folds <- borg_group_vfold_cv(
  data = clinical_data,
  group = patient_id,
  v = 5
)

# For temporal data - enforces chronological ordering
split <- borg_initial_split(
  data = ts_data,
  time = "date",
  prop = 0.8
)

Validating rsample Objects

# Validate existing rsample objects
ts_data <- data.frame(
  date = seq(as.Date("2020-01-01"), by = "day", length.out = 200),
  value = cumsum(rnorm(200))
)

rolling <- rolling_origin(
  data = ts_data,
  initial = 100,
  assess = 20,
  cumulative = FALSE
)

# Check for temporal leakage
borg_inspect(rolling, train_idx = NULL, test_idx = NULL)

mlr3

Validate mlr3 tasks and resamplings:

library(mlr3)

# Create task
task <- TaskClassif$new("iris", iris, target = "Species")

# Create resampling
resampling <- rsmp("cv", folds = 5)
resampling$instantiate(task)

# Validate first fold
train_idx <- resampling$train_set(1)
test_idx <- resampling$test_set(1)
borg_inspect(task, train_idx, test_idx)

Temporal Data Workflows

For time series and panel data, temporal ordering is critical.

Basic Temporal Validation

set.seed(123)
n <- 365
ts_data <- data.frame(
  date = seq(as.Date("2020-01-01"), by = "day", length.out = n),
  value = cumsum(rnorm(n)),
  feature = rnorm(n)
)

# Chronological split
train_idx <- 1:252
test_idx <- 253:365

# Validate temporal ordering
result <- borg(ts_data, train_idx = train_idx, test_idx = test_idx, time = "date")
result

Rolling Origin with rsample

rolling <- rolling_origin(
  data = ts_data,
  initial = 200,
  assess = 30,
  cumulative = FALSE
)

# Validate the resampling scheme
borg_inspect(rolling, train_idx = NULL, test_idx = NULL)

Spatial Data Workflows

For spatial data, nearby points are often correlated.

Spatial Block Validation

set.seed(456)
n <- 200
spatial_data <- data.frame(
  lon = runif(n, -10, 10),
  lat = runif(n, -10, 10),
  response = rnorm(n),
  predictor = rnorm(n)
)

# Geographic split (west vs east)
train_idx <- which(spatial_data$lon < 0)
test_idx <- which(spatial_data$lon >= 0)

# Validate with spatial awareness
result <- borg(spatial_data,
               train_idx = train_idx,
               test_idx = test_idx,
               coords = c("lon", "lat"))
result

Automatic Spatial CV Generation

# Let BORG generate spatially-blocked folds
result <- borg(spatial_data, coords = c("lon", "lat"), target = "response", v = 5)
result$diagnosis@recommended_cv

# Access the folds
length(result$folds)

Grouped Data Workflows

For hierarchical data (patients, sites, species):

# Clinical trial data with repeated measures
clinical_data <- data.frame(
  patient_id = rep(1:50, each = 4),
  visit = rep(1:4, times = 50),
  outcome = rnorm(200)
)

# Let BORG create leave-group-out folds
result <- borg(clinical_data, groups = "patient_id", target = "outcome", v = 5)
result$diagnosis@recommended_cv

# Verify no patient appears in both train and test
fold1 <- result$folds[[1]]
train_patients <- unique(clinical_data$patient_id[fold1$train])
test_patients <- unique(clinical_data$patient_id[fold1$test])
length(intersect(train_patients, test_patients))  # Should be 0

Complete Pipeline Validation

Validate an entire workflow at once:

# Build a workflow
data <- iris
set.seed(789)
n <- nrow(data)
train_idx <- sample(n, 0.7 * n)
test_idx <- setdiff(1:n, train_idx)

# Validate everything
result <- borg_validate(list(
  data = data,
  train_idx = train_idx,
  test_idx = test_idx
))

result

With Problematic Workflow

# Workflow with overlap (common mistake)
bad_workflow <- list(
  data = iris,
  train_idx = 1:100,
  test_idx = 51:150  # Overlaps!
)

result <- borg_validate(bad_workflow)
result

Automatic Repair with borg_assimilate()

BORG can automatically fix certain types of leakage:

# Workflow with fixable issues
workflow <- list(
  data = iris,
  train_idx = 1:100,
  test_idx = 51:150  # Overlap
)

# Attempt automatic repair
fixed <- borg_assimilate(workflow)

if (length(fixed$unfixable) > 0) {
  cat("Partial assimilation:", length(fixed$unfixable), "risk(s) require manual fix:",
      paste(fixed$unfixable, collapse = ", "), "\n")
} else {
  cat("Assimilation complete:", length(fixed$fixed), "risk(s) corrected\n")
}

Note: Index overlap requires choosing a new split strategy and cannot be fixed automatically.

Summary: Framework Integration Patterns

| Framework | Validation Function | Guarded Alternative | |-----------|--------------------|--------------------| | Base R | borg(), borg_inspect() | - | | caret | borg_inspect(preProcess) | borg_trainControl() | | rsample | borg_inspect(vfold_cv) | borg_vfold_cv(), borg_initial_split() | | recipes | borg_inspect(recipe) | - | | mlr3 | borg_inspect(task) | - |

BORG
Bounded Outcome Risk Guard for Model Evaluation

Framework Integration"
In BORG: Bounded Outcome Risk Guard for Model Evaluation

Base R

Safe Preprocessing Pattern

caret

Validating preProcess Objects

BORG-Guarded trainControl

tidymodels (rsample + recipes)

Validating Recipe Objects

BORG-Guarded rsample Functions

Validating rsample Objects

mlr3

Temporal Data Workflows

Basic Temporal Validation

Rolling Origin with rsample

Spatial Data Workflows

Spatial Block Validation

Automatic Spatial CV Generation

Grouped Data Workflows

Complete Pipeline Validation

With Problematic Workflow

Automatic Repair with borg_assimilate()

Summary: Framework Integration Patterns

See Also

Try the BORG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BORG Bounded Outcome Risk Guard for Model Evaluation

Framework Integration" In BORG: Bounded Outcome Risk Guard for Model Evaluation

Base R

Safe Preprocessing Pattern

caret

Validating preProcess Objects

BORG-Guarded trainControl

tidymodels (rsample + recipes)

Validating Recipe Objects

BORG-Guarded rsample Functions

Validating rsample Objects

mlr3

Temporal Data Workflows

Basic Temporal Validation

Rolling Origin with rsample

Spatial Data Workflows

Spatial Block Validation

Automatic Spatial CV Generation

Grouped Data Workflows

Complete Pipeline Validation

With Problematic Workflow

Automatic Repair with borg_assimilate()

Summary: Framework Integration Patterns

See Also

Try the BORG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BORG
Bounded Outcome Risk Guard for Model Evaluation

Framework Integration"
In BORG: Bounded Outcome Risk Guard for Model Evaluation