Nothing
# ===========================================================================
# Tests for borg_assimilate()
# ===========================================================================
test_that("borg_assimilate validates required arguments", {
# Non-list workflow
expect_error(
borg_assimilate("not a list"),
"'workflow' must be a list"
)
# Invalid risks object
expect_error(
borg_assimilate(list(data = data.frame(x = 1:10), train_idx = 1:5, test_idx = 6:10),
risks = "not a BorgRisk"),
"'risks' must be a BorgRisk object"
)
})
test_that("borg_assimilate returns correct structure", {
workflow <- list(
data = data.frame(x = 1:10, y = 11:20),
train_idx = 1:5,
test_idx = 6:10
)
result <- borg_assimilate(workflow)
expect_true(is.list(result))
expect_true("workflow" %in% names(result))
expect_true("fixed" %in% names(result))
expect_true("unfixable" %in% names(result))
expect_true("report" %in% names(result))
expect_s4_class(result$report, "BorgRisk")
})
test_that("borg_assimilate passes through clean workflow", {
workflow <- list(
data = data.frame(x = 1:10, y = 11:20),
train_idx = 1:5,
test_idx = 6:10
)
result <- borg_assimilate(workflow)
expect_true(result$report@is_valid)
expect_equal(length(result$fixed), 0)
expect_equal(length(result$unfixable), 0)
})
test_that("borg_assimilate identifies unfixable index overlap", {
workflow <- list(
data = data.frame(x = 1:10, y = 11:20),
train_idx = 1:6,
test_idx = 5:10 # overlap at 5, 6
)
result <- borg_assimilate(workflow)
expect_false(result$report@is_valid)
expect_true("index_overlap" %in% result$unfixable)
})
test_that("borg_assimilate identifies unfixable duplicate rows", {
workflow <- list(
data = data.frame(
x = c(1, 2, 3, 4, 5, 1, 2, 3), # rows 6-8 duplicate rows 1-3
y = c(10, 20, 30, 40, 50, 10, 20, 30)
),
train_idx = 1:5,
test_idx = 6:8
)
result <- borg_assimilate(workflow)
expect_true("duplicate_rows" %in% result$unfixable)
})
test_that("borg_assimilate identifies unfixable target leakage", {
set.seed(42)
n <- 100
target <- rnorm(n)
leaked_feature <- target + rnorm(n, sd = 0.001)
workflow <- list(
data = data.frame(
y = target,
x1 = rnorm(n),
x2 = leaked_feature
),
train_idx = 1:70,
test_idx = 71:100,
target_col = "y"
)
result <- borg_assimilate(workflow)
expect_true("target_leakage_direct" %in% result$unfixable)
})
test_that("borg_assimilate accepts pre-computed risks", {
workflow <- list(
data = data.frame(x = 1:10, y = 11:20),
train_idx = 1:6,
test_idx = 5:10 # overlap
)
# Pre-compute risks
risks <- borg_validate(workflow)
# Pass to rewrite
result <- borg_assimilate(workflow, risks = risks)
expect_true("index_overlap" %in% result$unfixable)
})
test_that("borg_assimilate fix parameter filters risk types", {
skip_if_not_installed("caret")
set.seed(42)
data <- data.frame(
x1 = rnorm(100, mean = 10, sd = 5),
x2 = rnorm(100, mean = 50, sd = 20)
)
train_idx <- 1:70
test_idx <- 71:100
# Create leaky preProcess
pp_bad <- caret::preProcess(data, method = c("center", "scale"))
workflow <- list(
data = data,
train_idx = train_idx,
test_idx = test_idx,
preprocess = pp_bad
)
# With fix = "all" (default), attempt to fix preprocessing
result_all <- borg_assimilate(workflow, fix = "all")
# With fix = "thresholds", skip preprocessing fix
result_thresholds <- borg_assimilate(workflow, fix = "thresholds")
expect_s4_class(result_all$report, "BorgRisk")
expect_s4_class(result_thresholds$report, "BorgRisk")
# fix = "all" should actually fix the preprocessing leak
expect_true("preprocessing_leak" %in% result_all$fixed)
})
test_that("borg_assimilate fixes caret preProcess leak", {
skip_if_not_installed("caret")
set.seed(42)
data <- data.frame(
x1 = rnorm(100, mean = 10, sd = 5),
x2 = rnorm(100, mean = 50, sd = 20)
)
train_idx <- 1:70
test_idx <- 71:100
# Leaky: preProcess fitted on full data
pp_bad <- caret::preProcess(data, method = c("center", "scale"))
workflow <- list(
data = data,
train_idx = train_idx,
test_idx = test_idx,
preprocess = pp_bad
)
result <- borg_assimilate(workflow)
# Should be fixed
expect_true("preprocessing_leak" %in% result$fixed)
expect_false("preprocessing_leak" %in% result$unfixable)
# The rewritten preProcess should be different from the original
new_pp <- result$workflow$preprocess
expect_s3_class(new_pp, "preProcess")
# New preProcess should be based on train data stats
train_data <- data[train_idx, ]
expect_equal(new_pp$mean[["x1"]], mean(train_data$x1), tolerance = 1e-10)
expect_equal(new_pp$mean[["x2"]], mean(train_data$x2), tolerance = 1e-10)
})
test_that("borg_assimilate fixes PCA leak (base R prcomp)", {
set.seed(42)
data <- data.frame(
x1 = rnorm(100),
x2 = rnorm(100),
x3 = rnorm(100)
)
train_idx <- 1:70
test_idx <- 71:100
# Leaky: PCA on full data
pca_bad <- prcomp(data, center = TRUE, scale. = TRUE)
workflow <- list(
data = data,
train_idx = train_idx,
test_idx = test_idx,
preprocess = pca_bad
)
result <- borg_assimilate(workflow)
expect_true("preprocessing_leak" %in% result$fixed)
# New PCA should use train-only stats
new_pca <- result$workflow$preprocess
expect_s3_class(new_pca, "prcomp")
expect_equal(new_pca$center[["x1"]], mean(data$x1[train_idx]), tolerance = 1e-10)
})
test_that("borg_assimilate fixes threshold on test data", {
set.seed(42)
n <- 100
data <- data.frame(
x = rnorm(n),
y = factor(sample(c("A", "B"), n, replace = TRUE))
)
train_idx <- 1:70
test_idx <- 71:100
model <- glm(y ~ x, data = data[train_idx, ], family = binomial())
workflow <- list(
data = data,
train_idx = train_idx,
test_idx = test_idx,
model = model,
target_col = "y",
thresholds = list(
value = 0.5,
optimized_on = "test",
used_test_predictions = TRUE
)
)
result <- borg_assimilate(workflow)
expect_true("threshold_leak" %in% result$fixed)
expect_equal(result$workflow$thresholds$optimized_on, "train")
expect_false(result$workflow$thresholds$used_test_predictions)
})
test_that("borg_assimilate re-validates after rewriting", {
workflow <- list(
data = data.frame(x = 1:10, y = 11:20),
train_idx = 1:5,
test_idx = 6:10
)
result <- borg_assimilate(workflow)
# The report should reflect the state after any rewrites
expect_s4_class(result$report, "BorgRisk")
expect_true(inherits(result$report@timestamp, "POSIXct"))
})
test_that("borg_assimilate handles workflow with model", {
# Simple lm model - should pass through without issues
set.seed(42)
data <- data.frame(x = 1:20, y = 2 * (1:20) + rnorm(20))
train_idx <- 1:15
test_idx <- 16:20
model <- lm(y ~ x, data = data[train_idx, ])
workflow <- list(
data = data,
train_idx = train_idx,
test_idx = test_idx,
model = model
)
result <- borg_assimilate(workflow)
expect_true(is.list(result))
expect_s4_class(result$report, "BorgRisk")
})
# ===========================================================================
# Edge-case tests for high-severity gaps
# ===========================================================================
test_that("borg_assimilate handles workflow with missing data gracefully", {
# Exercise the path where .get_train_data returns NULL because both
# data and train_idx are NULL. We construct a workflow that has the
# required fields for borg_validate (post-rewrite) but where the rewriter
# cannot extract training data.
set.seed(42)
data <- data.frame(x = 1:10, y = 11:20)
# Create a workflow where train_idx is present (for borg_validate) but
# where we simulate the rewriter encountering a risk it cannot fix.
# We pre-compute a BorgRisk with a normalization_leak and pass a workflow
# whose data is a scaled matrix without center/scale attributes and no
# caret preProcess, so the normalization rewriter falls through to the
# fallback path and returns success = FALSE.
workflow <- list(
data = data,
train_idx = 1:5,
test_idx = 6:10
)
# Construct a BorgRisk with normalization_leak
fake_risk <- new("BorgRisk",
risks = list(list(
type = "normalization_leak",
severity = "hard_violation",
description = "Normalization fitted on full data",
affected_indices = integer(0),
source_object = "scale"
)),
n_hard = 1L,
n_soft = 0L,
is_valid = FALSE,
train_indices = 1:5,
test_indices = 6:10,
timestamp = Sys.time(),
call = quote(borg_validate(x))
)
# The normalization rewriter checks for preProcess or scaled matrix.
# This workflow has neither, so it returns success=FALSE -> unfixable.
expect_no_error(result <- borg_assimilate(workflow, risks = fake_risk))
expect_true("normalization_leak" %in% result$unfixable)
expect_s4_class(result$report, "BorgRisk")
})
test_that("borg_assimilate handles unknown risk type", {
workflow <- list(
data = data.frame(x = 1:10, y = 11:20),
train_idx = 1:5,
test_idx = 6:10
)
# Create a BorgRisk with a fake/unknown risk type
fake_risk <- new("BorgRisk",
risks = list(list(
type = "unknown_risk_xyz",
severity = "hard_violation",
description = "Some unknown risk type for testing",
affected_indices = integer(0),
source_object = "test"
)),
n_hard = 1L,
n_soft = 0L,
is_valid = FALSE,
train_indices = 1:5,
test_indices = 6:10,
timestamp = Sys.time(),
call = quote(borg_validate(x))
)
# borg_assimilate should not error on unknown risk types
expect_no_error(result <- borg_assimilate(workflow, risks = fake_risk))
# Unknown risk type is neither in rewritable_types nor unfixable_types,
# so the switch returns NULL and it gets skipped entirely. It won't
# appear in fixed or unfixable.
expect_true(is.list(result))
expect_s4_class(result$report, "BorgRisk")
# The unknown type should NOT appear in fixed (it wasn't rewritten)
expect_false("unknown_risk_xyz" %in% result$fixed)
})
test_that("borg_assimilate handles normalization with zero-variance column", {
skip_if_not_installed("caret")
set.seed(42)
n <- 100
data <- data.frame(
x_normal = rnorm(n, mean = 10, sd = 5),
x_constant = rep(42, n) # zero variance
)
train_idx <- 1:70
test_idx <- 71:100
# Leaky: preProcess fitted on full data (including zero-variance column)
# caret warns about zero-variance columns — expected, not a BORG issue
pp_bad <- suppressWarnings(
caret::preProcess(data, method = c("center", "scale"))
)
workflow <- list(
data = data,
train_idx = train_idx,
test_idx = test_idx,
preprocess = pp_bad
)
# Should not error even though one column has zero variance
# Suppress caret's zero-variance warnings from the rewriter re-fitting
expect_no_error(result <- suppressWarnings(borg_assimilate(workflow)))
expect_true(is.list(result))
expect_s4_class(result$report, "BorgRisk")
})
test_that("borg_assimilate handles threshold optimization with identical predictions", {
set.seed(42)
n <- 100
# Create data where model predictions will have near-zero variance.
# Use a target that is nearly perfectly balanced and a feature with no
# signal, so that glm predictions cluster around 0.5.
data <- data.frame(
x = rep(0, n), # no signal at all
y = factor(rep(c("A", "B"), each = n / 2))
)
train_idx <- 1:70
test_idx <- 71:100
# glm with a zero-variance predictor produces identical predictions
# (intercept only). suppress the "fitted probabilities" warning.
model <- suppressWarnings(
glm(y ~ x, data = data[train_idx, ], family = binomial())
)
# Verify predictions are indeed constant
train_preds <- suppressWarnings(
predict(model, newdata = data[train_idx, ], type = "response")
)
expect_equal(length(unique(round(train_preds, 10))), 1)
workflow <- list(
data = data,
train_idx = train_idx,
test_idx = test_idx,
model = model,
target_col = "y",
thresholds = list(
value = 0.5,
optimized_on = "test",
used_test_predictions = TRUE
)
)
# Create threshold_leak risk manually
threshold_risk <- new("BorgRisk",
risks = list(list(
type = "threshold_leak",
severity = "hard_violation",
description = "Threshold optimized on test data",
affected_indices = integer(0),
source_object = "thresholds"
)),
n_hard = 1L,
n_soft = 0L,
is_valid = FALSE,
train_indices = train_idx,
test_indices = test_idx,
timestamp = Sys.time(),
call = quote(borg_validate(x))
)
# The threshold rewriter should handle identical predictions without error.
# All predictions are ~0.5 (single unique value), so thresholds_to_try
# has one element and the Youden's J loop runs exactly once.
expect_no_error(
result <- suppressWarnings(borg_assimilate(workflow, risks = threshold_risk))
)
expect_true(is.list(result))
expect_s4_class(result$report, "BorgRisk")
# With constant predictions, the rewriter should still succeed (it picks
# a threshold; Youden's J is just 0 for all candidates)
expect_true("threshold_leak" %in% result$fixed)
expect_equal(result$workflow$thresholds$optimized_on, "train")
})
test_that("borg_assimilate handles empty risks (clean workflow)", {
workflow <- list(
data = data.frame(x = 1:10, y = 11:20),
train_idx = 1:5,
test_idx = 6:10
)
# Create a BorgRisk with zero risks (clean)
clean_risk <- new("BorgRisk",
risks = list(),
n_hard = 0L,
n_soft = 0L,
is_valid = TRUE,
train_indices = 1:5,
test_indices = 6:10,
timestamp = Sys.time(),
call = quote(borg_validate(x))
)
expect_no_error(result <- borg_assimilate(workflow, risks = clean_risk))
# Should return unchanged workflow
expect_identical(result$workflow$data, workflow$data)
expect_identical(result$workflow$train_idx, workflow$train_idx)
expect_identical(result$workflow$test_idx, workflow$test_idx)
# No fixes needed, no unfixable
expect_equal(length(result$fixed), 0)
expect_equal(length(result$unfixable), 0)
# Post-rewrite report should still be valid
expect_s4_class(result$report, "BorgRisk")
expect_true(result$report@is_valid)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.