test-borg_assimilate.R
In BORG: Bounded Outcome Risk Guard for Model Evaluation

# ===========================================================================
# Tests for borg_assimilate()
# ===========================================================================

test_that("borg_assimilate validates required arguments", {
  # Non-list workflow
  expect_error(
    borg_assimilate("not a list"),
    "'workflow' must be a list"
  )

  # Invalid risks object
  expect_error(
    borg_assimilate(list(data = data.frame(x = 1:10), train_idx = 1:5, test_idx = 6:10),
                 risks = "not a BorgRisk"),
    "'risks' must be a BorgRisk object"
  )
})


test_that("borg_assimilate returns correct structure", {
  workflow <- list(
    data = data.frame(x = 1:10, y = 11:20),
    train_idx = 1:5,
    test_idx = 6:10
  )

  result <- borg_assimilate(workflow)

  expect_true(is.list(result))
  expect_true("workflow" %in% names(result))
  expect_true("fixed" %in% names(result))
  expect_true("unfixable" %in% names(result))
  expect_true("report" %in% names(result))

  expect_s4_class(result$report, "BorgRisk")
})


test_that("borg_assimilate passes through clean workflow", {
  workflow <- list(
    data = data.frame(x = 1:10, y = 11:20),
    train_idx = 1:5,
    test_idx = 6:10
  )

  result <- borg_assimilate(workflow)

  expect_true(result$report@is_valid)
  expect_equal(length(result$fixed), 0)
  expect_equal(length(result$unfixable), 0)
})


test_that("borg_assimilate identifies unfixable index overlap", {
  workflow <- list(
    data = data.frame(x = 1:10, y = 11:20),
    train_idx = 1:6,
    test_idx = 5:10  # overlap at 5, 6
  )

  result <- borg_assimilate(workflow)

  expect_false(result$report@is_valid)
  expect_true("index_overlap" %in% result$unfixable)
})


test_that("borg_assimilate identifies unfixable duplicate rows", {
  workflow <- list(
    data = data.frame(
      x = c(1, 2, 3, 4, 5, 1, 2, 3),  # rows 6-8 duplicate rows 1-3
      y = c(10, 20, 30, 40, 50, 10, 20, 30)
    ),
    train_idx = 1:5,
    test_idx = 6:8
  )

  result <- borg_assimilate(workflow)

  expect_true("duplicate_rows" %in% result$unfixable)
})


test_that("borg_assimilate identifies unfixable target leakage", {
  set.seed(42)
  n <- 100
  target <- rnorm(n)
  leaked_feature <- target + rnorm(n, sd = 0.001)

  workflow <- list(
    data = data.frame(
      y = target,
      x1 = rnorm(n),
      x2 = leaked_feature
    ),
    train_idx = 1:70,
    test_idx = 71:100,
    target_col = "y"
  )

  result <- borg_assimilate(workflow)

  expect_true("target_leakage_direct" %in% result$unfixable)
})


test_that("borg_assimilate accepts pre-computed risks", {
  workflow <- list(
    data = data.frame(x = 1:10, y = 11:20),
    train_idx = 1:6,
    test_idx = 5:10  # overlap
  )

  # Pre-compute risks
  risks <- borg_validate(workflow)

  # Pass to rewrite

  result <- borg_assimilate(workflow, risks = risks)

  expect_true("index_overlap" %in% result$unfixable)
})


test_that("borg_assimilate fix parameter filters risk types", {
  skip_if_not_installed("caret")

  set.seed(42)
  data <- data.frame(
    x1 = rnorm(100, mean = 10, sd = 5),
    x2 = rnorm(100, mean = 50, sd = 20)
  )
  train_idx <- 1:70
  test_idx <- 71:100

  # Create leaky preProcess
  pp_bad <- caret::preProcess(data, method = c("center", "scale"))

  workflow <- list(
    data = data,
    train_idx = train_idx,
    test_idx = test_idx,
    preprocess = pp_bad
  )

  # With fix = "all" (default), attempt to fix preprocessing
  result_all <- borg_assimilate(workflow, fix = "all")

  # With fix = "thresholds", skip preprocessing fix
  result_thresholds <- borg_assimilate(workflow, fix = "thresholds")

  expect_s4_class(result_all$report, "BorgRisk")
  expect_s4_class(result_thresholds$report, "BorgRisk")

  # fix = "all" should actually fix the preprocessing leak
  expect_true("preprocessing_leak" %in% result_all$fixed)
})


test_that("borg_assimilate fixes caret preProcess leak", {
  skip_if_not_installed("caret")

  set.seed(42)
  data <- data.frame(
    x1 = rnorm(100, mean = 10, sd = 5),
    x2 = rnorm(100, mean = 50, sd = 20)
  )
  train_idx <- 1:70
  test_idx <- 71:100

  # Leaky: preProcess fitted on full data
  pp_bad <- caret::preProcess(data, method = c("center", "scale"))

  workflow <- list(
    data = data,
    train_idx = train_idx,
    test_idx = test_idx,
    preprocess = pp_bad
  )

  result <- borg_assimilate(workflow)

  # Should be fixed

  expect_true("preprocessing_leak" %in% result$fixed)
  expect_false("preprocessing_leak" %in% result$unfixable)

  # The rewritten preProcess should be different from the original
  new_pp <- result$workflow$preprocess
  expect_s3_class(new_pp, "preProcess")

  # New preProcess should be based on train data stats
  train_data <- data[train_idx, ]
  expect_equal(new_pp$mean[["x1"]], mean(train_data$x1), tolerance = 1e-10)
  expect_equal(new_pp$mean[["x2"]], mean(train_data$x2), tolerance = 1e-10)
})


test_that("borg_assimilate fixes PCA leak (base R prcomp)", {
  set.seed(42)
  data <- data.frame(
    x1 = rnorm(100),
    x2 = rnorm(100),
    x3 = rnorm(100)
  )
  train_idx <- 1:70
  test_idx <- 71:100

  # Leaky: PCA on full data
  pca_bad <- prcomp(data, center = TRUE, scale. = TRUE)

  workflow <- list(
    data = data,
    train_idx = train_idx,
    test_idx = test_idx,
    preprocess = pca_bad
  )

  result <- borg_assimilate(workflow)

  expect_true("preprocessing_leak" %in% result$fixed)

  # New PCA should use train-only stats
  new_pca <- result$workflow$preprocess
  expect_s3_class(new_pca, "prcomp")
  expect_equal(new_pca$center[["x1"]], mean(data$x1[train_idx]), tolerance = 1e-10)
})


test_that("borg_assimilate fixes threshold on test data", {
  set.seed(42)
  n <- 100
  data <- data.frame(
    x = rnorm(n),
    y = factor(sample(c("A", "B"), n, replace = TRUE))
  )
  train_idx <- 1:70
  test_idx <- 71:100

  model <- glm(y ~ x, data = data[train_idx, ], family = binomial())

  workflow <- list(
    data = data,
    train_idx = train_idx,
    test_idx = test_idx,
    model = model,
    target_col = "y",
    thresholds = list(
      value = 0.5,
      optimized_on = "test",
      used_test_predictions = TRUE
    )
  )

  result <- borg_assimilate(workflow)

  expect_true("threshold_leak" %in% result$fixed)
  expect_equal(result$workflow$thresholds$optimized_on, "train")
  expect_false(result$workflow$thresholds$used_test_predictions)
})


test_that("borg_assimilate re-validates after rewriting", {
  workflow <- list(
    data = data.frame(x = 1:10, y = 11:20),
    train_idx = 1:5,
    test_idx = 6:10
  )

  result <- borg_assimilate(workflow)

  # The report should reflect the state after any rewrites
  expect_s4_class(result$report, "BorgRisk")
  expect_true(inherits(result$report@timestamp, "POSIXct"))
})


test_that("borg_assimilate handles workflow with model", {
  # Simple lm model - should pass through without issues
  set.seed(42)
  data <- data.frame(x = 1:20, y = 2 * (1:20) + rnorm(20))
  train_idx <- 1:15
  test_idx <- 16:20

  model <- lm(y ~ x, data = data[train_idx, ])

  workflow <- list(
    data = data,
    train_idx = train_idx,
    test_idx = test_idx,
    model = model
  )

  result <- borg_assimilate(workflow)

  expect_true(is.list(result))
  expect_s4_class(result$report, "BorgRisk")
})


# ===========================================================================
# Edge-case tests for high-severity gaps
# ===========================================================================

test_that("borg_assimilate handles workflow with missing data gracefully", {
  # Exercise the path where .get_train_data returns NULL because both
 # data and train_idx are NULL. We construct a workflow that has the

  # required fields for borg_validate (post-rewrite) but where the rewriter
  # cannot extract training data.
  set.seed(42)
  data <- data.frame(x = 1:10, y = 11:20)

  # Create a workflow where train_idx is present (for borg_validate) but
  # where we simulate the rewriter encountering a risk it cannot fix.
  # We pre-compute a BorgRisk with a normalization_leak and pass a workflow
  # whose data is a scaled matrix without center/scale attributes and no
  # caret preProcess, so the normalization rewriter falls through to the
  # fallback path and returns success = FALSE.
  workflow <- list(
    data = data,
    train_idx = 1:5,
    test_idx = 6:10
  )

  # Construct a BorgRisk with normalization_leak
  fake_risk <- new("BorgRisk",
    risks = list(list(
      type = "normalization_leak",
      severity = "hard_violation",
      description = "Normalization fitted on full data",
      affected_indices = integer(0),
      source_object = "scale"
    )),
    n_hard = 1L,
    n_soft = 0L,
    is_valid = FALSE,
    train_indices = 1:5,
    test_indices = 6:10,
    timestamp = Sys.time(),
    call = quote(borg_validate(x))
  )

  # The normalization rewriter checks for preProcess or scaled matrix.
  # This workflow has neither, so it returns success=FALSE -> unfixable.
  expect_no_error(result <- borg_assimilate(workflow, risks = fake_risk))
  expect_true("normalization_leak" %in% result$unfixable)
  expect_s4_class(result$report, "BorgRisk")
})


test_that("borg_assimilate handles unknown risk type", {
  workflow <- list(
    data = data.frame(x = 1:10, y = 11:20),
    train_idx = 1:5,
    test_idx = 6:10
  )

  # Create a BorgRisk with a fake/unknown risk type
  fake_risk <- new("BorgRisk",
    risks = list(list(
      type = "unknown_risk_xyz",
      severity = "hard_violation",
      description = "Some unknown risk type for testing",
      affected_indices = integer(0),
      source_object = "test"
    )),
    n_hard = 1L,
    n_soft = 0L,
    is_valid = FALSE,
    train_indices = 1:5,
    test_indices = 6:10,
    timestamp = Sys.time(),
    call = quote(borg_validate(x))
  )

  # borg_assimilate should not error on unknown risk types

  expect_no_error(result <- borg_assimilate(workflow, risks = fake_risk))

  # Unknown risk type is neither in rewritable_types nor unfixable_types,
  # so the switch returns NULL and it gets skipped entirely. It won't
  # appear in fixed or unfixable.
  expect_true(is.list(result))
  expect_s4_class(result$report, "BorgRisk")
  # The unknown type should NOT appear in fixed (it wasn't rewritten)
  expect_false("unknown_risk_xyz" %in% result$fixed)
})


test_that("borg_assimilate handles normalization with zero-variance column", {
  skip_if_not_installed("caret")

  set.seed(42)
  n <- 100
  data <- data.frame(
    x_normal = rnorm(n, mean = 10, sd = 5),
    x_constant = rep(42, n)  # zero variance
  )
  train_idx <- 1:70
  test_idx <- 71:100

  # Leaky: preProcess fitted on full data (including zero-variance column)
  # caret warns about zero-variance columns — expected, not a BORG issue
  pp_bad <- suppressWarnings(
    caret::preProcess(data, method = c("center", "scale"))
  )

  workflow <- list(
    data = data,
    train_idx = train_idx,
    test_idx = test_idx,
    preprocess = pp_bad
  )

  # Should not error even though one column has zero variance
  # Suppress caret's zero-variance warnings from the rewriter re-fitting
  expect_no_error(result <- suppressWarnings(borg_assimilate(workflow)))
  expect_true(is.list(result))
  expect_s4_class(result$report, "BorgRisk")
})


test_that("borg_assimilate handles threshold optimization with identical predictions", {
  set.seed(42)
  n <- 100

  # Create data where model predictions will have near-zero variance.
  # Use a target that is nearly perfectly balanced and a feature with no

  # signal, so that glm predictions cluster around 0.5.
  data <- data.frame(
    x = rep(0, n),  # no signal at all
    y = factor(rep(c("A", "B"), each = n / 2))
  )
  train_idx <- 1:70
  test_idx <- 71:100

  # glm with a zero-variance predictor produces identical predictions
  # (intercept only). suppress the "fitted probabilities" warning.
  model <- suppressWarnings(
    glm(y ~ x, data = data[train_idx, ], family = binomial())
  )

  # Verify predictions are indeed constant
  train_preds <- suppressWarnings(
    predict(model, newdata = data[train_idx, ], type = "response")
  )
  expect_equal(length(unique(round(train_preds, 10))), 1)

  workflow <- list(
    data = data,
    train_idx = train_idx,
    test_idx = test_idx,
    model = model,
    target_col = "y",
    thresholds = list(
      value = 0.5,
      optimized_on = "test",
      used_test_predictions = TRUE
    )
  )

  # Create threshold_leak risk manually
  threshold_risk <- new("BorgRisk",
    risks = list(list(
      type = "threshold_leak",
      severity = "hard_violation",
      description = "Threshold optimized on test data",
      affected_indices = integer(0),
      source_object = "thresholds"
    )),
    n_hard = 1L,
    n_soft = 0L,
    is_valid = FALSE,
    train_indices = train_idx,
    test_indices = test_idx,
    timestamp = Sys.time(),
    call = quote(borg_validate(x))
  )

  # The threshold rewriter should handle identical predictions without error.
  # All predictions are ~0.5 (single unique value), so thresholds_to_try
  # has one element and the Youden's J loop runs exactly once.
  expect_no_error(
    result <- suppressWarnings(borg_assimilate(workflow, risks = threshold_risk))
  )
  expect_true(is.list(result))
  expect_s4_class(result$report, "BorgRisk")
  # With constant predictions, the rewriter should still succeed (it picks
  # a threshold; Youden's J is just 0 for all candidates)
  expect_true("threshold_leak" %in% result$fixed)
  expect_equal(result$workflow$thresholds$optimized_on, "train")
})


test_that("borg_assimilate handles empty risks (clean workflow)", {
  workflow <- list(
    data = data.frame(x = 1:10, y = 11:20),
    train_idx = 1:5,
    test_idx = 6:10
  )

  # Create a BorgRisk with zero risks (clean)
  clean_risk <- new("BorgRisk",
    risks = list(),
    n_hard = 0L,
    n_soft = 0L,
    is_valid = TRUE,
    train_indices = 1:5,
    test_indices = 6:10,
    timestamp = Sys.time(),
    call = quote(borg_validate(x))
  )

  expect_no_error(result <- borg_assimilate(workflow, risks = clean_risk))

  # Should return unchanged workflow
  expect_identical(result$workflow$data, workflow$data)
  expect_identical(result$workflow$train_idx, workflow$train_idx)
  expect_identical(result$workflow$test_idx, workflow$test_idx)

  # No fixes needed, no unfixable
  expect_equal(length(result$fixed), 0)
  expect_equal(length(result$unfixable), 0)

  # Post-rewrite report should still be valid
  expect_s4_class(result$report, "BorgRisk")
  expect_true(result$report@is_valid)
})
Any scripts or data that you put into this service are public.
BORG documentation built on March 20, 2026, 5:09 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
BORG
Bounded Outcome Risk Guard for Model Evaluation

tests/testthat/test-borg_assimilate.R
In BORG: Bounded Outcome Risk Guard for Model Evaluation

Try the BORG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BORG Bounded Outcome Risk Guard for Model Evaluation

tests/testthat/test-borg_assimilate.R In BORG: Bounded Outcome Risk Guard for Model Evaluation

Try the BORG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BORG
Bounded Outcome Risk Guard for Model Evaluation

tests/testthat/test-borg_assimilate.R
In BORG: Bounded Outcome Risk Guard for Model Evaluation