risk-taxonomy.R
In BORG: Bounded Outcome Risk Guard for Model Evaluation

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
library(BORG)

## ----index-overlap------------------------------------------------------------
data <- data.frame(x = 1:100, y = rnorm(100))

# Accidental overlap
result <- borg_inspect(data, train_idx = 1:60, test_idx = 51:100)
result

## ----duplicate-rows-----------------------------------------------------------
# Data with duplicate rows
dup_data <- rbind(
  data.frame(x = 1:5, y = 1:5),
  data.frame(x = 1:5, y = 1:5)  # Duplicates
)

result <- borg_inspect(dup_data, train_idx = 1:5, test_idx = 6:10)
result

## ----preprocessing-leak, eval=FALSE-------------------------------------------
# # BAD: Scale fitted on all data
# scaled_data <- scale(data)  # Uses all rows!
# train <- scaled_data[1:70, ]
# test <- scaled_data[71:100, ]
# 
# # BORG detects this
# borg_inspect(scaled_data, train_idx = 1:70, test_idx = 71:100)

## ----target-leakage-----------------------------------------------------------
# Simulate target leakage
leaky <- data.frame(
  x = rnorm(100),
  outcome = rnorm(100)
)
leaky$leaked <- leaky$outcome + rnorm(100, sd = 0.01)  # Near-perfect correlation

result <- borg_inspect(leaky, train_idx = 1:70, test_idx = 71:100, target = "outcome")
result

## ----group-leakage------------------------------------------------------------
# Clinical data with patient IDs
clinical <- data.frame(
  patient_id = rep(1:10, each = 10),
  measurement = rnorm(100)
)

# Random split ignoring patients
set.seed(123)
all_idx <- sample(100)
train_idx <- all_idx[1:70]
test_idx <- all_idx[71:100]

result <- borg_inspect(clinical, train_idx = train_idx, test_idx = test_idx,
                       groups = "patient_id")
result

## ----temporal-leak------------------------------------------------------------
# Time series data
ts_data <- data.frame(
  date = seq(as.Date("2020-01-01"), by = "day", length.out = 100),
  value = cumsum(rnorm(100))
)

# Wrong: random split ignores time
set.seed(42)
random_idx <- sample(100)
train_idx <- random_idx[1:70]
test_idx <- random_idx[71:100]

result <- borg_inspect(ts_data, train_idx = train_idx, test_idx = test_idx,
                       time = "date")
result

## ----proxy-leakage------------------------------------------------------------
# Strong but not extreme correlation
proxy <- data.frame(
  x = rnorm(100),
  outcome = rnorm(100)
)
proxy$strong_predictor <- proxy$outcome + rnorm(100, sd = 0.3)  # r ~ 0.96

result <- borg_inspect(proxy, train_idx = 1:70, test_idx = 71:100, target = "outcome")
result

## ----spatial-proximity--------------------------------------------------------
set.seed(42)
spatial <- data.frame(
  lon = runif(100, 0, 100),
  lat = runif(100, 0, 100),
  value = rnorm(100)
)

# Random split intermixes nearby points
train_idx <- sample(100, 70)
test_idx <- setdiff(1:100, train_idx)

result <- borg_inspect(spatial, train_idx = train_idx, test_idx = test_idx,
                       coords = c("lon", "lat"))
result

## ----random-cv-inflation------------------------------------------------------
# Diagnose data dependencies
spatial <- data.frame(
  lon = runif(200, 0, 100),
  lat = runif(200, 0, 100),
  response = rnorm(200)
)

diagnosis <- borg_diagnose(spatial, coords = c("lon", "lat"), target = "response",
                           verbose = FALSE)
diagnosis@recommended_cv

## ----risk-access--------------------------------------------------------------
# Create result with violations
result <- borg_inspect(
  data.frame(x = 1:100, y = rnorm(100)),
  train_idx = 1:60,
  test_idx = 51:100
)

# Summary
cat("Valid:", result@is_valid, "\n")
cat("Hard violations:", result@n_hard, "\n")
cat("Soft warnings:", result@n_soft, "\n")

# Individual risks
for (risk in result@risks) {
  cat("\n", risk$type, "(", risk$severity, "):\n", sep = "")
  cat("  ", risk$description, "\n")
  if (!is.null(risk$affected)) {
    cat("  Affected:", head(risk$affected, 5), "...\n")
  }
}

# Tabular format
as.data.frame(result)

Any scripts or data that you put into this service are public.

BORG documentation built on March 20, 2026, 5:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

BORG
Bounded Outcome Risk Guard for Model Evaluation

inst/doc/risk-taxonomy.R
In BORG: Bounded Outcome Risk Guard for Model Evaluation

Try the BORG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BORG Bounded Outcome Risk Guard for Model Evaluation

inst/doc/risk-taxonomy.R In BORG: Bounded Outcome Risk Guard for Model Evaluation

Try the BORG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BORG
Bounded Outcome Risk Guard for Model Evaluation

inst/doc/risk-taxonomy.R
In BORG: Bounded Outcome Risk Guard for Model Evaluation