inst/doc/risk-taxonomy.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
library(BORG)

## ----index-overlap------------------------------------------------------------
data <- data.frame(x = 1:100, y = rnorm(100))

# Accidental overlap
result <- borg_inspect(data, train_idx = 1:60, test_idx = 51:100)
result

## ----duplicate-rows-----------------------------------------------------------
# Data with duplicate rows
dup_data <- rbind(
  data.frame(x = 1:5, y = 1:5),
  data.frame(x = 1:5, y = 1:5)  # Duplicates
)

result <- borg_inspect(dup_data, train_idx = 1:5, test_idx = 6:10)
result

## ----preprocessing-leak, eval=FALSE-------------------------------------------
# # BAD: Scale fitted on all data
# scaled_data <- scale(data)  # Uses all rows!
# train <- scaled_data[1:70, ]
# test <- scaled_data[71:100, ]
# 
# # BORG detects this
# borg_inspect(scaled_data, train_idx = 1:70, test_idx = 71:100)

## ----target-leakage-----------------------------------------------------------
# Simulate target leakage
leaky <- data.frame(
  x = rnorm(100),
  outcome = rnorm(100)
)
leaky$leaked <- leaky$outcome + rnorm(100, sd = 0.01)  # Near-perfect correlation

result <- borg_inspect(leaky, train_idx = 1:70, test_idx = 71:100, target = "outcome")
result

## ----group-leakage------------------------------------------------------------
# Clinical data with patient IDs
clinical <- data.frame(
  patient_id = rep(1:10, each = 10),
  measurement = rnorm(100)
)

# Random split ignoring patients
set.seed(123)
all_idx <- sample(100)
train_idx <- all_idx[1:70]
test_idx <- all_idx[71:100]

result <- borg_inspect(clinical, train_idx = train_idx, test_idx = test_idx,
                       groups = "patient_id")
result

## ----temporal-leak------------------------------------------------------------
# Time series data
ts_data <- data.frame(
  date = seq(as.Date("2020-01-01"), by = "day", length.out = 100),
  value = cumsum(rnorm(100))
)

# Wrong: random split ignores time
set.seed(42)
random_idx <- sample(100)
train_idx <- random_idx[1:70]
test_idx <- random_idx[71:100]

result <- borg_inspect(ts_data, train_idx = train_idx, test_idx = test_idx,
                       time = "date")
result

## ----proxy-leakage------------------------------------------------------------
# Strong but not extreme correlation
proxy <- data.frame(
  x = rnorm(100),
  outcome = rnorm(100)
)
proxy$strong_predictor <- proxy$outcome + rnorm(100, sd = 0.3)  # r ~ 0.96

result <- borg_inspect(proxy, train_idx = 1:70, test_idx = 71:100, target = "outcome")
result

## ----spatial-proximity--------------------------------------------------------
set.seed(42)
spatial <- data.frame(
  lon = runif(100, 0, 100),
  lat = runif(100, 0, 100),
  value = rnorm(100)
)

# Random split intermixes nearby points
train_idx <- sample(100, 70)
test_idx <- setdiff(1:100, train_idx)

result <- borg_inspect(spatial, train_idx = train_idx, test_idx = test_idx,
                       coords = c("lon", "lat"))
result

## ----random-cv-inflation------------------------------------------------------
# Diagnose data dependencies
spatial <- data.frame(
  lon = runif(200, 0, 100),
  lat = runif(200, 0, 100),
  response = rnorm(200)
)

diagnosis <- borg_diagnose(spatial, coords = c("lon", "lat"), target = "response",
                           verbose = FALSE)
diagnosis@recommended_cv

## ----risk-access--------------------------------------------------------------
# Create result with violations
result <- borg_inspect(
  data.frame(x = 1:100, y = rnorm(100)),
  train_idx = 1:60,
  test_idx = 51:100
)

# Summary
cat("Valid:", result@is_valid, "\n")
cat("Hard violations:", result@n_hard, "\n")
cat("Soft warnings:", result@n_soft, "\n")

# Individual risks
for (risk in result@risks) {
  cat("\n", risk$type, "(", risk$severity, "):\n", sep = "")
  cat("  ", risk$description, "\n")
  if (!is.null(risk$affected)) {
    cat("  Affected:", head(risk$affected, 5), "...\n")
  }
}

# Tabular format
as.data.frame(result)

Try the BORG package in your browser

Any scripts or data that you put into this service are public.

BORG documentation built on March 20, 2026, 5:09 p.m.