Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 7,
fig.height = 5
)
library(BORG)
## ----index-overlap------------------------------------------------------------
data <- data.frame(x = 1:100, y = rnorm(100))
# Accidental overlap
result <- borg_inspect(data, train_idx = 1:60, test_idx = 51:100)
result
## ----duplicate-rows-----------------------------------------------------------
# Data with duplicate rows
dup_data <- rbind(
data.frame(x = 1:5, y = 1:5),
data.frame(x = 1:5, y = 1:5) # Duplicates
)
result <- borg_inspect(dup_data, train_idx = 1:5, test_idx = 6:10)
result
## ----preprocessing-leak, eval=FALSE-------------------------------------------
# # BAD: Scale fitted on all data
# scaled_data <- scale(data) # Uses all rows!
# train <- scaled_data[1:70, ]
# test <- scaled_data[71:100, ]
#
# # BORG detects this
# borg_inspect(scaled_data, train_idx = 1:70, test_idx = 71:100)
## ----target-leakage-----------------------------------------------------------
# Simulate target leakage
leaky <- data.frame(
x = rnorm(100),
outcome = rnorm(100)
)
leaky$leaked <- leaky$outcome + rnorm(100, sd = 0.01) # Near-perfect correlation
result <- borg_inspect(leaky, train_idx = 1:70, test_idx = 71:100, target = "outcome")
result
## ----group-leakage------------------------------------------------------------
# Clinical data with patient IDs
clinical <- data.frame(
patient_id = rep(1:10, each = 10),
measurement = rnorm(100)
)
# Random split ignoring patients
set.seed(123)
all_idx <- sample(100)
train_idx <- all_idx[1:70]
test_idx <- all_idx[71:100]
result <- borg_inspect(clinical, train_idx = train_idx, test_idx = test_idx,
groups = "patient_id")
result
## ----temporal-leak------------------------------------------------------------
# Time series data
ts_data <- data.frame(
date = seq(as.Date("2020-01-01"), by = "day", length.out = 100),
value = cumsum(rnorm(100))
)
# Wrong: random split ignores time
set.seed(42)
random_idx <- sample(100)
train_idx <- random_idx[1:70]
test_idx <- random_idx[71:100]
result <- borg_inspect(ts_data, train_idx = train_idx, test_idx = test_idx,
time = "date")
result
## ----proxy-leakage------------------------------------------------------------
# Strong but not extreme correlation
proxy <- data.frame(
x = rnorm(100),
outcome = rnorm(100)
)
proxy$strong_predictor <- proxy$outcome + rnorm(100, sd = 0.3) # r ~ 0.96
result <- borg_inspect(proxy, train_idx = 1:70, test_idx = 71:100, target = "outcome")
result
## ----spatial-proximity--------------------------------------------------------
set.seed(42)
spatial <- data.frame(
lon = runif(100, 0, 100),
lat = runif(100, 0, 100),
value = rnorm(100)
)
# Random split intermixes nearby points
train_idx <- sample(100, 70)
test_idx <- setdiff(1:100, train_idx)
result <- borg_inspect(spatial, train_idx = train_idx, test_idx = test_idx,
coords = c("lon", "lat"))
result
## ----random-cv-inflation------------------------------------------------------
# Diagnose data dependencies
spatial <- data.frame(
lon = runif(200, 0, 100),
lat = runif(200, 0, 100),
response = rnorm(200)
)
diagnosis <- borg_diagnose(spatial, coords = c("lon", "lat"), target = "response",
verbose = FALSE)
diagnosis@recommended_cv
## ----risk-access--------------------------------------------------------------
# Create result with violations
result <- borg_inspect(
data.frame(x = 1:100, y = rnorm(100)),
train_idx = 1:60,
test_idx = 51:100
)
# Summary
cat("Valid:", result@is_valid, "\n")
cat("Hard violations:", result@n_hard, "\n")
cat("Soft warnings:", result@n_soft, "\n")
# Individual risks
for (risk in result@risks) {
cat("\n", risk$type, "(", risk$severity, "):\n", sep = "")
cat(" ", risk$description, "\n")
if (!is.null(risk$affected)) {
cat(" Affected:", head(risk$affected, 5), "...\n")
}
}
# Tabular format
as.data.frame(result)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.