quickstart.R
In BORG: Bounded Outcome Risk Guard for Model Evaluation

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
library(BORG)

## ----basic-usage--------------------------------------------------------------
# Create sample data
set.seed(42)
data <- data.frame(
  x1 = rnorm(100),
  x2 = rnorm(100),
  y = rnorm(100)
)

# Define a split
train_idx <- 1:70
test_idx <- 71:100

# Inspect the split
result <- borg_inspect(data, train_idx = train_idx, test_idx = test_idx)
result

## ----overlap-detection--------------------------------------------------------
# Accidental overlap in indices
bad_result <- borg_inspect(data, train_idx = 1:60, test_idx = 51:100)
bad_result

## ----diagnosis-mode-----------------------------------------------------------
# Spatial data with coordinates
set.seed(42)
spatial_data <- data.frame(
  lon = runif(200, -10, 10),
  lat = runif(200, -10, 10),
  elevation = rnorm(200, 500, 100),
  response = rnorm(200)
)

# Let BORG diagnose and create CV folds
result <- borg(spatial_data, coords = c("lon", "lat"), target = "response")
result

## ----validation-mode----------------------------------------------------------
# Validate a manual split
risk <- borg(spatial_data, train_idx = 1:150, test_idx = 151:200)
risk

## ----plot-results, fig.width=7, fig.height=5----------------------------------
# Plot the risk assessment
plot(risk)

## ----summary-results----------------------------------------------------------
# Generate methods text for publications
summary(result)

## ----spatial-example----------------------------------------------------------
result_spatial <- borg(spatial_data, coords = c("lon", "lat"), target = "response")
result_spatial$diagnosis@recommended_cv

## ----temporal-example---------------------------------------------------------
temporal_data <- data.frame(
  date = seq(as.Date("2020-01-01"), by = "day", length.out = 200),
  value = cumsum(rnorm(200))
)

result_temporal <- borg(temporal_data, time = "date", target = "value")
result_temporal$diagnosis@recommended_cv

## ----grouped-example----------------------------------------------------------
grouped_data <- data.frame(
  site = rep(1:20, each = 10),
  measurement = rnorm(200)
)

result_grouped <- borg(grouped_data, groups = "site", target = "measurement")
result_grouped$diagnosis@recommended_cv

## ----target-leakage-----------------------------------------------------------
# Simulate target leakage
leaky_data <- data.frame(
  x = rnorm(100),
  leaked_feature = rnorm(100),  # Will be made leaky
  outcome = rnorm(100)
)
# Make leaked_feature highly correlated with outcome
leaky_data$leaked_feature <- leaky_data$outcome + rnorm(100, sd = 0.05)

result <- borg_inspect(leaky_data, train_idx = 1:70, test_idx = 71:100,
                       target = "outcome")
result

## ----group-leakage------------------------------------------------------------
# Simulate clinical data with patient IDs
clinical_data <- data.frame(
  patient_id = rep(1:10, each = 10),
  visit = rep(1:10, times = 10),
  measurement = rnorm(100)
)

# Random split ignoring patients (BAD)
set.seed(123)
all_idx <- sample(100)
train_idx <- all_idx[1:70]
test_idx <- all_idx[71:100]

# Check for group leakage
result <- borg_inspect(clinical_data, train_idx = train_idx, test_idx = test_idx,
                       groups = "patient_id")
result

## ----cv-folds-----------------------------------------------------------------
result <- borg(spatial_data, coords = c("lon", "lat"), target = "response", v = 5)

# Number of folds
length(result$folds)

# First fold's train/test sizes
cat("Fold 1 - Train:", length(result$folds[[1]]$train),
    "Test:", length(result$folds[[1]]$test), "\n")

## ----certificate--------------------------------------------------------------
# Create a certificate
cert <- borg_certificate(result$diagnosis, data = spatial_data)
cert

## ----export, eval=FALSE-------------------------------------------------------
# # Export to file
# borg_export(result$diagnosis, spatial_data, "validation.yaml")
# borg_export(result$diagnosis, spatial_data, "validation.json")

## ----methods-text-------------------------------------------------------------
# Default APA style
result <- borg(spatial_data, coords = c("lon", "lat"), target = "response")
methods_text <- summary(result)

## ----methods-nature, eval=FALSE-----------------------------------------------
# # Nature style
# summary(result, style = "nature")
# 
# # Ecology style
# summary(result, style = "ecology")

## ----methods-with-comparison, eval=FALSE--------------------------------------
# comparison <- borg_compare_cv(spatial_data, response ~ lon + lat,
#                               coords = c("lon", "lat"))
# summary(result, comparison = comparison)

## ----compare-cv---------------------------------------------------------------
comparison <- borg_compare_cv(
  spatial_data,
  formula = response ~ lon + lat,
  coords = c("lon", "lat"),
  v = 5,
  repeats = 5  # Use more repeats in practice
)
print(comparison)

## ----compare-cv-plot, fig.width=7, fig.height=5-------------------------------
plot(comparison)

## ----power-analysis-----------------------------------------------------------
# Clustered data: 20 sites, 10 observations each
clustered_data <- data.frame(
  site = rep(1:20, each = 10),
  value = rep(rnorm(20, sd = 2), each = 10) + rnorm(200, sd = 0.5)
)

pw <- borg_power(clustered_data, groups = "site", target = "value")
print(pw)
summary(pw)