Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 7,
fig.height = 5
)
library(BORG)
## ----basic-usage--------------------------------------------------------------
# Create sample data
set.seed(42)
data <- data.frame(
x1 = rnorm(100),
x2 = rnorm(100),
y = rnorm(100)
)
# Define a split
train_idx <- 1:70
test_idx <- 71:100
# Inspect the split
result <- borg_inspect(data, train_idx = train_idx, test_idx = test_idx)
result
## ----overlap-detection--------------------------------------------------------
# Accidental overlap in indices
bad_result <- borg_inspect(data, train_idx = 1:60, test_idx = 51:100)
bad_result
## ----diagnosis-mode-----------------------------------------------------------
# Spatial data with coordinates
set.seed(42)
spatial_data <- data.frame(
lon = runif(200, -10, 10),
lat = runif(200, -10, 10),
elevation = rnorm(200, 500, 100),
response = rnorm(200)
)
# Let BORG diagnose and create CV folds
result <- borg(spatial_data, coords = c("lon", "lat"), target = "response")
result
## ----validation-mode----------------------------------------------------------
# Validate a manual split
risk <- borg(spatial_data, train_idx = 1:150, test_idx = 151:200)
risk
## ----plot-results, fig.width=7, fig.height=5----------------------------------
# Plot the risk assessment
plot(risk)
## ----summary-results----------------------------------------------------------
# Generate methods text for publications
summary(result)
## ----spatial-example----------------------------------------------------------
result_spatial <- borg(spatial_data, coords = c("lon", "lat"), target = "response")
result_spatial$diagnosis@recommended_cv
## ----temporal-example---------------------------------------------------------
temporal_data <- data.frame(
date = seq(as.Date("2020-01-01"), by = "day", length.out = 200),
value = cumsum(rnorm(200))
)
result_temporal <- borg(temporal_data, time = "date", target = "value")
result_temporal$diagnosis@recommended_cv
## ----grouped-example----------------------------------------------------------
grouped_data <- data.frame(
site = rep(1:20, each = 10),
measurement = rnorm(200)
)
result_grouped <- borg(grouped_data, groups = "site", target = "measurement")
result_grouped$diagnosis@recommended_cv
## ----target-leakage-----------------------------------------------------------
# Simulate target leakage
leaky_data <- data.frame(
x = rnorm(100),
leaked_feature = rnorm(100), # Will be made leaky
outcome = rnorm(100)
)
# Make leaked_feature highly correlated with outcome
leaky_data$leaked_feature <- leaky_data$outcome + rnorm(100, sd = 0.05)
result <- borg_inspect(leaky_data, train_idx = 1:70, test_idx = 71:100,
target = "outcome")
result
## ----group-leakage------------------------------------------------------------
# Simulate clinical data with patient IDs
clinical_data <- data.frame(
patient_id = rep(1:10, each = 10),
visit = rep(1:10, times = 10),
measurement = rnorm(100)
)
# Random split ignoring patients (BAD)
set.seed(123)
all_idx <- sample(100)
train_idx <- all_idx[1:70]
test_idx <- all_idx[71:100]
# Check for group leakage
result <- borg_inspect(clinical_data, train_idx = train_idx, test_idx = test_idx,
groups = "patient_id")
result
## ----cv-folds-----------------------------------------------------------------
result <- borg(spatial_data, coords = c("lon", "lat"), target = "response", v = 5)
# Number of folds
length(result$folds)
# First fold's train/test sizes
cat("Fold 1 - Train:", length(result$folds[[1]]$train),
"Test:", length(result$folds[[1]]$test), "\n")
## ----certificate--------------------------------------------------------------
# Create a certificate
cert <- borg_certificate(result$diagnosis, data = spatial_data)
cert
## ----export, eval=FALSE-------------------------------------------------------
# # Export to file
# borg_export(result$diagnosis, spatial_data, "validation.yaml")
# borg_export(result$diagnosis, spatial_data, "validation.json")
## ----methods-text-------------------------------------------------------------
# Default APA style
result <- borg(spatial_data, coords = c("lon", "lat"), target = "response")
methods_text <- summary(result)
## ----methods-nature, eval=FALSE-----------------------------------------------
# # Nature style
# summary(result, style = "nature")
#
# # Ecology style
# summary(result, style = "ecology")
## ----methods-with-comparison, eval=FALSE--------------------------------------
# comparison <- borg_compare_cv(spatial_data, response ~ lon + lat,
# coords = c("lon", "lat"))
# summary(result, comparison = comparison)
## ----compare-cv---------------------------------------------------------------
comparison <- borg_compare_cv(
spatial_data,
formula = response ~ lon + lat,
coords = c("lon", "lat"),
v = 5,
repeats = 5 # Use more repeats in practice
)
print(comparison)
## ----compare-cv-plot, fig.width=7, fig.height=5-------------------------------
plot(comparison)
## ----power-analysis-----------------------------------------------------------
# Clustered data: 20 sites, 10 observations each
clustered_data <- data.frame(
site = rep(1:20, each = 10),
value = rep(rnorm(20, sd = 2), each = 10) + rnorm(200, sd = 0.5)
)
pw <- borg_power(clustered_data, groups = "site", target = "value")
print(pw)
summary(pw)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.