Nothing
# Target leakage and feature engineering leakage checks for borg_validate()
#' Check for target leakage in features
#' @noRd
.check_target_leakage <- function(data, train_idx, test_idx, workflow) {
risks <- list()
# If no target column specified, skip
if (is.null(workflow$target_col)) {
return(risks)
}
target_col <- workflow$target_col
if (!target_col %in% names(data)) {
return(risks)
}
target <- data[[target_col]]
features <- data[, setdiff(names(data), target_col), drop = FALSE]
# ===========================================================================
# 1. Direct correlation check
# ===========================================================================
if (is.numeric(target)) {
numeric_features <- features[vapply(features, is.numeric, logical(1))]
if (length(numeric_features) > 0) {
cor_risks <- .detect_correlation_leakage(
numeric_features, target, train_idx, target_col_name = target_col
)
risks <- c(risks, cor_risks)
}
}
# ===========================================================================
# 2. Suspicious naming patterns (common leakage indicators)
# ===========================================================================
leaky_patterns <- c(
"^outcome", "^target", "^label", "^y_",
"_outcome$", "_target$", "_label$",
"^future_", "_future$", "^next_", "_next$",
"^result", "_result$",
"_encoded$", "_mean_target$", "_target_mean$"
)
for (col in names(features)) {
for (pattern in leaky_patterns) {
if (grepl(pattern, col, ignore.case = TRUE)) {
risks <- c(risks, list(.new_risk(
type = "target_leakage_naming",
severity = "soft_inflation",
description = sprintf(
"Feature '%s' has suspicious naming pattern suggesting potential target leakage",
col
),
source_object = col
)))
break
}
}
}
# ===========================================================================
# 3. Perfect categorical prediction (target encoding leak)
# ===========================================================================
for (col in names(features)) {
feature <- features[[col]]
# Check categorical/factor features
if (is.factor(feature) || is.character(feature)) {
train_feature <- as.character(feature[train_idx])
train_target <- target[train_idx]
# Check if categories perfectly predict target
if (is.factor(train_target) || is.character(train_target)) {
# For categorical target: check if each feature level maps to one target level
cross_tab <- tryCatch(
table(train_feature, train_target),
error = function(e) NULL
)
if (!is.null(cross_tab) && nrow(cross_tab) > 1 && ncol(cross_tab) > 1) {
# Check if each row has only one non-zero entry (perfect separation)
rows_with_single_class <- sum(apply(cross_tab, 1, function(r) sum(r > 0) == 1))
if (rows_with_single_class == nrow(cross_tab)) {
risks <- c(risks, list(.new_risk(
type = "target_leakage_categorical",
severity = "hard_violation",
description = sprintf(
"Categorical feature '%s' perfectly separates target classes (likely target encoding or derived feature)",
col
),
affected_indices = integer(0),
source_object = col
)))
}
}
} else if (is.numeric(train_target)) {
# For numeric target: check if variance within categories is near zero
within_var <- tryCatch({
category_vars <- tapply(train_target, train_feature, var, na.rm = TRUE)
mean(category_vars, na.rm = TRUE)
}, error = function(e) NA)
total_var <- var(train_target, na.rm = TRUE)
if (!is.na(within_var) && !is.na(total_var) && total_var > 0) {
# R-squared equivalent
explained_ratio <- 1 - (within_var / total_var)
if (explained_ratio > 0.99) {
risks <- c(risks, list(.new_risk(
type = "target_leakage_categorical",
severity = "hard_violation",
description = sprintf(
"Categorical feature '%s' explains %.1f%% of target variance (likely derived from outcome)",
col, explained_ratio * 100
),
affected_indices = integer(0),
source_object = col
)))
}
}
}
}
}
# ===========================================================================
# 4. Temporal ordering check (if timestamp column provided)
# ===========================================================================
if (!is.null(workflow$time_col)) {
time_col <- workflow$time_col
if (time_col %in% names(data)) {
timestamps <- data[[time_col]]
# Check each feature for future information
for (col in names(features)) {
feature <- features[[col]]
# Skip non-numeric
if (!is.numeric(feature)) next
# Check if feature values correlate with future timestamps
# A leaky feature might show values that correlate with future outcomes
# This is a heuristic check - not definitive
# If feature has different variance in test vs train proportionally to time
train_times <- timestamps[train_idx]
test_times <- timestamps[test_idx]
if (inherits(train_times, c("Date", "POSIXct", "POSIXt"))) {
# Check if test times are all after train times (proper temporal split)
max_train_time <- max(train_times, na.rm = TRUE)
min_test_time <- min(test_times, na.rm = TRUE)
if (min_test_time < max_train_time) {
# Only flag once, not per feature
if (col == names(features)[1]) {
risks <- c(risks, list(.new_risk(
type = "temporal_leak",
severity = "soft_inflation",
description = sprintf(
"Temporal split is not strict: test data (%s) overlaps with training period (max train: %s)",
as.character(min_test_time), as.character(max_train_time)
),
affected_indices = test_idx[test_times < max_train_time],
source_object = time_col
)))
}
}
}
}
}
}
# ===========================================================================
# 5. Group mean leakage (target encoding without proper CV)
# ===========================================================================
if (!is.null(workflow$group_col)) {
group_col <- workflow$group_col
if (group_col %in% names(data)) {
groups <- data[[group_col]]
# Check for features that look like group-level target means
for (col in names(features)) {
feature <- features[[col]]
if (!is.numeric(feature) || !is.numeric(target)) next
# Compute what the group means would be on training data
train_groups <- groups[train_idx]
train_targets <- target[train_idx]
train_features <- feature[train_idx]
group_means <- tapply(train_targets, train_groups, mean, na.rm = TRUE)
# Check if feature values match group means suspiciously well
expected_from_groups <- group_means[as.character(train_groups)]
if (all(!is.na(expected_from_groups)) && all(!is.na(train_features))) {
cor_with_group_mean <- tryCatch(
cor(train_features, expected_from_groups, use = "complete.obs"),
error = function(e) NA
)
if (!is.na(cor_with_group_mean) && abs(cor_with_group_mean) > 0.99) {
risks <- c(risks, list(.new_risk(
type = "target_leakage_group_mean",
severity = "hard_violation",
description = sprintf(
"Feature '%s' appears to be group-level target mean (correlation %.4f with group means)",
col, cor_with_group_mean
),
affected_indices = integer(0),
source_object = col
)))
}
}
}
}
}
risks
}
#' Check for feature engineering leakage
#' @noRd
.check_feature_engineering <- function(data, train_idx, test_idx, workflow) {
risks <- list()
# ===========================================================================
# 1. Check for global statistics that should be train-only
# ===========================================================================
for (col in names(data)) {
feature <- data[[col]]
if (!is.numeric(feature)) next
train_vals <- feature[train_idx]
test_vals <- feature[test_idx]
all_vals <- feature
# Check if feature appears to be standardized using global stats
# A properly standardized feature on train only would have mean ~0 only on train
train_mean <- mean(train_vals, na.rm = TRUE)
train_sd <- sd(train_vals, na.rm = TRUE)
all_mean <- mean(all_vals, na.rm = TRUE)
all_sd <- sd(all_vals, na.rm = TRUE)
# If the feature has mean ~0 and sd ~1 on ALL data but not on train,
# it was standardized on full data
if (!is.na(all_sd) && all_sd > 0 &&
!is.na(train_sd) && train_sd > 0) {
# Check if standardized on full data (all_mean ~0, all_sd ~1)
is_globally_standardized <- abs(all_mean) < 0.01 && abs(all_sd - 1) < 0.01
# Check if NOT standardized on train only (train_mean != 0 or train_sd != 1)
train_not_standard <- abs(train_mean) > 0.05 || abs(train_sd - 1) > 0.05
if (is_globally_standardized && train_not_standard) {
risks <- c(risks, list(.new_risk(
type = "feature_engineering_leak",
severity = "hard_violation",
description = sprintf(
"Feature '%s' appears standardized on full data (global: mean=%.3f, sd=%.3f; train: mean=%.3f, sd=%.3f)",
col, all_mean, all_sd, train_mean, train_sd
),
affected_indices = test_idx,
source_object = col
)))
}
}
}
# ===========================================================================
# 2. Check for rank/percentile features computed on full data
# ===========================================================================
for (col in names(data)) {
feature <- data[[col]]
if (!is.numeric(feature)) next
# Percentile features are typically in [0, 1] or [0, 100]
feature_range <- range(feature, na.rm = TRUE)
is_percentile_like <- (feature_range[1] >= 0 && feature_range[2] <= 1) ||
(feature_range[1] >= 0 && feature_range[2] <= 100 &&
all(feature == floor(feature), na.rm = TRUE))
if (is_percentile_like && grepl("rank|percentile|quantile|pct", col, ignore.case = TRUE)) {
# Check if the ranking was done on full data
# If train values don't span the full range but test does, that's suspicious
train_range <- range(feature[train_idx], na.rm = TRUE)
test_range <- range(feature[test_idx], na.rm = TRUE)
# If test range extends beyond train range significantly, ranking used test data
if (test_range[2] > train_range[2] * 1.1 || test_range[1] < train_range[1] * 0.9) {
# This could be legitimate, so just flag as soft
risks <- c(risks, list(.new_risk(
type = "feature_engineering_rank",
severity = "soft_inflation",
description = sprintf(
"Feature '%s' appears to be a rank/percentile that may use test data (train range: [%.2f, %.2f], test range: [%.2f, %.2f])",
col, train_range[1], train_range[2], test_range[1], test_range[2]
),
affected_indices = integer(0),
source_object = col
)))
}
}
}
# ===========================================================================
# 3. Check for binned features where bin edges used test data
# ===========================================================================
for (col in names(data)) {
feature <- data[[col]]
if (!is.numeric(feature) && !is.factor(feature)) next
# Detect binned features by looking for few unique values with specific patterns
if (is.numeric(feature)) {
n_unique <- length(unique(feature[!is.na(feature)]))
n_total <- sum(!is.na(feature))
# Features with very few unique values relative to size might be binned
if (n_unique >= 2 && n_unique <= 20 && n_unique < n_total * 0.1) {
# Check if bin assignments differ systematically between train/test
# If binning used test data, the test distribution should be "too perfect"
train_tab <- table(feature[train_idx])
test_tab <- table(feature[test_idx])
# If test has bins not in train, binning may have used test data
test_only_bins <- setdiff(names(test_tab), names(train_tab))
if (length(test_only_bins) > 0) {
risks <- c(risks, list(.new_risk(
type = "feature_engineering_binning",
severity = "soft_inflation",
description = sprintf(
"Binned feature '%s' has %d bin(s) that appear only in test data",
col, length(test_only_bins)
),
affected_indices = integer(0),
source_object = col
)))
}
}
}
}
# ===========================================================================
# 4. Check for lag/lead features in time series (if time column provided)
# ===========================================================================
if (!is.null(workflow$time_col) && !is.null(workflow$target_col)) {
time_col <- workflow$time_col
target_col <- workflow$target_col
if (time_col %in% names(data) && target_col %in% names(data)) {
timestamps <- data[[time_col]]
target <- data[[target_col]]
# Check for features that look like lagged target values
for (col in setdiff(names(data), c(time_col, target_col))) {
feature <- data[[col]]
if (!is.numeric(feature) || !is.numeric(target)) next
# Check if this feature is a shifted version of target
for (lag in 1:min(5, floor(length(target) / 10))) {
if (length(target) <= lag) next
# Compare feature to lagged target
lagged_target <- c(rep(NA, lag), target[1:(length(target) - lag)])
if (all(!is.na(feature)) && all(!is.na(lagged_target[-c(1:lag)]))) {
cor_with_lag <- tryCatch(
cor(feature[-(1:lag)], lagged_target[-(1:lag)], use = "complete.obs"),
error = function(e) NA
)
if (!is.na(cor_with_lag) && abs(cor_with_lag) > 0.99) {
risks <- c(risks, list(.new_risk(
type = "feature_engineering_lag",
severity = "soft_inflation",
description = sprintf(
"Feature '%s' highly correlates (%.4f) with target lagged by %d - verify lag direction",
col, cor_with_lag, lag
),
affected_indices = integer(0),
source_object = col
)))
break
}
}
}
}
}
}
risks
}
#' Check for rolling/window feature engineering leakage
#'
#' Detects lag features, rolling means, cumulative sums, and other
#' time-derived features that may have been computed using future data.
#' @noRd
.check_rolling_features <- function(data, train_idx, test_idx, workflow) {
risks <- list()
# Only relevant if time column is specified
time_col <- workflow$time_col
if (is.null(time_col) || !time_col %in% names(data)) return(risks)
timestamps <- data[[time_col]]
time_numeric <- as.numeric(timestamps)
ord <- order(time_numeric)
train_sorted <- sort(train_idx)
test_sorted <- sort(test_idx)
target_col <- workflow$target_col
exclude_cols <- c(time_col, target_col)
feature_cols <- setdiff(names(data), exclude_cols)
feature_cols <- feature_cols[vapply(data[feature_cols], is.numeric, logical(1))]
for (col in feature_cols) {
feature <- data[[col]]
# Heuristic 1: Name-based detection
is_rolling_name <- grepl(
"roll|rolling|window|moving|lag|lead|cumsum|cummax|cummin|cummean|ewm|ema|sma|diff_",
col, ignore.case = TRUE
)
# Heuristic 2: Leading NA pattern (lag/rolling features have NAs at start)
na_positions <- which(is.na(feature[ord]))
has_leading_nas <- length(na_positions) > 0 &&
all(na_positions == seq_along(na_positions))
n_leading_nas <- if (has_leading_nas) length(na_positions) else 0
if (!is_rolling_name && n_leading_nas == 0) next
# Heuristic 3: Boundary smoothness check
# A rolling feature computed on full data will be smooth across the
# train/test boundary. One computed on train-only will show a
# discontinuity (jump or NA) at the boundary.
if (length(test_sorted) >= 3 && length(train_sorted) >= 3) {
boundary_test <- test_sorted[1:min(3, length(test_sorted))]
boundary_train <- train_sorted[(max(1, length(train_sorted) - 2)):length(train_sorted)]
test_vals <- feature[boundary_test]
train_vals <- feature[boundary_train]
if (all(!is.na(test_vals)) && all(!is.na(train_vals)) && is_rolling_name) {
all_boundary <- c(train_vals, test_vals)
boundary_smooth <- sd(diff(all_boundary), na.rm = TRUE)
overall_roughness <- sd(diff(feature[ord]), na.rm = TRUE)
if (!is.na(boundary_smooth) && !is.na(overall_roughness) &&
overall_roughness > 0) {
smoothness_ratio <- boundary_smooth / overall_roughness
if (smoothness_ratio < 0.3) {
risks <- c(risks, list(.new_risk(
type = "rolling_feature_leak",
severity = "soft_inflation",
description = sprintf(
"Feature '%s' appears to be a rolling computation with no discontinuity at the train/test boundary (smoothness ratio: %.2f). Verify it was computed on training data only.",
col, smoothness_ratio
),
affected_indices = boundary_test,
source_object = col
)))
next
}
}
}
}
# Heuristic 4: Named rolling/lag feature with leading NAs
if (is_rolling_name && n_leading_nas > 0) {
first_test_in_order <- which(ord %in% test_sorted)[1]
if (!is.na(first_test_in_order) && first_test_in_order > n_leading_nas) {
risks <- c(risks, list(.new_risk(
type = "rolling_feature_leak",
severity = "soft_inflation",
description = sprintf(
"Feature '%s' has %d leading NAs (suggesting window size %d). Values at test indices may include future data in their window. Verify it was computed on training data only.",
col, n_leading_nas, n_leading_nas
),
affected_indices = test_sorted[1:min(n_leading_nas, length(test_sorted))],
source_object = col
)))
}
}
}
risks
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.