bioLeak Audit Report

if (is.null(params$audit) || !inherits(params$audit, "LeakAudit")) {
  stop("Parameter 'audit' must be a LeakAudit object.", call. = FALSE)
}

audit <- params$audit

empty_df <- function(msg) data.frame(note = msg, stringsAsFactors = FALSE)
safe_df <- function(x, msg = "No results available.") {
  if (is.null(x) || !is.data.frame(x) || !nrow(x)) return(empty_df(msg))
  x
}

task <- tryCatch(audit@fit@task, error = function(e) NA_character_)
outcome <- tryCatch(audit@fit@outcome, error = function(e) NA_character_)
metric_summary <- tryCatch(audit@fit@metric_summary, error = function(e) NULL)
perm_gap <- tryCatch(audit@permutation_gap, error = function(e) NULL)
batch_assoc <- tryCatch(audit@batch_assoc, error = function(e) NULL)
target_assoc <- tryCatch(audit@target_assoc, error = function(e) NULL)
duplicates <- tryCatch(audit@duplicates, error = function(e) NULL)
perm_values <- tryCatch(audit@perm_values, error = function(e) numeric(0))
fold_status <- tryCatch(audit@fit@info$fold_status, error = function(e) NULL)
provenance <- tryCatch(audit@fit@info$provenance, error = function(e) NULL)
bio_version <- tryCatch(as.character(utils::packageVersion("bioLeak")),
                        error = function(e) "unknown")
has_ggplot2 <- requireNamespace("ggplot2", quietly = TRUE)

Overview

Cross-validated Metrics

print(safe_df(metric_summary, "Metric summary is unavailable."))

Permutation-Gap Test

The permutation-gap test compares the observed cross-validated metric against a null distribution constructed by permuting labels. A large gap with a small p-value indicates the model has learned signal beyond random label assignment.

print(safe_df(perm_gap, "Permutation results are unavailable."))
if (length(perm_values) > 0L) {
  obs <- NA_real_
  if (is.data.frame(perm_gap) && nrow(perm_gap) > 0L && "metric_obs" %in% names(perm_gap)) {
    obs <- as.numeric(perm_gap$metric_obs[[1]])
  }
  hist(
    perm_values,
    breaks = max(10L, min(50L, floor(sqrt(length(perm_values))))),
    main = "Permutation Metric Distribution",
    xlab = "Metric",
    col = "grey85",
    border = "white"
  )
  if (is.finite(obs)) {
    abline(v = obs, col = "firebrick", lwd = 2)
  }
} else {
  plot.new()
  text(0.5, 0.5, "No permutation values available.")
}

Batch / Study Association

print(safe_df(batch_assoc, "No batch/study association results available."))

Confounder Sensitivity

tryCatch({
  cs <- confounder_sensitivity(audit@fit)
  if (is.data.frame(cs) && nrow(cs) > 0L) {
    print(cs)
  } else {
    print(empty_df("Confounder sensitivity analysis returned no results."))
  }
}, error = function(e) {
  print(empty_df(paste("Confounder sensitivity unavailable:", e$message)))
})

Calibration

tryCatch({
  if (identical(task, "binomial")) {
    cal <- calibration_summary(audit@fit)
    if (is.data.frame(cal) && nrow(cal) > 0L) {
      print(cal)
    } else {
      print(empty_df("Calibration summary returned no results."))
    }
  } else {
    print(empty_df("Calibration analysis is only available for binomial tasks."))
  }
}, error = function(e) {
  print(empty_df(paste("Calibration unavailable:", e$message)))
})
tryCatch({
  if (identical(task, "binomial") && has_ggplot2) {
    cal <- calibration_summary(audit@fit)
    if (is.data.frame(cal) && nrow(cal) > 0L &&
        all(c("bin_midpoint", "observed_rate") %in% names(cal))) {
      p <- ggplot2::ggplot(cal, ggplot2::aes(x = bin_midpoint, y = observed_rate)) +
        ggplot2::geom_point() +
        ggplot2::geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
        ggplot2::labs(x = "Predicted probability", y = "Observed rate",
                      title = "Calibration Curve") +
        ggplot2::theme_minimal()
      print(p)
    }
  }
}, error = function(e) NULL)

Target Leakage Scan

if (is.data.frame(target_assoc) && nrow(target_assoc) > 0L) {
  print(utils::head(target_assoc, 20L))
} else {
  print(empty_df("No target association results available."))
}

Duplicate Detection

if (is.data.frame(duplicates) && nrow(duplicates) > 0L) {
  print(utils::head(duplicates, 20L))
} else {
  print(empty_df("No near-duplicate samples detected."))
}

Fold Balance

tryCatch({
  if (!is.null(fold_status) && is.data.frame(fold_status) && nrow(fold_status) > 0L) {
    status_tbl <- table(fold_status$status)
    barplot(status_tbl, col = c(success = "steelblue", skipped = "orange",
                                 failed = "firebrick")[names(status_tbl)],
            main = "Fold Status Distribution",
            ylab = "Count", las = 1)
  } else {
    plot.new()
    text(0.5, 0.5, "No fold status information available.")
  }
}, error = function(e) {
  plot.new()
  text(0.5, 0.5, "Fold balance plot unavailable.")
})

Methods and Interpretation

This report summarizes a post-hoc leakage audit performed by bioLeak. Each section targets a different aspect of data leakage and confounding:

Session Information

tryCatch({
  if (!is.null(provenance) && is.list(provenance)) {
    cat("R version:", provenance$r_version, "\n")
    cat("Platform:", provenance$platform, "\n")
    cat("Timestamp:", format(provenance$timestamp), "\n")
    if (!is.na(provenance$git_sha)) {
      cat("Git SHA:", provenance$git_sha, "\n")
    }
    if (!is.null(provenance$hardware)) {
      cat("System:", provenance$hardware$sysname, "/",
          provenance$hardware$machine, "\n")
    }
    cat("\nKey packages:\n")
    print(utils::head(provenance$packages, 20))
  } else {
    print(utils::sessionInfo())
  }
}, error = function(e) {
  print(utils::sessionInfo())
})


Try the bioLeak package in your browser

Any scripts or data that you put into this service are public.

bioLeak documentation built on March 6, 2026, 1:06 a.m.