R/batch_pcc.R
In IOBR: Immune Oncology Biological Research

Documented in batch_pcc

#' Batch Calculation of Partial Correlation Coefficients
#'
#' @description
#' Computes partial correlation coefficients between multiple features and a
#' target variable while controlling for an interference (confounding) variable.
#' Adjusts p-values for multiple testing using the Benjamini-Hochberg method.
#'
#' @param input Data frame containing feature variables, target variable, and
#'   interference variable.
#' @param interferenceid Character string specifying the column name of the
#'   interference (confounding) variable to control for.
#' @param target Character string specifying the column name of the target
#'   variable.
#' @param features Character vector specifying the column names of feature
#'   variables to correlate with the target.
#' @param method Character string specifying the correlation method. Options are
#'   `"pearson"`, `"spearman"`, or `"kendall"`. Default is `"pearson"`.
#'
#' @return Tibble containing the following columns for each feature:
#' \describe{
#'   \item{sig_names}{Feature name}
#'   \item{p.value}{Raw p-value}
#'   \item{statistic}{Partial correlation coefficient}
#'   \item{p.adj}{Adjusted p-value (Benjamini-Hochberg method)}
#'   \item{log10pvalue}{Negative log10-transformed p-value}
#'   \item{stars}{Significance stars: **** p.adj<0.0001, *** p.adj<0.001,
#'     ** p.adj<0.01, * p.adj<0.05, + p.adj<0.5}
#' }
#'
#' @author Rongfang Shen
#' @export
#'
#' @examples
#' # Create small example data
#' set.seed(123)
#' test_data <- data.frame(
#'   TumorPurity = runif(100),
#'   TargetVar = rnorm(100),
#'   Signature1 = rnorm(100),
#'   Signature2 = rnorm(100)
#' )
#' # Calculate partial correlations controlling for tumor purity
#' res <- batch_pcc(
#'   input = test_data,
#'   interferenceid = "TumorPurity",
#'   target = "TargetVar",
#'   method = "pearson",
#'   features = c("Signature1", "Signature2")
#' )
#' head(res)
batch_pcc <- function(input,
                      interferenceid,
                      target,
                      features,
                      method = c("pearson", "spearman", "kendall")) {
  if (is.null(input)) return(NULL)
  # Input validation
  if (!is.data.frame(input)) {
    cli::cli_abort("{.arg input} must be a non-null data frame.")
  }
  if (nrow(input) == 0) {
    cli::cli_abort("{.arg input} has no rows.")
  }
  if (!interferenceid %in% colnames(input)) {
    cli::cli_abort(
      "Interference variable {.val {interferenceid}} not found in data."
    )
  }
  if (!target %in% colnames(input)) {
    cli::cli_abort("Target variable {.val {target}} not found in data.")
  }
  if (!is.character(features) || length(features) == 0) {
    cli::cli_abort("{.arg features} must be a non-empty character vector.")
  }

  method <- rlang::arg_match(method)

  # Filter valid features
  features <- setdiff(features, c(interferenceid, target))
  valid_features <- features[features %in% colnames(input)]
  invalid_features <- setdiff(features, colnames(input))

  if (length(invalid_features) > 0) {
    cli::cli_alert_warning(
      "Ignoring {length(invalid_features)} invalid feature{?s}:",
      " {.val {invalid_features}}}"
    )
  }

  if (length(valid_features) == 0) {
    cli::cli_abort("No valid features found in data.")
  }

  cli::cli_alert_info(
    "Computing {method} partial correlation for",
    " {length(valid_features)} feature{?s}"
  )

  # Partial correlation test function
  pcor_test <- function(x, y, z, method) {
    dat <- stats::na.omit(cbind(x, y, z))
    if (nrow(dat) < 4) {
      return(c(estimate = NA_real_, p.value = NA_real_))
    }

    r_mat <- stats::cor(dat, method = method)
    rxy <- r_mat[1, 2]
    rxz <- r_mat[1, 3]
    ryz <- r_mat[2, 3]

    # Partial correlation formula
    rho <- (rxy - rxz * ryz) / sqrt((1 - rxz^2) * (1 - ryz^2))

    n <- nrow(dat)
    if (abs(rho) >= 1) {
      return(c(estimate = rho, p.value = 0))
    }

    tstat <- rho * sqrt((n - 3) / (1 - rho^2))
    pval <- 2 * stats::pt(abs(tstat), df = n - 3, lower.tail = FALSE)

    c(estimate = rho, p.value = pval)
  }

  # Vectorized partial correlation calculation
  results <- vapply(valid_features, function(feat) {
    pcor_test(
      input[[feat]],
      input[[target]],
      input[[interferenceid]],
      method = method
    )
  }, numeric(2))

  # Build results data frame
  cc <- data.frame(
    sig_names = valid_features,
    p.value = results["p.value", ],
    statistic = results["estimate", ],
    stringsAsFactors = FALSE,
    row.names = NULL
  )

  cc$p.adj <- stats::p.adjust(cc$p.value, method = "BH")
  cc$log10pvalue <- -log10(cc$p.value)
  cc$stars <- cut(cc$p.adj,
    breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, 0.5, Inf),
    labels = c("****", "***", "**", "*", "+", "")
  )

  cc <- cc[order(cc$p.value), , drop = FALSE]

  cli::cli_alert_success("Partial correlation analysis complete")

  tibble::as_tibble(cc)
}