R/cyt_univiariate.R
In CytoProfile: Cytokine Profiling Analysis Tool

Documented in cyt_univariate

#' Pairwise Univariate Tests Between Two Groups
#'
#' @description
#' `cyt_univariate` supports additional scaling options and explicit
#' choice of statistical test.
#' For each categorical predictor with exactly two levels and each
#' numeric outcome, a two‑sample t‑test or Wilcoxon rank–sum test is
#' performed.  Results are returned either as a list of test objects
#' or, if `format_output = TRUE`, as a tidy data frame with one
#' row per comparison.
#'
#' @param data A data frame or matrix containing both categorical
#'   and numeric variables.
#' @param scale A character specifying a transformation to apply to
#'   numeric variables prior to testing.  Choices are `NULL` (no
#'   transformation), "log2", "log10", "zscore", or
#'   "custom".  When set to "custom", supply a function via
#'   `custom_fn`.
#' @param method Character specifying the test to perform.  Use
#'   "auto" (default) to select between t‑test and Wilcoxon based
#'   on Shapiro–Wilk normality tests for each outcome; "ttest" to
#'   always use Student’s t‑test; or "wilcox" to always use the
#'   Wilcoxon rank–sum test.
#' @param verbose Logical indicating whether to return the results.
#'   Provided for backward compatibility but has no effect on printing.
#' @param format_output Logical.  If `TRUE`, returns the results as
#'   a tidy data frame; if `FALSE` (default), returns a list of
#'   test objects similar to the original function.
#' @param custom_fn A function to apply when `scale = "custom"`.
#' @param p_adjust_method Character or \code{NULL}. Method passed to
#'   \code{p.adjust()} for correcting p-values across all comparisons
#'   (e.g., \code{"BH"} for Benjamini-Hochberg). If \code{NULL} (default)
#'   no adjustment is performed.
#' @return If `format_output = FALSE`, a named list of test objects
#'   keyed by "Outcome_Categorical".  If `format_output = TRUE`, a
#'   data frame with columns `Outcome`, `Categorical`, `Comparison`,
#'   `Test`, `Estimate`, `Statistic`, and `P_value`.
#' @examples
#' data_df <- ExampleData1[, -c(3)]
#' data_df <- dplyr::filter(data_df, Group != "ND", Treatment != "Unstimulated")
#' cyt_univariate(data_df[, c(1:2, 5:6)], scale = "log2",
#'                method = "auto", format_output = TRUE)
#' @author Shubh Saraswat
#' @importFrom stats as.formula sd shapiro.test t.test wilcox.test
#' @export
cyt_univariate <- function(
  data,
  scale = NULL,
  method = c("auto", "ttest", "wilcox"),
  verbose = TRUE,
  format_output = FALSE,
  custom_fn = NULL,
  p_adjust_method = NULL
) {
  names(data) <- make.names(names(data), unique = TRUE)
  method <- match.arg(method)
  # Convert to data frame
  x1_df <- as.data.frame(data)
  # Convert character variables to factors
  cat_vars <- sapply(x1_df, is.character)
  if (any(cat_vars)) {
    x1_df[cat_vars] <- lapply(x1_df[cat_vars], as.factor)
  }
  # Identify categorical predictors and continuous variables
  cat_preds <- sapply(x1_df, is.factor)
  cont_vars <- sapply(x1_df, is.numeric)
  # Apply scaling using shared utility for consistency
  if (!is.null(scale)) {
    x1_df <- apply_scale(
      data = x1_df,
      scale = scale,
      custom_fn = custom_fn
    )
    # Recompute predictors and continuous-variable indicators after scaling
    cat_preds <- sapply(x1_df, is.factor)
    cont_vars <- sapply(x1_df, is.numeric)
  }
  # Empty list to store test results
  test_results <- list()
  # Loop over categorical predictors and continuous outcomes
  for (cat_var in names(x1_df)[cat_preds]) {
    # Only consider predictors with exactly two levels
    if (length(levels(x1_df[[cat_var]])) != 2) {
      next
    }
    for (outcome in names(x1_df)[cont_vars]) {
      group_levels <- levels(x1_df[[cat_var]])
      group1 <- x1_df[[outcome]][x1_df[[cat_var]] == group_levels[1]]
      group2 <- x1_df[[outcome]][x1_df[[cat_var]] == group_levels[2]]
      # Check for sufficient data and variance in both groups
      if (length(group1) < 2 || length(group2) < 2) {
        warning(
          "Skipping test for ",
          outcome,
          " in ",
          cat_var,
          " due to insufficient data in one of the groups."
        )
        next
      } else if (stats::sd(group1) == 0 || stats::sd(group2) == 0) {
        warning(
          "Skipping test for ",
          outcome,
          " in ",
          cat_var,
          " due to low variance."
        )
        next
      }
      # Determine test to use
      test_used <- switch(
        method,
        auto = {
          p1 <- tryCatch(
            stats::shapiro.test(group1)$p.value,
            error = function(e) 0
          )
          p2 <- tryCatch(
            stats::shapiro.test(group2)$p.value,
            error = function(e) 0
          )
          if (p1 > 0.05 && p2 > 0.05) "ttest" else "wilcox"
        },
        ttest = "ttest",
        wilcox = "wilcox"
      )
      # Perform the test
      if (test_used == "ttest") {
        tt <- stats::t.test(
          stats::as.formula(paste(outcome, "~", cat_var)),
          data = x1_df
        )
      } else {
        tt <- stats::wilcox.test(
          stats::as.formula(paste(outcome, "~", cat_var)),
          data = x1_df,
          conf.int = TRUE,
          exact = FALSE
        )
      }
      key <- paste(outcome, cat_var, sep = "_")
      test_results[[key]] <- tt
    }
  }
  if (length(test_results) == 0) {
    return("No valid tests were performed.")
  }
  # Return results in tidy format if requested
  if (!format_output) {
    return(test_results)
  }
  # Format into data frame
  out_df <- do.call(
    rbind,
    lapply(names(test_results), function(key) {
      tt <- test_results[[key]]
      parts <- strsplit(key, "_")[[1]]
      outcome <- parts[1]
      cat_var <- parts[2]
      lvls <- levels(x1_df[[cat_var]])
      comp <- paste(lvls[1], "vs", lvls[2])
      est <- if (!is.null(tt$estimate)) unname(tt$estimate)[1] else NA_real_
      stat <- if (!is.null(tt$statistic)) unname(tt$statistic)[1] else NA_real_
      data.frame(
        Outcome = outcome,
        Categorical = cat_var,
        Comparison = comp,
        Test = tt$method,
        Estimate = round(est, 3),
        Statistic = round(stat, 3),
        P_value = round(tt$p.value, 3),
        stringsAsFactors = FALSE
      )
    })
  )
  if (!is.null(p_adjust_method)) {
    out_df$P_adj <- adjust_p(out_df$P_value, method = p_adjust_method)
  }
  out_df
}