R/testEdges.R

Defines functions regressEdges testEdgesPaired testEdgesTwoSample testEdgesSingle testEdges

Documented in regressEdges testEdges

#' @importFrom stats t.test p.adjust pf model.matrix
#' @title Test edges from SCORPION networks
#' @description Performs statistical testing of network edges from runSCORPION output.
#' Supports single-sample tests (testing if edges differ from zero) and two-sample
#' tests (comparing edges between two groups).
#' @param networksDF A data.frame output from \code{\link{runSCORPION}} containing
#'   TF-target pairs as rows and network identifiers as columns.
#' @param testType Character specifying the test type. Options are:
#'   \itemize{
#'     \item{"single": Single-sample test (one-sample t-test against zero)}
#'     \item{"two.sample": Two-sample comparison (t-test between two groups)}
#'   }
#' @param group1 Character vector of column names in \code{networksDF} representing
#'   the first group (or the only group for single-sample tests).
#' @param group2 Character vector of column names in \code{networksDF} representing
#'   the second group. Required for two-sample tests, ignored for single-sample tests.
#' @param paired Logical indicating whether to perform a paired t-test. Default FALSE.
#'   When TRUE, group1 and group2 must have the same length and be in matched order
#'   (e.g., group1[1] is paired with group2[1]). Useful for comparing matched samples
#'   such as Tumor vs Normal from the same patient.
#' @param alternative Character specifying the alternative hypothesis. Options:
#'   "two.sided" (default), "greater", or "less".
#' @param padjustMethod Character specifying the p-value adjustment method for multiple
#'   testing correction. See \code{\link[stats]{p.adjust}} for options. Default "BH"
#'   (Benjamini-Hochberg FDR).
#' @param minMeanEdge Numeric threshold for minimum mean absolute edge weight to
#'   include in testing. Edges with mean absolute weight below this threshold are
#'   excluded. Default 0 (no filtering).
#' @return A data.frame containing:
#'   \itemize{
#'     \item{tf: Transcription factor}
#'     \item{target: Target gene}
#'     \item{meanEdge: Mean edge weight}
#'     \item{tStatistic: Test statistic}
#'     \item{pValue: Raw p-value}
#'     \item{pAdj: Adjusted p-value}
#'     \item{For two-sample tests: meanGroup1, meanGroup2, diffMean (Group1 - Group2), log2FoldChange}
#'   }
#' @details
#' For single-sample tests, the function tests whether the mean edge weight across
#' replicates significantly differs from zero using a one-sample t-test.
#'
#' For two-sample tests, the function compares edge weights between two groups
#' using Welch's t-test (unequal variances assumed).
#'
#' For paired tests, the function calculates the difference between matched pairs
#' and performs a one-sample t-test on the differences (testing if mean difference
#' differs from zero). This is appropriate when samples are matched (e.g., Tumor
#' and Normal from the same patient).
#'
#' Edges are tested independently, and p-values are adjusted for multiple testing
#' using the specified method.
#'
#' The function uses fully vectorized computations for efficiency, making it suitable
#' for large-scale analyses with millions of edges. T-statistics and p-values are
#' calculated using matrix operations without iteration.
#' @examples
#' \dontrun{
#' # Load test data and build networks by donor and region
#' # Note: T = Tumor, N = Normal, B = Border regions
#' data(scorpionTest)
#' nets <- runSCORPION(
#'   gexMatrix = scorpionTest$gex,
#'   tfMotifs = scorpionTest$tf,
#'   ppiNet = scorpionTest$ppi,
#'   cellsMetadata = scorpionTest$metadata,
#'   groupBy = c("donor", "region")
#' )
#'
#' # Single-sample test: Test if edges in Tumor region differ from zero
#' tumor_nets <- grep("--T$", colnames(nets), value = TRUE)  # T = Tumor
#' results_single <- testEdges(
#'   networksDF = nets,
#'   testType = "single",
#'   group1 = tumor_nets
#' )
#'
#' # Two-sample test: Compare Tumor vs Border regions
#' tumor_nets <- grep("--T$", colnames(nets), value = TRUE)  # T = Tumor
#' border_nets <- grep("--B$", colnames(nets), value = TRUE)  # B = Border
#' results_tumor_vs_border <- testEdges(
#'   networksDF = nets,
#'   testType = "two.sample",
#'   group1 = tumor_nets,
#'   group2 = border_nets
#' )
#'
#' # View top differential edges (Tumor vs Border)
#' head(results_tumor_vs_border[order(results_tumor_vs_border$pAdj), ])
#'
#' # Compare Tumor vs Normal regions
#' normal_nets <- grep("--N$", colnames(nets), value = TRUE)  # N = Normal
#' results_tumor_vs_normal <- testEdges(
#'   networksDF = nets,
#'   testType = "two.sample",
#'   group1 = tumor_nets,
#'   group2 = normal_nets
#' )
#'
#' # Filter by minimum edge weight for focused analysis
#' results_filtered <- testEdges(
#'   networksDF = nets,
#'   testType = "two.sample",
#'   group1 = tumor_nets,
#'   group2 = normal_nets,
#'   minMeanEdge = 0.1  # Only test edges with |mean| >= 0.1
#' )
#'
#' # Paired t-test: Compare matched Tumor vs Normal samples (same patient)
#' # Ensure columns are ordered by patient: P31--T with P31--N, P32--T with P32--N, etc.
#' tumor_nets_ordered <- c("P31--T", "P32--T", "P33--T")
#' normal_nets_ordered <- c("P31--N", "P32--N", "P33--N")
#' results_paired <- testEdges(
#'   networksDF = nets,
#'   testType = "two.sample",
#'   group1 = tumor_nets_ordered,
#'   group2 = normal_nets_ordered,
#'   paired = TRUE
#' )
#' }
#' @export
#' @importFrom stats pt p.adjust var sd
testEdges <- function(networksDF,
                      testType = c("single", "two.sample"),
                      group1,
                      group2 = NULL,
                      paired = FALSE,
                      alternative = c("two.sided", "greater", "less"),
                      padjustMethod = "BH",
                      minMeanEdge = 0) {
  
  # Input validation
  testType <- match.arg(testType)
  alternative <- match.arg(alternative)
  
  if (missing(group1) || is.null(group1)) {
    stop("group1 must be specified")
  }
  
  if (!all(group1 %in% colnames(networksDF))) {
    missing_cols <- setdiff(group1, colnames(networksDF))
    stop("Some group1 columns not found in networksDF: ", paste(missing_cols, collapse = ", "))
  }
  
  if (testType == "two.sample") {
    if (is.null(group2)) {
      stop("group2 must be specified for two.sample test")
    }
    if (!all(group2 %in% colnames(networksDF))) {
      missing_cols <- setdiff(group2, colnames(networksDF))
      stop("Some group2 columns not found in networksDF: ", paste(missing_cols, collapse = ", "))
    }
    if (paired && length(group1) != length(group2)) {
      stop("For paired tests, group1 and group2 must have the same length")
    }
  }
  
  if (paired && testType == "single") {
    stop("Paired tests require testType = 'two.sample'")
  }
  
  # Extract TF and target columns
  tf_col <- which(colnames(networksDF) == "tf")
  target_col <- which(colnames(networksDF) == "target")
  
  if (length(tf_col) == 0 || length(target_col) == 0) {
    stop("networksDF must contain 'tf' and 'target' columns")
  }
  
  # Perform tests based on testType
  if (testType == "single") {
    results <- testEdgesSingle(
      networksDF = networksDF,
      group1 = group1,
      alternative = alternative,
      minMeanEdge = minMeanEdge
    )
  } else if (paired) {
    results <- testEdgesPaired(
      networksDF = networksDF,
      group1 = group1,
      group2 = group2,
      alternative = alternative,
      minMeanEdge = minMeanEdge
    )
  } else {
    results <- testEdgesTwoSample(
      networksDF = networksDF,
      group1 = group1,
      group2 = group2,
      alternative = alternative,
      minMeanEdge = minMeanEdge
    )
  }
  
  # Adjust p-values
  results$pAdj <- p.adjust(results$pValue, method = padjustMethod)
  
  rownames(results) <- NULL
  
  return(results)
}

#' @keywords internal
testEdgesSingle <- function(networksDF, group1, alternative, minMeanEdge) {
  
  # Extract edge weights for group1
  edge_data <- networksDF[, group1, drop = FALSE]
  edge_matrix <- as.matrix(edge_data)
  
  # Calculate mean edge weight
  meanEdge <- rowMeans(edge_matrix, na.rm = TRUE)
  
  # Filter by minimum mean edge weight
  keep_idx <- abs(meanEdge) >= minMeanEdge
  edge_matrix <- edge_matrix[keep_idx, , drop = FALSE]
  meanEdge <- meanEdge[keep_idx]
  tf_target <- networksDF[keep_idx, c("tf", "target")]
  
  # Vectorized one-sample t-test computation
  # Calculate statistics for all edges at once
  n_samples <- ncol(edge_matrix)
  
  # Count non-NA values per row
  n_valid <- rowSums(!is.na(edge_matrix))
  
  # Calculate standard deviation per row (using only non-NA values)
  sd_edge <- apply(edge_matrix, 1, sd, na.rm = TRUE)
  
  # Calculate t-statistic: t = (mean - mu) / (sd / sqrt(n))
  # For one-sample test against mu = 0
  test_stats <- meanEdge / (sd_edge / sqrt(n_valid))
  
  # Calculate p-values from t-distribution
  # df = n - 1
  df <- n_valid - 1
  
  pvalues <- switch(alternative,
    "two.sided" = 2 * pt(abs(test_stats), df = df, lower.tail = FALSE),
    "greater" = pt(test_stats, df = df, lower.tail = FALSE),
    "less" = pt(test_stats, df = df, lower.tail = TRUE)
  )
  
  # Set NA for edges with insufficient data
  insufficient_data <- n_valid < 2 | is.na(sd_edge) | sd_edge == 0
  test_stats[insufficient_data] <- NA
  pvalues[insufficient_data] <- NA
  
  # Compile results
  results <- data.frame(
    tf = tf_target$tf,
    target = tf_target$target,
    meanEdge = meanEdge,
    tStatistic = test_stats,
    pValue = pvalues,
    stringsAsFactors = FALSE
  )
  
  return(results)
}

#' @keywords internal
testEdgesTwoSample <- function(networksDF, group1, group2, alternative, minMeanEdge) {
  
  # Extract edge weights for both groups
  edge_data1 <- networksDF[, group1, drop = FALSE]
  edge_data2 <- networksDF[, group2, drop = FALSE]
  edge_matrix1 <- as.matrix(edge_data1)
  edge_matrix2 <- as.matrix(edge_data2)
  
  # Calculate mean edge weights
  meanEdge1 <- rowMeans(edge_matrix1, na.rm = TRUE)
  meanEdge2 <- rowMeans(edge_matrix2, na.rm = TRUE)
  meanEdge <- (meanEdge1 + meanEdge2) / 2
  diffMean <- meanEdge1 - meanEdge2
  
  # Filter by minimum mean edge weight
  keep_idx <- abs(meanEdge) >= minMeanEdge
  edge_matrix1 <- edge_matrix1[keep_idx, , drop = FALSE]
  edge_matrix2 <- edge_matrix2[keep_idx, , drop = FALSE]
  meanEdge1 <- meanEdge1[keep_idx]
  meanEdge2 <- meanEdge2[keep_idx]
  meanEdge <- meanEdge[keep_idx]
  diffMean <- diffMean[keep_idx]
  tf_target <- networksDF[keep_idx, c("tf", "target")]
  
  # Vectorized two-sample t-test computation (Welch's t-test)
  # Count non-NA values per row for each group
  n1 <- rowSums(!is.na(edge_matrix1))
  n2 <- rowSums(!is.na(edge_matrix2))
  
  # Calculate variance per row (using only non-NA values)
  var1 <- apply(edge_matrix1, 1, var, na.rm = TRUE)
  var2 <- apply(edge_matrix2, 1, var, na.rm = TRUE)
  
  # Calculate Welch's t-statistic: t = (mean1 - mean2) / sqrt(var1/n1 + var2/n2)
  se <- sqrt(var1/n1 + var2/n2)
  test_stats <- diffMean / se
  
  # Calculate degrees of freedom (Welch-Satterthwaite equation)
  df <- (var1/n1 + var2/n2)^2 / ((var1/n1)^2/(n1-1) + (var2/n2)^2/(n2-1))
  
  # Calculate p-values from t-distribution
  pvalues <- switch(alternative,
    "two.sided" = 2 * pt(abs(test_stats), df = df, lower.tail = FALSE),
    "greater" = pt(test_stats, df = df, lower.tail = FALSE),
    "less" = pt(test_stats, df = df, lower.tail = TRUE)
  )
  
  # Set NA for edges with insufficient data
  insufficient_data <- n1 < 2 | n2 < 2 | is.na(var1) | is.na(var2) | 
                       var1 == 0 | var2 == 0 | se == 0 | is.na(se)
  test_stats[insufficient_data] <- NA
  pvalues[insufficient_data] <- NA
  df[insufficient_data] <- NA
  
  # Calculate log2 fold change
  # Initialize with NA
  log2FC <- rep(NA_real_, length(meanEdge1))
  
  # Define threshold for near-zero values
  epsilon <- 1e-16
  
  # Case 1: Both groups are positive and non-zero
  idx_both_pos <- meanEdge1 > epsilon & meanEdge2 > epsilon
  if (any(idx_both_pos, na.rm = TRUE)) {
    log2FC[idx_both_pos] <- log2(meanEdge1[idx_both_pos] / meanEdge2[idx_both_pos])
  }
  
  # Case 2: Both groups are negative and non-zero
  idx_both_neg <- meanEdge1 < -epsilon & meanEdge2 < -epsilon
  if (any(idx_both_neg, na.rm = TRUE)) {
    log2FC[idx_both_neg] <- log2(abs(meanEdge1[idx_both_neg]) / abs(meanEdge2[idx_both_neg]))
  }
  
  # Case 3: Mixed signs but both non-zero
  idx_mixed <- abs(meanEdge1) > epsilon & abs(meanEdge2) > epsilon & 
               !(idx_both_pos | idx_both_neg)
  if (any(idx_mixed, na.rm = TRUE)) {
    log2FC[idx_mixed] <- sign(diffMean[idx_mixed]) * 
                         log2(abs(meanEdge1[idx_mixed]) / abs(meanEdge2[idx_mixed]))
  }
  
  # Compile results
  results <- data.frame(
    tf = tf_target$tf,
    target = tf_target$target,
    meanGroup1 = meanEdge1,
    meanGroup2 = meanEdge2,
    diffMean = diffMean,
    log2FoldChange = log2FC,
    meanEdge = meanEdge,
    tStatistic = test_stats,
    pValue = pvalues,
    stringsAsFactors = FALSE
  )
  
  return(results)
}

#' @keywords internal
testEdgesPaired <- function(networksDF, group1, group2, alternative, minMeanEdge) {
  
  # Extract edge weights for both groups
  edge_data1 <- networksDF[, group1, drop = FALSE]
  edge_data2 <- networksDF[, group2, drop = FALSE]
  edge_matrix1 <- as.matrix(edge_data1)
  edge_matrix2 <- as.matrix(edge_data2)
  
  # Calculate mean edge weights
  meanEdge1 <- rowMeans(edge_matrix1, na.rm = TRUE)
  meanEdge2 <- rowMeans(edge_matrix2, na.rm = TRUE)
  meanEdge <- (meanEdge1 + meanEdge2) / 2
  
  # Calculate paired differences
  diff_matrix <- edge_matrix1 - edge_matrix2
  diffMean <- rowMeans(diff_matrix, na.rm = TRUE)
  
  # Filter by minimum mean edge weight
  keep_idx <- abs(meanEdge) >= minMeanEdge
  diff_matrix <- diff_matrix[keep_idx, , drop = FALSE]
  edge_matrix1 <- edge_matrix1[keep_idx, , drop = FALSE]
  edge_matrix2 <- edge_matrix2[keep_idx, , drop = FALSE]
  meanEdge1 <- meanEdge1[keep_idx]
  meanEdge2 <- meanEdge2[keep_idx]
  meanEdge <- meanEdge[keep_idx]
  diffMean <- diffMean[keep_idx]
  tf_target <- networksDF[keep_idx, c("tf", "target")]
  
  # Vectorized paired t-test computation
  # For paired t-test: t = mean(d) / (sd(d) / sqrt(n))
  # where d = differences between pairs
  
  # Count valid pairs per row (both values must be non-NA)
  valid_pairs <- !is.na(edge_matrix1) & !is.na(edge_matrix2)
  n_valid <- rowSums(valid_pairs)
  
  # Calculate standard deviation of differences
  sd_diff <- apply(diff_matrix, 1, sd, na.rm = TRUE)
  
  # Calculate t-statistic
  test_stats <- diffMean / (sd_diff / sqrt(n_valid))
  
  # Calculate degrees of freedom: df = n - 1
  df <- n_valid - 1
  
  # Calculate p-values from t-distribution
  pvalues <- switch(alternative,
    "two.sided" = 2 * pt(abs(test_stats), df = df, lower.tail = FALSE),
    "greater" = pt(test_stats, df = df, lower.tail = FALSE),
    "less" = pt(test_stats, df = df, lower.tail = TRUE)
  )
  
  # Set NA for edges with insufficient data
  insufficient_data <- n_valid < 2 | is.na(sd_diff) | sd_diff == 0
  test_stats[insufficient_data] <- NA
  pvalues[insufficient_data] <- NA
  
  # Calculate log2 fold change
  log2FC <- rep(NA_real_, length(meanEdge1))
  epsilon <- 1e-16
  
  # Case 1: Both groups are positive and non-zero
  idx_both_pos <- meanEdge1 > epsilon & meanEdge2 > epsilon
  if (any(idx_both_pos, na.rm = TRUE)) {
    log2FC[idx_both_pos] <- log2(meanEdge1[idx_both_pos] / meanEdge2[idx_both_pos])
  }
  
  # Case 2: Both groups are negative and non-zero
  idx_both_neg <- meanEdge1 < -epsilon & meanEdge2 < -epsilon
  if (any(idx_both_neg, na.rm = TRUE)) {
    log2FC[idx_both_neg] <- log2(abs(meanEdge1[idx_both_neg]) / abs(meanEdge2[idx_both_neg]))
  }
  
  # Case 3: Mixed signs but both non-zero
  idx_mixed <- abs(meanEdge1) > epsilon & abs(meanEdge2) > epsilon & 
               !(idx_both_pos | idx_both_neg)
  if (any(idx_mixed, na.rm = TRUE)) {
    log2FC[idx_mixed] <- sign(diffMean[idx_mixed]) * 
                         log2(abs(meanEdge1[idx_mixed]) / abs(meanEdge2[idx_mixed]))
  }
  
  # Compile results
  results <- data.frame(
    tf = tf_target$tf,
    target = tf_target$target,
    meanGroup1 = meanEdge1,
    meanGroup2 = meanEdge2,
    diffMean = diffMean,
    log2FoldChange = log2FC,
    meanEdge = meanEdge,
    tStatistic = test_stats,
    pValue = pvalues,
    stringsAsFactors = FALSE
  )
  
  return(results)
}

#' @title Regression analysis of edges across ordered conditions
#' @description Performs linear regression on network edges from runSCORPION output
#' to identify edges that show significant trends across ordered conditions (e.g.,
#' disease progression: Normal -> Border -> Tumor).
#' @param networksDF A data.frame output from \code{\link{runSCORPION}} containing
#'   TF-target pairs as rows and network identifiers as columns.
#' @param orderedGroups A named list where each element is a character vector of
#'   column names in \code{networksDF}. Names represent ordered conditions
#'   (e.g., list(Normal = c("P31--N", "P32--N"), Border = c("P31--B", "P32--B"),
#'   Tumor = c("P31--T", "P32--T"))). The order of list elements defines the
#'   progression (first to last).
#' @param padjustMethod Character specifying the p-value adjustment method for multiple
#'   testing correction. See \code{\link[stats]{p.adjust}} for options. Default "BH"
#'   (Benjamini-Hochberg FDR).
#' @param minMeanEdge Numeric threshold for minimum mean absolute edge weight to
#'   include in testing. Edges with mean absolute weight below this threshold are
#'   excluded. Default 0 (no filtering).
#' @return A data.frame containing:
#'   \itemize{
#'     \item{tf: Transcription factor}
#'     \item{target: Target gene}
#'     \item{slope: Regression slope (change in edge weight per condition step)}
#'     \item{intercept: Regression intercept}
#'     \item{rSquared: R-squared value (proportion of variance explained)}
#'     \item{fStatistic: F-statistic for the regression}
#'     \item{pValue: Raw p-value for the slope}
#'     \item{pAdj: Adjusted p-value}
#'     \item{meanEdge: Overall mean edge weight across all conditions}
#'     \item{One column per condition showing mean edge weight in that condition}
#'   }
#' @details
#' This function performs simple linear regression for each edge, modeling edge weight
#' as a function of an ordered categorical variable (coded as 0, 1, 2, ... for each
#' condition level).
#'
#' The slope coefficient indicates the average change in edge weight per step along
#' the ordered progression. Positive slopes indicate increasing edge weights,
#' negative slopes indicate decreasing edge weights.
#'
#' The function uses vectorized computations for efficiency with large datasets.
#' @examples
#' \dontrun{
#' # Load test data and build networks by donor and region
#' # Note: T = Tumor, N = Normal, B = Border regions
#' data(scorpionTest)
#' nets <- runSCORPION(
#'   gexMatrix = scorpionTest$gex,
#'   tfMotifs = scorpionTest$tf,
#'   ppiNet = scorpionTest$ppi,
#'   cellsMetadata = scorpionTest$metadata,
#'   groupBy = c("donor", "region")
#' )
#'
#' # Define ordered progression: Normal -> Border -> Tumor
#' normal_nets <- grep("--N$", colnames(nets), value = TRUE)
#' border_nets <- grep("--B$", colnames(nets), value = TRUE)
#' tumor_nets <- grep("--T$", colnames(nets), value = TRUE)
#'
#' ordered_conditions <- list(
#'   Normal = normal_nets,
#'   Border = border_nets,
#'   Tumor = tumor_nets
#' )
#'
#' # Perform regression analysis
#' results_regression <- regressEdges(
#'   networksDF = nets,
#'   orderedGroups = ordered_conditions
#' )
#'
#' # View top edges with strongest trends
#' head(results_regression[order(results_regression$pAdj), ])
#'
#' # Edges with positive slopes (increasing from N to T)
#' increasing <- results_regression[results_regression$pAdj < 0.05 & 
#'                                   results_regression$slope > 0, ]
#' print(paste("Edges increasing along N->B->T:", nrow(increasing)))
#'
#' # Edges with negative slopes (decreasing from N to T)
#' decreasing <- results_regression[results_regression$pAdj < 0.05 & 
#'                                   results_regression$slope < 0, ]
#' print(paste("Edges decreasing along N->B->T:", nrow(decreasing)))
#'
#' # Filter by minimum edge weight and R-squared
#' strong_trends <- results_regression[results_regression$pAdj < 0.05 & 
#'                                      results_regression$rSquared > 0.7 &
#'                                      abs(results_regression$meanEdge) > 0.1, ]
#' }
#' @export
#' @importFrom stats lm coef summary.lm p.adjust
regressEdges <- function(networksDF,
                         orderedGroups,
                         padjustMethod = "BH",
                         minMeanEdge = 0) {
  
  # Input validation
  if (missing(orderedGroups) || is.null(orderedGroups)) {
    stop("orderedGroups must be specified")
  }
  
  if (!is.list(orderedGroups) || is.null(names(orderedGroups))) {
    stop("orderedGroups must be a named list")
  }
  
  if (length(orderedGroups) < 2) {
    stop("orderedGroups must contain at least 2 conditions")
  }
  
  # Validate all columns exist
  all_cols <- unlist(orderedGroups, use.names = FALSE)
  if (!all(all_cols %in% colnames(networksDF))) {
    missing_cols <- setdiff(all_cols, colnames(networksDF))
    stop("Some columns not found in networksDF: ", paste(missing_cols, collapse = ", "))
  }
  
  # Extract TF and target columns
  tf_col <- which(colnames(networksDF) == "tf")
  target_col <- which(colnames(networksDF) == "target")
  
  if (length(tf_col) == 0 || length(target_col) == 0) {
    stop("networksDF must contain 'tf' and 'target' columns")
  }
  
  # Prepare data for regression
  n_conditions <- length(orderedGroups)
  condition_names <- names(orderedGroups)
  
  # Create predictor variable (0, 1, 2, ... for ordered conditions)
  x <- numeric()
  edge_data_list <- list()
  
  for (i in seq_along(orderedGroups)) {
    cols <- orderedGroups[[i]]
    edge_data_list[[i]] <- as.matrix(networksDF[, cols, drop = FALSE])
    x <- c(x, rep(i - 1, length(cols)))  # 0-indexed for first condition
  }
  
  # Combine all edge data
  edge_matrix <- do.call(cbind, edge_data_list)
  
  # Calculate mean edge weight across all conditions
  meanEdge <- rowMeans(edge_matrix, na.rm = TRUE)
  
  # Calculate mean for each condition
  condition_means <- matrix(NA, nrow = nrow(edge_matrix), ncol = n_conditions)
  for (i in seq_along(orderedGroups)) {
    condition_means[, i] <- rowMeans(edge_data_list[[i]], na.rm = TRUE)
  }
  colnames(condition_means) <- paste0("mean", condition_names)
  
  # Filter by minimum mean edge weight
  keep_idx <- abs(meanEdge) >= minMeanEdge
  edge_matrix <- edge_matrix[keep_idx, , drop = FALSE]
  meanEdge <- meanEdge[keep_idx]
  condition_means <- condition_means[keep_idx, , drop = FALSE]
  tf_target <- networksDF[keep_idx, c("tf", "target")]
  
  # Vectorized linear regression
  n_edges <- nrow(edge_matrix)
  n_samples <- ncol(edge_matrix)
  
  # Pre-compute regression components
  x_mean <- mean(x)
  x_centered <- x - x_mean
  sxx <- sum(x_centered^2)
  
  # Initialize result vectors
  slopes <- numeric(n_edges)
  intercepts <- numeric(n_edges)
  r_squared <- numeric(n_edges)
  f_stats <- numeric(n_edges)
  pvalues <- numeric(n_edges)
  
  # Compute regression for each edge
  for (i in 1:n_edges) {
    y <- edge_matrix[i, ]
    
    # Remove NA values
    valid_idx <- !is.na(y)
    y_valid <- y[valid_idx]
    x_valid <- x[valid_idx]
    n_valid <- length(y_valid)
    
    if (n_valid < 3) {
      slopes[i] <- NA
      intercepts[i] <- NA
      r_squared[i] <- NA
      f_stats[i] <- NA
      pvalues[i] <- NA
      next
    }
    
    # Compute regression coefficients
    x_valid_mean <- mean(x_valid)
    y_valid_mean <- mean(y_valid)
    x_valid_centered <- x_valid - x_valid_mean
    y_valid_centered <- y_valid - y_valid_mean
    
    sxy <- sum(x_valid_centered * y_valid_centered)
    sxx_valid <- sum(x_valid_centered^2)
    syy <- sum(y_valid_centered^2)
    
    # Slope and intercept
    slopes[i] <- sxy / sxx_valid
    intercepts[i] <- y_valid_mean - slopes[i] * x_valid_mean
    
    # R-squared
    ss_res <- sum((y_valid - (intercepts[i] + slopes[i] * x_valid))^2)
    ss_tot <- syy
    r_squared[i] <- 1 - (ss_res / ss_tot)
    
    # F-statistic and p-value
    df_reg <- 1
    df_res <- n_valid - 2
    ms_reg <- (ss_tot - ss_res) / df_reg
    ms_res <- ss_res / df_res
    
    if (ms_res > 0) {
      f_stats[i] <- ms_reg / ms_res
      pvalues[i] <- pf(f_stats[i], df1 = df_reg, df2 = df_res, lower.tail = FALSE)
    } else {
      f_stats[i] <- NA
      pvalues[i] <- NA
    }
  }
  
  # Adjust p-values
  pAdj <- p.adjust(pvalues, method = padjustMethod)
  
  # Compile results
  results <- data.frame(
    tf = tf_target$tf,
    target = tf_target$target,
    slope = slopes,
    intercept = intercepts,
    rSquared = r_squared,
    fStatistic = f_stats,
    pValue = pvalues,
    pAdj = pAdj,
    meanEdge = meanEdge,
    condition_means,
    stringsAsFactors = FALSE
  )
  
  return(results)
}

Try the SCORPION package in your browser

Any scripts or data that you put into this service are public.

SCORPION documentation built on Feb. 5, 2026, 1:06 a.m.