gbuehler-MiniChip: MiniChip

Documented in SummarizeHeatmaps

#' @title SummarizeHeatmaps
#'
#' @description Summarize heatmap count matrices from individual bam samples.
#'
#' @details This function takes a list of heatmap count matrices (from individual bam files) and summarizes them (by user defined groups).
#' The matrices should have unique row names and consistent dimensions. This implementation is optimized for performance.
#'
#' @param counts A list of counts across peak summits generated by SummitHeatmap.
#' Each element must be a matrix with unique row names.
#' @param sampleList A list where each element is a character vector of sample names that should be grouped.
#' The name of each element of this list specifies the group name.
#' @param summarizing Function to use for summarizing data. Defaults to mean.
#' Must be a function that can be applied to a numeric vector (e.g., mean, median, sum).
#' @param verbose Logical indicating whether to print progress messages. Default is TRUE.
#'
#' @return A list of summarized (by group) counts across peak summits.
#'
#' @examples
#' counts <- list(matrix(rnorm(21000,2,1),ncol=21,
#' nrow=100,dimnames=list(1:100,-10:10)),
#'                matrix(rnorm(21000,2,1),ncol=21,
#'                nrow=100,dimnames=list(1:100,-10:10)),
#'                matrix(rnorm(21000,2,1),ncol=21,
#'                nrow=100,dimnames=list(1:100,-10:10)),
#'                matrix(rnorm(21000,2,1),ncol=21,
#'                nrow=100,dimnames=list(1:100,-10:10)))
#' bamNames <- c("counts1","counts2","counts3","counts4")
#' names(counts) <- bamNames
#' SummarizeHeatmaps(counts,sampleList=list
#' (countsA=c("counts1","counts2"),countsB=c("counts3","counts4")))
#'
#'@importFrom utils head
#'
#' @export
SummarizeHeatmaps <- function(counts, sampleList, summarizing = mean, verbose = TRUE) {
  # Fast input validation
  if (!is.list(counts)) stop("'counts' must be a list of matrices")
  if (length(counts) == 0) stop("'counts' list is empty")
  if (is.null(names(counts))) stop("'counts' list elements must be named")
  if (!is.list(sampleList)) stop("'sampleList' must be a list")
  if (length(sampleList) == 0) stop("'sampleList' is empty")
  if (is.null(names(sampleList))) stop("'sampleList' elements must be named to specify group names")
  if (!is.function(summarizing)) stop("'summarizing' must be a function (e.g., mean, median, sum)")
  
  # Check first matrix once
  if (!is.matrix(counts[[1]])) stop("First element in 'counts' is not a matrix")
  if (is.null(rownames(counts[[1]]))) stop("First matrix does not have row names")
  
  # Store dimensions once
  nrow_first <- nrow(counts[[1]])
  ncol_first <- ncol(counts[[1]])
  
  # Quick check for duplicate rownames in first matrix
  if (anyDuplicated(rownames(counts[[1]])) > 0) {
    dups <- duplicated(rownames(counts[[1]]))
    stop(sprintf("First matrix contains %d duplicated rownames: %s",
                 sum(dups), paste(head(rownames(counts[[1]])[dups], 5), collapse=", ")))
  }
  
  # Check all matrices in one pass
  is_valid <- vapply(seq_along(counts)[-1], function(i) {
    if (!is.matrix(counts[[i]])) {
      stop(sprintf("Element %s in 'counts' is not a matrix", names(counts)[i]))
    }
    if (is.null(rownames(counts[[i]]))) {
      stop(sprintf("Matrix %s does not have row names", names(counts)[i]))
    }
    if (nrow(counts[[i]]) != nrow_first || ncol(counts[[i]]) != ncol_first) {
      stop(sprintf("Matrix %s has inconsistent dimensions (%d×%d vs %d×%d)",
                   names(counts)[i], nrow(counts[[i]]), ncol(counts[[i]]), nrow_first, ncol_first))
    }
    if (anyDuplicated(rownames(counts[[i]])) > 0) {
      dups <- duplicated(rownames(counts[[i]]))
      stop(sprintf("Matrix %s contains %d duplicated rownames: %s",
                   names(counts)[i], sum(dups), paste(head(rownames(counts[[i]])[dups], 5), collapse=", ")))
    }
    return(TRUE)
  }, logical(1))
  
  # Check if all samples in sampleList exist in counts
  all_samples <- names(counts)
  for (i in seq_along(sampleList)) {
    missing_samples <- setdiff(sampleList[[i]], all_samples)
    if (length(missing_samples) > 0) {
      stop(sprintf("Group '%s' contains samples that don't exist in counts: %s",
                   names(sampleList)[i], paste(missing_samples, collapse=", ")))
    }
  }
  
  # Pre-allocate output list
  meanCounts <- vector("list", length(sampleList))
  names(meanCounts) <- names(sampleList)
  
  # Process each group
  for (i in seq_along(sampleList)) {
    group_name <- names(meanCounts)[i]
    if (verbose) message(sprintf("Summarizing group '%s' with %d samples", group_name, length(sampleList[[i]])))
    
    # Check if group has at least one sample
    if (length(sampleList[[i]]) == 0) {
      warning(sprintf("Group '%s' contains no samples, skipping", group_name))
      next
    }
    
    # Extract samples for this group
    group_samples <- sampleList[[i]]
    
    # Optimization: For single sample groups, just copy the matrix
    if (length(group_samples) == 1) {
      meanCounts[[i]] <- counts[[group_samples]]
      next
    }
    
    # For multiple samples, optimize by using arrays directly
    tryCatch({
      # Pre-allocate a 3D array to store all matrices in this group
      arr_size <- c(length(group_samples), nrow_first, ncol_first)
      arr <- array(0, dim = arr_size)
      
      # Fill the array more efficiently
      for (j in seq_along(group_samples)) {
        arr[j,,] <- counts[[group_samples[j]]]
      }
      
      # Apply the summarizing function along the first dimension
      meanCounts[[i]] <- apply(arr, c(2, 3), summarizing)
      
      # Ensure proper dimension names
      rownames(meanCounts[[i]]) <- rownames(counts[[1]])
      colnames(meanCounts[[i]]) <- colnames(counts[[1]])
      
    }, error = function(e) {
      stop(sprintf("Error when summarizing group '%s': %s", group_name, e$message))
    })
  }
  
  return(meanCounts)
}