R/counts_normalization.R

Defines functions counts_normalization

Documented in counts_normalization

#' Calculate CPM Normalization for Raw Gene Expression Counts 
#'
#' @param raw_counts numeric matrix or data frame of raw gene expression counts.  
#' @param MARGIN "row" or "col" to identify the place of sample (library size). If the \code{raw_counts} matrix has a \code{nxG} size, put "row" if it's opposite put "col".
#' @param log2 a logical indicating whether the normalized counts will be transformed in log2. Default is \code{TRUE}.
#' @param plot a logical indicating whether a graph will be displayed for the normalization check. Default is \code{TRUE}.
#' @param norm_method a character string indicating which method to use to normalize the data. See \code{\link[edgeR]{normLibSizes}} for details. Default method is \code{TMM}.
#'
#' @return normalized counts matrix 
#' 
#' @importFrom utils menu
#' @importFrom edgeR DGEList calcNormFactors
#'  
#' @author Mélanie Huchon & Quentin Laval
#' 
#' @export
#' 
#' @examples
#' 
#' #Generate data
#' sim_Genes <- c(runif(90,0,1), rbinom(60,2,0.4), runif(30,1250,2000), runif(20,100,300))
#' raw_data <- matrix(sim_Genes, nrow = 10)
#' colnames(raw_data) <- paste0("G", sample(1:ncol(raw_data), ncol(raw_data), replace = FALSE))
#' rownames(raw_data) <- paste0("Sample_", 1:nrow(raw_data))
#' 
#' #Use the method
#' norm_data <- counts_normalization(raw_counts = data.frame(raw_data), MARGIN = "row")

counts_normalization <- function(raw_counts, MARGIN = c("row", "col"), log2 = TRUE, plot = TRUE, norm_method = c("TMM", "TMMwsp", "none", "RLE", "upperquartile")){
  
  # Ensure that MARGIN is either "row" or "col"
  MARGIN <- match.arg(MARGIN, several.ok = FALSE) 
  norm_method <- match.arg(norm_method, several.ok = FALSE)
  
  if (MARGIN == "row" && ncol(raw_counts) < nrow(raw_counts)) {
    input <- menu(c("Yes", "No"), title = "Are you sure that your samples are in rows? (nb_col < nb_row)")
    if (input == 2) stop("The normalization has been stopped. Please check your input dimensions.")
  }
  
  if (MARGIN == "col" && ncol(raw_counts) > nrow(raw_counts)) {
    input <- menu(c("Yes", "No"), title = "Are you sure that your samples are in columns? (nb_col > nb_row)")
    if (input == 2) stop("The normalization has been stopped. Please check your input dimensions.")
  }
  
  
  # Transpose if samples are in rows
  if (MARGIN == "row") {
    raw_counts <- t(raw_counts)
  }
  
  # Create DGEList object and calculate TMM normalization factors
  dge <- DGEList(counts = raw_counts)
  dge <- calcNormFactors(dge, method = norm_method)
  
  # Apply log2 transformation on CPM if requested
  if (log2) {
    norm_counts <- mapply(function(v, lib_size, norm_factor) {
      log2((v + 0.5) / (lib_size * norm_factor + 1) * 1e6)
    },
    as.data.frame(dge$counts), 
    dge$samples$lib.size, 
    dge$samples$norm.factors,
    SIMPLIFY = TRUE)
  } else {
    norm_counts <- mapply(function(v, lib_size, norm_factor) {
      v / (lib_size * norm_factor) * 1e6
    },
    as.data.frame(dge$counts), 
    dge$samples$lib.size, 
    dge$samples$norm.factors,
    SIMPLIFY = TRUE)
  }
  
  # Transpose back if samples were originally in rows
  if (MARGIN == "row") {
    norm_counts <- t(norm_counts)
  }
  
  # Plot normalized counts if requested
  if (plot) {
    if (log2) {
      norm_counts_plot <- 2^norm_counts
    } else {
      norm_counts_plot <- norm_counts
    }
    
    barplot(if (MARGIN == "row") rowSums(norm_counts) else colSums(norm_counts), 
            xlab = "Samples", main = "Normalized Counts")
  }
  
  return(as.data.frame(norm_counts))
}
sistm/sistmr documentation built on June 11, 2025, 1:33 a.m.