R/normalisation.R

Defines functions geomxNorm rpkm2tpm calNormCount

Documented in geomxNorm

# DEseq2 normalization
calNormCount <- function(spe_object, log = TRUE) {
  # get raw count
  count_df <- assay(spe_object, 1)
  # compute geometric mean
  loggeomeans <- rowMeans(log(count_df))
  # compute size factor
  sf <- apply(count_df, 2, function(x) {
    exp(stats::median((log(x) - loggeomeans)[is.finite(loggeomeans) & x > 0]))
  })

  # normalise by size factor
  norm_count <- t(t(count_df) / sf)

  if (isTRUE(log)) {
    lognorm_count <- log2(norm_count + 1)
  } else {
    lognorm_count <- norm_count
  }
  return(list(lognorm_count, sf))
}



rpkm2tpm <- function(x) {
  colSumMat <- edgeR::expandAsMatrix(colSums(x, na.rm = TRUE),
    byrow = TRUE,
    dim = dim(x)
  )
  tpm <- x / colSumMat * 1e6
  return(tpm)
}


#' Perform normalization to GeoMX data
#'
#' @param spe_object A SpatialExperiment object.
#' @param method Normalization method to use. Options: TMM, RPKM, TPM, CPM, upperquartile, sizefactor. RPKM and TPM require gene length information, which should be added into rowData(spe). Note that TMM here is TMM + CPM.
#' @param log Log-transformed or not.
#'
#' @return A SpatialExperiment object, with the second assay being the normalized count matrix. The normalised count is stored in the assay slot called "logcounts" by default. 
#' With method TMM and sizefactor, the norm.factor will be saved in the metadata of the SpatialExperiment object.
#' @export
#'
#' @references Robinson, M. D., McCarthy, D. J., & Smyth, G. K. (2010). edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics, 26(1), 139-140.
#' @references Love, M., Anders, S., & Huber, W. (2014). Differential analysis of count data–the DESeq2 package. Genome Biol, 15(550), 10-1186.
#'
#' @note The normalised count is not intended to be used directly for linear modelling. For linear modelling, it is better to include the normalized factors in the "norm.factors" column of the DGEList object.
#' 
#' @examples
#' data("dkd_spe_subset")
#'
#' spe_tmm <- geomxNorm(dkd_spe_subset, method = "TMM")
#' spe_upq <- geomxNorm(dkd_spe_subset, method = "upperquartile")
#' spe_deseqnorm <- geomxNorm(dkd_spe_subset, method = "sizefactor")
#'
geomxNorm <- function(spe_object, method = c(
                        "TMM", "RPKM", "TPM", "CPM",
                        "upperquartile", "sizefactor"
                      ),
                      log = TRUE) {

  if (!(method %in% c("TMM", "RPKM", "TPM", "CPM", "upperquartile", "sizefactor"))) {
    stop("Please make sure method matched one of the following strings: 
         TMM, 
         RPKM, 
         TPM, 
         CPM, 
         upperquartile, 
         sizefactor")
  }
  
  method <- match.arg(method)

  # spe object to dgelist
  spe <- spe_object

  y <- edgeR::SE2DGEList(spe)

  ## TMM

  if (method == "TMM") {
    S4Vectors::metadata(spe)$norm.factor <- edgeR::calcNormFactors(y)$samples$norm.factors
    S4Vectors::metadata(spe)$norm.method <- "TMM"
    if (isTRUE(log)) {
      assay(spe, "logcounts") <- edgeR::calcNormFactors(y) |>
        (\(.) edgeR::cpm(., log = TRUE))()
    } else {
      assay(spe, "logcounts") <- edgeR::calcNormFactors(y) |>
        (\(.) edgeR::cpm(.))()
    }
  }

  ## RPKM

  if (method == "RPKM") {
    S4Vectors::metadata(spe)$norm.method <- "RPKM"
    if (isTRUE(log)) {
      assay(spe, "logcounts") <- edgeR::rpkm(y, log = TRUE, prior.count = 0)
    } else {
      assay(spe, "logcounts") <- edgeR::rpkm(y, log = FALSE, prior.count = 0)
    }
  }


  ## TPM

  if (method == "TPM") {
    S4Vectors::metadata(spe)$norm.method <- "TPM"
    if (isTRUE(log)) {
      assay(spe, "logcounts") <- log(rpkm2tpm(edgeR::rpkm(y)) + 1, 2)
    } else {
      assay(spe, "logcounts") <- rpkm2tpm(edgeR::rpkm(y))
    }
  }


  ## upper quantile

  if (method == "upperquartile") {
    S4Vectors::metadata(spe)$norm.method <- "upperquartile"
    S4Vectors::metadata(spe)$norm.factor <- edgeR::calcNormFactors(y, method = "upperquartile")$samples$norm.factors
    if (isTRUE(log)) {
      assay(spe, "logcounts") <- edgeR::calcNormFactors(y, method = "upperquartile") |>
        (\(.) edgeR::cpm(., log = TRUE))()
    } else {
      assay(spe, "logcounts") <- edgeR::calcNormFactors(y, method = "upperquartile") |>
        (\(.) edgeR::cpm(.))()
    }
  }


  ## calculating size factor based on geomean

  if (method == "sizefactor") {
    S4Vectors::metadata(spe)$norm.method <- "sizefactor"
    S4Vectors::metadata(spe)$norm.factor <- calNormCount(spe, log = TRUE)[[2]]
    if (isTRUE(log)) {
      assay(spe, "logcounts") <- calNormCount(spe, log = TRUE)[[1]]
    } else {
      assay(spe, "logcounts") <- calNormCount(spe, log = FALSE)[[1]]
    }
  }

  return(spe)
}

#' Transfer SpatialExperiment object into DGEList object for DE analysis
#'
#' @param spe SpatialExperiment object.
#'
#' @return A DGEList.
#' @export 
#'
#' @examples 
#' data("dkd_spe_subset")
#' 
#' 
#' spe_tmm <- geomxNorm(dkd_spe_subset, method = "TMM")
#' dge <- spe2dge(spe_tmm)
#' 
spe2dge <- function(spe){
  
  dge <- edgeR::SE2DGEList(spe)
  nm <- S4Vectors::metadata(spe)$norm.method
  if(nm %in% c("TMM", "upperquartile", "sizefactor")){
    dge$samples$norm.factors <- S4Vectors::metadata(spe)$norm.factor
  }
  
  return(dge)
}



utils::globalVariables(c("."))
DavisLaboratory/standR documentation built on June 28, 2024, 11:32 a.m.