R/lcpm.R

#' Transform Counts
#'
#' This convenient wrapper function converts raw counts to the log2-counts per 
#' million scale, following normalization for library size and a minimal count 
#' shift.
#'
#' @param dat Omic data matrix or matrix-like object with rows corresponding to
#'   probes and columns to samples.
#' @param filter Optional vector of length 2 specifying the filter criterion. 
#'   Each probe must have more than \code{filter[1]} log2-counts per million in 
#'   at least \code{filter[2]} libraries to pass the expression threshold.
#' @param method Normalization method to be used. See Details.
#'
#' @details
#' \code{lcpm} applies the \code{\link[limma]{voom}} transformation to 
#' sequencing data, converting counts to approximately normal distributions on 
#' the log2-CPM scale. Data can now be modelled using traditional linear 
#' techniques (at least once spot weights have been applied; see Law, et al. 
#' (2014)), or used for unsupervised clustering analysis via PCA, MDS, or other 
#' methods.
#'
#' It is recommended that low count genes be filtered out prior to 
#' transformation. There is no general algorithm for determining the most 
#' appropriate expression filter for a given data set. As a rule of thumb, the 
#' \code{limma} authors advise setting \code{filter[1]} to either 1, or 10 / 
#' (\emph{L} / 1,000,000), where \emph{L} = the minimum library size for a given 
#' count matrix. The former corresponds to a log2-CPM of 0, while the latter may 
#' be preferable for studies with especially deep read coverage. For \code{
#' filter[2]}, the authors recommend using the number of replicates in the 
#' largest group, to guarantee that a gene is expresed in at least one sample 
#' for any groupwise comparison. These are broad guidelines, however, not 
#' strict rules. See Law, et al. (2016) for some worked through examples.
#'
#' \code{method = "TMM"} is the weighted trimmed mean of M-values (to the 
#' reference) proposed by Robinson & Oshlack (2010), where the weights are from 
#' the delta method on binomial data.
#'
#' \code{method = "RLE"} is the scaling factor method proposed by Anders & Huber
#' (2010). We call it "relative log expression", as median library is calculated 
#' from the geometric mean of all columns and the median ratio of each sample to 
#' the median library is taken as the scale factor.
#'
#' \code{method = "upperquartile"} is the upper-quartile normalization method of
#' Bullard, et al. (2010), in which the scale factors are calculated from the 
#' 75% quantile of the counts for each library, after removing genes which are 
#' zero in all libraries.
#'
#' If \code{method = "none"}, then the normalization factors are set to 1.
#'
#' For symmetry, normalization factors are adjusted to multiply to 1. The 
#' effective library size is then the original library size multiplied by the 
#' scaling factor.
#'
#' Note that rows that have zero counts for all columns are trimmed before
#' normalization factors are computed. Therefore rows with all zero counts do 
#' not affect the estimated factors.
#'
#' @return A numeric matrix of normalized counts on the log2-CPM scale.
#'
#' @references
#' Anders, S. & Huber, W. (2010).
#' \href{https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106}{
#' Differential expression analysis for sequence count data}. \emph{Genome 
#' Biology}, 11:R106.
#'
#' Bullard, J.H., Purdom, E., Hansen, K.D. & Dudoit, S. (2010).
#' \href{http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-94}{
#' Evaluation of statistical methods for normalization and differential 
#' expression in mRNA-Seq experiments}. \emph{BMC Bioinformatics}, 11:94.
#'
#' Law, C.W., Alhamdoosh, M., Su, S., Smyth, G.K., & Ritchie, M.E. (2016).
#' \href{https://f1000research.com/articles/5-1408/v2}{RNA-seq analysis is 
#' easy as 1-2-3 with limma, Glimma and edgeR}. \emph{F1000 Research, 5}(1408).
#'
#' Law, C.W., Chen, Y., Shi, W., & Smyth, G.K. (2014).
#' \href{https://genomebiology.biomedcentral.com/articles/10.1186/gb-2014-15-2-r29}{
#' voom: precision weights unlock linear model analysis tools for RNA-seq read 
#' counts}. \emph{Genome Biology}, \strong{15}:R29.
#'
#' Robinson, M.D. & Oshlack, A. (2010).
#' \href{https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25}{
#' A scaling normalization method for differential expression analysis of 
#' RNA-seq data}. \emph{Genome Biology}, 11:R25.
#'
#' @examples
#' # Simulate count data
#' mat <- matrix(rnbinom(5000, mu = 10, size = 5), nrow = 1000, ncol = 5)
#'
#' # Plot raw counts
#' plot_density(mat)
#'
#' # Plot transformed counts
#' trans_mat <- lcpm(mat)
#' plot_density(trans_mat)
#'
#' @seealso
#' \code{\link[limma]{voom}}, \code{\link[edgeR]{cpm}}
#'
#' @export
#' @importFrom edgeR cpm DGEList calcNormFactors
#'

lcpm <- function(dat,
                 filter = NULL,
                 method = 'TMM') {
  
  # Preliminaries
  if (!is.null(filter)) {
    if (length(filter) != 2L) {
      stop('filter must be a vector of length 2.')
    }
  }
  
  # DESeqDataSet?
  if (is(dat, 'DESeqDataSet')) {
    suppressPackageStartupMessages(require(DESeq2))
    if (!is.null(filter)) {
      keep <- rowSums(cpm(counts(dat))) > filter[1L] >= filter[2L]
      dat <- dat[keep, , drop = FALSE]
    }
    if (is.null(sizeFactors(dat)) & is.null(normalizationFactors(dat))) {
      dat <- estimateSizeFactors(dat)
    }
    y <- cpm(counts(fit, normalized = TRUE), log = TRUE, prior.count = 1L)
  } else {
  # Raw count matrix?
    if (is.null(filter)) {
      y <- DGEList(mat)
    } else {
      keep <- rowSums(cpm(mat) > filter[1]) >= filter[2]
      y <- DGEList(mat[keep, , drop = FALSE])
    }
    y <- calcNormFactors(y, method = method)
    lib.size <- with(y$samples, lib.size * norm.factors)
    y <- t(log2(t(y$counts + 0.5) / (lib.size + 1L) * 1e6L))
    colnames(y) <- colnames(mat)
  }
  
  # Export
  return(y)
  
}
dswatson/biomisc documentation built on May 15, 2019, 4:52 p.m.