R/check_resid.R

#' Check the residuals of a DESeq model
#'
#' This function evaluates the impact of different expression filters on the 
#' residuals of a fitted DESeqDataSet.
#'
#' @param dds A \code{DESeqDataSet} object that has been fit with a negative
#'   binomial GLM.
#' @param filter Numeric vector of length two specifying the filter criterion. Each 
#'   probe must have more than \code{filter[1]} log2-counts per million in at least 
#'   \code{filter[2]} libraries to pass the expression threshold.
#'
#' @details
#' The statistical tests upon which \code{qmod} is based presume that residuals
#' for a fitted model are approximately normally distributed. This is not generally
#' true of negative binomal GLMs, the family of models used by DESeq. The 
#' non-normality of residuals is especially pronounced for low count probes, which are
#' by default not filtered out until after modeling in the DESeq pipeline. (See 
#' \code{\link[DESeq2]{results}} for more details.) To run \code{qmod} on 
#' \code{DESeqDataSet} objects, it is necessary to filter out underexpressed probes 
#' and apply a variance stabilizing transformation. We recommend applying the lightest
#' possible expression filter, although there is no precise algorithm for determining
#' what this should be. 
#' 
#' As a rule of thumb, the \code{limma} authors advise setting \code{filter[1]} to 
#' either 1, or 10 / (\emph{L} / 1,000,000), where \emph{L} = the minimum library size 
#' for a given count matrix. The former corresponds to a log2-CPM of 0, while the 
#' latter may be preferable in cases where read depth is especially shallow. For 
#' \code{filter[2]}, the authors recommend using the number of replicates in the 
#' largest group, to guarantee that a gene is expresed in at least one sample for 
#' any groupwise comparison. These are broad guidelines, however, not strict rules. 
#'
#' @examples
#' library(DESeq2)
#' dds <- makeExampleDEESeqDataSet()
#' dds <- DESeq(dds)
#' check_resid(dds)
#'
#' @export
#' @importFrom edgeR cpm
#' @importFrom DESeq2 counts normalizationFactors  
#' @importFrom tidyr gather
#' @import dplyr 
#' @import ggplot2
#'

check_resid <- function(dds, 
                        filter = c(1, 1)) {
  
  # Preliminaries
  if (!is(dds, 'DESeqDataSet')) {
    stop('dds must be a DESeqDataSet.')
  }
  if (is.null(assays(dds)[['mu']])) {
    stop('dds must be fit with a negative binomial GLM.')
  }
  if (length(filter) != 2L) {
    stop('filter must be a vector of length 2.')
  }

  # Create resid_mat
  p <- nrow(dds)
  keep <- rowSums(cpm(counts(dds)) > filter[1]) >= filter[2]
  dds <- dds[keep, , drop = FALSE]
  if (is.null(sizeFactors(dds))) {
    nf <- normalizationFactors(dds) + 1L
    cnts <- log2((counts(dds) + 0.5) / nf * 1e6L)
    signal_mat <- log2((assays(dds)[['mu']] + 0.5) / nf * 1e6L)
  } else {
    cnts <- calcNormFactors(DGEList(counts(dds)), method = 'RLE')
    nf <- with(cnts$samples, lib.sizes * norm.factors) + 1L
    cnts <- t(log2(t(cnts$counts + 0.5) / nf * 1e6L))
    signal_mat <- t(log2(t(assays(dds)[['mu']] + 0.5) / nf * 1e6L))
  }
  resid_mat <- cnts - signal_mat

  # Output
  bad <- p - sum(keep)
  bad_p <- round((bad / p) * 100L, 2L)
  cat(paste0('Filter criterion removes ', bad, ' (', bad_p, '%) of probes.'))
  
  # Plot
  data_frame(Expected = qnorm(ppoints(1e4L)),
             Observed = quantile(gather(tbl_df(resid_mat), Sample, Resid)$Resid, 
                                 probs = ppoints(1e4L))) %>%
    ggplot(aes(Expected, Observed)) + 
    geom_point(size = 0.5) + 
    labs(title = 'Normal Q-Q Plot',
         x = 'Expected Quantiles',
         y = 'Observed Quantiles') + 
    theme_bw() + 
    theme(plot.title = element_text(hjust = 0.5))
  
}
dswatson/biowrapr documentation built on May 15, 2019, 4:52 p.m.