zinbwave: Zero-Inflated Negative Binomial Model for RNA-Seq Data

Documented in glmWeightedF independentFiltering pvalueAdjustment

#' Zero-inflation adjusted statistical tests for assessing differential
#' expression.
#'
#' This function recycles an old version of the \code{\link[edgeR]{glmLRT}}
#' method that allows an F-test with adjusted denominator degrees of freedom to
#' account for the downweighting in the zero-inflation model.
#'
#' @param glmfit a \code{\link[edgeR]{DGEGLM-class}} object, usually output from
#'   \code{\link[edgeR]{glmFit}}.
#' @param coef integer or character vector indicating which coefficients of the
#'   linear model are to be tested equal to zero. Values must be columns or
#'   column names of design. Defaults to the last coefficient. Ignored if
#'   \code{contrast} is specified.
#' @param contrast numeric vector or matrix specifying one or more contrasts of
#'   the linear model coefficients to be tested equal to zero. Number of rows
#'   must equal to the number of columns of \code{design}. If specified, then
#'   takes precedence over \code{coef}.
#' @param ZI logical, specifying whether the degrees of freedom in the
#'   statistical test should be adjusted according to the weights in the
#'   \code{fit} object to account for the downweighting. Defaults to TRUE and
#'   this option is highly recommended.
#' @param independentFiltering logical, specifying whether independent filtering
#'   should be performed.
#' @param filter vector of values to perform filtering on. Default is the mean
#'   of the fitted values from glmfit.
#' @references McCarthy, DJ, Chen, Y, Smyth, GK (2012). Differential expression
#' analysis of multifactor RNA-Seq experiments with respect to biological
#' variation. Nucleic Acids Research 40, 4288-4297.
#' Bourgon, Richard, Robert Gentleman, and Wolfgang Huber (2010) Independent
#' Filtering Increases Detection Power for High-Throughput Experiments. PNAS
#' 107 (21): 9546-51.
#' @note This function uses an adapted version of the \code{glmLRT} function
#'   that was originally written by Gordon Smyth, Davis McCarthy and Yunshun
#'   Chen as part of the edgeR package. Koen Van den Berge wrote code to adjust
#'   residual degree of freedoom and added the independent filtering step.
#' @details When `independentFiltering=TRUE` (default) an independent filtering
#' step is applied prior to the multiple testing procedure, as described in
#' great details in the `DESeq2`` vignette.
#' The values in the `padjFilter` column refer to this procedure. They are
#' identical to the `FDR` values if the filtering step does not remove any gene,
#' since the function uses the Benjamini-Hochberg correction by default. If the
#' procedure filters some genes, the adjusted p-values will typically result in
#' greater power to detect DE genes. The theory behind independent filtering is
#' described in  Bourgon et al. (2010).
#' @seealso \code{\link[edgeR]{glmLRT}}
#' @export
#' @importFrom edgeR aveLogCPM glmFit
#' @importFrom stats pf quantile
glmWeightedF <- function(glmfit, coef = ncol(glmfit$design),
                         contrast = NULL, ZI = TRUE,
                         independentFiltering = TRUE, filter = NULL){
    # Original function obtained from https://github.com/Bioconductor-mirror/edgeR/blob/release-3.0/R/glmfit.R
    # Tagwise likelihood ratio tests for DGEGLM
    # Gordon Smyth, Davis McCarthy and Yunshun Chen.
    # Created 1 July 2010.  Last modified 22 Nov 2013.
    if (!is(glmfit, "DGEGLM")) {
        if (is(glmfit, "DGEList") && is(coef, "DGEGLM")) {
            stop("First argument is no longer required. Rerun with just the glmfit and coef/contrast arguments.")
        }
        stop("glmfit must be an DGEGLM object (usually produced by glmFit).")
    }
    if (is.null(glmfit$AveLogCPM)) glmfit$AveLogCPM <- aveLogCPM(glmfit)
    nlibs <- ncol(glmfit)

    #	Check design matrix
    design <- as.matrix(glmfit$design)
    nbeta <- ncol(design)
    if(nbeta < 2) stop("Need at least two columns for design, usually the first is the intercept column")
    coef.names <- colnames(design)

    #	Evaluate logFC for coef to be tested
    #	Note that contrast takes precedence over coef: if contrast is given
    #	then reform design matrix so that contrast of interest is last column.
    if(is.null(contrast)) {
        if(length(coef) > 1) coef <- unique(coef)
        if(is.character(coef)) {
            check.coef <- coef %in% colnames(design)
            if(any(!check.coef)) stop("One or more named coef arguments do not match a column of the design matrix.")
            coef.name <- coef
            coef <- match(coef, colnames(design))
        }
        else
            coef.name <- coef.names[coef]
        logFC <- glmfit$coefficients[, coef, drop = FALSE] / log(2)
    } else {
        contrast <- as.matrix(contrast)
        qrc <- qr(contrast)
        ncontrasts <- qrc$rank
        if(ncontrasts == 0) stop("contrasts are all zero")
        coef <- 1:ncontrasts
        if(ncontrasts < ncol(contrast)) contrast <- contrast[,qrc$pivot[coef]]
        logFC <- drop((glmfit$coefficients %*% contrast) / log(2))
        if(ncontrasts > 1) {
            coef.name <- paste("LR test of", ncontrasts, "contrasts")
        } else {
            contrast <- drop(contrast)
            i <- contrast != 0
            coef.name <- paste(paste(contrast[i], coef.names[i], sep = "*"), collapse = " ")
        }
        Dvec <- rep.int(1, nlibs)
        Dvec[coef] <- diag(qrc$qr)[coef]
        Q <- qr.Q(qrc, complete = TRUE, Dvec = Dvec)
        design <- design %*% Q
    }
    if(length(coef) == 1) logFC <- as.vector(logFC)

    #	Null design matrix
    design0 <- design[, -coef, drop = FALSE]

    #	Null fit
    fit.null <- glmFit(glmfit$counts, design = design0,
                       offset = glmfit$offset, weights =glmfit$weights,
                       dispersion = glmfit$dispersion, prior.count = 0)

    #	Likelihood ratio statistic
    LR <- fit.null$deviance - glmfit$deviance

    if(ZI) fit.null$df.residual <- rowSums(fit.null$weights) - ncol(design0)
    if(ZI) glmfit$df.residual <- rowSums(glmfit$weights) - ncol(design)

    df.test <- fit.null$df.residual - glmfit$df.residual ## okay

    LRT.pvalue <- {
        phi <- quantile(glmfit$dispersion, p = 0.5)
        mu <- quantile(glmfit$fitted.values, p = 0.5)
        gamma.prop <- (phi * mu / (1 + phi * mu) )^2
        prior.df <- glmfit$prior.df
        if(is.null(prior.df)) prior.df <- 20
        glmfit$df.total <- glmfit$df.residual + prior.df/gamma.prop
        pf(LR/df.test, df1 = df.test, df2 = glmfit$df.total,
           lower.tail = FALSE, log.p = FALSE)
    }
    ### set p-values to NA if residual df non-positive
    LRT.pvalue[glmfit$df.residual<=0] = NA

    rn <- rownames(glmfit)
    if(is.null(rn))
        rn <- 1:nrow(glmfit)
    else
        rn <- make.unique(rn)
    tab <- data.frame(
        logFC     = logFC,
        logCPM    = glmfit$AveLogCPM,
        LR        = LR,
        PValue    = LRT.pvalue,
        row.names = rn
    )
    glmfit$counts <- NULL
    glmfit$table <- tab
    glmfit$comparison <- coef.name
    glmfit$df.test <- df.test
    res <- new("DGELRT",unclass(glmfit))
    if(independentFiltering){
        if(is.null(filter)) filter = rowMeans(glmfit$fitted.values) #aprrox. linear w\ basemean
        res <- independentFiltering(res, filter = filter,
                                    objectType = "edgeR")
    } else return(res)
}


#' Perform independent filtering in differential expression analysis.
#'
#' This function uses the \code{DESeq2} independent filtering method
#' to increase detection power in high throughput gene expression
#' studies.
#'
#' @param object Either a \code{\link[edgeR]{DGELRT-class}} object or
#' a \code{\link{data.frame}} with differential expression results.
#' @param filter The characteristic to use for filtering, usually a
#' measure of normalized mean expression for the features.
#' @param objectType Either \code{"edgeR"} or \code{"limma"}. If
#' \code{"edgeR"}, it is assumed that \code{object} is of class
#' \code{\link[edgeR]{DGELRT-class}}, the output of
#' \code{\link[edgeR]{glmLRT}}. If \code{"limma"}, it is assumed that
#' \code{object} is a \code{\link{data.frame}} and the output of a
#' limma-voom analysis.
#' @seealso \code{\link[DESeq2]{results}}
#' @references
#' Michael I Love, Wolfgang Huber, and Simon Anders.
#' Moderated estimation of fold change and dispersion for RNA-seq data
#' with DESeq2. Genome Biology, 15(12):550, dec 2014.
#' @author Koen Van den Berge
independentFiltering <- function(object, filter,
                                 objectType = c("edgeR","limma")){
    if(objectType == "edgeR"){
        hlp <- pvalueAdjustment(filter = filter,
                                pValue = object$table$PValue)
        object$table$padjFilter <- hlp$padj
        return(object)

    } else if(objectType == "limma"){
        hlp <- pvalueAdjustment(filter = filter,
                                pValue = object$P.Value)
        object$padjFilter <- hlp$padj
        return(object)

    } else stop("objectType must be either one of 'edgeR' or 'limma'.")
}


#' Perform independent filtering in differential expression analysis.
#'
#' This function performs independent filtering to increase detection
#' power in high throughput gene expression studies.
#'
#' @param baseMean A vector of mean values.
#' @param filter A vector of stage-one filter statistics.
#' @param pValue A vector of unadjusted p-values, or a function which
#' is able to compute this vector from the filtered portion of data,
#' if data is supplied. The option to supply a function is useful when
#' the value of the test statistic depends on which hypotheses are
#' filtered out at stage one. (The limma t-statistic is an example.)
#' @param theta A vector with one or more filtering fractions to
#' consider. Actual cutoffs are then computed internally by applying
#' quantile to the filter statistics contained in (or produced by)
#' the filter argument.
#' @param alpha A cutoff to which p-values, possibly adjusted for
#' multiple testing, will be compared. Default is 0.05.
#' @param pAdjustMethod The unadjusted p-values contained in
#' (or produced by) test will be adjusted for multiple testing after
#' filtering. Default is "BH".
#' @return a list with pvalues, filtering threshold, theta,
#' number of rejections, and alpha.
#' @importFrom genefilter filtered_p
#' @importFrom stats lowess quantile
#' @note This function is an adapted version of the
#' \code{pvalueAdjustment} function that was originally written by
#' Michael I. Love as part of the DESeq2 package.
#' Koen Van den Berge adapted the function.
pvalueAdjustment <- function(baseMean, filter, pValue, theta,
                             alpha = 0.05, pAdjustMethod = "BH") {
    if (missing(filter)) {
        filter <- baseMean
    }
    if (missing(theta)) {
        lowerQuantile <- mean(filter == 0)
        if (lowerQuantile < .95) upperQuantile <- .95 else upperQuantile <- 1
        theta <- seq(lowerQuantile, upperQuantile, length = 50)
    }

    # do filtering using genefilter
    stopifnot(length(theta) > 1)
    filtPadj <- filtered_p(filter = filter, test = pValue,
                           theta = theta, method = pAdjustMethod)
    numRej  <- colSums(filtPadj < alpha, na.rm = TRUE)
    # prevent over-aggressive filtering when all genes are null,
    # by requiring the max number of rejections is above a fitted curve.
    # If the max number of rejection is not greater than 10, then don't
    # perform independent filtering at all.
    lo.fit <- lowess(numRej ~ theta, f = 1/5)
    if (max(numRej) <= 10) {
        j <- 1
    } else {
        residual <- if (all(numRej == 0)) {
            0
        } else {
            numRej[numRej > 0] - lo.fit$y[numRej > 0]
        }
        thresh <- max(lo.fit$y) - sqrt(mean(residual^2))
        j <- if (any(numRej > thresh)) {
            which(numRej > thresh)[1]
        } else {
            1
        }
    }
    padj <- filtPadj[, j, drop = TRUE]
    cutoffs <- quantile(filter, theta)
    filterThreshold <- cutoffs[j]
    filterNumRej <- data.frame(theta = theta, numRej = numRej)
    filterTheta <- theta[j]

    return(list(padj = padj, filterThreshold = filterThreshold,
                filterTheta = filterTheta, filterNumRej = filterNumRej,
                lo.fit = lo.fit, alpha = alpha))

}