R/pseudoBulkDGE.R
In scran: Methods for Single-Cell RNA-Seq Data Analysis

#' Quickly perform pseudo-bulk DE analyses
#'
#' A wrapper function around \pkg{edgeR}'s quasi-likelihood methods
#' to conveniently perform differential expression analyses on pseudo-bulk profiles,
#' allowing detection of cell type-specific changes between conditions in replicated studies.
#'
#' @param x A numeric matrix of counts where rows are genes and columns are pseudo-bulk profiles.
#' Alternatively, a SummarizedExperiment object containing such a matrix in its assays.
#' @param col.data A data.frame or \linkS4class{DataFrame} containing metadata for each column of \code{x}.
#' @param label A vector of factor of length equal to \code{ncol(x)},
#' specifying the cluster or cell type assignment for each column of \code{x}.
#' @param design A formula to be used to construct a design matrix from variables in \code{col.data}.
#' Alternatively, a function that accepts a data.frame with the same fields as \code{col.data} and returns a design matrix.
#' @param condition A vector or factor of length equal to \code{ncol(x)},
#' specifying the experimental condition for each column of \code{x}.
#' Only used for abundance-based filtering of genes.
#' @param coef String or character vector containing the coefficients to drop from the design matrix to form the null hypothesis.
#' Can also be an integer scalar or vector specifying the indices of the relevant columns.
#' @param contrast Numeric vector or matrix containing the contrast of interest.
#' Alternatively, a character vector to be passed to \code{\link{makeContrasts}} to create this numeric vector/matrix.
#' If specified, this takes precedence over \code{coef}.
#' @param lfc Numeric scalar specifying the log-fold change threshold to use in \code{\link{glmTreat}} or \code{\link{treat}}.
#' @param assay.type String or integer scalar specifying the assay to use from \code{x}.
#' @param include.intermediates Logical scalar indicating whether the intermediate \pkg{edgeR} objects should be returned.
#' @param row.data A \linkS4class{DataFrame} containing additional row metadata for each gene in \code{x},
#' to be included in each of the output DataFrames.
#' This should have the same number and order of rows as \code{x}.
#' @param ... For the generic, additional arguments to pass to individual methods.
#'
#' For the SummarizedExperiment method, additional arguments to pass to the ANY method.
#' @param method String specifying the DE analysis framework to use.
#' @param robust Logical scalar indicating whether robust empirical Bayes shrinkage should be performed.
#' @param qualities Logical scalar indicating whether quality weighting should be used when \code{method="voom"},
#' see \code{\link{voomWithQualityWeights}} for more details.
#' @param sorted Logical scalar indicating whether the output tables should be sorted by p-value.
#' @param sample Deprecated.
#' 
#' @return 
#' A \linkS4class{List} with one \linkS4class{DataFrame} of DE results per unique (non-failed) level of \code{cluster}.
#' This contains columns from \code{\link{topTags}} if \code{method="edgeR"} or \code{\link{topTable}} if \code{method="voom"}.
#' All DataFrames have row names equal to \code{rownames(x)}.
#' 
#' The \code{\link{metadata}} of the List contains \code{failed},
#' a character vector with the names of the labels for which the comparison could not be performed - see Details.
#'
#' The \code{\link{metadata}} of the individual DataFrames contains \code{design}, the final design matrix for that label.
#' If \code{include.intermediates}, the \code{\link{metadata}} will also contain
#' \code{y}, the DGEList used for the analysis; and \code{fit}, the DGEGLM object after GLM fitting.
#' 
#' @details
#' In replicated multi-condition scRNA-seq experiments,
#' we often have clusters comprised of cells from different samples of different experimental conditions.
#' It is often desirable to check for differential expression between conditions within each cluster,
#' allowing us to identify cell-type-specific responses to the experimental perturbation.
#'
#' Given a set of pseudo-bulk profiles (usually generated by \code{\link{sumCountsAcrossCells}}),
#' this function loops over the labels and uses \pkg{edgeR} or \code{\link{voom}} to detect DE genes between conditions.
#' The DE analysis for each label is largely the same as a standard analysis for bulk RNA-seq data,
#' using \code{design} and \code{coef} or \code{contrast} as described in the \pkg{edgeR} or \pkg{limma} user guides.
#' Generally speaking, \pkg{edgeR} handles low counts better via its count-based model
#' but \code{method="voom"} supports variable sample precision when \code{quality=TRUE}.
#'
#' Performing pseudo-bulk DGE enables us to reuse well-tested methods developed for bulk RNA-seq data analysis.
#' Each pseudo-bulk profile can be treated as an \emph{in silico} mimicry of a real bulk RNA-seq sample
#' (though in practice, it tends to be much more variable due to the lower numbers of cells).
#' This also models the relevant variability between experimental replicates (i.e., across samples) 
#' rather than that between cells in the same sample, without resorting to expensive mixed-effects models.
#'
#' The DE analysis for each label is independent of that for any other label.
#' This aims to minimize problems due to differences in abundance and variance between labels,
#' at the cost of losing the ability to share information across labels.
#'
#' In some cases, it will be impossible to perform a DE analysis for a label.
#' The most obvious reason is if there are no residual degrees of freedom;
#' other explanations include impossible contrasts or a failure to construct an appropriate design matrix
#' (e.g., if a cell type only exists in one condition).
#'
#' Note that we assume that \code{x} has already been filtered to remove unstable pseudo-bulk profiles generated from few cells.
#'
#' @section Comments on abundance filtering:
#' For each label, abundance filtering is performed using \code{\link{filterByExpr}} prior to further analysis.
#' Genes that are filtered out will still show up in the DataFrame for that label, but with all statistics set to \code{NA}.
#' As this is done separately for each label, a different set of genes may be filtered out for each label,
#' which is largely to be expected if there is any label-specific expression.
#' 
#' By default, the minimum group size for \code{filterByExpr} is determined using the design matrix.
#' However, this may not be optimal if the design matrix contains additional terms (e.g., blocking factors)
#' in which case it is not easy to determine the minimum size of the groups relevant to the comparison of interest.
#' To overcome this, users can specify \code{condition.field} to specify the group to which each sample belongs,
#' which is used by \code{filterByExpr} to obtain a more appropriate minimum group size.
#'
#' @author Aaron Lun
#'
#' @references
#' Tung P-Y et al. (2017).
#' Batch effects and the effective design of single-cell gene expression studies.
#' \emph{Sci. Rep.} 7, 39921
#' 
#' Lun ATL and Marioni JC (2017).
#' Overcoming confounding plate effects in differential expression analyses of single-cell RNA-seq data.
#' \emph{Biostatistics} 18, 451-464
#'
#' Crowell HL et al. (2019).
#' On the discovery of population-specific state transitions from multi-sample multi-condition single-cell RNA sequencing data.
#' \emph{biorXiv}
#'
#' @examples
#' set.seed(10000)
#' library(scuttle)
#' sce <- mockSCE(ncells=1000)
#' sce$samples <- gl(8, 125) # Pretending we have 8 samples.
#'
#' # Making up some clusters.
#' sce <- logNormCounts(sce)
#' clusters <- kmeans(t(logcounts(sce)), centers=3)$cluster
#'
#' # Creating a set of pseudo-bulk profiles:
#' info <- DataFrame(sample=sce$samples, cluster=clusters)
#' pseudo <- sumCountsAcrossCells(sce, info)
#'
#' # Making up an experimental design for our 8 samples.
#' pseudo$DRUG <- gl(2,4)[pseudo$sample]
#' 
#' # DGE analysis:
#' out <- pseudoBulkDGE(pseudo, 
#'    label=pseudo$cluster,
#'    condition=pseudo$DRUG,
#'    design=~DRUG,
#'    coef="DRUG2"
#' )
#' out[[1]]
#' metadata(out[[1]])$design
#' @seealso
#' \code{\link{sumCountsAcrossCells}}, to easily generate the pseudo-bulk count matrix.
#'
#' \code{\link{decideTestsPerLabel}}, to generate a summary of the DE results across all labels.
#'
#' \code{\link{pseudoBulkSpecific}}, to look for label-specific DE genes.
#'
#' \code{pbDS} from the \pkg{muscat} package, which uses a similar approach.
#' @name pseudoBulkDGE
NULL

.pseudo_bulk_master <- function(x, col.data, label, design, coef, contrast=NULL, 
    condition=NULL, lfc=0, include.intermediates=TRUE, row.data=NULL, sorted=FALSE, 
    method=c("edgeR", "voom"), qualities=TRUE, robust=TRUE, sample=NULL)
{
    if (is.function(design) || is(design, "formula")) {
        .pseudo_bulk_dge(x=x, col.data=col.data, label=label, condition=condition, 
            design=design, coef=coef, contrast=contrast, lfc=lfc, row.data=row.data, 
            sorted=sorted, include.intermediates=include.intermediates,
            method=match.arg(method), qualities=qualities, robust=robust)
    } else {
        .Deprecated(msg="matrix arguments for 'design=' are deprecated. \nUse functions or formulas instead.")
        .pseudo_bulk_old(x=x, label=label, sample=sample, design=design, condition=condition,
            coef=coef, contrast=contrast, lfc=lfc) 
    }
}

#' @importFrom edgeR DGEList 
#' @importFrom S4Vectors DataFrame SimpleList metadata metadata<-
.pseudo_bulk_dge <- function(x, col.data, label, design, coef, contrast=NULL, 
    condition=NULL, lfc=0, null.lfc.list=NULL, row.data=NULL, sorted=FALSE, include.intermediates=FALSE,
    method=c("edgeR", "voom"), qualities=TRUE, robust=TRUE)
{
    de.results <- list()
    failed <- character(0)
    label <- as.character(label)
    method <- match.arg(method)

    # Avoid requiring 'coef' if 'contrast' is specified.
    if (!is.null(contrast)) {
        coef <- NULL
    }

    for (i in sort(unique(label))) {
        chosen <- i==label

        curx <- x[,chosen,drop=FALSE]
        curdata <- col.data[chosen,,drop=FALSE]
        y <- DGEList(curx, samples=as.data.frame(curdata))
        curcond <- condition[chosen]

        curdesign <- try({
            if (is.function(design)) {
                design(curdata)
            } else {
                model.matrix(design, data=curdata)
            }
        }, silent=TRUE)

        if (is(curdesign, "try-error")) {
            failed <- c(failed, i)
            next

        } else {
            args <- list(y, row.names=rownames(x), curdesign=curdesign, curcond=curcond,
                coef=coef, contrast=contrast, lfc=lfc, null.lfc=null.lfc.list[[i]],
                robust=robust, include.intermediates=include.intermediates)

            if (method=="edgeR") {
                stuff <- do.call(.pseudo_bulk_edgeR, args)
                pval.field <- "PValue"
            } else {
                stuff <- do.call(.pseudo_bulk_voom, c(args, list(qualities=qualities)))
                pval.field <- "p.value"
            }

            if (is.null(stuff)) {
                failed <- c(failed, i)
                next
            }
        }

        if (!is.null(row.data)) {
            # Adding stuff[,0] to preserve the DF's metadata and row names.
            stuff <- cbind(stuff[,0], row.data, stuff)
        }
        if (sorted) {
            o <- order(stuff[,pval.field])
            stuff <- stuff[o,,drop=FALSE]
        }
        de.results[[i]] <- stuff
    }

    output <- SimpleList(de.results)
    metadata(output)$failed <- failed
    output
}

#' @importFrom edgeR DGEList 
#' @importFrom S4Vectors DataFrame List metadata metadata<-
.pseudo_bulk_old <- function(x, sample, label, design, coef, contrast=NULL, condition=NULL, lfc=0) {
    sample <- as.character(sample)
    label <- as.character(label)

    if (!identical(sort(rownames(design)), sort(unique(sample)))) {
        stop("'rownames(design)' and 'sample' should have the same levels")
    }

    de.results <- list()
    failed <- character(0)

    for (i in sort(unique(label))) {
        chosen <- i==label

        curx <- x[,chosen,drop=FALSE]
        y <- DGEList(curx, samples=data.frame(sample=sample[chosen], stringsAsFactors=FALSE))

        m <- match(as.character(y$samples$sample), rownames(design))
        curdesign <- design[m,,drop=FALSE]
        curcond <- condition[m]

        de.out <- .pseudo_bulk_edgeR(y, curdesign=curdesign, curcond=curcond,
            coef=coef, contrast=contrast, lfc=lfc, row.names=rownames(x),
            null.lfc=NULL, include.intermediates=FALSE)

        if (is.null(de.out)) {
            failed <- c(failed, i)
            next
        }
        de.results[[i]] <- de.out
    }

    output <- List(de.results)
    metadata(output)$failed <- failed
    output
}

#' @importFrom S4Vectors DataFrame metadata metadata<-
#' @importFrom edgeR estimateDisp glmQLFit glmQLFTest getOffset scaleOffset
#' calcNormFactors filterByExpr topTags glmLRT glmFit glmTreat
#' @importFrom limma makeContrasts
.pseudo_bulk_edgeR <- function(y, row.names, curdesign, curcond, coef, contrast, 
    lfc, null.lfc, include.intermediates, robust=TRUE) 
{
    ngenes <- nrow(y)
    gkeep <- filterByExpr(y, design=curdesign, group=curcond)
    y <- y[gkeep,]
    y <- calcNormFactors(y)

    rank <- qr(curdesign)$rank
    if (rank == nrow(curdesign) || rank < ncol(curdesign)) { 
        return(NULL)
    }
    if (is.character(contrast)) {
        contrast <- makeContrasts(contrasts=contrast, levels=curdesign)
    } 

    lfc.out <- .compute_offsets_by_lfc(design=curdesign, coef=coef, 
        contrast=contrast, filtered=gkeep, null.lfc=null.lfc)
    if (!is.null(lfc.out)) {
        offsets <- t(t(lfc.out)/log2(exp(1)) + getOffset(y))
        y <- scaleOffset(y, offsets)
    }

    y <- estimateDisp(y, curdesign)
    fit <- glmQLFit(y, curdesign, robust=TRUE)

    if (lfc==0) {
        res <- glmQLFTest(fit, coef=coef, contrast=contrast)
    } else {
        res <- glmTreat(fit, lfc=lfc, coef=coef, contrast=contrast)
    }

    tab <- topTags(res, n=Inf, sort.by="none")
    expander <- match(seq_len(ngenes), which(gkeep))
    tab <- DataFrame(tab$table[expander,,drop=FALSE])
    rownames(tab) <- row.names

    metadata(tab)$design <- curdesign
    if (include.intermediates) {
        metadata(tab)$y <- y
        metadata(tab)$fit <- fit
    }

    tab 
}

#' @importFrom limma contrastAsCoef
.compute_offsets_by_lfc <- function(design, coef, contrast, filtered, null.lfc) {
    if (is.null(null.lfc) || all(null.lfc==0)) {
        return(NULL) 
    }

    if (!is.null(contrast)) {
        out <- contrastAsCoef(design, contrast)
        design <- out$design
        coef <- out$coef
    }
    stopifnot(length(coef)==1)

    null.lfc <- rep(null.lfc, length.out=length(filtered))
    null.lfc <- null.lfc[filtered]
    outer(null.lfc, design[,coef])
}

#' @importFrom S4Vectors DataFrame metadata metadata<-
#' @importFrom edgeR calcNormFactors filterByExpr 
#' @importFrom limma voom voomWithQualityWeights lmFit 
#' contrasts.fit eBayes treat topTable makeContrasts
.pseudo_bulk_voom <- function(y, row.names, curdesign, curcond, coef, contrast, 
    lfc, null.lfc, include.intermediates, qualities=TRUE, robust=TRUE) 
{
    ngenes <- nrow(y)
    gkeep <- filterByExpr(y, design=curdesign, group=curcond)
    y <- y[gkeep,]
    y <- calcNormFactors(y)

    rank <- qr(curdesign)$rank
    if (rank == nrow(curdesign) || rank < ncol(curdesign)) { 
        return(NULL)
    }

    if (qualities) {
        v <- voomWithQualityWeights(y, curdesign)
    } else {
        v <- voom(y, curdesign)
    }
    if (is.character(contrast)) {
        contrast <- makeContrasts(contrasts=contrast, levels=curdesign)
    } 

    lfc.out <- .compute_offsets_by_lfc(design=curdesign, coef=coef, 
        contrast=contrast, filtered=gkeep, null.lfc=null.lfc)
    if (!is.null(lfc.out)) {
        v$E <- v$E - lfc.out
    }

    fit <- lmFit(v)
    if (!is.null(contrast)) {
        fit <- contrasts.fit(fit, contrast)
        coef <- 1
    }

    if (lfc==0) {
        res <- eBayes(fit, robust=robust)
    } else {
        res <- treat(fit, lfc=lfc, robust=robust)
    }

    tab <- topTable(res, coef=coef, number=Inf, sort.by="none")
    expander <- match(seq_len(ngenes), which(gkeep))
    tab <- DataFrame(tab[expander,,drop=FALSE])
    rownames(tab) <- row.names

    metadata(tab)$design <- curdesign
    if (include.intermediates) {
        metadata(tab)$y <- y
        metadata(tab)$v <- v
        metadata(tab)$fit <- fit
    }

    tab
}

#' @export
#' @rdname pseudoBulkDGE
setGeneric("pseudoBulkDGE", function(x, ...) standardGeneric("pseudoBulkDGE"))

#' @export
#' @rdname pseudoBulkDGE
setMethod("pseudoBulkDGE", "ANY", .pseudo_bulk_master)

#' @export
#' @rdname pseudoBulkDGE
#' @importFrom SummarizedExperiment assay colData
setMethod("pseudoBulkDGE", "SummarizedExperiment", function(x, col.data=colData(x), ..., assay.type=1) {
    .pseudo_bulk_master(assay(x, assay.type), col.data=col.data, ...)
})