R/aggregateAcrossCells.R
In scuttle: Single-Cell RNA-Seq Analysis Utilities

#' Aggregate data across groups of cells
#' 
#' Sum counts or average expression values for each feature across groups of cells,
#' while also aggregating values in the \code{\link{colData}} and other fields in a SummarizedExperiment.
#'
#' @param x A \linkS4class{SingleCellExperiment} or \linkS4class{SummarizedExperiment}
#' containing one or more matrices of expression values to be aggregated;
#' possibly along with \code{\link{colData}}, \code{\link{reducedDims}} and \code{\link{altExps}} elements.
#' @inheritParams summarizeAssayByGroup
#' @param statistics Character vector specifying the type of statistics to be computed, see \code{?\link{summarizeAssayByGroup}}.
#' If not specified, defaults to \code{"sum"}.
#' @param average Deprecated, specifies whether to compute the average - use \code{statistics="mean"} instead.
#' Only used if \code{statistics=NULL}.
#' @param subset.row An integer, logical or character vector specifying the features to use.
#' If \code{NULL}, defaults to all features.
#' For the \linkS4class{SingleCellExperiment} method, this argument will not affect alternative Experiments,
#' where aggregation is always performed for all features (or not at all, depending on \code{use.altexps}).
#' @param suffix Logical scalar indicating whether to always suffix the assay name with the statistic type.
#' @param ... For the generic, further arguments to be passed to specific methods.
#'
#' For the SummarizedExperiment method, further arguments to be passed to \code{\link{summarizeAssayByGroup}}.
#'
#' For the SingleCellExperiment method, arguments to be passed to the SummarizedExperiment method.
#' @param use.assay.type A character or integer vector specifying the assay(s) of \code{x} containing count matrices.
#' @param use.altexps Logical scalar indicating whether aggregation should be performed for alternative experiments. 
#' Alternatively, a character or integer vector specifying the alternative experiments to be aggregated.
#' @param use.dimred Logical scalar indicating whether aggregation should be performed for dimensionality reduction results.
#' Alternatively, a character or integer vector specifying the dimensionality reduction results to be aggregated.
#' @param coldata.merge A named list of functions specifying how each column metadata field should be aggregated.
#' Each function should be named according to the name of the column in \code{\link{colData}} to which it applies.
#' Alternatively, a single function can be supplied, see below for more details.
#' @param dimred.stats A character vector specifying how the reduced dimensions should be aggregated by group.
#' This can be one or more of \code{"mean"} and \code{"median"}.
#' @param subset_row,subset_col,store_number,use_exprs_values,use_altexps,use_dimred,coldata_merge
#' Soft deprecated equivalents to the arguments described above.
#'
#' @return 
#' A SummarizedExperiment of the same class of \code{x} is returned containing summed/averaged matrices 
#' generated by \code{\link{summarizeAssayByGroup}} on all assays in \code{use.assay.type}.
#' Column metadata are also aggregated according to the rules in \code{coldata.merge}, see below.
#'
#' For the SingleCellExperiment method, 
#' the output also contains aggregated values for the reduced dimensions and alternative Experiments. 
#'
#' @details
#' This function summarizes the assay values in \code{x} for each group in \code{ids} using \code{\link{summarizeAssayByGroup}}
#' while also aggregating metadata across cells in a \dQuote{sensible} manner.
#' This makes it useful for obtaining an aggregated \linkS4class{SummarizedExperiment} during an analysis session;
#' in contrast, \code{\link{summarizeAssayByGroup}} is more lightweight and is better for use inside other functions.
#' 
#' Aggregation of the \code{\link{colData}} is controlled using functions in \code{coldata.merge}.
#' This can either be:
#' \itemize{
#' \item A function that takes a subset of entries for any given column metadata field and returns a single value.
#' This can be set to, e.g., \code{\link{sum}} or \code{\link{median}} for numeric covariates,
#' or a function that takes the most abundant level for categorical factors.
#' \item A named list of such functions, where each function is applied to the column metadata field after which it is named.
#' Any field that does not have an entry in \code{coldata.merge} is \dQuote{unspecified} and handled as described below.
#' A list element can also be set to \code{FALSE}, in which case no aggregation is performed for the corresponding field.
#' \item \code{NULL}, in which case all fields are considered to be unspecified.
#' \item \code{FALSE}, in which case no aggregation of column metadata is performed.
#' }
#' For any unspecified field, we check if all cells of a group have the same value.
#' If so, that value is reported, otherwise a \code{NA} is reported for the offending group.
#'
#' By default, each matrix values is returned with the same name as the original per-cell matrix from which it was derived.
#' If \code{statistics} is of length greater than 1 or \code{suffix=TRUE},
#' the names of all aggregated matrices are suffixed with their type of aggregate statistic.
#'
#' If \code{ids} is a \linkS4class{DataFrame}, the combination of levels corresponding to each column is also reported in the column metadata.
#' Otherwise, the level corresponding to each column is reported in the \code{ids} column metadata field as well as in the column names.
#'
#' @section Dealing with SingleCellExperiments:
#' If \code{x} is a \linkS4class{SingleCellExperiment}, aggregation is repeated for each entry of \code{\link{altExps}}.
#' This is done by calling \code{aggregateAcrossCells} on that entry with the same arguments used for the main Experiment -
#' as such, any column metadata in those entries will also be aggregated following the rules in \code{coldata.merge}.
#' The exception is \code{subset.row}, which is not applied to the alternative Experiments as the feature sets are different.
#' 
#' If \code{x} is a \linkS4class{SingleCellExperiment}, each entry of \code{\link{reducedDims}} is averaged across cells.
#' This assumes that the average of low-dimensional coordinates has some meaning for a group of cells but the sum does not.
#' We can explicitly specify computation of the \code{"mean"} or \code{"median"} (or both) with \code{dimred.stats}.
#' If \code{dimred.stats} is of length greater than 1 or \code{suffix=TRUE},
#' the name of each matrix in the output \code{\link{reducedDims}} is suffixed with the type of average.
#'
#' Users can tune the behavior of the function for these additional fields with \code{use.altexps} and \code{use.dimred}.
#' Note that if the alternative experiments themselves are \linkS4class{SingleCellExperiment}s,
#' any further nested alternative experiment or reduced dimensions will always be aggregated
#' regardless of the value of \code{use.altexps} or \code{use.dimred}.
#' 
#' @author Aaron Lun
#' @name aggregateAcrossCells
#'
#' @seealso
#' \code{\link{summarizeAssayByGroup}}, which does the heavy lifting at the assay level.
#'
#' @examples
#' example_sce <- mockSCE()
#' ids <- sample(LETTERS[1:5], ncol(example_sce), replace=TRUE)
#' out <- aggregateAcrossCells(example_sce, ids)
#' out
#'
#' batches <- sample(1:3, ncol(example_sce), replace=TRUE)
#' out2 <- aggregateAcrossCells(example_sce, 
#'       DataFrame(label=ids, batch=batches))
#' out2
#'
#' # Using another column metadata merge strategy.
#' example_sce$stuff <- runif(ncol(example_sce))
#' out3 <- aggregateAcrossCells(example_sce, ids, 
#'      coldata_merge=list(stuff=sum))
#' out3
NULL

#' @export
#' @rdname aggregateAcrossCells
setGeneric("aggregateAcrossCells", function(x, ...) standardGeneric("aggregateAcrossCells"))

#' @export
#' @rdname aggregateAcrossCells 
#' @importFrom S4Vectors DataFrame
#' @importFrom SummarizedExperiment assay assays<- colData<- colData assayNames
setMethod("aggregateAcrossCells", "SummarizedExperiment", function(x, ids, ..., statistics=NULL, average=NULL, suffix=FALSE,
    subset.row=NULL, subset.col=NULL, store.number="ncells", coldata.merge=NULL, use.assay.type="counts",
    subset_row=NULL, subset_col=NULL, store_number="ncells", coldata_merge=NULL, use_exprs_values=NULL)
{
    subset.row <- .replace(subset.row, subset_row)
    subset.col <- .replace(subset.col, subset_col)
    store.number <- .replace(store.number, store_number)
    coldata.merge <- .replace(coldata.merge, coldata_merge)
    use.assay.type <- .replace(use.assay.type, use_exprs_values)

    new.ids <- .process_ids(x, ids, subset.col)
    new.ids.char <- as.character(new.ids) # Avoid re-coercion on every call to the output function.

    # Organizing the assays.
    use.assay.type <- .use_names_to_integer_indices(use.assay.type, x=x, nameFUN=assayNames, msg="use.assay.type")
    if (length(use.assay.type)==0L) {
        stop("'use.assay.type' must specify at least one assay")
    }

    if (is.null(statistics)) {
        if (is.null(average)) {
            statistics <- "sum"
        } else {
            .Deprecated(msg="'average=' is deprecated, use 'statistics=' instead")
            statistics <- .average2statistic(average)
        }
    }

    collected <- vector("list", length(use.assay.type))
    ncells <- NULL
    for (i in seq_along(use.assay.type)) {
        sum.out <- .summarize_assay(assay(x, use.assay.type[i]), ids=new.ids, 
            statistics=statistics, ..., subset.row=subset.row)
        ncells <- sum.out$freq
        collected[[i]] <- sum.out$summary
    }
    collected <- .apply_suffixes(collected, assay.names=assayNames(x)[use.assay.type], suffix=suffix)

    # Organizing the column metadata.
    cn <- colnames(collected[[1]])
    m <- match(cn, new.ids.char)
    coldata <- .create_coldata(ids, mapping=m, freq=ncells, store.number=store.number)

    new.cd <- .merge_DF_rows(colData(x), ids=new.ids.char, final=cn, mergeFUN=coldata.merge)
    if (length(new.cd)) {
        new.cd <- do.call(DataFrame, c(new.cd, list(check.names=FALSE, row.names=rownames(coldata))))
        coldata <- cbind(new.cd, coldata)
    }

    # Endomorphic creation of a new SummarizedExperiment.
    shell <- x[,m]
    if (!is.null(subset.row)) {
        shell <- shell[subset.row,]
    }

    assays(shell, withDimnames=FALSE) <- collected
    colData(shell) <- coldata
    shell
})

.apply_suffixes <- function(mat.list, assay.names, suffix) {
    if (!suffix && all(lengths(mat.list)==1)) {
        mat.list <- lapply(mat.list, unname)
    }
    names(mat.list) <- assay.names
    unlist(mat.list, use.names=TRUE, recursive=FALSE)
}

#' @importFrom BiocGenerics match
#' @importFrom S4Vectors split extractROWS bindROWS I
.merge_DF_rows <- function(x, ids, final, mapping=match(final, ids), mergeFUN=NULL) {
    collected <- list()
    if (isFALSE(mergeFUN)) {
        return(collected)
    }

    for (cn in colnames(x)) {
        if (!is.function(mergeFUN)) {
            FUN <- mergeFUN[[cn]]
            if (isFALSE(FUN)) {
                collected[[cn]] <- NULL
                next
            }
        } else {
            FUN <- mergeFUN
        }

        grouped <- split(x[[cn]], ids)[final]

        if (is.null(FUN)) {
            # Obtaining a NA of matched type.
            FUN <- function(x) {
                if (NROW(val <- unique(x))==1L) {
                    val 
                } else {
                    extractROWS(val, NA_integer_)
                }
            }
        }

        per.group <- lapply(grouped, FUN)
        per.group <- unname(per.group)
        if (length(per.group)>=1L) {
            col <- bindROWS(per.group[[1]], per.group[-1])
        } else {
            # Obtaining a column of the correct type.
            col <- extractROWS(FUN(x[[cn]]), 0L)
        }

        collected[[cn]] <- I(col)
    }

    collected
}

#' @export
#' @rdname aggregateAcrossCells 
#' @importFrom SingleCellExperiment altExp altExps altExp<- altExps<-
#' reducedDimNames reducedDim<- reducedDim reducedDims<- reducedDims
setMethod("aggregateAcrossCells", "SingleCellExperiment", function(x, ids, ..., 
    subset.row=NULL, subset.col=NULL, use.altexps=TRUE, use.dimred=TRUE, dimred.stats=NULL, suffix=FALSE,
    subset_row=NULL, subset_col=NULL, use_altexps=NULL, use_dimred=NULL)
{
    subset.row <- .replace(subset.row, subset_row)
    subset.col <- .replace(subset.col, subset_col)
    use.altexps <- .replace(use.altexps, use_altexps)
    use.dimred <- .replace(use.dimred, use_dimred)

    base.args <- list(x=x, ids=ids, subset.col=subset.col, suffix=suffix, ...)
    y <- do.call(callNextMethod, c(base.args, list(subset.row=subset.row)))

    # Aggregating alternative experiments. 
    use.altexps <- .use_names_to_integer_indices(use.altexps, x=x, nameFUN=altExpNames, msg="use.altexps")
    for (i in use.altexps) {
        # Do NOT pass use.altexps and use.dimred into the aggregateAcrossCells
        # call, as this part must work with any SE object.
        args <- base.args
        args$x <- altExp(x, i)
        altExp(y, i) <- do.call(aggregateAcrossCells, args)
    }
    altExps(y) <- altExps(y, withColData=FALSE)[use.altexps]

    new.ids <- .process_ids(x, ids, subset.col)
    use.dimred <- .use_names_to_integer_indices(use.dimred, x=x, nameFUN=reducedDimNames, msg="use.dimred")

    if (is.null(dimred.stats)) {
        dimred.stats <- "mean"
    } else {
        dimred.stats <- match.arg(dimred.stats, c("mean", "median"), several.ok=TRUE)
    }

    collected <- vector("list", length(use.dimred))
    for (i in seq_along(use.dimred)) {
        current <- t(reducedDim(x, use.dimred[i]))
        out <- .summarize_assay(current, ids=new.ids, statistics=dimred.stats)
        collected[[i]] <- lapply(out$summary, t)
    }
    collected <- .apply_suffixes(collected, assay.names=reducedDimNames(x)[use.dimred], suffix=suffix)

    reducedDims(y) <- collected
    y
})