Nothing
#' Aggregate data across groups of cells
#'
#' Sum counts or average expression values for each feature across groups of cells,
#' while also aggregating values in the \code{\link{colData}} and other fields in a SummarizedExperiment.
#'
#' @param x A \linkS4class{SingleCellExperiment} or \linkS4class{SummarizedExperiment}
#' containing one or more matrices of expression values to be aggregated;
#' possibly along with \code{\link{colData}}, \code{\link{reducedDims}} and \code{\link{altExps}} elements.
#' @inheritParams summarizeAssayByGroup
#' @param statistics Character vector specifying the type of statistics to be computed, see \code{?\link{summarizeAssayByGroup}}.
#' If not specified, defaults to \code{"sum"}.
#' @param average Deprecated, specifies whether to compute the average - use \code{statistics="mean"} instead.
#' Only used if \code{statistics=NULL}.
#' @param subset.row An integer, logical or character vector specifying the features to use.
#' If \code{NULL}, defaults to all features.
#' For the \linkS4class{SingleCellExperiment} method, this argument will not affect alternative Experiments,
#' where aggregation is always performed for all features (or not at all, depending on \code{use.altexps}).
#' @param suffix Logical scalar indicating whether to always suffix the assay name with the statistic type.
#' @param ... For the generic, further arguments to be passed to specific methods.
#'
#' For the SummarizedExperiment method, further arguments to be passed to \code{\link{summarizeAssayByGroup}}.
#'
#' For the SingleCellExperiment method, arguments to be passed to the SummarizedExperiment method.
#' @param use.assay.type A character or integer vector specifying the assay(s) of \code{x} containing count matrices.
#' @param use.altexps Logical scalar indicating whether aggregation should be performed for alternative experiments.
#' Alternatively, a character or integer vector specifying the alternative experiments to be aggregated.
#' @param use.dimred Logical scalar indicating whether aggregation should be performed for dimensionality reduction results.
#' Alternatively, a character or integer vector specifying the dimensionality reduction results to be aggregated.
#' @param coldata.merge A named list of functions specifying how each column metadata field should be aggregated.
#' Each function should be named according to the name of the column in \code{\link{colData}} to which it applies.
#' Alternatively, a single function can be supplied, see below for more details.
#' @param dimred.stats A character vector specifying how the reduced dimensions should be aggregated by group.
#' This can be one or more of \code{"mean"} and \code{"median"}.
#' @param subset_row,subset_col,store_number,use_exprs_values,use_altexps,use_dimred,coldata_merge
#' Soft deprecated equivalents to the arguments described above.
#'
#' @return
#' A SummarizedExperiment of the same class of \code{x} is returned containing summed/averaged matrices
#' generated by \code{\link{summarizeAssayByGroup}} on all assays in \code{use.assay.type}.
#' Column metadata are also aggregated according to the rules in \code{coldata.merge}, see below.
#'
#' For the SingleCellExperiment method,
#' the output also contains aggregated values for the reduced dimensions and alternative Experiments.
#'
#' @details
#' This function summarizes the assay values in \code{x} for each group in \code{ids} using \code{\link{summarizeAssayByGroup}}
#' while also aggregating metadata across cells in a \dQuote{sensible} manner.
#' This makes it useful for obtaining an aggregated \linkS4class{SummarizedExperiment} during an analysis session;
#' in contrast, \code{\link{summarizeAssayByGroup}} is more lightweight and is better for use inside other functions.
#'
#' Aggregation of the \code{\link{colData}} is controlled using functions in \code{coldata.merge}.
#' This can either be:
#' \itemize{
#' \item A function that takes a subset of entries for any given column metadata field and returns a single value.
#' This can be set to, e.g., \code{\link{sum}} or \code{\link{median}} for numeric covariates,
#' or a function that takes the most abundant level for categorical factors.
#' \item A named list of such functions, where each function is applied to the column metadata field after which it is named.
#' Any field that does not have an entry in \code{coldata.merge} is \dQuote{unspecified} and handled as described below.
#' A list element can also be set to \code{FALSE}, in which case no aggregation is performed for the corresponding field.
#' \item \code{NULL}, in which case all fields are considered to be unspecified.
#' \item \code{FALSE}, in which case no aggregation of column metadata is performed.
#' }
#' For any unspecified field, we check if all cells of a group have the same value.
#' If so, that value is reported, otherwise a \code{NA} is reported for the offending group.
#'
#' By default, each matrix values is returned with the same name as the original per-cell matrix from which it was derived.
#' If \code{statistics} is of length greater than 1 or \code{suffix=TRUE},
#' the names of all aggregated matrices are suffixed with their type of aggregate statistic.
#'
#' If \code{ids} is a \linkS4class{DataFrame}, the combination of levels corresponding to each column is also reported in the column metadata.
#' Otherwise, the level corresponding to each column is reported in the \code{ids} column metadata field as well as in the column names.
#'
#' @section Dealing with SingleCellExperiments:
#' If \code{x} is a \linkS4class{SingleCellExperiment}, aggregation is repeated for each entry of \code{\link{altExps}}.
#' This is done by calling \code{aggregateAcrossCells} on that entry with the same arguments used for the main Experiment -
#' as such, any column metadata in those entries will also be aggregated following the rules in \code{coldata.merge}.
#' The exception is \code{subset.row}, which is not applied to the alternative Experiments as the feature sets are different.
#'
#' If \code{x} is a \linkS4class{SingleCellExperiment}, each entry of \code{\link{reducedDims}} is averaged across cells.
#' This assumes that the average of low-dimensional coordinates has some meaning for a group of cells but the sum does not.
#' We can explicitly specify computation of the \code{"mean"} or \code{"median"} (or both) with \code{dimred.stats}.
#' If \code{dimred.stats} is of length greater than 1 or \code{suffix=TRUE},
#' the name of each matrix in the output \code{\link{reducedDims}} is suffixed with the type of average.
#'
#' Users can tune the behavior of the function for these additional fields with \code{use.altexps} and \code{use.dimred}.
#' Note that if the alternative experiments themselves are \linkS4class{SingleCellExperiment}s,
#' any further nested alternative experiment or reduced dimensions will always be aggregated
#' regardless of the value of \code{use.altexps} or \code{use.dimred}.
#'
#' @author Aaron Lun
#' @name aggregateAcrossCells
#'
#' @seealso
#' \code{\link{summarizeAssayByGroup}}, which does the heavy lifting at the assay level.
#'
#' @examples
#' example_sce <- mockSCE()
#' ids <- sample(LETTERS[1:5], ncol(example_sce), replace=TRUE)
#' out <- aggregateAcrossCells(example_sce, ids)
#' out
#'
#' batches <- sample(1:3, ncol(example_sce), replace=TRUE)
#' out2 <- aggregateAcrossCells(example_sce,
#' DataFrame(label=ids, batch=batches))
#' out2
#'
#' # Using another column metadata merge strategy.
#' example_sce$stuff <- runif(ncol(example_sce))
#' out3 <- aggregateAcrossCells(example_sce, ids,
#' coldata_merge=list(stuff=sum))
#' out3
NULL
#' @export
#' @rdname aggregateAcrossCells
setGeneric("aggregateAcrossCells", function(x, ...) standardGeneric("aggregateAcrossCells"))
#' @export
#' @rdname aggregateAcrossCells
#' @importFrom S4Vectors DataFrame
#' @importFrom SummarizedExperiment assay assays<- colData<- colData assayNames
setMethod("aggregateAcrossCells", "SummarizedExperiment", function(x, ids, ..., statistics=NULL, average=NULL, suffix=FALSE,
subset.row=NULL, subset.col=NULL, store.number="ncells", coldata.merge=NULL, use.assay.type="counts",
subset_row=NULL, subset_col=NULL, store_number="ncells", coldata_merge=NULL, use_exprs_values=NULL)
{
subset.row <- .replace(subset.row, subset_row)
subset.col <- .replace(subset.col, subset_col)
store.number <- .replace(store.number, store_number)
coldata.merge <- .replace(coldata.merge, coldata_merge)
use.assay.type <- .replace(use.assay.type, use_exprs_values)
new.ids <- .process_ids(x, ids, subset.col)
new.ids.char <- as.character(new.ids) # Avoid re-coercion on every call to the output function.
# Organizing the assays.
use.assay.type <- .use_names_to_integer_indices(use.assay.type, x=x, nameFUN=assayNames, msg="use.assay.type")
if (length(use.assay.type)==0L) {
stop("'use.assay.type' must specify at least one assay")
}
if (is.null(statistics)) {
if (is.null(average)) {
statistics <- "sum"
} else {
.Deprecated(msg="'average=' is deprecated, use 'statistics=' instead")
statistics <- .average2statistic(average)
}
}
collected <- vector("list", length(use.assay.type))
ncells <- NULL
for (i in seq_along(use.assay.type)) {
sum.out <- .summarize_assay(assay(x, use.assay.type[i]), ids=new.ids,
statistics=statistics, ..., subset.row=subset.row)
ncells <- sum.out$freq
collected[[i]] <- sum.out$summary
}
collected <- .apply_suffixes(collected, assay.names=assayNames(x)[use.assay.type], suffix=suffix)
# Organizing the column metadata.
cn <- colnames(collected[[1]])
m <- match(cn, new.ids.char)
coldata <- .create_coldata(ids, mapping=m, freq=ncells, store.number=store.number)
new.cd <- .merge_DF_rows(colData(x), ids=new.ids.char, final=cn, mergeFUN=coldata.merge)
if (length(new.cd)) {
new.cd <- do.call(DataFrame, c(new.cd, list(check.names=FALSE, row.names=rownames(coldata))))
coldata <- cbind(new.cd, coldata)
}
# Endomorphic creation of a new SummarizedExperiment.
shell <- x[,m]
if (!is.null(subset.row)) {
shell <- shell[subset.row,]
}
assays(shell, withDimnames=FALSE) <- collected
colData(shell) <- coldata
shell
})
.apply_suffixes <- function(mat.list, assay.names, suffix) {
if (!suffix && all(lengths(mat.list)==1)) {
mat.list <- lapply(mat.list, unname)
}
names(mat.list) <- assay.names
unlist(mat.list, use.names=TRUE, recursive=FALSE)
}
#' @importFrom BiocGenerics match
#' @importFrom S4Vectors split extractROWS bindROWS I
.merge_DF_rows <- function(x, ids, final, mapping=match(final, ids), mergeFUN=NULL) {
collected <- list()
if (isFALSE(mergeFUN)) {
return(collected)
}
for (cn in colnames(x)) {
if (!is.function(mergeFUN)) {
FUN <- mergeFUN[[cn]]
if (isFALSE(FUN)) {
collected[[cn]] <- NULL
next
}
} else {
FUN <- mergeFUN
}
grouped <- split(x[[cn]], ids)[final]
if (is.null(FUN)) {
# Obtaining a NA of matched type.
FUN <- function(x) {
if (NROW(val <- unique(x))==1L) {
val
} else {
extractROWS(val, NA_integer_)
}
}
}
per.group <- lapply(grouped, FUN)
per.group <- unname(per.group)
if (length(per.group)>=1L) {
col <- bindROWS(per.group[[1]], per.group[-1])
} else {
# Obtaining a column of the correct type.
col <- extractROWS(FUN(x[[cn]]), 0L)
}
collected[[cn]] <- I(col)
}
collected
}
#' @export
#' @rdname aggregateAcrossCells
#' @importFrom SingleCellExperiment altExp altExps altExp<- altExps<-
#' reducedDimNames reducedDim<- reducedDim reducedDims<- reducedDims
setMethod("aggregateAcrossCells", "SingleCellExperiment", function(x, ids, ...,
subset.row=NULL, subset.col=NULL, use.altexps=TRUE, use.dimred=TRUE, dimred.stats=NULL, suffix=FALSE,
subset_row=NULL, subset_col=NULL, use_altexps=NULL, use_dimred=NULL)
{
subset.row <- .replace(subset.row, subset_row)
subset.col <- .replace(subset.col, subset_col)
use.altexps <- .replace(use.altexps, use_altexps)
use.dimred <- .replace(use.dimred, use_dimred)
base.args <- list(x=x, ids=ids, subset.col=subset.col, suffix=suffix, ...)
y <- do.call(callNextMethod, c(base.args, list(subset.row=subset.row)))
# Aggregating alternative experiments.
use.altexps <- .use_names_to_integer_indices(use.altexps, x=x, nameFUN=altExpNames, msg="use.altexps")
for (i in use.altexps) {
# Do NOT pass use.altexps and use.dimred into the aggregateAcrossCells
# call, as this part must work with any SE object.
args <- base.args
args$x <- altExp(x, i)
altExp(y, i) <- do.call(aggregateAcrossCells, args)
}
altExps(y) <- altExps(y, withColData=FALSE)[use.altexps]
new.ids <- .process_ids(x, ids, subset.col)
use.dimred <- .use_names_to_integer_indices(use.dimred, x=x, nameFUN=reducedDimNames, msg="use.dimred")
if (is.null(dimred.stats)) {
dimred.stats <- "mean"
} else {
dimred.stats <- match.arg(dimred.stats, c("mean", "median"), several.ok=TRUE)
}
collected <- vector("list", length(use.dimred))
for (i in seq_along(use.dimred)) {
current <- t(reducedDim(x, use.dimred[i]))
out <- .summarize_assay(current, ids=new.ids, statistics=dimred.stats)
collected[[i]] <- lapply(out$summary, t)
}
collected <- .apply_suffixes(collected, assay.names=reducedDimNames(x)[use.dimred], suffix=suffix)
reducedDims(y) <- collected
y
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.