Nothing
#' Remove the ambient profile
#'
#' Estimate and remove the ambient profile from a count matrix, given pre-existing groupings of similar cells.
#' This function is largely intended for plot beautification rather than real analysis.
#'
#' @param y A numeric matrix-like object containing counts for each gene (row) and cell/library (column).
#' @param ambient A numeric vector of length equal to \code{nrow(y)},
#' containing the proportions of transcripts for each gene in the ambient solution.
#' @param groups A vector of length equal to \code{ncol(y)}, specifying the assigned group for each cell.
#' This can also be a \linkS4class{DataFrame}, see \code{?\link{sumCountsAcrossCells}}.
#' @param features A vector of control features or a list of mutually exclusive feature sets,
#' see \code{?\link{controlAmbience}} for more details.
#' @param ... Further arguments to pass to \code{\link{maximumAmbience}}.
#' @param dispersion Numeric scalar specifying the dispersion to use in the quantile-quantile mapping.
#' @param size.factors Numeric scalar specifying the size factors for each column of \code{y},
#' defaults to library size-derived size factors.
#' @param sink An optional \linkS4class{RealizationSink} object of the same dimensions as \code{y}.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed.
#'
#' @return
#' A numeric matrix-like object of the same dimensions as \code{y},
#' containing the counts after removing the ambient contamination.
#' The exact representation of the output will depend on the class of \code{y} and whether \code{sink} was used.
#'
#' @details
#' This function will aggregate counts from each group of related cells into an average profile.
#' For each group, we estimate the contribution of the ambient profile and subtract it from the average.
#' By default, this is done with \code{\link{maximumAmbience}}, but if enough is known about the biological system, users can specify \code{feaures} to use \code{\link{controlAmbience}} instead.
#'
#' We then perform quantile-quantile mapping of counts in \code{y} from the old to new averages.
#' This approach preserves the mean-variance relationship and improves the precision of estimate of the ambient contribution, but relies on a sensible grouping of similar cells, e.g., unsupervised clusters or cell type annotations.
#' As such, this function is best used at the end of the analysis to clean up expression matrices prior to visualization.
#'
#' @author Aaron Lun
#'
#' @examples
#' # Making up some data.
#' ngenes <- 1000
#' ambient <- runif(ngenes, 0, 0.1)
#' cells <- c(runif(100) * 10, integer(900))
#' y <- matrix(rpois(ngenes * 100, cells + ambient), nrow=ngenes)
#'
#' # Pretending that all cells are in one group, in this example.
#' removed <- removeAmbience(y, ambient, groups=rep(1, ncol(y)))
#' summary(rowMeans(removed[1:100,]))
#' summary(rowMeans(removed[101:1000,]))
#'
#' @seealso
#' \code{\link{maximumAmbience}} and \code{\link{controlAmbience}}, to estimate the ambient contribution.
#'
#' \code{\link{estimateAmbience}}, to estimate the ambient profile.
#'
#' The \pkg{SoupX} package, which provides another implementation of the same general approach.
#'
#' @export
#' @importFrom BiocParallel SerialParam
#' @importFrom SummarizedExperiment assay colData
#' @importFrom scuttle sumCountsAcrossCells librarySizeFactors
#' @importFrom BiocGenerics match
#' @importFrom DelayedArray blockApply colAutoGrid
removeAmbience <- function(y, ambient, groups, features=NULL, ...,
size.factors=librarySizeFactors(y), dispersion=0.1,
sink=NULL, BPPARAM=SerialParam())
{
summed <- sumCountsAcrossCells(y, groups, store.number=NULL)
old.means <- assay(summed)
sf.summed <- sumCountsAcrossCells(rbind(size.factors), groups)
group.sf <- assay(sf.summed)[1,]
if (is.null(features)) {
profile <- maximumAmbience(old.means, ambient=ambient, ..., mode="profile")
} else {
profile <- controlAmbience(old.means, ambient=ambient, features=features, mode="profile")
}
new.means <- old.means - profile
new.means[new.means < 0] <- 0
if (is(groups, "DataFrame")) {
matches <- match(groups, colData(summed))
} else {
matches <- match(groups, colData(summed)[,1])
}
if (!is.null(sink)) {
# Force serial execution to avoid multi-threaded writing to sink.
BPPARAM <- NULL
}
out <- blockApply(y, FUN=.remap_quantiles,
matches=matches, old.means=old.means, new.means=new.means,
dispersion=dispersion, size.factors=size.factors, group.sf=group.sf,
sink=sink,
BPPARAM=BPPARAM, as.sparse=NA, grid=colAutoGrid(y))
if (!is.null(sink)) {
as(sink, "DelayedArray")
} else {
do.call(cbind, out)
}
}
#' @importFrom methods is
#' @importFrom stats pnbinom qnbinom
#' @importFrom edgeR q2qnbinom
#' @importFrom DelayedArray makeNindexFromArrayViewport write_block currentViewport
.remap_quantiles <- function(block, matches, old.means, new.means, size.factors, group.sf, dispersion, sink) {
vp <- currentViewport()
idx <- makeNindexFromArrayViewport(vp, expand.RangeNSBS = TRUE)
if (!is.null(idx[[2]])) {
matches <- matches[idx[[2]]]
size.factors <- size.factors[idx[[2]]]
}
old.means <- old.means[, matches, drop=FALSE]
new.means <- new.means[, matches, drop=FALSE]
rescale <- size.factors / group.sf[matches]
old.means <- t(t(old.means) * rescale)
new.means <- t(t(new.means) * rescale)
originals <- as.matrix(block)
p <- pnbinom(originals, mu=old.means, size=1/dispersion, log.p=TRUE)
q <- qnbinom(p, mu=new.means, size=1/dispersion, log.p=TRUE)
failed <- !is.finite(q)
fallback <- q2qnbinom(originals[failed], input.mean=old.means[failed], output.mean=new.means[failed], dispersion=dispersion)
fallback <- pmax(0, round(fallback))
q[failed] <- fallback
dim(q) <- dim(originals)
if (!is.null(sink)) {
write_block(sink, vp, q)
NULL
} else if (is(block, "SparseArraySeed")) {
as(q, "dgCMatrix")
} else {
q
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.