#' @title Calculate normalized expression of a gene set
#'
#' @description Calculate normalized expression for a set of genes in each cell
#' from a \code{SingleCellExperiment}, using random sets of similarly
#' expressed genes as background to account for cell quality and
#' sequencing depth.
#'
#' @author Michael Stadler
#'
#' @param sce \code{SingleCellExperiment} object.
#' @param genes \code{character} vector with the genes in the set. Must be a
#' subset of \code{rownames(sce)}.
#' @param expr_values Integer scalar or string indicating which assay of
#' \code{sce} contains the expression values.
#' @param R Integer scalar giving the number of random gene sets to sample
#' for normalization.
#' @param nbins Integer scalar, specifying the number of bins to group the
#' average expression levels into before sampling (passed to
#' \code{sampleControlElements}). Higher numbers of bins
#' will increase the match to the target distribution(s), but may fail if
#' there are few elements to sample from.
#' @param subset.row Sample random genes only from these. If \code{NULL}
#' (the default), the function will sample from all genes in \code{sce}.
#' Alternatively, \code{subset.row} can be a logical, integer or character
#' vector indicating the rows (genes) of \code{sce} to use for sampling.
#' This allows for example to exclude highly variable genes from the sampling
#' which are likely expressed only in certain cell types.
#' @param BPPARAM An optional \code{\link[BiocParallel]{BiocParallelParam}}
#' instance determining the parallel back-end to be used during evaluation.
#'
#' @return A \code{numeric} vector with normalized gene set scores for each
#' cell in \code{sce}.
#'
#' @examples
#' if (require(SingleCellExperiment)) {
#' # get sce
#' example(SingleCellExperiment, echo=FALSE)
#' rownames(sce) <- paste0("g", seq.int(nrow(sce)))
#'
#' # calculate gene set expression scores
#' markers <- c("g1", "g13", "g27")
#' scores <- normGenesetExpression(sce, markers, R = 50)
#'
#' # compare expression of marker genes with scores
#' plotdat <- cbind(scores, t(logcounts(sce)[markers, ]))
#' cor(plotdat)
#' pairs(plotdat)
#' }
#'
#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom matrixStats rowMeans2 rowSds
#'
#' @export
normGenesetExpression <- function(sce,
genes,
expr_values = "logcounts",
subset.row = NULL,
R = 200,
nbins = 100,
BPPARAM = SerialParam()) {
# pre-flight checks
.assertPackagesAvailable("SummarizedExperiment")
stopifnot(exprs = {
is(sce, "SingleCellExperiment")
is.character(genes)
length(genes) > 0L
all(genes %in% rownames(sce))
is.null(subset.row) || !any(is.na(subset.row))
is.numeric(R)
length(R) == 1L
R > 0
is.numeric(nbins)
length(nbins) == 1L
nbins > 0
inherits(BPPARAM, "BiocParallelParam")
})
if (is.numeric(expr_values)) {
stopifnot(exprs = {
length(expr_values) == 1L
expr_values <= length(SummarizedExperiment::assays(sce))
})
} else if (is.character(expr_values)) {
stopifnot(exprs = {
length(expr_values) == 1L
expr_values %in% SummarizedExperiment::assayNames(sce)
})
} else {
stop("'expr_values' is not a valid value for use in assay()")
}
# exclude genes if subset.row is given
expr <- as.matrix(SummarizedExperiment::assay(sce, expr_values))
i <- match(genes, rownames(sce))
if (!is.null(subset.row)) {
if (is.logical(subset.row) && length(subset.row) == nrow(sce)) {
sel <- which(subset.row)
} else if (is.numeric(subset.row) && min(subset.row) >= 0 && max(subset.row) <= nrow(sce)) {
sel <- subset.row
} else if (is.character(subset.row) && all(subset.row %in% rownames(sce))) {
sel <- match(subset.row, rownames(sce))
} else {
stop("'subset.row' is not a valid selector of rows (genes) in 'sce'")
}
sel <- unique(c(i, sel)) # keep marker genes
expr <- expr[sel, ]
i <- match(genes, rownames(expr))
}
# calculate average expression of selected genes
avgExpr <- rowMeans(expr)
val.obs <- colSums(expr[i, , drop = FALSE])
# calculate expression in random gene sets
val.rand <- do.call(cbind, bplapply(seq_len(R), function(r) {
irand <- sampleControlElements(x = avgExpr, idxTarget = i, nbins = nbins)
colSums(expr[irand, , drop = FALSE])
}, BPPARAM = BPPARAM))
# normalize observed gene set expression
# z values: (obs - exp) / sqrt(Var[exp])
mus <- rowMeans2(val.rand)
sds <- rowSds(val.rand)
val.obs.norm <- (val.obs - mus) / sds
names(val.obs.norm) <- colnames(sce)
return(val.obs.norm)
}
#' @title Assign labels to cells using known marker genes
#'
#' @description Given marker gene sets for cell types, identify cells with
#' high expression of the marker genes (positive examples), then use these
#' cells to create a reference transcriptome profile for each cell type and
#' identify additional cells of each type using \code{SingleR}. These
#' marker genes should specifically expressed a single cell type, e.g.
#' CD3 which is expressed by all T cell subtypes would not be suitable
#' for specific T cell subtypes.
#'
#' @author Michael Stadler
#'
#' @param sce \code{SingleCellExperiment} object.
#' @param markergenes Named \code{list} of \code{character} vectors with the
#' marker genes for each cell types. The marker genes must be a subset of
#' \code{rownames(sce)}.
#' @param fraction_topscoring \code{numeric} vector of length 1 or the same
#' length as \code{markergenes} giving the fraction(s) of top scoring cells
#' for each cell type to pick to create the reference transcriptome profile.
#' @param expr_values Integer scalar or string indicating which assay of
#' \code{sce} contains the expression values.
#' @param normGenesetExpressionParams \code{list} with additional parameters
#' for \code{\link{normGenesetExpression}}.
#' @param aggregateReferenceParams \code{list} with additional parameters
#' for \code{\link[SingleR]{aggregateReference}}.
#' @param SingleRParams \code{list} with additional parameters for
#' \code{\link[SingleR]{SingleR}}.
#' @param BPPARAM An optional \code{\link[BiocParallel]{BiocParallelParam}}
#' instance determining the parallel back-end to be used during evaluation.
#'
#' @return A \code{list} of three elements named \code{cells}, \code{refs} and \code{labels}.
#' \code{cells} contains a \code{list} with the numerical indices of the top
#' scoring cells for each cell type.
#' \code{refs} contains the pseudo-bulk transcriptome profiles used as a
#' reference for label assignment, as returned by \code{\link[SingleR]{aggregateReference}}.
#' \code{labels} contains a \code{\link[S4Vectors]{DataFrame}} with the
#' annotation statistics for each cell (one cell per row), generated by
#' \code{\link[SingleR]{SingleR}}.
#'
#' @seealso \code{\link{normGenesetExpression}} used to calculate scores for
#' marker gene sets; \code{\link[SingleR]{aggregateReference}} used to
#' create reference profiles; \code{\link[SingleR]{SingleR}} used to assign
#' labels to cells.
#'
#' @examples
#' if (requireNamespace("SingleR", quietly = TRUE) &&
#' requireNamespace("SingleCellExperiment", quietly = TRUE) &&
#' requireNamespace("scrapper", quietly = TRUE)) {
#'
#' # create SingleCellExperiment with cell-type specific genes
#' library(SingleCellExperiment)
#' n_types <- 3
#' n_per_type <- 30
#' n_cells <- n_types * n_per_type
#' n_genes <- 500
#' fraction_specific <- 0.1
#' n_specific <- round(n_genes * fraction_specific)
#'
#' set.seed(42)
#' mu <- ceiling(runif(n = n_genes, min = 0, max = 30))
#' u <- do.call(rbind, lapply(mu, function(x) rpois(n_cells, lambda = x)))
#' rownames(u) <- paste0("g", seq.int(nrow(u)))
#' celltype.labels <- rep(paste0("t", seq.int(n_types)), each = n_per_type)
#' celltype.genes <- split(sample(rownames(u), size = n_types * n_specific),
#' rep(paste0("t", seq.int(n_types)), each = n_specific))
#' for (i in seq_along(celltype.genes)) {
#' j <- celltype.genes[[i]]
#' k <- celltype.labels == paste0("t", i)
#' u[j, k] <- 2 * u[j, k]
#' }
#' v <- log2(u + 1)
#' sce <- SingleCellExperiment(assays=list(counts=u, logcounts=v))
#'
#' # define marker genes (subset of true cell-type-specific genes)
#' marker.genes <- lapply(celltype.genes, "[", 1:5)
#' marker.genes
#'
#' # predict cell types
#' res <- labelCells(sce, marker.genes,
#' fraction_topscoring = 0.1,
#' normGenesetExpressionParams = list(R = 50))
#'
#' # high-scoring cells used as references for each celltype
#' res$cells
#'
#' # ... from these, pseudo-bulks were created:
#' res$refs
#'
#' # ... and used to predict labels for all cells
#' res$labels$pruned.labels
#'
#' # compare predicted to true cell types
#' table(true = celltype.labels, predicted = res$labels$pruned.labels)
#' }
#'
#' @importFrom BiocParallel SerialParam bplapply
#'
#' @export
labelCells <- function(sce,
markergenes,
fraction_topscoring = 0.01,
expr_values = "logcounts",
normGenesetExpressionParams = list(R = 200),
aggregateReferenceParams = list(power = 0.5),
SingleRParams = list(),
BPPARAM = SerialParam()) {
## pre-flight checks
.assertPackagesAvailable(c("SingleR","SummarizedExperiment"))
stopifnot(exprs = {
# sce
is(sce, "SingleCellExperiment")
# markergenes
is(markergenes, "list")
!is.null(names(markergenes))
length(markergenes) > 1L
all(lengths(markergenes) > 0L)
all(vapply(markergenes, is.character, logical(1)))
all(unlist(markergenes) %in% rownames(sce))
# fraction_topscoring
is.numeric(fraction_topscoring)
length(fraction_topscoring) == 1L || length(fraction_topscoring) == length(markergenes)
all(fraction_topscoring > 0) && all(fraction_topscoring < 1)
# normGenesetExpressionParams
is(normGenesetExpressionParams, "list")
length(normGenesetExpressionParams) == 0 || !is.null(names(normGenesetExpressionParams))
# aggregateReferenceParams
is(aggregateReferenceParams, "list")
length(aggregateReferenceParams) == 0 || !is.null(names(aggregateReferenceParams))
# SingleRParams
is(SingleRParams, "list")
length(SingleRParams) == 0 || !is.null(names(SingleRParams))
})
if (is.numeric(expr_values)) {
stopifnot(exprs = {
length(expr_values) == 1L
expr_values <= length(SummarizedExperiment::assays(sce))
})
} else if (is.character(expr_values)) {
stopifnot(exprs = {
length(expr_values) == 1L
expr_values %in% SummarizedExperiment::assayNames(sce)
})
} else {
stop("'expr_values' is not a valid value for use in assay()")
}
## calculate scores for marker gene sets
todrop <- intersect(names(normGenesetExpressionParams),
c("sce", "genes", "expr_values", "BPPARAM"))
if (length(todrop) > 0) {
warning("ignoring the following user-provided elements in ",
"'normGenesetExpressionParams': ",
paste(todrop, collapse = ", "))
normGenesetExpressionParams[todrop] <- NULL
}
scoreL <- lapply(markergenes, function(gns)
do.call(normGenesetExpression,
c(list(sce = sce, genes = gns,
expr_values = expr_values,
BPPARAM = BPPARAM),
normGenesetExpressionParams))
)
## identify top scoring cells of each type and create references
if (length(fraction_topscoring) == 1L)
fraction_topscoring <- rep(fraction_topscoring, length(markergenes))
topL <- lapply(seq_along(scoreL), function(i) {
which(scoreL[[i]] > quantile(scoreL[[i]],
probs = 1 - fraction_topscoring[i], na.rm = TRUE))
})
names(topL) <- names(scoreL)
topcells <- unlist(topL, use.names = FALSE)
topcelltypes <- factor(rep(names(topL), lengths(topL)),
levels = names(markergenes))
todrop <- intersect(names(aggregateReferenceParams),
c("ref", "labels", "BPPARAM"))
if (length(todrop) > 0) {
warning("ignoring the following user-provided elements in ",
"'aggregateReferenceParams': ",
paste(todrop, collapse = ", "))
aggregateReferenceParams[todrop] <- NULL
}
refs <- do.call(SingleR::aggregateReference,
c(list(ref = sce[, topcells],
labels = topcelltypes,
BPPARAM = BPPARAM),
aggregateReferenceParams))
# transfer labels using SingleR
todrop <- intersect(names(SingleRParams),
c("test", "ref", "labels", "BPPARAM"))
if (length(todrop) > 0) {
warning("ignoring the following user-provided elements in ",
"'SingleRParams': ",
paste(todrop, collapse = ", "))
SingleRParams[todrop] <- NULL
}
df <- do.call(SingleR::SingleR,
c(list(test = sce,
ref = refs,
labels = refs$label,
BPPARAM = BPPARAM),
SingleRParams))
# return results
list(cells = topL, refs = refs, labels = df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.