Nothing
#' Identify cell markers
#'
#' Uses geometric method based on vector dot product to identify genes which are
#' the best markers for individual cell types.
#'
#' If `verbose = TRUE`, the function will display an estimate of the required
#' memory. But importantly this estimate is only a guide. It is provided to help
#' users choose the optimal number of cores during parallelisation. Real memory
#' usage might well be more, theoretically up to double this amount, due to R's
#' use of copy-on-modify.
#'
#' @param scdata Single-cell data matrix with genes in rows and cells in
#' columns. Can be sparse matrix or DelayedMatrix. Must have rownames
#' representing gene IDs or gene symbols.
#' @param bulkdata Optional data matrix containing bulk RNA-Seq data with genes
#' in rows and samples in columns. This matrix is only used for its rownames
#' (gene IDs), to ensure that cell markers are selected from genes in the bulk
#' dataset.
#' @param subclass Vector of cell subclasses matching the columns in `scdata`
#' @param cellgroup Optional grouping vector of major cell types matching the
#' columns in `scdata`. `subclass` is assumed to contain subclasses which are
#' subsets within `cellgroup` overarching classes.
#' @param nsubclass Number of genes to select for each single cell subclass.
#' Either a single number or a vector with the number of genes for each
#' subclass.
#' @param ngroup Number of genes to select for each cell group. Either a single
#' number or a vector with the number of genes for each group.
#' @param expfilter Genes whose maximum mean expression on log2 scale per cell
#' type are below this value are removed and not considered for the signature.
#' @param noisefilter Sets an upper bound for `noisefraction` cut-off below
#' which gene expression is set to 0. Essentially gene expression above this
#' level must be retained in the signature. Setting this higher can allow more
#' suppression via `noisefraction` and can favour more highly expressed genes.
#' @param noisefraction Numeric value. Maximum mean log2 gene expression across
#' cell types is calculated and values in celltypes below this fraction are
#' set to 0. Set in conjunction with `noisefilter.` Note: if this is set too
#' high (too close to 1), it can have a deleterious effect on deconvolution.
#' @param min_cells Numeric value specifying minimum number of cells in a
#' subclass category. Subclass categories with fewer cells will be ignored.
#' @param remove_subclass Character vector of `subclass` levels to be removed
#' from the analysis.
#' @param dual_mean Logical whether to calculate arithmetic mean of counts as
#' well as mean(log2(counts +1)). This is mainly useful for simulation.
#' @param meanFUN Either a character value or function for applying mean which
#' is passed to [scmean()]. Options include `"logmean"` (the default) or
#' `"trimmean"` which is a trimmed after excluding the top/bottom 5% of
#' values. `rowMeans` can be used for the arithmetic mean of counts.
#' @param postFUN Optional function applied to `genemeans` matrices after mean
#' has been calculated. If `meanFUN` is set to `"trimmean"` or `"rowMeans"`,
#' then `postFUN` defaults to `log2s`. See [scmean()].
#' @param verbose Logical whether to show messages.
#' @param sliceMem Max amount of memory in GB to allow for each subsetted count
#' matrix object. When `scdata` is subsetted by each cell subclass, if the
#' amount of memory would be above `sliceMem` then slicing is activated and
#' the subsetted count matrix is divided into chunks and processed separately.
#' This is indicated by addition of '...' in the printed timings. The limit is
#' just under 17.2 GB (2^34 / 1e9). Above this the subsetted matrix breaches
#' the long vector limit (>2^31 elements).
#' @param cores Integer, number of cores to use for parallelisation using
#' `mclapply()`. Parallelisation is not available on windows. Warning:
#' parallelisation has increased memory requirements. See [scmean()].
#' @param ... Additional arguments passed to [scmean()] such as `use_future`.
#' @returns
#' A list object with S3 class 'cellMarkers' containing:
#' \item{call}{the matched call}
#' \item{best_angle}{named list containing a matrix for each cell type with
#' genes in rows. Rows are ranked by lowest specificity angle for that cell
#' type and highest maximum expression. Columns are:
#' `angle` the specificity angle in radians,
#' `angle.deg` the same angle in degrees,
#' `max` the maximum mean expression across all cell types,
#' `rank` the rank of the mean gene expression for that cell type compared to
#' the other cell types}
#' \item{group_angle}{named list of matrices similar to `best_angle`, for each
#' cell subclass}
#' \item{geneset}{character vector of selected gene markers for cell types}
#' \item{group_geneset}{character vector of selected gene markers for cell
#' subclasses}
#' \item{genemeans}{matrix of mean log2+1 gene expression with genes in rows
#' and cell types in columns}
#' \item{genemeans_filtered}{matrix of gene expression for cell types
#' following noise reduction}
#' \item{groupmeans}{matrix of mean log2+1 gene expression with genes in rows
#' and cell subclasses in columns}
#' \item{groupmeans_filtered}{matrix of gene expression for cell subclasses
#' following noise reduction}
#' \item{cell_table}{factor encoded vector containing the groupings of the
#' cell types within cell subclasses, determined by which subclass contains
#' the maximum number of cells for each cell type}
#' \item{spillover}{matrix of spillover values between cell types}
#' \item{subclass_table}{contingency table of the number of cells in each
#' subclass}
#' \item{opt}{list storing options, namely arguments `nsubclass`, `ngroup`,
#' `expfilter`, `noisefilter`, `noisefraction`}
#' \item{genemeans_ar}{if `dual_mean` is `TRUE`, optional matrix of arithmetic
#' mean, i.e. log2(mean(counts)+1)}
#' \item{genemeans_filtered_ar}{optional matrix of arithmetic mean
#' following noise reduction}
#' The 'cellMarkers' object is designed to be passed to [deconvolute()] to
#' deconvolute bulk RNA-Seq data. It can be updated rapidly with different
#' settings using [updateMarkers()]. Ensembl gene ids can be substituted for
#' recognisable gene symbols by applying [gene2symbol()].
#' @seealso [deconvolute()] [updateMarkers()] [gene2symbol()]
#' @author Myles Lewis
#' @importFrom matrixStats rowMaxs
#' @export
#'
cellMarkers <- function(scdata,
bulkdata = NULL,
subclass,
cellgroup = NULL,
nsubclass = 25,
ngroup = 10,
expfilter = 0.5,
noisefilter = 2,
noisefraction = 0.25,
min_cells = 10,
remove_subclass = NULL,
dual_mean = FALSE,
meanFUN = "logmean",
postFUN = NULL,
verbose = TRUE,
sliceMem = 16,
cores = 1L, ...) {
.call <- match.call()
if (!inherits(scdata, c("dgCMatrix", "matrix", "Seurat", "DelayedMatrix"))) {
scdata <- as.matrix(scdata)
}
if (is.null(rownames(scdata))) stop("scdata is missing rownames/ gene ids")
dimx <- dim(scdata)
if (!is.factor(subclass)) subclass <- factor(subclass)
if (min_cells > 0) {
tab <- table(subclass)
if (any(tab) < min_cells) {
rem <- names(which(tab < min_cells))
subclass[subclass %in% rem] <- NA
subclass <- factor(subclass)
}
}
if (!is.null(remove_subclass)) {
subclass[subclass %in% remove_subclass] <- NA
subclass <- factor(subclass)
}
if (verbose) {
message("Dataset: ", dimx[2], " cells, ", dimx[1], " genes")
if (!inherits(scdata, "DelayedMatrix")) {
mem_estimate(dimx, subclass, cellgroup, sliceMem, cores)
}
}
ok <- TRUE
if (!is.null(bulkdata)) {
if (inherits(bulkdata, "data.frame")) bulkdata <- as.matrix(bulkdata)
ok <- rownames(scdata) %in% rownames(bulkdata)
if (verbose) message("Removing ", sum(!ok), " genes not found in bulkdata")
}
nsub <- nlevels(subclass)
if (verbose) {
if (dimx[2] != sum(!is.na(subclass))) {
message("Analysing ", sum(!is.na(subclass)), " cells, ", appendLF = FALSE)
}
message(nsub, " cell subclasses\nSubclass analysis")
}
nsubclass2 <- rep_len(nsubclass, nsub)
if (is.character(meanFUN) && meanFUN %in% c("trimmean", "rowMeans") &&
is.null(postFUN)) {
postFUN <- log2s
}
if (dual_mean) {
gm <- scmean2(scdata, subclass, meanFUN, postFUN, verbose, sliceMem, cores)
genemeans <- gm[[1]]
genemeans_ar <- gm[[2]]
} else {
genemeans <- scmean(scdata, subclass, meanFUN, postFUN, verbose, sliceMem,
cores, ...)
}
if (any(!ok)) {
genemeans <- genemeans[ok, ]
if (dual_mean) genemeans_ar <- genemeans_ar[ok, ]
dimx[1] <- nrow(genemeans)
}
highexp <- rowMaxs(genemeans) > expfilter
genemeans_filtered <- reduceNoise(genemeans[highexp, ], noisefilter,
noisefraction)
if (dual_mean) {
genemeans_filtered_ar <- reduceNoise(genemeans_ar[highexp, ], noisefilter,
noisefraction)
}
best_angle <- gene_angle(genemeans_filtered)
geneset <- lapply(seq_along(best_angle), function(i) {
rownames(best_angle[[i]])[seq_len(nsubclass2[i])]
})
geneset <- unique(unlist(geneset))
if (!is.null(cellgroup)) {
if (!is.factor(cellgroup)) cellgroup <- factor(cellgroup)
# cellgroup[is.na(subclass)] <- NA
if (min_cells > 0) {
tab <- table(cellgroup)
if (any(tab) < min_cells) {
rem <- names(which(tab < min_cells))
cellgroup[cellgroup %in% rem] <- NA
cellgroup <- factor(cellgroup)
}
}
if (verbose) message("Basic cell group analysis")
# test nesting
tab <- table(subclass, cellgroup)
groupmeans <- scmean(scdata, cellgroup, meanFUN, postFUN, verbose,
sliceMem, cores, ...)
if (any(!ok)) {
groupmeans <- groupmeans[ok, ]
}
highexp <- rowMaxs(groupmeans) > expfilter
groupmeans_filtered <- reduceNoise(groupmeans[highexp, ], noisefilter,
noisefraction)
# add extra rows
rn <- rownames(groupmeans_filtered)
miss <- rn[!rn %in% rownames(genemeans_filtered)]
if (length(miss) > 0) {
extra <- reduceNoise(genemeans[miss, , drop = FALSE], noisefilter,
noisefraction)
genemeans_filtered <- rbind(genemeans_filtered, extra)
}
group_angle <- gene_angle(groupmeans_filtered)
ngroup2 <- rep_len(ngroup, length(group_angle))
group_geneset <- lapply(seq_along(group_angle), function(i) {
rownames(group_angle[[i]])[seq_len(ngroup2[i])]
})
group_geneset <- unique(unlist(group_geneset))
cell_table <- apply(tab, 1, function(i) names(which.max(i)))
cell_table <- factor(cell_table, levels = unique(cell_table))
} else {
# no cell groups
group_geneset <- group_angle <- groupmeans <- groupmeans_filtered <- NULL
cell_table <- NULL
}
# determine spillover
gene_sig <- genemeans_filtered[geneset, ]
m_itself <- dotprod(gene_sig, gene_sig)
if (verbose) message(length(geneset), " marker genes")
out <- list(call = .call,
best_angle = best_angle,
group_angle = group_angle,
geneset = geneset,
group_geneset = group_geneset,
genemeans = genemeans,
genemeans_filtered = genemeans_filtered,
groupmeans = groupmeans,
groupmeans_filtered = groupmeans_filtered,
cell_table = cell_table,
spillover = m_itself,
subclass_table = table(subclass, dnn = NULL),
opt = list(nsubclass = nsubclass,
ngroup = ngroup,
expfilter = expfilter,
noisefilter = noisefilter,
noisefraction = noisefraction))
if (dual_mean) {
out$genemeans_ar <- genemeans_ar
out$genemeans_filtered_ar <- genemeans_filtered_ar
}
class(out) <- "cellMarkers"
out
}
#' @export
summary.cellMarkers <- function(object, ...) {
cat("scRNA-Seq dataset:", nrow(object$genemeans), "genes,",
sum(object$subclass_table), "cells\n")
cat("Cell subclasses:", ncol(object$genemeans), "\n")
cat("Cell subclass signature:", length(object$geneset), "genes\n")
cat("Cell groups:", ncol(object$groupmeans), "\n")
cat("Cell group signature:", length(object$group_geneset), "genes\n")
mmeth <- if (is.null(object$call[["meanFUN"]])) {
"logmean"
} else as.character(object$call[["meanFUN"]])
cat("Mean method:", mmeth, "\n")
for (i in 1:5) {
cat(paste0(names(object$opt)[i], ": ", object$opt[i], "\n"))
}
}
# estimate memory requirement
mem_estimate <- function(dimx, subclass, cellgroup, sliceMem, cores) {
tab <- table(subclass)
tab2 <- table(cellgroup)
dimmax <- as.numeric(max(c(tab, tab2), na.rm = TRUE)) * dimx[1]
mem <- structure(dimmax * 8, class = "object_size")
if (mem > 1e9)
message("Largest subclass/group object ", format(mem, units = "GB"))
if (mem > sliceMem * 1e9) {
message("Slicing above ", sliceMem, " Gb")
} else message("No slicing")
nsubcl <- sort(tab, decreasing = TRUE)[1:pmin(cores, length(tab))]
ngroup <- sort(tab2, decreasing = TRUE)[1:pmin(cores, length(tab2))]
msubcl <- nsubcl * as.numeric(dimx[1]) / 1.25e8
mgroup <- ngroup * as.numeric(dimx[1]) / 1.25e8
msubcl[msubcl > sliceMem] <- sliceMem
mgroup[mgroup > sliceMem] <- sliceMem
mem <- max(c(sum(msubcl), sum(mgroup)))
message("Estimated minimum RAM required ", format(mem, digits = 2), " Gb")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.