Nothing
#' Generate random cell number samples
#'
#' Used for simulating pseudo-bulk RNA-Seq from a 'cellMarkers' object. Cell
#' counts are randomly sampled from the uniform distribution, using the original
#' subclass contingency table as a limit on the maximum number of cells in each
#' subclass.
#'
#' @param object A 'cellMarkers' class object
#' @param n Integer value for the number of samples to generate
#' @param equal_sample Logical whether to sample subclasses equally or generate
#' samples with proportions of cells in keeping with the original subtotal of
#' cells in the main scRNA-Seq data.
#' @param method Either "unif" or "dirichlet" to specify whether cell numbers
#' are drawn from uniform distribution or dirichlet distribution.
#' @param alpha Shape parameter for `gtools::rdirichlet()`. Automatically
#' expanded to be a vector whose length is the number of subclasses.
#' @param zero_fraction Numeric from 0 to 1 specifying proportion of cell
#' subclasses to randomly set to zero in each sample.
#' @returns An integer matrix with `n` rows, with columns for each cell
#' subclasses in `object`, representing cell counts for each cell subclass.
#' Designed to be passed to [simulate_bulk()].
#' @details
#' Leaving `equal_sample = TRUE` is better for tuning deconvolution parameters.
#'
#' @seealso [simulate_bulk()]
#' @importFrom gtools rdirichlet
#' @export
generate_samples <- function(object, n, equal_sample = TRUE,
method = c("unif", "dirichlet"),
alpha = 1.5, zero_fraction = 0) {
lim <- object$subclass_table
nc <- length(lim)
method <- match.arg(method)
if (method == "unif") {
sim_counts <- matrix(runif(n * nc), ncol = nc,
dimnames = list(paste0("S", c(1:n)), names(lim)))
if (equal_sample) {
fac <- sum(lim) / nc
sim_counts <- sim_counts * fac
} else sim_counts <- t(t(sim_counts) * as.vector(lim))
} else {
sim_counts <- rdirichlet(n, rep_len(alpha, nc)) * sum(lim)
dimnames(sim_counts) <- list(paste0("S", c(1:n)), names(lim))
}
mode(sim_counts) <- "integer"
if (zero_fraction > 0) {
zc <- round(zero_fraction * nc)
wr <- rep(seq_len(n), each = zc)
wc <- sample(nc, n * zc, replace = TRUE)
w <- matrix(c(wr, wc), ncol = 2)
sim_counts[w] <- 0L
}
sim_counts
}
#' Simulate pseudo-bulk RNA-Seq
#'
#' Simulates pseudo-bulk RNA-Seq dataset using two modes. The first mode uses a
#' 'cellMarkers' class object and a matrix of counts for the numbers of cells of
#' each cell subclass. This method converts the log2 gene means back for
#' each cell subclass back to count scale and then calculates pseudo-bulk count
#' values based on the cell amounts specified in `samples`. In the 2nd mode, a
#' single-cell RNA-Seq dataset is required, such as a matrix used as input to
#' [cellMarkers()]. Cells from the relevant subclass are sampled from the
#' single-cell matrix in the appropriate amounts based on `samples`, except that
#' sampling is scaled up by the factor `times`.
#'
#' The first method can give perfect deconvolution if the following settings are
#' used with [deconvolute()]: `count_space = TRUE`, `convert_bulk = FALSE`,
#' `use_filter = FALSE` and `comp_amount = 1`.
#'
#' @param object Either a 'cellMarkers' class object, or a single cell count
#' matrix with genes in rows and cells in columns, with rownames representing
#' gene IDs/symbols. The matrix can be a sparse matrix or DelayedMatrix.
#' @param samples An integer matrix of cell counts with samples in rows and
#' columns for each cell subclass in `object`. This can be generated using
#' [generate_samples()].
#' @param subclass Vector of cell subclasses matching the columns in `object`.
#' Only used if `object` is a single cell count matrix.
#' @param times Scaling factor to increase sampling of cells. Cell counts in
#' `samples` are scaled up by being multiplied by this number. Only used if
#' `object` is a single cell count matrix.
#' @param method Either "dirichlet" or "unif" to specify whether cells are
#' sampled based on the Dirichlet distribution with K = number of cells in
#' each subclass, or sampled uniformly. When cells are oversampled uniformly,
#' in the limit the summed gene expression tends to the arithmetic mean of the
#' subclass x sample frequency. Dirichlet sampling provides proper randomness
#' with sampling.
#' @param alpha Shape parameter for Dirichlet sampling.
#' @returns An integer count matrix with genes in rows and cell subclasses in
#' columns. This can be used as `test` with the [deconvolute()] function.
#' @seealso [generate_samples()] [deconvolute()] [add_noise()]
#' @export
simulate_bulk <- function(object, samples, subclass, times = 1,
method = c("dirichlet", "unif"), alpha = 1) {
if (inherits(object, "cellMarkers")) {
genemean_counts <- 2^object$genemeans -1
if (ncol(genemean_counts) != ncol(samples)) stop("incompatible number of columns")
sim_pseudo <- genemean_counts %*% t(samples)
if (all(sim_pseudo <= .Machine$integer.max)) mode(sim_pseudo) <- "integer"
return(sim_pseudo)
}
# sample from count matrix
if (!inherits(object, c("dgCMatrix", "matrix", "Seurat", "DelayedMatrix"))) {
object <- as.matrix(object)
}
if (ncol(object) != length(subclass)) stop("incompatible dimensions")
if (!is.factor(subclass)) subclass <- factor(subclass)
if (any(!colnames(samples) %in% levels(subclass))) stop("incompatible subclasses")
method <- match.arg(method)
start <- Sys.time()
message("Creating sampling matrix", appendLF = FALSE)
samples <- samples * times
subclass_lev <- levels(subclass)
subclass_lev <- subclass_lev[subclass_lev %in% colnames(samples)]
cmat <- vapply(seq_len(nrow(samples)), function(j) {
s <- unlist(lapply(subclass_lev, function(i) {
w <- which(subclass == i)
n <- samples[j, i]
if (method == "unif" || n < length(w) * 2) {
sample(w, n, replace = TRUE)
} else {
rd <- as.vector(rdirichlet(1, rep_len(alpha, length(w))))
sample(w, n, replace = TRUE, prob = rd)
}
}))
tabulate(s, nbins = length(subclass))
}, numeric(length(subclass)))
cat_timer(start)
start <- Sys.time()
message("Matrix multiplication", appendLF = FALSE)
sim_pseudo <- as.matrix(object %*% cmat)
colnames(sim_pseudo) <- rownames(samples)
if (max(sim_pseudo) <= .Machine$integer.max) mode(sim_pseudo) <- "integer"
cat_timer(start)
sim_pseudo
}
cat_timer <- function(start) {
message(" (", format(Sys.time() - start, digits = 3), ")")
}
#' Scatter plots to compare deconvoluted subclasses
#'
#' Produces a set of scatter plots using base graphics to compare actual cell
#' counts against deconvoluted cell counts from bulk (or pseudo-bulk) RNA-Seq
#' for each cell subclass. Mainly for use if ground truth is available, e.g. for
#' simulated pseudo-bulk RNA-Seq data.
#'
#' @param obs Observed matrix of cell amounts with subclasses in columns and
#' samples in rows.
#' @param pred Predicted (deconvoluted) matrix of cell amounts with rows and
#' columns matching `obs`. Alternatively a 'deconv' class fitted model can be
#' supplied and its `$subclass$output` cell counts element will be used.
#' @param mfrow Optional vector of length 2 for organising plot layout. See
#' `par()`.
#' @param show_zero Logical whether to force plot to include the origin.
#' @param show_identity Logical whether to show the identity line.
#' @param cols Optional vector of column indices to plot to show either a subset
#' of columns or change the order in which columns are plotted. `NA` skips a
#' plot space to introduce a gap between plots.
#' @param colour Colour for the regression lines.
#' @param title Title for page of plots.
#' @param cex.title Font size for title.
#' @param ... Optional arguments passed to `plot()`.
#' @returns No return value. Produces scatter plots using base graphics.
#' @importFrom graphics abline mtext plot.new
#' @importFrom stats lm runif rnorm
#' @export
plot_set <- function(obs, pred, mfrow = NULL,
show_zero = FALSE,
show_identity = FALSE,
cols = NULL,
colour = "blue",
title = "", cex.title = 1, ...) {
if (inherits(pred, "deconv")) pred <- pred$subclass$output
if (!identical(dim(obs), dim(pred))) stop("incompatible dimensions")
if (anyNA(pred)) {
message("`pred` contains NA")
pred[is.na(pred)] <- 0
}
if (is.null(cols)) cols <- TRUE
subclasses <- colnames(obs)[cols]
nr1 <- ceiling(sqrt(length(subclasses)))
nr2 <- ceiling(length(subclasses) / nr1)
if (is.null(mfrow)) mfrow <- c(nr1, nr2)
oma <- par("oma")
if (title != "" & oma[3] < 1.5) oma[3] <- 1.5
op <- par(bty = "l", mgp = c(2.2, 0.6, 0), tcl = -0.3, oma = oma,
mar = c(3.7, 3.7, 1.5, 1.1), mfrow = mfrow)
on.exit(par(op))
scheme <- rev(hue_pal(h = c(0, 270), c = 120)(11))
xlim <- ylim <- NULL
new.args <- list(...)
for (i in subclasses) {
if (is.na(i)) {plot.new(); next}
if (show_zero) {
xr <- range(obs[, i], na.rm = TRUE)
xlim <- c(min(xr[1], 0), xr[2])
yr <- range(pred[, i], na.rm = TRUE)
ylim <- c(min(yr[1], 0), yr[2])
}
args <- list(x = obs[, i], y = pred[, i], cex = 0.8, pch = 16, las = 1,
xlab = i, ylab = "Predicted", xlim = xlim, ylim = ylim)
if (length(new.args)) args[names(new.args)] <- new.args
do.call(plot, args)
fit <- lm(pred[, i] ~ obs[, i])
rsq <- summary(fit)$r.squared
col <- if (colour == "rainbow") scheme[ceiling(rsq*10) +1] else colour
abline(fit, col = col, lwd = 1.5)
if (show_identity) abline(0, 1, col = "grey50", lty = 2)
mtext(bquote(R^2 == .(format(rsq, digits = 3))), cex = par("cex"), adj = 0.04)
}
mtext(title, outer = TRUE, cex = cex.title * par("cex"), adj = 0.05, line = 0)
}
#' Calculate R-squared and metrics on deconvoluted cell subclasses
#'
#' Calculates Pearson r-squared, R-squared and RMSE comparing subclasses in each
#' column of `obs` with matching columns in deconvoluted `pred`. Samples are in
#' rows. For use if ground truth is available, e.g. simulated pseudo-bulk
#' RNA-Seq data.
#'
#' Pearson r-squared ranges from 0 to 1. R-squared, calculated as 1 - rss/tss,
#' ranges from -Inf to 1.
#'
#' @param obs Observed matrix of cell amounts with subclasses in columns and
#' samples in rows.
#' @param pred Predicted (deconvoluted) matrix of cell amounts with rows and
#' columns matching `obs`.
#' @returns Matrix containing Pearson r-squared, R-squared and RMSE values.
#' @importFrom stats cor
#' @export
metric_set <- function(obs, pred) {
if (!identical(dim(obs), dim(pred))) stop("incompatible dimensions")
if (anyNA(pred)) {
message("`pred` contains NA")
pred[is.na(pred)] <- 0
}
d <- pred - obs
r1 <- colRsq(obs, pred)
r2 <- colRmse(d)
bias <- colMeans(d)
var <- predVar(d)
cors <- diag(cor(obs, pred))^2 |> suppressWarnings()
out <- cbind(cors, r1, r2, bias, var)
colnames(out) <- c("pearson.rsq", "Rsq", "RMSE", "bias", "var")
out
}
rmse <- function(obs, pred) {
sqrt(mean((pred - obs)^2))
}
Rsq <- function(obs, pred) {
rss <- sum((pred - obs)^2)
tss <- sum((obs - mean(obs))^2)
1 - rss/tss
}
# Lin's CCC
#' @importFrom stats cov var
ccc <- function(x, y) {
x <- as.vector(x)
y <- as.vector(y)
n <- length(x)
2* cov(x, y) / (var(x) + var(y) + ((mean(x) - mean(y))^2 * n/(n-1)))
}
# column vectorised versions
colRmse <- function(d) {
sqrt(colMeans(d^2))
}
colRsq <- function(obs, pred) {
rss <- colSums((pred - obs)^2)
tss <- rowSums((t(obs) - colMeans(obs))^2)
1 - rss/tss
}
predVar <- function(d) {
n <- nrow(d)
matrixStats::colVars(d) * (n-1)/n
}
#' Scatter plot to compare deconvoluted subclasses
#'
#' Produces a single scatter plot using base graphics to compare actual cell
#' counts against deconvoluted cell counts from bulk (or pseudo-bulk) RNA-Seq
#' Cell subclasses are shown in different colours. Designed for use if ground
#' truth is available, e.g. for simulated pseudo-bulk RNA-Seq data.
#'
#' @param obs Observed matrix of cell amounts with subclasses in columns and
#' samples in rows.
#' @param pred Predicted (deconvoluted) matrix of cell amounts with rows and
#' columns matching `obs`. Alternatively a 'deconv' class fitted model can be
#' supplied and its `$subclass$output` cell counts element will be used.
#' @param mk Optional matching cellMarkers object. This is used for its
#' `cell_table` element to try to colour subclasses by group.
#' @param scheme Vector of colours, one for each cell subclass.
#' @param ellipse Either a single number for the number of ellipses to plot, in
#' which case ellipses are shown for cell subclasses with the lowest R^2; or a
#' character vector of cell subclasses to be outlined with ellipses. Requires
#' the ggforce package to be installed.
#' @param labels Logical whether to add labels to ellipses.
#' @param main Title to add to plot.
#' @returns A ggplot2 scatter plot. An overall R^2 (coefficient of
#' determination) comparing all observed and predicted results is shown.
#' @importFrom ggplot2 geom_abline alpha margin
#' @export
plot_pred <- function(obs, pred, mk = NULL, scheme = NULL, ellipse = NULL,
labels = TRUE, main = "") {
if (inherits(pred, "deconv")) pred <- pred$subclass$output
if (!identical(dim(obs), dim(pred))) stop("incompatible dimensions")
if (anyNA(pred)) {
message("`pred` contains NA")
pred[is.na(pred)] <- 0
}
subclasses <- colnames(obs)
if (is.null(scheme)) {
if (is.null(mk)) {
scheme <- hue_pal(h = c(0, 300))(ncol(obs))
} else {
if (!inherits(mk, "cellMarkers")) stop("mk is not a cellMarkers object")
scheme <- material_colours(mk)
}
}
dat <- data.frame(obs = as.vector(obs), pred = as.vector(pred),
subclass = factor(rep(subclasses, each = nrow(obs))))
ccc <- ccc(dat$obs, dat$pred) |> format(digits = 3)
rsq <- format(Rsq(obs, pred), digits = 3)
if (nchar(main) > 0) main <- paste0(main, " ")
title <- bquote(.(main) * CCC == .(ccc) * "," ~ R^2 == .(rsq))
p <- ggplot(dat, aes(x = .data$obs, y = .data$pred, color = .data$subclass,
fill = .data$subclass)) +
geom_point(size = 1, alpha = 0.8) +
scale_colour_manual(values = scheme) +
scale_fill_manual(values = scheme) +
geom_abline(slope = 1, intercept = 0) +
labs(xlab = "Observed", ylab = "Predicted", subtitle = title) +
theme_classic() +
theme(axis.text = element_text(colour = "black"),
axis.ticks = element_line(colour = "black"),
legend.position = "none")
if (!is.null(ellipse)) {
if (is.character(ellipse)) {
subdat <- dat[dat$subclass %in% ellipse, ]
} else {
mset <- metric_set(obs, pred)
o <- order(mset[, "Rsq"])[seq_len(ellipse)]
subdat <- dat[dat$subclass %in% colnames(obs)[o], ]
}
if (!requireNamespace("ggforce", quietly = TRUE))
stop("'ggforce' package is not installed", call. = FALSE)
p <- p +
ggforce::geom_mark_ellipse(data = subdat,
aes(x = .data$obs, y = .data$pred,
color = .data$subclass, fill = .data$subclass,
label = if (labels) .data$subclass else NULL),
label.fontface = "plain",
label.margin = margin(1, 1, 1, 1, "mm"),
label.fontsize = 10,
label.fill = alpha("white", 0.6),
label.buffer = unit(5, "mm"),
con.cap = 0,
con.type = "straight",
con.border = "none",
expand = 0, linewidth = unit(0.3, "pt"))
}
p
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.