#' Null distribution for standardized module scores.
#'
#' Downsample number of cells in Seurat object by specified factor
#'
#' @param object Seurat Object
#' @param assay Name of assay to use
#' @param n.replicate Number of replicates
#' @param min.gs.size minimum gene set size
#' @param max.gs.size maximum gene set size
#' @param step.size step size; increment of sequence between `min.gs.size` and `max.gs.size`.
#' @param nworkers number of workers used for parallel implementation. Default is 1.
#' @param subsample.n Numeric [1,ncol(object)]. Number of cells to subsample. If specified, overides subsample.factor.
#' @param nbin Number of bins of aggregate expression levels for all analyzed features. See `AddModuleScore` for details.
#' @param verbose Print progress. Default is TRUE.
#' @name nullScore
#' @author Nicholas Mikolajewicz
#' @concept miko_score
#' @return list of results that are used as input into `mikoScore`.
#' @seealso \code{\link{AddSModuleScore}} for standardized module scoring, \code{\link{mikoScore}} for miko scoring, \code{\link{sigScore}} for derivation of p values for miko scores.
#' @examples
#' maxgs <- max(unlist(lapply(gene.list, length)))
#' if (maxgs > 200) maxgs <- 200
#' stepsize <- round(maxgs / 15)
#' if (stepsize < 1) stepsize <- 1
#' if (stepsize > 20) stepsize <- 10
#'
#' # Generating null distributions for genesets
#' bm.null <- nullScore(object = object, assay = "RNA, n.replicate = 10, nbin = 24,
#' min.gs.size = 2, max.gs.size = gs.size + 5, step.size = stepsize,
#' nworkers =20, verbose = T, subsample.n = 5000)
#'
#' # variance.raw.plot <- bm.null$variance.raw.plot
#' # variance.mean.plot <- bm.null$variance.mean.plot
#' # plt.null.model <- bm.null$mean.plot
#' # plt.corrected.plot <- bm.null$corrected.plot
#'
#' # Compute miko scores for genesets
#' object <- mikoScore(object = object, geneset = gene.list, nbin = 24,
#' nullscore = bm.null, assay = "RNA", nworkers = 20)
#'
#' # Get significant miko scores for scored genesets
#' score.result <- sigScore(object = object, geneset = gene.list, reduction = "umap")
#'
nullScore <- function(object, assay = DefaultAssay(object), n.replicate = 25,
min.gs.size = 2, max.gs.size = 100, step.size = 2, nworkers = 1, subsample.n = NULL, nbin = 24, verbose = T){
require(foreach)
if (!is.numeric(min.gs.size)) min.gs.size <- 2
if (min.gs.size < 0) min.gs.size <- 2
if (!is.numeric(max.gs.size)) max.gs.size <- 100
if (max.gs.size < min.gs.size) max.gs.size <- min.gs.size + 10
if (!is.numeric(step.size)) step.size <- 2
if (!is.numeric(nbin)) nbin <- 24
stopifnot("Seurat" %in% class(object))
if (!is.null(subsample.n)){
if (subsample.n < ncol(object)){
set.seed(1023)
object <- downsampleSeurat(object = object, subsample.n = subsample.n, verbose = verbose)
}
}
# ncores <- nworkers
nbin <- optimalBinSize(object = object, nbin = nbin)
miko_message("Generating random gene sets...", verbose = verbose)
base.seq <- unique(c(3:10, seq(10, 20, by = 2), seq(min.gs.size, max.gs.size)))
n.rep <- n.replicate
rsize <- rep(base.seq, n.rep)
rgene.list <- list()
for (j in 1:length(rsize)){
rgene.list[[j]] <- sample(rownames(object), rsize[j])
}
names(rgene.list) <- paste0("gs", seq(1, length(rgene.list)))
miko_message("Scoring gene modules...", verbose = verbose)
if (nworkers > length(rgene.list)) nworkers <- length(rgene.list)
object <- AddSModuleScore(
object = object,
features = rgene.list,
pool = NULL,
nbin = nbin,
ctrl = 100,
k = FALSE,
nworkers = nworkers,
assay = assay,
name = "MS",
seed = 1,
search = F
)
# df.ms <- object@meta.data[ ,grepl("MS", colnames(object@meta.data) )]
df.ms <- object@meta.data[ ,paste0("MS", names(rgene.list) )]
if (is.numeric(df.ms)){
df.ms <- as.data.frame(df.ms)
rownames(df.ms) <- colnames(object)
}
colnames(df.ms) <- names(rgene.list)
scores <- as.matrix(df.ms)
# }
# compute null model
miko_message("Fitting null models...", verbose = verbose)
suppressMessages({
suppressWarnings({
df.var <- data.frame(size = rsize,
x.var = apply(scores, 2, var, na.rm = T),
x.mean = apply(scores, 2, mean, na.rm = T),
x.median = apply(scores, 2, median, na.rm = T)
)
})
})
df.var <- df.var %>%
dplyr::group_by(size) %>%
dplyr::mutate(y.mean = median((x.var), na.rm = T),
y.sd = mad((x.var), na.rm = T) )
df.var.sum <- df.var %>%
dplyr::group_by(size) %>%
dplyr::summarize(x.var = median(x.var),
x.mean = median(x.mean))
miko_message("Generating summary plots...", verbose = verbose)
variance.model <- tryCatch({
glm(formula = x.var ~ size, family = Gamma, data = df.var.sum )
}, error = function(e){
variance.model <- glm(formula = x.var ~ size, family = inverse.gaussian, data = df.var.sum )
return(variance.model)
})
# df.var$sd.pred <- sqrt(variance.model[["fitted.values"]])
df.var$sd.pred <-sqrt(predict.glm(object = variance.model, newdata = df.var, type="response") )
df.var.sum$sd.pred <-sqrt(predict.glm(object = variance.model, newdata = df.var.sum, type="response") )
variance.raw.plot <- df.var %>%
dplyr::arrange(size) %>%
ggplot(aes(x = size, y = (x.var))) +
# scattermore::geom_scattermore() +
geom_point(pch = 21, color = "white", size = 2, fill = "black") +
geom_line(aes(y = (sd.pred^2)), color = "red", size = 1) +
labs(x = "Geneset Size", y = "Score Variance", title = "Null variance model") +
theme_miko(grid = T)
variance.mean.plot <- df.var.sum %>%
dplyr::arrange(size) %>%
ggplot(aes(x = size, y = (x.var))) +
geom_point(pch = 21, color = "white", size = 2, fill = "black") +
# scattermore::geom_scattermore() +
geom_line(aes(y = (sd.pred^2)), color = "red", size = 1) +
labs(x = "Geneset Size", y = "Score Variance", title = "Null variance model") +
theme_miko(grid = T)
model.offset <- 1 + abs(min(df.var.sum$x.mean))
mean.model <- tryCatch({
library(splines)
list(fit = glm(formula = (x.mean ) ~ bs(size, degree = 3), family = gaussian, data = df.var.sum ),
model = "spline")
}, error = function(e){
# glm(formula = log10(x.mean + model.offset) ~ log10(size), family = inverse.gaussian, data = df.var.sum )
fit.list <- list(fit = glm(formula = log10(x.mean + model.offset) ~ log10(size), family = gaussian, data = df.var.sum ),
model = "gaussian")
return(fit.list)
})
if (mean.model$model == "gaussian"){
df.var$mean.pred <- ( 10^predict.glm(object = mean.model$fit, newdata =df.var, type="response")) - model.offset
df.var.sum$mean.pred <-( 10^predict.glm(object = mean.model$fit, newdata =df.var.sum, type="response")) - model.offset
linear.model <- list(
fit = mean.model,
model.offset = model.offset
)
} else if (mean.model$model == "spline") {
df.var$mean.pred <- predict.glm(object = mean.model$fit, newdata =df.var, type="response")
linear.model <- list(
fit = mean.model,
model.offset = 0
)
}
df.var$pi.pred.high <- df.var$mean.pred + 2*(df.var$sd.pred)
df.var$pi.pred.lo <- df.var$mean.pred-2*(df.var$sd.pred)
df.raw.score <- as.data.frame(t(scores))
score.name <- colnames(df.raw.score)
df.raw.score$size = rsize
df.raw.score.long <- pivot_longer(data = df.raw.score, cols = score.name)
yscale <- c(min(df.var$pi.pred.lo, na.rm = T), max(df.var$pi.pred.high, na.rm = T))
if (yscale[1] <=0){
yscale[1] <- yscale[1]*1.1
} else {
yscale[1] <- yscale[1]*0.9
}
if (yscale[2] <=0){
yscale[2] <- yscale[2]*0.9
} else {
yscale[2] <- yscale[2]*1.1
}
plt.mean <- df.var %>%
ggplot(aes(x = size, y = (x.mean))) +
scattermore::geom_scattermore(data = df.raw.score.long, aes(x = size, y = value), color = "grey") +
geom_point(pch = 21, color = "white", size = 2, fill = "black") +
geom_line(aes(y = mean.pred), color = "red", size = 1) +
geom_line(aes(y = pi.pred.high), color = "red", size = 1) +
geom_line(aes(y = pi.pred.lo), color = "red", size = 1) +
labs(x = "Geneset Size", y = "Score Mean", title = "Null model with 95% CI",
subtitle = paste0("Empirical False Positive Rate = ", 100*signif(1- sum(df.var$pi.pred.high > df.var$x.mean) / nrow(df.var), 3), "%")) +
theme_miko(grid = T) +
coord_cartesian(ylim = yscale)
plt.null.correction <- df.var %>%
ggplot(aes(x = size, y = (x.mean - mean.pred)/sd.pred , fill = (x.mean)/sd.pred > 2)) +
# scattermore::geom_scattermore() +
geom_point(pch = 21, color = "white", size = 2) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2) +
scale_fill_manual(values = c("TRUE" = "tomato", "FALSE" = "grey")) +
labs(color = "z > 2", x = "Geneset Size", y = "Corrected Score") + theme_miko(grid = T)
list(
variance.model = variance.model,
variance.raw.plot = variance.raw.plot,
variance.mean.plot = variance.mean.plot,
mean.model = mean.model,
mean.plot = plt.mean,
corrected.plot = plt.null.correction,
null.scores = scores,
geneset = rgene.list
)
}
#' Calculate standardized module scores for feature expression programs in single cells.
#'
#' Calculates the standardized expression level of each program on single-cell level, by subtracting by aggregated expression of control feature sets (like in `AddModuleScore`) and then scaling the difference by the pooled variance of the gene set of interest and control features. Like `AddModuleScore`, all analyzed features are binned based on averaged expression and control features are randomly selected from each bin.
#'
#' @param object Seurat Object
#' @param features A list of vectors of features for expression programs; each entry should be a vector of feature names
#' @param pool List of features to check expression levels against, defaults to `rownames(x = object)`
#' @param nbin Number of bins of aggregate expression levels for all analyzed features
#' @param ctrl Number of control features selected from the same bin per analyzed feature
#' @param nworkers Number of workers for parallel implementation. Default is 1.
#' @param scale scale score by pooled variance. Default is T.
#' @param k Use feature clusters returned from DoKMeans
#' @param assay Name of assay to use
#' @param name Name for the expression programs; will append a number to the end for each entry in features (eg. if `features` has three programs, the results will be stored as name1, name2, name3, respectively)
#' @param seed Set a random seed. If NULL, seed is not set.
#' @param search Search for symbol synonyms for features in features that don't match features in object? Searches the HGNC's gene names database; see UpdateSymbolList for more details
#' @param ... Extra parameters passed to UpdateSymbolList
#' @name AddSModuleScore
#' @concept miko_score
#' @author Nicholas Mikolajewicz
#' @return Returns seurat object with standardized module scores added to object meta data; each module is stored as name# for each module program present in `features`
#' @seealso \code{\link{AddModuleScore}} for original module scoring function implemented in `Seurat`.
AddSModuleScore <- function (object, features, pool = NULL, nbin = NULL, ctrl = 100, nworkers = 1, scale = T,
k = FALSE, assay = NULL, name = "Cluster", seed = 1, search = FALSE, ...){
require(parallel)
ncore <- nworkers
if (is.null(names(features))) names(features) <- paste0("gs", 1:length(features))
names(features) <- make.names(names(features))
if (is.null(nbin)) nbin <- optimalBinSize(object)
if (!is.null(x = seed)) {
set.seed(seed = seed)
}
assay.old <- DefaultAssay(object = object)
assay <- assay %||% assay.old
DefaultAssay(object = object) <- assay
assay.data <- GetAssayData(object = object)
features.old <- features
if (k) {
.NotYetUsed(arg = "k")
features <- list()
for (i in as.numeric(x = names(x = table(object@kmeans.obj[[1]]$cluster)))) {
features[[i]] <- names(x = which(x = object@kmeans.obj[[1]]$cluster ==
i))
}
cluster.length <- length(x = features)
} else {
if (is.null(x = features)) {
stop("Missing input feature list")
}
features <- lapply(X = features, FUN = function(x) {
missing.features <- setdiff(x = x, y = rownames(x = object))
if (length(x = missing.features) > 0) {
if (search) {
tryCatch(expr = {
updated.features <- UpdateSymbolList(symbols = missing.features,
...)
names(x = updated.features) <- missing.features
for (miss in names(x = updated.features)) {
index <- which(x == miss)
x[index] <- updated.features[miss]
}
}, error = function(...) {
warning("Could not reach HGNC's gene names database",
call. = FALSE, immediate. = TRUE)
})
missing.features <- setdiff(x = x, y = rownames(x = object))
}
}
return(intersect(x = x, y = rownames(x = object)))
})
cluster.length <- length(x = features)
}
LengthCheck <- function(values, cutoff = 0) {
return(vapply(
X = values,
FUN = function(x) {
return(length(x = x) > cutoff)
},
FUN.VALUE = logical(1)
))
}
if (!all(LengthCheck(values = features))) {
features <- lapply(X = features.old, FUN = CaseMatch,
match = rownames(x = object))
}
if (!all(LengthCheck(values = features))) {
stop(paste("The following feature lists do not have enough features present in the object:",
paste(names(x = which(x = !LengthCheck(values = features)))),
"exiting..."))
}
pool <- pool %||% rownames(x = object)
data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
data.avg <- data.avg[order(data.avg)]
data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e+30,
n = nbin, labels = FALSE, right = FALSE)
names(x = data.cut) <- names(x = data.avg)
ctrl.use <- vector(mode = "list", length = cluster.length)
miko_message("Preparing control genesets...", ...)
ctrl.use2 <- pbapply::pblapply(seq_along(features), function(i){
features.use <- features[[i]]
for (j in 1:length(x = features.use)) {
which.ind <- which(x = data.cut == data.cut[features.use[j]])
if (length(which.ind) < ctrl){
ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which.ind], size = ctrl, replace = T)))
} else {
ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which.ind], size = ctrl, replace = FALSE)))
}
}
return(ctrl.use[[i]])
})
ctrl.use <- ctrl.use2; rm(ctrl.use2)
ctrl.use <- lapply(X = ctrl.use, FUN = unique)
# start cluster
if (ncore > detectCores()){
n.work.score <- detectCores()
} else {
n.work.score <- ncore
}
cl <- parallel::makeCluster(n.work.score)
doParallel::registerDoParallel(cl)
# miko_message("Scoring gene sets...", ...)
# iterate through each gene signature list
score.results <- foreach(i = 1:cluster.length, .packages = c("Matrix")) %dopar% {
czscore <- fzscore <- NULL
features.use <- ctrl.use[[i]]
ad <- assay.data[features.use, ]
cscore <- Matrix::colMeans(x = ad)
if (scale) czscore <- sparseMatrixStats::colVars(x = ad)
features.use <- features[[i]]
data.use <- assay.data[features.use, , drop = FALSE]
fscore <- Matrix::colMeans(x = data.use)
if (scale) fzscore <- sparseMatrixStats::colVars(x = data.use)
return(
list(
cscore = cscore,
czscore = czscore,
fscore = fscore,
fzscore = fzscore
)
)
}
# stop workers
parallel::stopCluster(cl)
ctrl.scores <- ((do.call(rbind, pbapply::pblapply(score.results, function(x){ x[["cscore"]]} ))))
features.scores <- ((do.call(rbind, pbapply::pblapply(score.results, function(x){ x[["fscore"]]} ))))
if (scale){
ctrl.var <- ((do.call(rbind, pbapply::pblapply(score.results, function(x){ x[["czscore"]]} ))))
features.var <- ((do.call(rbind, pbapply::pblapply(score.results, function(x){ x[["fzscore"]]} ))))
# features.scores.use <- (features.scores - ctrl.scores)
features.var[features.var == 0] <- median(ctrl.var[ctrl.var != 0])
ctrl.var[ctrl.var == 0] <- median(ctrl.var[ctrl.var != 0])
}
# raw score
raw.score <- features.scores - ctrl.scores
# rownames(x = raw.score) <- paste0(name, 1:cluster.length)
rownames(x = raw.score) <- paste0("raw_", names(features))
raw.score <- as.data.frame(x = t(x = raw.score))
rownames(x = raw.score) <- colnames(x = object)
object@misc[["raw_score"]] <- raw.score
# standardized score
if (scale){
features.scores.use <- (features.scores - ctrl.scores)/sqrt(ctrl.var + features.var) #+ features.var
} else {
features.scores.use <- (features.scores - ctrl.scores)
}
# rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
rownames(x = features.scores.use) <- paste0(name, names(features))
features.scores.use <- as.data.frame(x = t(x = features.scores.use))
rownames(x = features.scores.use) <- colnames(x = object)
object[[colnames(x = features.scores.use)]] <- features.scores.use
CheckGC()
DefaultAssay(object = object) <- assay.old
return(object)
}
#' Calculate Miko module scores for feature expression programs
#'
#' Feature expression programs are scored by first computing standardized module scores with `AddSModuleScore` and then scaling the scores by the null distributions calculated using `nullScore`. The resulting scores are robust to gene set size and can be further used to compute whether feature expression program is significantly up-regulated in single-cell population.
#'
#' @param object Seurat Object
#' @param geneset A list of vectors of features for expression programs; each entry should be a vector of feature names
#' @param nullscore `nullScore` output for provided `object`. Must run `nullScore` prior to running `mikoScore`.
#' @param do.center center scores using null model predictions. Default is FALSE.
#' @param do.scale scale scores by null model variance predictions. Default is TRUE.
#' @param assay Name of assay to use.
#' @param nworkers Number of workers for parallel implementation. Default is 1.
#' @param nbin Number of bins of aggregate expression levels for all analyzed features. See `AddModuleScore` for details.
#' @param verbose Print progress. Default is TRUE.
#' @name mikoScore
#' @author Nicholas Mikolajewicz
#' @concept miko_score
#' @return list of results.
#' @seealso \code{\link{AddSModuleScore}} for standardized module scoring, \code{\link{nullScore}} for calculating null score distributions \code{\link{sigScore}} for derivation of p values for miko scores.
#' @examples
#'
#' maxgs <- max(unlist(lapply(gene.list, length)))
#' if (maxgs > 200) maxgs <- 200
#' stepsize <- round(maxgs / 15)
#' if (stepsize < 1) stepsize <- 1
#' if (stepsize > 20) stepsize <- 10
#'
#' # Generating null distributions for genesets
#' bm.null <- nullScore(object = object, assay = "RNA, n.replicate = 10, nbin = 24,
#' min.gs.size = 2, max.gs.size = gs.size + 5, step.size = stepsize,
#' nworkers =20, verbose = T, subsample.n = 5000)
#'
#' # variance.raw.plot <- bm.null$variance.raw.plot
#' # variance.mean.plot <- bm.null$variance.mean.plot
#' # plt.null.model <- bm.null$mean.plot
#' # plt.corrected.plot <- bm.null$corrected.plot
#'
#' # Compute miko scores for genesets
#' object <- mikoScore(object = object, geneset = gene.list, nbin = 24,
#' nullscore = bm.null, assay = "RNA", nworkers = 20)
#'
#' # Get significant miko scores for scored genesets
#' score.result <- sigScore(object = object, geneset = gene.list, reduction = "umap")
#'
mikoScore <- function(object, geneset, nullscore, do.center = F, do.scale = T, assay = DefaultAssay(object), nworkers = 1, nbin = 24 , verbose = T){
# ncores <- nworkers
nbin <- optimalBinSize(object = object, nbin = nbin)
miko_message("Running cell scoring...", verbose = verbose)
nonrelevant.field <- colnames(object@meta.data)[grepl("cell|cluster", colnames(object@meta.data))]
names(geneset) <- make.names(names(geneset))
if (nworkers > length(geneset)) nworkers <- length(geneset)
object <- AddSModuleScore(
object = object,
features = geneset,
pool = NULL,
nbin = nbin,
ctrl = 100,
k = FALSE,
nworkers = nworkers,
assay = assay,
name = "cell_",
seed = 1,
search = F
)
# raw scores
df.raw <- object@misc[["raw_score"]]
if (is.numeric(df.raw)){
df.ms <- as.data.frame(df.raw)
rownames(df.raw) <- colnames(object)
}
colnames(df.raw) <- names(geneset)
# raw.scores <- as.matrix(df.raw)
object@misc[["raw_score"]] <-df.raw
# cell scores
df.ms <- object@meta.data[ ,grepl("cell", colnames(object@meta.data) )]
av.field <- colnames(df.ms)
nonrelevant.field2 <- nonrelevant.field[nonrelevant.field %in% av.field]
try({df.ms <- df.ms %>% dplyr::select(-nonrelevant.field2)}, silent = T)
if (is.numeric(df.ms)){
df.ms <- as.data.frame(df.ms)
rownames(df.ms) <- colnames(object)
}
colnames(df.ms) <- gsub("cell_", "", colnames(df.ms))
gs.scores <- as.matrix(df.ms)
# clusters scores
miko_message("Running cluster scoring...", verbose = verbose)
df.gs.size <- data.frame(size = unlist(lapply(geneset, length)))
if (nullscore$mean.model$model == "spline"){
var.pred <- predict.glm(object = nullscore$variance.model, newdata = df.gs.size, type="response")
mean.pred <- predict.glm(object = nullscore$mean.model$fit, newdata = df.gs.size, type="response")
gs.score.correct <- gs.scores
if (do.center) gs.score.correct <- (gs.score.correct - mean.pred)
if (do.scale) gs.score.correct <- gs.score.correct/sqrt(var.pred)
} else if (nullscore$mean.model$model == "gaussian"){
var.pred <- predict.glm(object = nullscore$variance.model, newdata = df.gs.size, type="response")
mean.pred <- (10^(predict.glm(object = nullscore$mean.model$fit, newdata = df.gs.size, type="response"))) - nullscore$mean.model$model.offset
# gs.score.correct <- (gs.scores - mean.pred)/sqrt(var.pred)
gs.score.correct <- gs.scores
if (do.center) gs.score.correct <- (gs.score.correct - mean.pred)
if (do.scale) gs.score.correct <- gs.score.correct/sqrt(var.pred)
}
gs.score.correct <- as.data.frame(gs.score.correct)
colnames(x = gs.score.correct) <- paste0("cluster_", colnames(x = gs.score.correct))
object[[colnames(x = gs.score.correct)]] <- gs.score.correct
miko_message("Complete!", verbose = verbose)
return( object)
}
#' Benchmark Miko scoring pipeline using cluster-specific markers that had been variably contaminated with random genes.
#'
#' Differentially-expressed gene sets for each cluster are derived and variably contaminated with random genes prior to calculating cluster-specific miko scores for each gene set. The performance of miko scoring is then evaluated on cluster-specific and non-specific gene sets.
#'
#' @param object Seurat Object
#' @param geneset.size size of differentially-expressed gene sets
#' @param group_by Name of grouping variable in `object` meta feature. Default is "seurat_clusters".
#' @param assay Name of assay to use
#' @param deg.logFC.threshold logFC threshold for differential-expression analysis. Default is 0.5.
#' @param deg.fdr.threshold FDR threshold for differential-expression analysis. Default is 0.05.
#' @param miko.fdr.threshold FDR threshold for Miko scores. Default is 0.05.
#' @param nworkers Number of workers for parallel implementation. Default is 1.
#' @param verbose Print progress. Default is TRUE.
#' @name benchmarkScores
#' @author Nicholas Mikolajewicz
#' @concept miko_score
#' @return list of benchmark results.
#' @seealso \code{\link{AddSModuleScore}} for standardized module scoring, \code{\link{wilcoxauc}} for differential expression analysis
benchmarkScores <- function(object, geneset.size = 15, group_by = "seurat_clusters", assay = DefaultAssay(object),deg.logFC.threshold = 0.5, deg.fdr.threshold = 0.05, miko.fdr.threshold = 0.05, verbose = T, nworkers = 1){
miko_message("Performing differential expression analysis...", verbose = verbose)
require(presto)
df.prest <- presto::wilcoxauc(X = object, group_by = group_by, assay = "data", seurat_assay = assay)
gs.size <- geneset.size
df.presto.sig <- df.prest %>% dplyr::filter(logFC > deg.logFC.threshold, padj < deg.fdr.threshold)
u.group <- as.numeric(unique(df.presto.sig$group))
u.group <- u.group[order(u.group)]
miko_message("Generating benchmark genesets...", verbose = verbose)
contam.vector <-unique( round(seq(0, gs.size, by = gs.size / 10)))
all.gene <- rownames(so.query)
gene.list <- list()
for (i in 1:length(u.group)){
df.presto.sig.cur <- df.presto.sig %>% dplyr::filter(group == u.group[i])
if (nrow(df.presto.sig.cur) < gs.size) next
nonsig.gene <- all.gene[!c(all.gene %in% df.presto.sig.cur$feature)]
for (j in 1:length(contam.vector)){
n.sig <- gs.size - contam.vector[j]
n.rand <- contam.vector[j]
gene.list[[paste0("c", u.group[i], "_", contam.vector[j], "contam")]] <- c(sample(df.presto.sig.cur$feature, n.sig),
sample(nonsig.gene, n.rand))
}
}
maxgs <- gs.size
if (maxgs > 200) maxgs <- 200
stepsize <- round(maxgs / 15)
if (stepsize < 1) stepsize <- 1
if (stepsize > 20) stepsize <- 10
miko_message("Generating null distributions for benchmarking genesets...", verbose = verbose)
bm.null <- nullScore(object = object, assay = assay, n.replicate = 10, nbin = 24,
min.gs.size = 2, max.gs.size = gs.size + 5, step.size = stepsize,
nworkers =nworkers, verbose = T, subsample.n = 5000)
miko_message("Computing miko scores for benchmarking genesets...", verbose = verbose)
object <- mikoScore(object = object, geneset = gene.list, nbin = 24,
nullscore = bm.null, assay = assay, nworkers = nworkers)
miko_message("Getting significant miko scores for benchmarking genesets...", verbose = verbose)
score.result <- sigScore(object = object, geneset = gene.list, reduction = "umap")
miko_message("Computing coherence scores for benchmarking genesets...", verbose = verbose)
raw.mat <- object@misc[["raw_score"]]
colnames(raw.mat) <- gsub("raw_", "", colnames(raw.mat))
df.cscore <- coherentFraction(object = object, score.matrix =raw.mat, nworkers = nworkers,
genelist = gene.list, assay = assay, slot = "data", subsample.cluster.n = 500)
df.score <- score.result$cluster_stats
df.score$gs <- df.score$name
df.score$cluster <- df.score$cluster
df.merge <- merge(df.cscore, df.score, by = c("gs", "cluster"))
df.val <- df.merge
df.val$cell.type <- df.val$gs
df.val$target.cluster <- gsub("c", "", stringr::str_extract(df.val$cell.type, "c[0-9]*"))
df.val$contam <- as.numeric(gsub("contam", "", gsub("_", "", stringr::str_remove(df.val$cell.type, "c[0-9]*"))))/gs.size
df.val$is.target <- gsub("c", "", df.val$cluster) == df.val$target.cluster
# overall summary plots #####################################################
df.val.sum <- df.val %>%
dplyr::group_by(is.target, contam) %>%
dplyr::summarize(
sig.enrich = sum(fdr < miko.fdr.threshold & miko_score > 0)/length(fdr),
sig.enrich.80frac = sum( (fdr < miko.fdr.threshold) & (miko_score > 0) & (coherence_fraction >= 0.8)) / length(fdr),
sig.enrich.90frac = sum( (fdr < miko.fdr.threshold) & (miko_score > 0) & (coherence_fraction >= 0.9)) / length(fdr),
sig.enrich.100frac = sum( (fdr < miko.fdr.threshold) & (miko_score > 0) & (coherence_fraction >= 1)) / length(fdr),
.groups = 'drop'
)
df.val.sum.long <- pivot_longer(df.val.sum, cols = c("sig.enrich", "sig.enrich.80frac", "sig.enrich.90frac",
"sig.enrich.100frac"))
df.val.sum.long$is.target2 <- "cluster non-specific"
df.val.sum.long$is.target2[df.val.sum.long$is.target] <- "cluster-specific"
plt.ce.performance <- df.val.sum.long %>%
ggplot(aes(x = contam*100, y = value, color = name)) +
geom_point() +
coord_cartesian(ylim = c(0,1)) +
geom_line() +
facet_wrap(~is.target2) +
theme_miko(legend = T, grid = T, color.palette = "ptol") +
labs(y = "Fraction of genesets that are significant", x = "Geneset contamination (%)", color = "scoring")
return(
list(
benchmark_data = df.val,
benchmark_plot = plt.ce.performance,
benchmark_statistics= df.val.sum.long,
benchmark_genesets = gene.list
)
)
}
#' Examine cluster-specific scoring performance for cluster-specific gene sets that have been variably contaminated with random genes.
#'
#' Examine cluster-specific scoring performance for cluster-specific gene sets that have been variably contaminated with random genes.
#'
#' @param object Seurat Object
#' @param benchmark_data `benchmark_data` returned in list from `benchmarkScores` function. Must run `benchmarkScores` prior to `benchmarkCluster`.
#' @param benchmark_genesets `benchmark_genesets` returned in list from `benchmarkScores` function. Must run `benchmarkScores` prior to `benchmarkCluster`.
#' @param which.cluster Name of group within grouping variable used to run `benchmarkScores`. E.g., '3' if `benchmarkScores(... , group_by = "seurat_clusters")`
#' @param miko.fdr.threshold FDR threshold for Miko scores. Default is 0.05.
#' @param coherence.fraction.threshold Coherence fraction threshold. Default is 0.8.
#' @name benchmarkCluster
#' @author Nicholas Mikolajewicz
#' @return list of benchmark results for specified cluster.
#' @seealso \code{\link{benchmarkScores}} for miko scoring benchmarking
#' @examples
benchmarkCluster <- function(object, benchmark_data, benchmark_genesets, which_cluster, miko.fdr.threshold = 0.05, coherence.fraction.threshold = 0){
benchmark_data.cluster <- benchmark_data %>% dplyr::filter(cluster == which_cluster, is.target)
val.gs <- benchmark_genesets[benchmark_data.cluster$cell.type]
df.dot.all <- NULL
for (i in 1:length(val.gs)){
df.dot <- DotPlot(object = object, features = val.gs[[i]])[["data"]]
df.dot <- df.dot %>% dplyr::filter(id == which_cluster)
df.dot$gs = names(val.gs)[i]
df.dot.all <- bind_rows(df.dot.all, df.dot)
}
df.dot.all$target.cluster <- gsub("c", "", stringr::str_extract(df.dot.all$gs, "c[0-9]*"))
df.dot.all$contam <- as.numeric(gsub("contam", "", gsub("_", "", stringr::str_remove(df.dot.all$gs, "c[0-9]*"))))/unique(unlist(lapply(val.gs, length)))
df.dot.all <- df.dot.all %>%
dplyr::group_by(gs) %>%
dplyr::arrange(-avg.exp.scaled) %>%
dplyr::mutate(ind = seq(1, length(avg.exp.scaled)))
plt.dot <- df.dot.all %>%
ggplot(aes(x = ind, y = as.factor(round(contam*100)), color = avg.exp.scaled, size = pct.exp)) +
geom_point() +
scale_color_gradient2(high = scales::muted("red"), low = scales::muted("blue")) +
labs(x = "Gene Index", y = "Gene Set Contamination (%)", title = "Gene Set Expression Profiles", subtitle = "Same cellular population, varying gene set contamination", size = "Percent Expr.", color = "Scaled Expr.") +
theme_miko(legend = T, grid = T) +
theme(legend.position = "bottom")
plt.enrich <- benchmark_data.cluster %>%
ggplot(aes(x = miko_score, y = as.factor(round(contam*100)), fill = fdr < miko.fdr.threshold & miko_score > 0))+
scale_fill_manual(values = c("TRUE" = "tomato", "FALSE" = "grey")) +
theme_miko(legend = T, grid = T) +
geom_bar(stat = "identity") +
labs(x= "Miko Score", y = "Gene Set Contamination (%)", title = "Miko Score", fill = paste0("FDR<", miko.fdr.threshold)) +
theme(legend.position = "bottom")
plt.coh <- benchmark_data.cluster %>%
ggplot(aes(x = coherence_fraction, y = as.factor(round(contam*100)), fill =coherence_fraction > coherence.fraction.threshold))+
scale_fill_manual(values = c("TRUE" = "tomato", "FALSE" = "grey")) +
theme_miko(legend = T, grid = T) +
geom_bar(stat = "identity") +
geom_vline(xintercept = coherence.fraction.threshold, linetype = "dashed") +
labs(x= "Coherence Fraction", y = "Gene Set Contamination (%)", title = "Coherence Fraction", fill = paste0("coh.frac>", coherence.fraction.threshold)) +
theme(legend.position = "bottom")
plt.cluster.rep <- cowplot::plot_grid(plt.dot, plt.enrich, plt.coh, nrow = 1, align = "h")
return(
list(benchmark_cluster_plot = plt.cluster.rep,
benchmark_data_cluster = benchmark_data.cluster)
)
}
#' Calculate Miko score significance
#'
#' Miko scores are transformed to p values and corrected for multiple-comparisons using Benjamini Hochberg method.
#'
#' @param object Seurat Object. Must contain miko scores (i.e., must run `mikoScore` prior to `sigScore`)
#' @param geneset gene set list used for miko scoring (i.e., those provided as input into `mikoScore`).
#' @param reduction Dimensionality reduction to add to cell-level statistics data.frame. Intended for downstream use (e.g., viewing miko scores in UMAP space).
#' @name sigScore
#' @author Nicholas Mikolajewicz
#' @return list of data frames containing cell- and cluster-level statistics. Also return plot comparing raw (unstandardized) module score with Miko scores.
#' @seealso \code{\link{AddSModuleScore}} for standardized module scoring, \code{\link{mikoScore}} for miko scoring
#' @examples
#'
#' maxgs <- max(unlist(lapply(gene.list, length)))
#' if (maxgs > 200) maxgs <- 200
#' stepsize <- round(maxgs / 15)
#' if (stepsize < 1) stepsize <- 1
#' if (stepsize > 20) stepsize <- 10
#'
#' # Generating null distributions for genesets
#' bm.null <- nullScore(object = object, assay = "RNA, n.replicate = 10, nbin = 24,
#' min.gs.size = 2, max.gs.size = gs.size + 5, step.size = stepsize,
#' nworkers =20, verbose = T, subsample.n = 5000)
#'
#' # variance.raw.plot <- bm.null$variance.raw.plot
#' # variance.mean.plot <- bm.null$variance.mean.plot
#' # plt.null.model <- bm.null$mean.plot
#' # plt.corrected.plot <- bm.null$corrected.plot
#'
#' # Compute miko scores for genesets
#' object <- mikoScore(object = object, geneset = gene.list, nbin = 24,
#' nullscore = bm.null, assay = "RNA", nworkers = 20)
#'
#' # Get significant miko scores for scored genesets
#' score.result <- sigScore(object = object, geneset = gene.list, reduction = "umap")
#'
sigScore <- function(object, geneset, reduction = "umap"){
names(geneset) <- make.names(names(geneset))
df.cs <- object@meta.data[ ,paste0("cluster_", names(geneset))]
df.rs <- object@misc[["raw_score"]]
set.names <- colnames(df.cs)
df.cs$cluster <- object@meta.data$seurat_clusters
try({
df.cs$reduction.x <- object@reductions[[reduction]]@cell.embeddings[,1]
df.cs$reduction.y <- object@reductions[[reduction]]@cell.embeddings[,2]
}, silent = T)
df.cs.sum <- df.cs %>% pivot_longer(cols = set.names)
df.cs.sum$name <- gsub("cluster_", "", df.cs.sum$name)
set.names <- colnames(df.rs)
df.rs$cluster <- object@meta.data$seurat_clusters
try({
df.rs$reduction.x <- object@reductions[[reduction]]@cell.embeddings[,1]
df.rs$reduction.y <- object@reductions[[reduction]]@cell.embeddings[,2]
}, silent = T)
df.rs.sum <- df.rs %>%
pivot_longer(cols = set.names)
df.cs.sum$value.rs <- df.rs.sum$value
df.cs.sum <- as.data.frame(df.cs.sum)
df.cs.sum2 <- df.cs.sum %>%
dplyr::group_by( name, cluster) %>%
dplyr::summarise(
raw_score = mean(value.rs, na.rm = T),
miko_score = mean(value, na.rm = T),
.groups = 'drop'
)
df.cs.sum2$p <- 2*pnorm(q=abs(df.cs.sum2$miko_score), lower.tail=FALSE)
df.cs.sum2 <- df.cs.sum2 %>%
dplyr::group_by(cluster) %>%
dplyr::mutate(
fdr = p.adjust(p, method = "BH")
)
plt.score.comp <- df.cs.sum2 %>%
ggplot(aes(x = raw_score, y = miko_score, fill = fdr < 0.05 & miko_score > 0)) +
scale_fill_manual(values = c("TRUE" = "tomato", "FALSE" = "grey")) +
geom_point(pch = 21, color = "white", size = 2) +
# scattermore::geom_scattermore() +
theme_miko(legend = T, grid= T) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
labs(title = "Raw vs. Miko Enrichment Scores", x = "Raw Score", y = "Miko Score",
subtitle = paste0(signif(sum(df.cs.sum2$fdr < 0.05 & df.cs.sum2$miko_score > 0)/nrow(df.cs.sum2), 3)*100, "% Significance Rate (FDR < 0.05)"),
fill = "FDR<0.05")
return(
list(
cluster_stats = df.cs.sum2,
cell_stats = df.cs.sum,
score_plot = plt.score.comp
)
)
}
#' Calculate coherent fraction for feature expression program.
#'
#' Calculate fraction of genes that are correlated with feature expression program. Performed on a per-cluster basis ("seurat_clusters" in `object` meta data).
#'
#' @param object Seurat Object.
#' @param score.matrix matrix of feature expression program. Can be computed using `AddModuleScores`, `AddSModuleScores`, `mikoScore`, among others.
#' @param geneset gene set list used for obtaining `score.matrix` (e.g., gene set provided as input into `mikoScore` or `AddSModuleScores`).
#' @param method Correlation method: "pearson" or "spearman". Default is "pearson".
#' @param assay Name of assay to use.
#' @param slot Use expression data from this slot in `object`.
#' @param subsample.cluster.n number of cells to subsample within each cluster ("seurat_clusters" in `object` meta data). Default is 500.
#' @param nworkers Number of workers for parallel implementation. Default is 1.
#' @param verbose Print progress. Default is TRUE.
#' @name coherentFraction
#' @author Nicholas Mikolajewicz
#' @return data.frame containing cluster-level coherent fractions.
#' @seealso \code{\link{AddSModuleScore}} for standardized module scoring, \code{\link{mikoScore}} for miko scoring
#' @examples
#'
#' so.query <- AddSModuleScore(object = so.query, features = gene.list)
#'
#' raw.mat <- so.query@misc[["raw_score"]]
#' colnames(raw.mat) <- gsub("raw_", "", colnames(raw.mat))
#
#' df.cscore <- coherentFraction(object = so.query, score.matrix =raw.mat, nworkers = 20,
#' genelist = gene.list, assay = DefaultAssay(so.query), slot = "data", subsample.cluster.n = 500)
#'
#'
coherentFraction <- function(object, score.matrix, genelist, method = c("pearson", "spearman"), assay = DefaultAssay(object), slot = "data", subsample.cluster.n = 500, nworkers = 1, verbose = T){
set.seed(1023)
stopifnot("'object' is not a Seurat object" = ("Seurat" %in% class(object)))
names(genelist) <- make.names(names(genelist))
colnames(score.matrix) <- make.names(colnames(score.matrix))
all.genes <- unique(unlist(genelist))
expr.mat <- getExpressionMatrix(so = object, only.variable = F, which.data = slot, which.assay = assay)
expr.mat <- expr.mat[rownames(expr.mat) %in% all.genes, ]
expr.mat <- tryCatch({
t(expr.mat)
}, error = function(e){
return(t(as.matrix(expr.mat)))
})
if ("seurat_clusters" %in% colnames(object@meta.data)){
df.meta <- data.frame(cluster = object@meta.data$seurat_clusters)
} else {
object <- setResolution(object = object, resolution = 0.8)
df.meta <- data.frame(cluster = object@meta.data$seurat_clusters)
}
uclust <- as.numeric(as.character(unique(df.meta$cluster))); uclust <- uclust[order(uclust)]
# prerank
if (method == "spearman"){
miko_message("Preranking expression and score matrices...", verbose = verbose)
expr.mat.rank <- pbapply::pbapply(expr.mat, 2, rank)
rs.res.rank <- pbapply::pbapply(score.matrix, 2, rank) #, ties.method = "random", na.last = NA
} else if (method == "pearson"){
expr.mat.rank <- as.matrix(expr.mat)
rs.res.rank <- as.matrix(score.matrix)
}
df.cscore <- NULL
miko_message("Computing cluster-level correlations between gene expression and scores...", verbose = verbose)
# nworkers <- nworkers
if (length(uclust) < nworkers) nworkers <- length(uclust)
cl <- parallel::makeCluster(nworkers)
doParallel::registerDoParallel(cl)
# iterate through each gene signature list
coh.results <- foreach(i = 1:length(uclust)) %dopar% { #
# for (i in 1:length(uclust)){
# print(i)
which.cell <- which(df.meta$cluster == uclust[i])
if (subsample.cluster.n < length(which.cell)){
which.cell <- sample(which.cell, subsample.cluster.n)
}
if (length(which.cell) > 1){
expr.mat.cur <- expr.mat.rank[which.cell, ]
rs.res.cur <- rs.res.rank[which.cell, ]
if (all(class(expr.mat.cur) == class(rs.res.cur))){
cor.score <- cor(expr.mat.cur, rs.res.cur)
}
rownames(cor.score) <- colnames(expr.mat.cur)
colnames(cor.score) <- colnames(rs.res.cur)
cor.score[is.na(cor.score)] <- 0
cor.gs <- lapply(seq_along(genelist), function(j){
cor.score[rownames(cor.score) %in% genelist[[j]], names(genelist)[j]]
})
# suppressWarnings({
# suppressMessages({
# cwct <- lapply(cor.gs, function(x){ wilcox.test(x = x, mu = 0, alternative = "greater")[["p.value"]]})
cfrac <- lapply(cor.gs, function(x){ sum(x> 0, na.rm = T)/ length(x)})
cn <- lapply(cor.gs, length)
cmean <- lapply(cor.gs, median, na.rm = T)
cvar <- lapply(cor.gs, var, na.rm = T)
df.res <- data.frame(cluster = uclust[i],
gs = names(genelist),
coherence_fraction = unlist(cfrac),
r_mean = unlist(cmean),
r_variance = unlist(cvar),
# coh.sd = unlist(csd),
gene.n = unlist(cn),
cell.n = length(which.cell)
)
} else {
df.res <- data.frame(cluster = uclust[i],
gs = names(genelist),
coherence_fraction = 0,
r_mean = 0,
r_variance = 0,
gene.n = unlist( lapply(genelist, length)),
cell.n = length(which.cell)
)
}
return(df.res)
}
parallel::stopCluster(cl)
df.cscore <- bind_rows(coh.results)
return(df.cscore)
}
#' Word cloud visualization of cell-type annotations from the Miko scoring pipeline
#'
#' Given outputs from the Miko scoring pipeline, the top cell-type annotations fare visualized using word clouds.
#'
#' @param object Seurat Object.
#' @param object.group name of object meta data field specifying cluster membership. Default is "seurat_clusters".
#' @param score vector of Miko scores
#' @param score.group vector of group memberships
#' @param score.cell.type vector of cell-type names/labels.
#' @param score.p vector of p values.
#' @param score.fdr vector of fdr values. Optional.
#' @param score.coherence.fraction vector of coherence fractions. See coherentFraction(...) for details.
#' @param score.frequent.flier vector of logicals specifying whether score belongs to frequent flier.
#' @param fdr.correction Specify whether p-value should be corrected using Benjamini & Hochberg method. Default is T.
#' @param p.threshold p value threshold. Default is 0.05.
#' @param coherence.threshold Numerical [0,1] specifying minimal coherence required to qualify for visualization. Default is 0.8.
#' @param show.n.terms Maximal number of cell-type terms shown in word cloud. Default is 15.
#' @param verbose Logical, specify whether process is printed. Default is T.
#' @name annotationCloud
#' @author Nicholas Mikolajewicz
#' @return list of ggplot handles
#' @seealso \code{\link{mikoScore}} for miko scoring, \code{\link{coherentFraction}} for coherence scoring
#' @examples
#'
#' df.score_summary <- data.frame(cluster = df.merge$cluster,
#' cell.type = df.merge$gs,
#' miko_score = signif(df.merge$miko_score, 3) ,
#' p = signif(df.merge$p),
#' fdr = signif(df.merge$fdr),
#' coherence_fraction = signif(df.merge$coherence_fraction))
#'
#'
#' plt.cloud <- annotationCloud(object = so.query_scored,
#' object.group = "seurat_clusters",
#' score = df.score_summary$miko_score,
#' score.group = df.score_summary$cluster,
#' score.cell.type = df.score_summary$cell.type,
#' score.p = df.score_summary$p,
#' score.fdr = df.score_summary$fdr,
#' score.coherence.fraction = df.score_summary$coherence_fraction,
#' score.frequent.flier = NULL,
#' fdr.correction = T,
#' p.threshold = 0.05,
#' coherence.threshold = 0.9,
#' show.n.terms = 15,
#' verbose = T)
#'
annotationCloud <- function(object,
object.group = "seurat_clusters",
score,
score.group,
score.cell.type,
score.p,
score.fdr = NULL,
score.coherence.fraction = NULL,
score.frequent.flier = NULL,
fdr.correction = T,
p.threshold = 0.05,
coherence.threshold = 0.8,
show.n.terms = 15,
verbose = T){
require(ggwordcloud)
miko_message("Generating annotation wordclouds...", verbose = verbose)
if (is.null(score.coherence.fraction)) coherence.threshold <- 0
if (is.null(score.frequent.flier)) score.frequent.flier <- F
vst.merge.cloud <- data.frame(
miko_score = score,
cluster = score.group,
cell.type = score.cell.type,
p = score.p,
fdr = score.fdr,
coherence_fraction = score.coherence.fraction,
frequent_flier = score.frequent.flier
)
u.cl <- unique(vst.merge.cloud$cluster)
plt.ww.list <- list()
show.w <- show.n.terms
plt.cluster.umap <- highlightUMAP(object = object, group = object.group,
reduction = "umap", highlight.color = "tomato")
names(plt.cluster.umap) <- as.character(gsub("group_", "", names(plt.cluster.umap)))
u.cl <- object@meta.data[[object.group]]
#
# # get unique clusters
if (object.group == "seurat_clusters"){
u.cl <- unique(as.numeric(as.character((u.cl))))
u.cl <- u.cl[order(u.cl)]
} else {
u.cl <- unique((as.character((u.cl))))
}
n_marker_sets <- ulength(vst.merge.cloud$cell.type)
for (i in 1:length(u.cl)){
vst.merge.subset <- vst.merge.cloud[vst.merge.cloud$cluster %in% u.cl[i], ]
vst.merge.subset$coh.score <- vst.merge.subset$coherence_fraction
vst.merge.subset$coh.score[vst.merge.subset$coh.score < coherence.threshold] <- 0
vst.merge.subset <- vst.merge.subset %>% dplyr::filter(miko_score > 0,
coherence_fraction >= coherence.threshold,
!frequent_flier)
if (nrow(vst.merge.subset) >0){
n.sig.score <- nrow(vst.merge.subset %>% dplyr::filter(fdr < p.threshold, miko_score > 0))
n.coh.score <- nrow(vst.merge.subset %>% dplyr::filter(fdr < p.threshold,miko_score > 0,
coherence_fraction >= coherence.threshold)) #, coh.score
if (fdr.correction){
enrich.label <- paste0(n.sig.score, "/", n_marker_sets, " (", signif(n.sig.score/ ulength(vst.merge.cloud$cell.type), 2)*100, "%) gene sets are enriched (FDR < 5e-2*, 1e-5**, 1e-8***)\nand exceed ", 100*coherence.threshold, "% coherence")
} else {
enrich.label <- paste0(n.sig.score, "/", n_marker_sets, " (", signif(n.sig.score/ ulength(vst.merge.cloud$cell.type), 2)*100, "%) gene sets are enriched (p < 5e-2*, 1e-5**, 1e-8***)\nand exceed ", 100*coherence.threshold, "% coherence")
}
vst.merge.subset$sig.stringent <- vst.merge.subset$fdr < p.threshold &
vst.merge.subset$coherence_fraction > coherence.threshold
df.f1 <- vst.merge.subset %>% dplyr::filter(sig.stringent) %>%
dplyr::filter( miko_score > 0, coherence_fraction >= coherence.threshold) %>% #| (coh.fdr < 0.05)
dplyr::top_n(show.w, miko_score)
if (nrow(df.f1) < show.w){
ndif <- show.w - nrow(df.f1)
df.f1 <- bind_rows(df.f1, vst.merge.subset %>%
dplyr::filter(!sig.stringent) %>%
dplyr::filter(miko_score > 0) %>% #| (coh.fdr < 0.05) #fdr < 0.05,
dplyr::top_n(ndif, miko_score))
}
if (nrow(df.f1) == 0) next
df.f1$cell.type <- gsub("_|-", " ", df.f1$cell.type)
df.f1$cell.type <- stringr::str_wrap(df.f1$cell.type, 40)
maxlim <- max(df.f1$miko_score)
if (maxlim > 30){
maxlim <- 30
}
minlim.coh <- min(df.f1$coh.score)
maxlim.coh <- max(df.f1$coh.score)
df.f1$miko_score[(df.f1$miko_score) > maxlim] <- maxlim
df.f1$coh.score[(df.f1$coh.score) > maxlim.coh] <- maxlim.coh
df.f1$cell.type2 <- df.f1$cell.type
df.f1$cell.type2[df.f1$fdr < p.threshold] <- paste0( df.f1$cell.type[df.f1$fdr < p.threshold], "*")
df.f1$cell.type2[df.f1$fdr < 1e-5] <- paste0( df.f1$cell.type[df.f1$fdr < 1e-5], "**")
df.f1$cell.type2[df.f1$fdr < 1e-8] <- paste0( df.f1$cell.type[df.f1$fdr < 1e-8], "***")
df.f1$cell.type3 <- df.f1$cell.type2
limseq <- unique(round( seq(-log10(p.threshold), maxlim, by = signif((maxlim-(-log10(p.threshold)))/4, 1))))
if (fdr.correction){
col.label <- "-log10(FDR)"
} else {
col.label <- "-log10(p)"
}
w1 <- df.f1 %>%
ggplot(aes(label = cell.type3, color = -log10(fdr), size = abs(miko_score))) +
geom_text_wordcloud(scale_size_area = 40, rm_outside = TRUE, eccentricity = 1, show.legend = T) +
theme_minimal() +
labs(title = "Enriched", subtitle = enrich.label, size = "Score", color = col.label) +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
theme(legend.position = "bottom") +
scale_color_gradient2(low = "grey", mid = "grey", high = scales::muted("red"), midpoint = -log10(p.threshold)) +
scale_size(range = c(1, 5)) +
theme( #
legend.title = element_text(color = "black", size = 10),
legend.text = element_text(color = "black", size = 8) ,
legend.box.background = element_rect(colour = "black")
)
plt.ww.list[[as.character(u.cl[i])]] <- cowplot::plot_grid(plt.cluster.umap[[as.character(u.cl[i])]],w1, ncol = 2)
}
}
return(plt.ww.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.