Nothing
#' @import dplyr magrittr ggplot2 ggrepel ggpmisc
#' @importFrom R6 R6Class
#' @importFrom sccore plapply checkPackageInstalled
#' @importFrom Matrix t
#' @importFrom ggpubr stat_compare_means
#' @importFrom cowplot plot_grid
#' @importFrom stats setNames relevel
#' @importFrom tidyr pivot_longer replace_na
#' @importFrom ggbeeswarm geom_quasirandom
#' @importFrom tibble add_column
#' @importFrom scales comma
#' @importFrom sparseMatrixStats colSums2 rowSums2
#' @importFrom utils globalVariables
NULL
utils::globalVariables(c("Valid Barcodes","Fraction Reads in Cells"))
#' CRMetrics class object
#'
#' @description Functions to analyze Cell Ranger count data. To initialize a new object, 'data.path' or 'cms' is needed. 'metadata' is also recommended, but not required.
#' @export
CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
public = list(
#' @field metadata data.frame or character Path to metadata file or name of metadata data.frame object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)
metadata = NULL,
#' @field data.path character Path(s) to Cell Ranger count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL)
data.path = NULL,
#' @field cms list List with count matrices (default = NULL)
cms = NULL,
#' @field cms.preprocessed list List with preprocessed count matrices after $doPreprocessing() (default = NULL)
cms.preprocessed = NULL,
#' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all CBs detected also empty droplets (default = NULL)
cms.raw = NULL,
#' @field summary.metrics data.frame Summary metrics from Cell Ranger (default = NULL)
summary.metrics = NULL,
#' @field detailed.metrics data.frame Detailed metrics, i.e., no. genes and UMIs per cell (default = NULL)
detailed.metrics = NULL,
#' @field comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL)
comp.group = NULL,
#' @field verbose logical Print messages or not (default = TRUE)
verbose = TRUE,
#' @field theme ggplot2 theme (default: theme_bw())
theme = ggplot2::theme_bw(),
#' @field pal Plotting palette (default = NULL)
pal = NULL,
#' @field n.cores numeric Number of cores for calculations (default = 1)
n.cores = 1,
#' Initialize a CRMetrics object
#' @description To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also recommended, but not required.
#' @param data.path character Path to directory with Cell Ranger count data, one directory per sample (default = NULL).
#' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)
#' @param cms list List with count matrices (default = NULL)
#' @param samples character Sample names. Only relevant is cms is provided (default = NULL)
#' @param unique.names logical Create unique cell names. Only relevant if cms is provided (default = TRUE)
#' @param sep.cells character Sample-cell separator. Only relevant if cms is provided and `unique.names=TRUE` (default = "!!")
#' @param comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL)
#' @param verbose logical Print messages or not (default = TRUE)
#' @param theme ggplot2 theme (default: theme_bw())
#' @param n.cores integer Number of cores for the calculations (default = self$n.cores)
#' @param sep.meta character Separator for metadata file (default = ",")
#' @param raw.meta logical Keep metadata in its raw format. If FALSE, classes will be converted using "type.convert" (default = FALSE)
#' @param pal character Plotting palette (default = NULL)
#' @return CRMetrics object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' }
initialize = function(data.path = NULL,
metadata = NULL,
cms = NULL,
samples = NULL,
unique.names = TRUE,
sep.cells = "!!",
comp.group = NULL,
verbose = TRUE,
theme = theme_bw(),
n.cores = 1,
sep.meta = ",",
raw.meta = FALSE,
pal = NULL) {
if ('CRMetrics' %in% class(data.path)) { # copy constructor
for (n in ls(data.path)) {
if (!is.function(get(n, data.path))) assign(n, get(n, data.path), self)
}
return(NULL)
}
# Check that either data.path or cms is provided
if (is.null(data.path) & is.null(cms)) stop("Either 'data.path' or 'cms' must be provided.")
# Check that last character is slash
if (!is.null(data.path)) {
data.path %<>%
sapply(\(path) {
length.path <- nchar(path)
last.char <- path %>%
substr(length.path, length.path)
if (last.char != "/") paste0(path,"/") else path
})
}
# Write stuff to object
self$n.cores <- as.integer(n.cores)
self$data.path <- data.path
self$verbose <- verbose
self$theme <- theme
self$pal <- pal
# Metadata
if (is.null(metadata)) {
if (!is.null(data.path)) {
self$metadata <- lapply(data.path,
list.dirs,
recursive = FALSE,
full.names = FALSE) %>%
Reduce(c, .) %>%
.[pathsToList(data.path, .) %>% sapply(\(path) file.exists(paste0(path[2],"/",path[1],"/outs")))] %>%
{data.frame(sample = .)}
} else {
if (is.null(names(cms))) {
if (is.null(samples)) {
stop("Either `samples` must be provided, or `cms` must be named.")
} else {
sample.out <- samples
}
} else {
sample.out <- names(cms)
}
self$metadata <- data.frame(sample = sample.out)
}
} else {
if (inherits(metadata, "data.frame")) {
self$metadata <- metadata
} else {
stopifnot(file.exists(metadata))
self$metadata <- read.table(metadata,
header = TRUE,
colClasses = "character",
sep = sep.meta)
}
}
if (!is.null(metadata)) {
if (!raw.meta) self$metadata %<>% lapply(type.convert, as.is = FALSE) %>% bind_cols()
}
# Add CMs
if (!is.null(cms)) {
self$addCms(cms = cms,
samples = samples,
unique.names = unique.names,
sep = sep.cells,
n.cores = self$n.cores,
add.metadata = FALSE,
verbose = verbose)
}
checkCompMeta(comp.group, self$metadata)
# Add summary metrics
if (is.null(cms)) self$summary.metrics <- addSummaryMetrics(data.path = data.path, metadata = self$metadata, n.cores = self$n.cores, verbose = verbose)
},
#' @description Function to read in detailed metrics. This is not done upon initialization for speed.
#' @param cms list List of (sparse) count matrices (default = self$cms)
#' @param min.transcripts.per.cell numeric Minimal number of transcripts per cell (default = 100)
#' @param n.cores integer Number of cores for the calculations (default = self$n.cores).
#' @param verbose logical Print messages or not (default = self$verbose).
#' @return Count matrices
#' @examples
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Run function
#' crm$addDetailedMetrics()
addDetailedMetrics = function(cms = self$cms,
min.transcripts.per.cell = 100,
n.cores = self$n.cores,
verbose = self$verbose) {
# Checks
if (!is.null(self$detailed.metrics)) stop("Detailed metrics already present. To overwrite, set $detailed.metrics = NULL and rerun this function")
if (is.null(cms)) stop("No CMs found, run $addCms first.")
size.check <- cms %>%
sapply(dim) %>%
apply(2, prod) %>%
{. > 2^31-1}
if (any(size.check)) warning(message(paste0("Unrealistic large samples detected that are larger than what can be handled in R. Consider removing ",paste(size.check[size.check] %>% names(), collapse = " "),". If kept, you may experience errors.")))
# Calculate metrics
if (min.transcripts.per.cell > 0) cms %<>% lapply(\(cm) cm[,sparseMatrixStats::colSums2(cm) > min.transcripts.per.cell])
self$detailed.metrics <- addDetailedMetricsInner(cms = cms, verbose = verbose, n.cores = n.cores)
},
#' @description Add comparison group for statistical testing.
#' @param comp.group character Comparison metric (default = self$comp.group).
#' @param metadata data.frame Metadata for samples (default = self$metadata).
#' @return Vector
#' @examples
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Add metadata
#' crm$metadata <- data.frame(sex = c("male","female"))
#'
#' # Add comparison group
#' crm$addComparison(comp.group = "sex")
addComparison = function(comp.group,
metadata = self$metadata) {
checkCompMeta(comp.group, metadata)
self$comp.group <- comp.group
},
#' @description Plot the number of samples.
#' @param comp.group character Comparison metric, must match a column name of metadata (default = self$comp.group).
#' @param h.adj numeric Position of statistics test p value as % of max(y) (default = 0.05).
#' @param exact logical Whether to calculate exact p values (default = FALSE).
#' @param metadata data.frame Metadata for samples (default = self$metadata).
#' @param second.comp.group character Second comparison metric, must match a column name of metadata (default = NULL).
#' @param pal character Plotting palette (default = self$pal)
#' @return ggplot2 object
#' @examples
#' samples <- c("sample1", "sample2")
#'
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#' names(testdata.cms) <- samples
#'
#' # Create metadata
#' metadata <- data.frame(sample = samples,
#' sex = c("male","female"),
#' condition = c("a","b"))
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, metadata = metadata, n.cores = 1)
#'
#' # Plot
#' crm$plotSamples(comp.group = "sex", second.comp.group = "condition")
plotSamples = function(comp.group = self$comp.group,
h.adj = 0.05,
exact = FALSE,
metadata = self$metadata,
second.comp.group = NULL,
pal = self$pal) {
comp.group %<>% checkCompGroup("sample", self$verbose)
if (!is.null(second.comp.group)) {
second.comp.group %<>% checkCompGroup(second.comp.group, self$verbose)
} else {
second.comp.group <- comp.group
}
plot.stats <- ifelse(comp.group == "sample", FALSE, TRUE)
g <- metadata %>%
select(comp.group, second.comp.group) %>%
table() %>%
data.frame() %>%
ggplot(aes(!!sym(comp.group), Freq, fill = !!sym(second.comp.group))) +
geom_bar(stat = "identity", position = "dodge") +
self$theme +
labs(x = comp.group, y = "Freq") +
theme(legend.position = "right")
if (plot.stats) {
g %<>% addPlotStatsSamples(comp.group, metadata, h.adj, exact, second.comp.group)
}
if (!is.null(pal))
g <- g + scale_fill_manual(values = pal)
return(g)
},
#' @description Plot all summary stats or a selected list.
#' @param comp.group character Comparison metric (default = self$comp.group).
#' @param second.comp.group character Second comparison metric, used for the metric "samples per group" or when "comp.group" is a numeric or an integer (default = NULL).
#' @param metrics character Metrics to plot (default = NULL).
#' @param h.adj numeric Position of statistics test p value as % of max(y) (default = 0.05)
#' @param plot.stat logical Show statistics in plot. Will be FALSE if "comp.group" = "sample" or if "comp.group" is a numeric or an integer (default = TRUE)
#' @param stat.test character Statistical test to perform to compare means. Can either be "non-parametric" or "parametric" (default = "non-parametric").
#' @param exact logical Whether to calculate exact p values (default = FALSE).
#' @param metadata data.frame Metadata for samples (default = self$metadata).
#' @param summary.metrics data.frame Summary metrics (default = self$summary.metrics).
#' @param plot.geom character Which geometric is used to plot the data (default = "point").
#' @param se logical For regression lines, show SE (default = FALSE)
#' @param group.reg.lines logical For regression lines, if FALSE show one line, if TRUE show line per group defined by second.comp.group (default = FALSE)
#' @param secondary.testing logical Whether to show post hoc testing (default = TRUE)
#' @param pal character Plotting palette (default = self$pal)
#' @return ggplot2 object
#' @examples
#' \donttest{
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Add summary metrics
#' crm$addSummaryFromCms()
#'
#' crm$plotSummaryMetrics(plot.geom = "point")
#' }
plotSummaryMetrics = function(comp.group = self$comp.group,
second.comp.group = NULL,
metrics = NULL,
h.adj = 0.05,
plot.stat = TRUE,
stat.test = c("non-parametric","parametric"),
exact = FALSE,
metadata = self$metadata,
summary.metrics = self$summary.metrics,
plot.geom = "bar",
se = FALSE,
group.reg.lines = FALSE,
secondary.testing = TRUE,
pal = self$pal) {
# Checks
comp.group %<>% checkCompGroup("sample", self$verbose)
if (is.null(plot.geom)) {
stop("A plot type needs to be defined, can be one of these: 'point', 'bar', 'histogram', 'violin'.")
}
stat.test %<>% match.arg(c("non-parametric","parametric"))
# if no metrics selected, plot all
if (is.null(metrics)) {
metrics <- summary.metrics$metric %>%
unique()
} else {
# check if selected metrics are available
difs <- setdiff(metrics, summary.metrics$metric %>% unique())
if ("samples per group" %in% difs) difs <- difs[difs != "samples per group"]
if (length(difs) > 0) stop(paste0("The following 'metrics' are not valid: ",paste(difs, collapse=" ")))
}
# if samples per group is one of the metrics to plot use the plotSamples function to plot
if ("samples per group" %in% metrics){
sample.plot <- self$plotSamples(comp.group, h.adj, exact, metadata, second.comp.group, pal)
metrics <- metrics[metrics != "samples per group"]
}
# Plot all the other metrics
plotList <- metrics %>%
lapply(function (met) {
tmp <- summary.metrics %>%
filter(metric == met) %>%
merge(metadata, by = "sample")
# Create ggplot object
g <- tmp %>%
ggplot(aes(!!sym(comp.group), value)) +
self$theme
# Add geom + palette
if (is.null(second.comp.group)) {
g %<>%
plotGeom(plot.geom, col = comp.group, pal)
g <- g +
labs(y = met, x = element_blank())
} else {
g %<>%
plotGeom(plot.geom, col = second.comp.group, pal)
g <- g +
labs(y = met, x = comp.group)
}
if (is.numeric(metadata[[comp.group]])) {
if (!group.reg.lines) {
g <- g +
stat_poly_eq(color = "black", aes(label = paste(after_stat(rr.label), after_stat(p.value.label), sep = "*\", \"*"))) +
stat_poly_line(color = "black", se = se)
} else {
g <- g +
stat_poly_eq(aes(label = paste(after_stat(rr.label), after_stat(p.value.label), sep = "*\", \"*"), col = !!sym(second.comp.group))) +
stat_poly_line(aes(col = !!sym(second.comp.group)), se = se)
}
}
# a legend only makes sense if the comparison is not the samples
if (comp.group != "sample") {
g <- g + theme(legend.position = "right")
} else {
plot.stat <- FALSE
g <- g + theme(legend.position = "none")
}
# Statistical testing
if (plot.stat & !is.numeric(metadata[[comp.group]])) {
if (stat.test == "non-parametric") {
primary.test <- "kruskal.test"
secondary.test <- "wilcox.test"
} else {
primary.test <- "anova"
secondary.test <- "t.test"
}
if (length(unique(metadata[[comp.group]])) < 3) {
primary.test <- secondary.test
secondary.test <- NULL
}
if (!secondary.testing) secondary.test <- NULL
g %<>% addPlotStats(comp.group, metadata, h.adj, primary.test, secondary.test, exact)
}
g <- g + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
return(g)
})
# To return the plots
if (exists("sample.plot")) {
if (length("plotList") > 0){
return(plot_grid(plotlist = plotList,
sample.plot,
ncol = min(length(plotList)+1, 3)))
} else {
return(sample.plot)
}
} else {
if (length(plotList) == 1) {
return(plotList[[1]])
} else {
return(plot_grid(plotlist = plotList, ncol = min(length(plotList), 3)))
}
}
},
#' @description Plot detailed metrics from the detailed.metrics object
#' @param comp.group character Comparison metric (default = self$comp.group).
#' @param detailed.metrics data.frame Object containing the count matrices (default = self$detailed.metrics).
#' @param metadata data.frame Metadata for samples (default = self$metadata).
#' @param metrics character Metrics to plot. NULL plots both plots (default = NULL).
#' @param plot.geom character How to plot the data (default = "violin").
#' @param data.path character Path to Cell Ranger count data (default = self$data.path).
#' @param hline logical Whether to show median as horizontal line (default = TRUE)
#' @param pal character Plotting palette (default = self$pal)
#' @return ggplot2 object
#' @examples
#' \donttest{
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Add detailed metrics
#' crm$addDetailedMetrics()
#'
#' # Plot
#' crm$plotDetailedMetrics()
#' }
plotDetailedMetrics = function(comp.group = self$comp.group,
detailed.metrics = self$detailed.metrics,
metadata = self$metadata,
metrics = NULL,
plot.geom = "violin",
hline = TRUE,
pal = self$pal){
# Checks
if (is.null(detailed.metrics)) stop("'detailed.metrics' not calculated. Please run 'addDetailedMetrics()'.")
comp.group %<>% checkCompGroup("sample", self$verbose)
if (is.null(metrics)) {
metrics <- c("UMI_count","gene_count")
}
# check if selected metrics are available
difs <- setdiff(metrics, self$detailed.metrics$metric %>% unique())
if (length(difs) > 0) stop(paste0("The following 'metrics' are not valid: ",paste(difs, collapse=" ")))
# if no plot type is defined, return a list of options
if (is.null(plot.geom)) {
stop("A plot type needs to be defined, can be one of these: 'point', 'bar', 'histogram', 'violin'.")
}
# Plot all the other metrics
plotList <- metrics %>%
lapply(function (met) {
tmp <- detailed.metrics %>%
filter(metric == met) %>%
merge(metadata, by = "sample")
g <- ggplot(tmp, aes(x = sample, y = value))
g %<>%
plotGeom(plot.geom, comp.group, pal)
g <- g +
{if (plot.geom == "violin") scale_y_log10()} +
{if (hline) geom_hline(yintercept = median(tmp$value))} +
labs(y = met, x = element_blank()) +
self$theme
# a legend only makes sense if the comparison is not the samples
if (comp.group != "sample") {
g <- g + theme(legend.position = "right")
} else {
g <- g + theme(legend.position = "none")
}
g <- g + theme(axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
))
return(g)
})
# To return the plots
if (length(plotList) == 1) {
return(plotList[[1]])
} else {
return(plot_grid(plotlist = plotList, ncol = min(length(plotList), 3)))
}
},
#' @description Plot cells in embedding using Conos and color by depth and doublets.
#' @param depth logical Plot depth or not (default = FALSE).
#' @param doublet.method character Doublet detection method (default = NULL).
#' @param doublet.scores logical Plot doublet scores or not (default = FALSE).
#' @param depth.cutoff numeric Depth cutoff (default = 1e3).
#' @param mito.frac logical Plot mitochondrial fraction or not (default = FALSE).
#' @param mito.cutoff numeric Mitochondrial fraction cutoff (default = 0.05).
#' @param species character Species to calculate the mitochondrial fraction for (default = c("human","mouse")).
#' @param size numeric Dot size (default = 0.3)
#' @param sep character Separator for creating unique cell names (default = "!!")
#' @param pal character Plotting palette (default = NULL)
#' @param ... Plotting parameters passed to `sccore::embeddingPlot`.
#' @return ggplot2 object
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#'
#' crm$plotEmbedding()
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
plotEmbedding = function(depth = FALSE,
doublet.method = NULL,
doublet.scores = FALSE,
depth.cutoff = 1e3,
mito.frac = FALSE,
mito.cutoff = 0.05,
species = c("human","mouse"),
size = 0.3,
sep = "!!",
pal = NULL,
...) {
checkPackageInstalled("conos", cran = TRUE)
if (sum(depth, mito.frac, !is.null(doublet.method)) > 1) stop("Only one filter allowed. For multiple filters, use plotFilteredCells(type = 'embedding').")
species %<>%
tolower() %>%
match.arg(c("human","mouse"))
# Check for existing Conos object and preprocessed data
if (is.null(self$con)) {
if (self$verbose) stop("No embedding found, please run createEmbedding.")
}
# Depth
if (depth) {
depths <- self$getDepth() %>%
filterVector("depth.cutoff", depth.cutoff, self$con$samples %>% names(), sep)
if (length(depth.cutoff) > 1) {
main <- "Cells with low depth with sample-specific cutoff"
} else {
main <- paste0("Cells with low depth, < ",depth.cutoff)
}
g <- self$con$plotGraph(colors = (!depths) * 1, title = main, size = size, ...)
}
# Doublets
if (!is.null(doublet.method)) {
dres <- self$doublets[[doublet.method]]$result
if (is.null(dres)) stop("No results found for doublet.method '",doublet.method,"'. Please run doubletDetection(method = '",doublet.method,"'.")
if (doublet.scores) {
doublets <- dres$scores
label <- "scores"
} else {
doublets <- dres$labels * 1
label <- "labels"
}
doublets %<>% setNames(rownames(dres))
g <- self$con$plotGraph(colors = doublets, title = paste(doublet.method,label, collapse = " "), size = size, palette = pal, ...)
}
# Mitochondrial fraction
if (mito.frac) {
mf <- self$getMitoFraction(species = species) %>%
filterVector("mito.cutoff", mito.cutoff, self$con$samples %>% names(), sep)
if (length(mito.cutoff) > 1) {
main <- "Cells with low mito. frac with sample-specific cutoff"
} else {
main <- paste0("Cells with high mito. fraction, > ",mito.cutoff*100,"%")
}
g <- self$con$plotGraph(colors = mf * 1, title = main, size = size, ...)
}
if (!exists("g")) g <- self$con$plotGraph(palette = pal, size = size, ...)
return(g)
},
#' @description Plot the sequencing depth in histogram.
#' @param cutoff numeric The depth cutoff to color the cells in the embedding (default = 1e3).
#' @param samples character Sample names to include for plotting (default = $metadata$sample).
#' @param sep character Separator for creating unique cell names (default = "!!")
#' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2")
#' @param filter.col Character Color for density of cells to be filtered (default = "#A65141")
#' @return ggplot2 object
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#'
#' # Plot
#' crm$plotDepth()
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
plotDepth = function(cutoff = 1e3,
samples = self$metadata$sample,
sep = "!!",
keep.col = "#E7CDC2",
filter.col = "#A65141"){
# Checks
checkPackageInstalled("conos", cran = TRUE)
if (is.null(self$con)) {
stop("No Conos object found, please run createEmbedding.")
}
if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust."))
depths <- self$getDepth()
# Preparations
tmp <- depths %>%
{data.frame(depth = unname(.), sample = names(.))} %>%
mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>%
split(., .$sample) %>%
.[samples] %>%
lapply(\(z) with(density(z$depth, adjust = 1/10), data.frame(x,y))) %>%
{lapply(names(.), \(x) data.frame(.[[x]], sample = x))} %>%
bind_rows()
ncol.plot <- samples %>%
length() %>%
pmin(3)
# Plot
depth.plot <- tmp %>%
pull(sample) %>%
unique() %>%
lapply(\(id) {
tmp.plot <- tmp %>%
filter(sample == id)
xmax <- tmp.plot$x %>%
max() %>%
pmin(2e4)
g <- ggplot(tmp.plot, aes(x,y)) +
self$theme +
geom_line() +
xlim(0,xmax) +
theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1), plot.margin = unit(c(0, 0, 0, 0.5), "cm")) +
labs(title = id, y = "Density [AU]", x = "")
if (length(cutoff) == 1) {
plot.cutoff <- cutoff
} else {
plot.cutoff <- cutoff[names(cutoff) == id]
}
if (all(tmp.plot$x < plot.cutoff)) {
g <- g +
geom_area(fill = filter.col)
} else {
g <- g +
geom_area(fill = filter.col) +
geom_area(data = tmp.plot %>% filter(x > plot.cutoff), aes(x), fill = keep.col)
}
return(g)
}) %>%
plot_grid(plotlist = ., ncol = ncol.plot, label_size = 5)
return(depth.plot)
},
#' @description Plot the mitochondrial fraction in histogram.
#' @param cutoff numeric The mito. fraction cutoff to color the embedding (default = 0.05)
#' @param species character Species to calculate the mitochondrial fraction for (default = "human")
#' @param samples character Sample names to include for plotting (default = $metadata$sample)
#' @param sep character Separator for creating unique cell names (default = "!!")
#' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2")
#' @param filter.col Character Color for density of cells to be filtered (default = "#A65141")
#' @return ggplot2 object
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#'
#' # Plot
#' crm$plotMitoFraction()
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
plotMitoFraction = function(cutoff = 0.05,
species = c("human","mouse"),
samples = self$metadata$sample,
sep = "!!",
keep.col = "#E7CDC2",
filter.col = "#A65141"){
# Checks
checkPackageInstalled("conos", cran = TRUE)
if (is.null(self$con)) {
stop("No Conos object found, please run createEmbedding.")
}
if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust."))
mf <- self$getMitoFraction(species = species)
mf.zero <- sum(mf == 0) / length(mf) * 100
if (mf.zero > 95) warning(paste0(mf.zero,"% of all cells do not express mitochondrial genes. Plotting may behave unexpected."))
# Preparations
tmp <- mf %>%
{data.frame(mito.frac = unname(.), sample = names(.))} %>%
mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>%
split(., .$sample) %>%
.[samples] %>%
lapply(\(z) with(density(z$mito.frac, adjust = 1/10), data.frame(x,y))) %>%
{lapply(names(.), \(x) data.frame(.[[x]], sample = x))} %>%
bind_rows()
ncol.plot <- samples %>%
length() %>%
pmin(3)
# Plot
mf.plot <- tmp %>%
pull(sample) %>%
unique() %>%
lapply(\(id) {
tmp.plot <- tmp %>%
filter(sample == id)
g <- ggplot(tmp.plot, aes(x,y)) +
self$theme +
geom_line() +
theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1), plot.margin = unit(c(0, 0, 0, 0.5), "cm")) +
labs(title = id, y = "Density [AU]", x = "")
if (length(cutoff) == 1) {
plot.cutoff <- cutoff
} else {
plot.cutoff <- cutoff[names(cutoff) == id]
}
if (all(tmp.plot$x < plot.cutoff)) {
g <- g +
geom_area(fill = filter.col)
} else {
g <- g +
geom_area(fill = filter.col) +
geom_area(data = tmp.plot %>% filter(x < plot.cutoff), aes(x), fill = keep.col)
}
return(g)
}) %>%
plot_grid(plotlist = ., ncol = ncol.plot, label_size = 5)
return(mf.plot)
},
#' @description Detect doublet cells.
#' @param method character Which method to use, either `scrublet` or `doubletdetection` (default="scrublet").
#' @param cms list List containing the count matrices (default=self$cms).
#' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)
#' @param env character Environment to run python in (default="r-reticulate").
#' @param conda.path character Path to conda environment (default=system("whereis conda")).
#' @param n.cores integer Number of cores to use (default = self$n.cores)
#' @param verbose logical Print messages or not (default = self$verbose)
#' @param args list A list with additional arguments for either `DoubletDetection` or `scrublet`. Please check the respective manuals.
#' @param export boolean Export CMs in order to detect doublets outside R (default = FALSE)
#' @param data.path character Path to write data, only relevant if `export = TRUE`. Last character must be `/` (default = self$data.path)
#' @return data.frame
#' @examples
#' \dontrun{
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#'
#' # Detect doublets
#' crm$detectDoublets(method = "scrublet",
#' conda.path = "/opt/software/miniconda/4.12.0/condabin/conda")
#' }
detectDoublets = function(method = c("scrublet","doubletdetection"),
cms = self$cms,
samples = self$metadata$sample,
env = "r-reticulate",
conda.path = system("whereis conda"),
n.cores = self$n.cores,
verbose = self$verbose,
args = list(),
export = FALSE,
data.path = self$data.path) {
# Checks
method %<>% tolower() %>% match.arg(c("scrublet","doubletdetection"))
if (!is.list(args)) stop("'args' must be a list.")
if (!inherits(cms, "list")) stop("'cms' must be a list")
if (!all(sapply(cms, inherits, "Matrix"))) {
warning("All samples in 'cms' must be a matrix, trying to convert to dgCMatrix...")
cms %<>% lapply(as, "CsparseMatrix")
if (!all(sapply(cms, inherits, "Matrix"))) stop("Could not convert automatically.")
}
if (length(data.path) > 1) data.path <- data.path[1]
if (export && is.null(data.path)) stop("When 'export = TRUE', 'data.path' must be provided.")
# Prepare arguments
if (method == "doubletdetection") {
args.std <- list(boost_rate = 0.25,
clustering_algorithm = "phenograph",
clustering_kwargs = NULL,
n_components = 30,
n_iters = 10,
n_jobs = n.cores,
n_top_var_genes = 10000,
normalizer = NULL,
pseudocount = 0.1,
random_state = 0,
replace = FALSE,
standard_scaling = FALSE,
p_thresh = 1e-7,
voter_thresh = 0.9)
ints <- c("n_components","n_iters","n_jobs","n_top_var_genes","random_state")
} else {
args.std <- list(total_counts = NULL,
sim_doublet_ratio = 2.0,
n_neighbors = NULL,
expected_doublet_rate = 0.1,
stdev_doublet_rate = 0.02,
random_state = 0,
synthetic_doublet_umi_subsampling = 1.0,
use_approx_neighbors = TRUE,
distance_metric = "euclidean",
get_doublet_neighbor_parents = FALSE,
min_counts = 3,
min_cells = 3,
min_gene_variability_pctl = 85,
log_transform = FALSE,
mean_center = TRUE,
normalize_variance = TRUE,
n_prin_comps = 30,
svd_solver = "arpack")
ints <- c("random_state","min_cells","n_prin_comps")
}
# Update arguments based on input
if (length(args) > 0) {
diff <- setdiff(args %>% names(), args.std %>% names())
if (length(diff) > 0) stop(paste0("Argument(s) not recognized: ",paste(diff, collapse = " "),". Please update 'args' and try again."))
for (i in names(args)) {
args.std[[i]] <- args[[i]]
}
}
# Ensure integers
for (i in ints) {
args.std[[i]] <- as.integer(args.std[[i]])
}
if (!export) {
# Prep environment
if (verbose) message(paste0(Sys.time()," Loading prerequisites..."))
checkPackageInstalled("reticulate", cran = TRUE)
reticulate::use_condaenv(condaenv = env, conda = conda.path, required = TRUE)
if (!reticulate::py_module_available(method)) stop(paste0("'",method,"' is not installed in your current conda environment."))
reticulate::source_python(paste(system.file(package="CRMetrics"), paste0(method,".py"), sep ="/"))
if (verbose) message(paste0(Sys.time()," Identifying doublets using '",method,"'..."))
# Calculate
tmp <- cms %>%
names() %>%
lapply(\(cm) {
if (verbose) message(paste0(Sys.time()," Running sample '",cm,"'..."))
args.out <- list(cm = Matrix::t(cms[[cm]])) %>% append(args.std)
if (method == "doubletdetection") {
tmp.out <- do.call("doubletdetection_py", args.out)
} else {
tmp.out <- do.call("scrublet_py", args.out)
}
tmp.out %<>%
setNames(c("labels","scores","output"))
}) %>%
setNames(cms %>% names())
df <- tmp %>%
names() %>%
lapply(\(name) {
tmp[[name]] %>%
.[c("labels","scores")] %>%
bind_rows() %>%
as.data.frame() %>%
mutate(sample = name) %>%
`rownames<-`(cms[[name]] %>% colnames())
}) %>%
bind_rows()
df[is.na(df)] <- FALSE
df %<>% mutate(labels = as.logical(labels))
output <- tmp %>% lapply(`[[`, 3) %>%
setNames(tmp %>% names())
res <- list(result = df,
output = output)
if (verbose) message(paste0(Sys.time()," Detected ",sum(df$labels, na.rm = TRUE)," possible doublets out of ",nrow(df)," cells."))
self$doublets[[method]] <- res
} else {
# Check for existing files
files <- setdiff(samples %>% sapply(paste0, ".mtx"), dir(data.path)) %>%
strsplit(".mtx",) %>%
sapply('[[', 1)
diff <- length(samples) - length(files)
# Save data
if (verbose) message(paste0(Sys.time()," Saving ",length(cms)," CMs"))
if (diff > 0) message("Existing save files already found, skipping ",diff," samples: ",paste(c("",setdiff(samples, files)), collapse = "\n"))
for (i in files) {
cms[[i]] %>%
Matrix::t() %>%
Matrix::writeMM(paste0(data.path,i,".mtx"))
}
if (verbose) message(paste0(Sys.time()," Done! Python script is saved as ",data.path,toupper(method),".py"))
# Create Python script
args.std %<>%
`names<-`(sapply(names(.), paste0, "X")) %>%
lapply(\(x) {
if (is.null(x)) return("None")
if (is.logical(x) && x) return("True")
if (is.logical(x) && !x) return("False")
if (is.character(x)) return(paste0('"',x,'"'))
return(x)
}) %>%
append(list(data.path = data.path,
method = method))
tmp <- readLines(paste(system.file(package="CRMetrics"), paste0(method,"_manual.py"), sep ="/"))
for (i in names(args.std)) {
tmp %<>% gsub(pattern = i, replace = args.std[i], x = .)
}
writeLines(tmp, con=paste0(data.path,toupper(method),".py"))
}
},
#' @description Perform conos preprocessing.
#' @param cms list List containing the count matrices (default = self$cms).
#' @param preprocess character Method to use for preprocessing (default = c("pagoda2","seurat")).
#' @param min.transcripts.per.cell numeric Minimal transcripts per cell (default = 100)
#' @param verbose logical Print messages or not (default = self$verbose).
#' @param n.cores integer Number of cores for the calculations (default = self$n.cores).
#' @param get.largevis logical For Pagoda2, create largeVis embedding (default = FALSE)
#' @param tsne logical Create tSNE embedding (default = FALSE)
#' @param make.geneknn logical For Pagoda2, estimate gene kNN (default = FALSE)
#' @param cluster logical For Seurat, estimate clusters (default = FALSE)
#' @param ... Additional arguments for `Pagaoda2::basicP2Proc` or `conos:::basicSeuratProc`
#' @return Conos object
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Perform preprocessing
#' crm$doPreprocessing(preprocess = "pagoda2")
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
doPreprocessing = function(cms = self$cms,
preprocess = c("pagoda2","seurat"),
min.transcripts.per.cell = 100,
verbose = self$verbose,
n.cores = self$n.cores,
get.largevis = FALSE,
tsne = FALSE,
make.geneknn = FALSE,
cluster = FALSE,
...) {
preprocess %<>%
tolower() %>%
match.arg(c("pagoda2","seurat"))
if (is.null(cms)) {
stop("No count matrices found, please add them using addDetailedMetrics or addCms.")
}
if (preprocess == "pagoda2") {
if (verbose) message('Running preprocessing using pagoda2...')
checkPackageInstalled("pagoda2", cran = TRUE)
tmp <- lapply(
cms,
pagoda2::basicP2proc,
get.largevis = FALSE,
get.tsne = FALSE,
make.geneknn = FALSE,
min.transcripts.per.cell = min.transcripts.per.cell,
n.cores = n.cores,
...)
} else if (preprocess == "seurat") {
if (verbose) message('Running preprocessing using Seurat...')
checkPackageInstalled("conos", cran = TRUE)
tmp <- lapply(
cms,
conos::basicSeuratProc,
do.par = (n.cores > 1),
tsne = FALSE,
cluster = FALSE,
verbose = FALSE,
...)
}
if (verbose) message('Preprocessing done!\n')
self$cms.preprocessed <- tmp
invisible(tmp)
},
#' @description Create Conos embedding.
#' @param cms list List containing the preprocessed count matrices (default = self$cms.preprocessed).
#' @param verbose logical Print messages or not (default = self$verbose).
#' @param n.cores integer Number of cores for the calculations (default = self$n.cores).
#' @param arg.buildGraph list A list with additional arguments for the `buildGraph` function in Conos (default = list())
#' @param arg.findCommunities list A list with additional arguments for the `findCommunities` function in Conos (default = list())
#' @param arg.embedGraph list A list with additional arguments for the `embedGraph` function in Conos (default = list(method = "UMAP))
#' @return Conos object
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
createEmbedding = function(cms = self$cms.preprocessed,
verbose = self$verbose,
n.cores = self$n.cores,
arg.buildGraph = list(),
arg.findCommunities = list(),
arg.embedGraph = list(method = "UMAP")) {
checkPackageInstalled("conos", cran = TRUE)
if (is.null(cms)) {
stop("No preprocessed count matrices found, please run doPreprocessing.")
}
if (verbose) message('Creating Conos object... ')
con <- conos::Conos$new(cms, n.cores = n.cores)
if (verbose) message('Building graph... ')
do.call(con$buildGraph, arg.buildGraph)
if (verbose) message('Finding communities... ')
do.call(con$findCommunities, arg.findCommunities)
if (verbose) message('Creating embedding... ')
do.call(con$embedGraph, arg.embedGraph)
self$con <- con
invisible(con)
},
#' @description Filter cells based on depth, mitochondrial fraction and doublets from the count matrix.
#' @param depth.cutoff numeric Depth (transcripts per cell) cutoff (default = NULL).
#' @param mito.cutoff numeric Mitochondrial fraction cutoff (default = NULL).
#' @param doublets character Doublet detection method to use (default = NULL).
#' @param species character Species to calculate the mitochondrial fraction for (default = "human").
#' @param samples.to.exclude character Sample names to exclude (default = NULL)
#' @param verbose logical Show progress (default = self$verbose)
#' @param sep character Separator for creating unique cell names (default = "!!")
#' @param raw boolean Filter on raw, unfiltered count matrices. Usually not intended (default = FALSE)
#' @return list of filtered count matrices
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#'
#'
#' # Filter CMs
#' crm$filterCms(depth.cutoff = 1e3, mito.cutoff = 0.05)
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
filterCms = function(depth.cutoff = NULL,
mito.cutoff = NULL,
doublets = NULL,
species = c("human","mouse"),
samples.to.exclude = NULL,
verbose = self$verbose,
sep = "!!",
raw = FALSE) {
if (verbose) {
filters <- c()
if (!is.null(depth.cutoff)) filters %<>% c(paste0("depth.cutoff = ",depth.cutoff))
if (!is.null(mito.cutoff)) filters %<>% c(paste0("mito.cutoff = ",mito.cutoff," and species = ",species))
if (!is.null(doublets)) filters %<>% c(paste0("doublet method = ",doublets))
message(paste0("Filtering based on: ",paste(filters, collapse="; ")))
}
# Preparations
species %<>%
tolower() %>%
match.arg(c("human","mouse"))
# Extract CMs
if (!raw) cms <- self$cms else cms <- self$cms.raw
if (is.null(cms)) stop(if (raw) "$cms.raw" else "$cms"," is NULL. filterCms depends on this object. Aborting")
# Exclude samples
if (!is.null(samples.to.exclude)) {
if (!((samples.to.exclude %in% names(cms)) %>% all())) stop("Not all 'samples.to.exclude' found in names of ",if (raw) "self$cms.raw" else "self$cms. Please check and try again.")
if (verbose) message(paste0("Excluding sample(s) ",paste(samples.to.exclude, sep = "\t")))
cms %<>% .[setdiff(names(.), samples.to.exclude)]
}
if (verbose) message(paste0(Sys.time()," Preparing filter"))
# Extract sample names
samples <- cms %>%
names()
# Depth
if (!is.null(depth.cutoff)) {
depth.filter <- self$getDepth() %>%
filterVector("depth.cutoff", depth.cutoff, samples, sep)
} else {
depth.filter <- NULL
}
# Mitochondrial fraction
if (!is.null(mito.cutoff)) {
mito.filter <- self$getMitoFraction(species = species) %>%
filterVector("mito.cutoff", mito.cutoff, samples, sep) %>%
!. # NB, has to be negative
} else {
mito.filter <- NULL
}
# Doublets
if (!is.null(doublets)) {
if (is.null(self$doublets[[doublets]])) stop("Results for doublet detection method '",doublets,"' not found. Please run detectDoublets(method = '",doublets,"'.")
doublets.filter <- self$doublets[[doublets]]$result %>%
mutate(labels = replace_na(labels, FALSE)) %>%
{setNames(!.$labels, rownames(.))}
} else {
doublets.filter <- NULL
}
# Get cell index
cell.idx <- list(names(depth.filter),
names(mito.filter),
names(doublets.filter)) %>%
.[!sapply(., is.null)] %>%
Reduce(intersect, .)
# Create split vector
split.vec <- strsplit(cell.idx, sep) %>%
sapply('[[', 1)
# Filter
filter.list <- list(depth = depth.filter,
mito = mito.filter,
doublets = doublets.filter) %>%
.[!sapply(., is.null)] %>%
lapply(\(filter) filter[cell.idx]) %>% # Ensure same order of cells
bind_cols() %>%
apply(1, all) %>%
split(split.vec)
if (verbose) {
cells.total <- cms %>%
sapply(ncol) %>%
sum()
cells.remove <- sum(!filter.list %>% unlist())
if (!any(is.null(depth.filter), is.null(mito.filter))) cells.remove <- cells.remove + cells.total - nrow(self$con$embedding)
cells.percent <- cells.remove / cells.total * 100
message(paste0(Sys.time()," Removing ",cells.remove," of ", cells.total," cells (",formatC(cells.percent, digits = 3),"%)"))
}
self$cms.filtered <- samples %>%
lapply(\(sample) {
cms[[sample]][,filter.list[[sample]]]
}) %>%
setNames(samples)
},
#' @description Select metrics from summary.metrics
#' @param ids character Metric id to select (default = NULL).
#' @return vector
#' @examples
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Select metrics
#' crm$selectMetrics()
#' selection.metrics <- crm$selectMetrics(c(1:4))
selectMetrics = function(ids = NULL) {
metrics <- self$summary.metrics$metric %>%
unique()
if (is.null(ids)) tmp <- data.frame(no = seq_len(length(metrics)), metrics = metrics) else tmp <- metrics[ids]
return(tmp)
},
#' @description Plot filtered cells in an embedding, in a bar plot, on a tile or export the data frame
#' @param type character The type of plot to use: embedding, bar, tile or export (default = c("embedding","bar","tile","export")).
#' @param depth logical Plot the depth or not (default = TRUE).
#' @param depth.cutoff numeric Depth cutoff, either a single number or a vector with cutoff per sample and with sampleIDs as names (default = 1e3).
#' @param doublet.method character Method to detect doublets (default = NULL).
#' @param mito.frac logical Plot the mitochondrial fraction or not (default = TRUE).
#' @param mito.cutoff numeric Mitochondrial fraction cutoff, either a single number or a vector with cutoff per sample and with sampleIDs as names (default = 0.05).
#' @param species character Species to calculate the mitochondrial fraction for (default = c("human","mouse")).
#' @param size numeric Dot size (default = 0.3)
#' @param sep character Separator for creating unique cell names (default = "!!")
#' @param cols character Colors used for plotting (default = c("grey80","red","blue","green","yellow","black","pink","purple"))
#' @param ... Plotting parameters passed to `sccore::embeddingPlot`.
#' @return ggplot2 object or data frame
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#'
#' # Plot and extract result
#' crm$plotFilteredCells(type = "embedding")
#' filtered.cells <- crm$plotFilteredCells(type = "export")
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
plotFilteredCells = function(type = c("embedding","bar","tile","export"),
depth = TRUE,
depth.cutoff = 1e3,
doublet.method = NULL,
mito.frac = TRUE,
mito.cutoff = 0.05,
species = c("human","mouse"),
size = 0.3,
sep = "!!",
cols = c("grey80","red","blue","green","yellow","black","pink","purple"),
...) {
type %<>%
tolower() %>%
match.arg(c("embedding","bar","tile","export"))
if (mito.frac) species %<>% tolower() %>% match.arg(c("human","mouse"))
# Prepare data
if (depth) {
depths <- self$getDepth() %>%
filterVector("depth.cutoff", depth.cutoff, depth.cutoff %>% names(), sep) %>%
{ifelse(!., "depth", "")}
} else {
depths <- NULL
}
if (mito.frac) {
mf <- self$getMitoFraction(species = species) %>%
filterVector("mito.cutoff", mito.cutoff, mito.cutoff %>% names(), sep) %>%
{ifelse(., "mito", "")}
} else {
mf <- NULL
}
if (!is.null(doublet.method)) {
tmp.doublets <- self$doublets[[doublet.method]]$result
doublets <- tmp.doublets$labels %>%
ifelse("doublet","") %>%
setNames(rownames(tmp.doublets))
} else {
doublets <- NULL
}
# Get cell index
cell.idx <- self$cms %>%
lapply(colnames) %>%
Reduce(c, .)
# Create data.frame
tmp <- list(depth = depths,
mito = mf,
doublets = doublets) %>%
.[!sapply(., is.null)] %>%
lapply(\(filter) filter[cell.idx]) %>% # Ensure same order of cells
bind_cols() %>%
as.data.frame() %>%
`rownames<-`(cell.idx)
if (type == "embedding" || type == "bar") {
tmp %<>%
mutate(., filter = apply(., 1, paste, collapse=" ")) %>%
mutate(filter = gsub('^\\s+|\\s+$', '', filter) %>%
gsub(" ", " ", ., fixed = TRUE) %>%
gsub(" ", "+", .))
tmp$filter[tmp$filter == ""] <- "kept"
tmp$filter %<>%
factor()
if ("kept" %in% levels(tmp$filter)) {
tmp$filter %<>% relevel(ref = "kept")
colstart <- 1
} else {
colstart <- 2
}
} else {
tmp %<>%
apply(2, \(x) x != "") %>%
{data.frame(. * 1)} %>%
mutate(., sample = rownames(.) %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1),
cell = rownames(.)) %>%
pivot_longer(cols = -c(sample,cell),
names_to = "variable",
values_to = "value")
}
# Embedding plot
if (type == "embedding"){
g <- self$con$plotGraph(groups = tmp$filter %>% setNames(rownames(tmp)), mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Cells to filter", size = size, ...) +
scale_color_manual(values = cols[colstart:(tmp$filter %>% levels() %>% length())])
}
# Bar plot
if (type == "bar") {
g <- tmp %>% mutate(., sample = rownames(.) %>% strsplit(sep) %>% sapply('[[', 1),
filter = ifelse(grepl("+", filter, fixed = TRUE), "combination", as.character(filter))) %>%
group_by(sample,filter) %>%
dplyr::count() %>%
ungroup() %>%
group_by(sample) %>%
mutate(pct = n/sum(n)*100) %>%
ungroup() %>%
filter(filter != "kept") %>%
ggplot(aes(sample, pct, fill = filter)) +
geom_bar(stat = "identity") +
geom_text_repel(aes(label = sprintf("%0.2f", round(pct, digits = 2))),
position = position_stack(vjust = 0.5), direction = "y", size = 2.5) +
self$theme +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "", y = "Percentage cells filtered")
} else if (type == "tile") {
# Tile plot
tmp.plot <- labelsFilter(tmp)
if ("mito" %in% tmp.plot$fraction) {
tmp.plot %<>%
mutate(., fraction = gsub("mito", "mito.fraction", .$fraction))
}
g <- tmp.plot %>%
ggplot(aes(fraction, sample, fill = value)) +
geom_tile(aes(width = 0.7, height = 0.7), color = "black", size = 0.5) +
scale_fill_manual(values = c("green", "orange", "red")) +
self$theme +
labs(x = "", y = "", fill = "")
} else if (type == "export") {
g <- tmp
}
return(g)
},
#' @description Extract sequencing depth from Conos object.
#' @param cms list List of (sparse) count matrices (default = self$cms)
#' @return data frame
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#'
#' # Get depth
#' crm$getDepth()
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
getDepth = function(cms = self$cms) {
cms %>%
lapply(\(cm) `names<-`(sparseMatrixStats::colSums2(cm), colnames(cm))) %>%
Reduce(c, .)
},
#' @description Calculate the fraction of mitochondrial genes.
#' @param species character Species to calculate the mitochondrial fraction for (default = "human").
#' @param cms list List of (sparse) count matrices (default = self$cms)
#' @return data frame
#' @examples
#' \donttest{
#' if (requireNamespace("pagoda2", quietly = TRUE)) {
#' if (requireNamespace("conos", quietly = TRUE)) {
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Create embedding
#' crm$doPreprocessing()
#' crm$createEmbedding()
#'
#' # Get mito. fraction
#' crm$getMitoFraction(species = c("human", "mouse"))
#' } else {
#' message("Package 'conos' not available.")
#' }
#' } else {
#' message("Package 'pagoda2' not available.")
#' }
#' }
getMitoFraction = function(species = c("human", "mouse"), cms = self$cms) {
# Checks
species %<>%
match.arg(c("human", "mouse"))
if (is.null(cms)) stop("Cms is NULL, aborting.")
if (species=="human") symb <- "MT-" else if (species=="mouse") symb <- "mt-" else stop("Species must either be 'human' or 'mouse'.")
# Calculate
tmp <- cms %>%
lapply(\(cm) {
tmp.mat <- cm[grep(symb, rownames(cm)),]
if (inherits(tmp.mat, "numeric")) {
nom <- tmp.mat
} else {
nom <- sparseMatrixStats::colSums2(tmp.mat)
}
out <- (nom / sparseMatrixStats::colSums2(cm)) %>%
`names<-`(colnames(cm))
out[is.na(out)] <- 0
return(out)
}) %>%
Reduce(c, .)
return(tmp)
},
#' @description Create plots and script call for CellBender
#' @param shrinkage integer Select every nth UMI count per cell for plotting. Improves plotting speed drastically. To plot all cells, set to 1 (default = 100)
#' @param show.expected.cells logical Plot line depicting expected number of cells (default = TRUE)
#' @param show.total.droplets logical Plot line depicting total droplets included for CellBender run (default = TRUE)
#' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
#' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
#' @param cms.raw list Raw count matrices from HDF5 Cell Ranger outputs (default = self$cms.raw)
#' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger outputs (default: stored list)
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param verbose logical Show progress (default: stored vector)
#' @param n.cores integer Number of cores (default: stored vector)
#' @param unique.names logical Create unique cell names (default = FALSE)
#' @param sep character Separator for creating unique cell names (default = "!!")
#' @return ggplot2 object and bash script
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data")
#' crm$prepareCellbender()
#' }
prepareCellbender = function(shrinkage = 100,
show.expected.cells = TRUE,
show.total.droplets = TRUE,
expected.cells = NULL,
total.droplets = NULL,
cms.raw = self$cms.raw,
umi.counts = self$cellbender$umi.counts,
data.path = self$data.path,
samples = self$metadata$sample,
verbose = self$verbose,
n.cores = self$n.cores,
unique.names = FALSE,
sep = "!!") {
checkPackageInstalled("sparseMatrixStats", bioc = TRUE)
# Preparations
if (verbose) message(paste0(Sys.time()," Started run using ", if (n.cores < length(samples)) n.cores else length(samples)," cores"))
if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples)
if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples)
# Read CMs from HDF5 files
if (!is.null(cms.raw)) {
if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger outputs. To overwrite, set $cms.raw <- NULL"))
} else {
if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs"))
checkDataPath(data.path)
cms.raw <- read10xH5(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
self$cms.raw <- cms.raw
}
# Get UMI counts
if (!is.null(umi.counts)) {
if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL"))
} else {
if (verbose) message(paste0(Sys.time()," Calculating UMI counts per sample"))
umi.counts <- cms.raw[samples] %>%
plapply(\(cm) {
sparseMatrixStats::colSums2(cm) %>%
sort(decreasing = TRUE) %>%
{data.frame(y = .)} %>%
filter(y > 0) %>%
mutate(., x = seq_len(nrow(.)))
}, n.cores = n.cores) %>%
setNames(samples)
self$cellbender$umi.counts <- umi.counts
}
# Create plot
if (verbose) message(paste0(Sys.time()," Plotting"))
data.df <- umi.counts[samples] %>%
names() %>%
lapply(\(sample) {
umi.counts[[sample]] %>%
mutate(sample = sample) %>%
.[seq(1, nrow(.), shrinkage),]
}) %>%
bind_rows()
line.df <- expected.cells %>%
{data.frame(sample = names(.), exp = .)} %>%
mutate(total = total.droplets %>% unname())
g <- ggplot(data.df, aes(x, y)) +
geom_line(color = "red") +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
self$theme +
labs(x = "Droplet ID ranked by count", y = "UMI count per droplet", col = "")
if (show.expected.cells) g <- g + geom_vline(data = line.df, aes(xintercept = exp, col = "Expected cells"))
if (show.total.droplets) g <- g + geom_vline(data = line.df, aes(xintercept = total, col = "Total droplets included"))
g <- g + facet_wrap(~ sample, scales = "free")
if (verbose) message(paste0(Sys.time()," Done!"))
return(g)
},
#' @param file character File name for CellBender script. Will be stored in `data.path` (default: "cellbender_script.sh")
#' @param fpr numeric False positive rate for CellBender (default = 0.01)
#' @param epochs integer Number of epochs for CellBender (default = 150)
#' @param use.gpu logical Use CUDA capable GPU (default = TRUE)
#' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
#' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param args character (optional) Additional parameters for CellBender
#' @return bash script
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$prepareCellbender()
#' crm$saveCellbenderScript()
#' }
saveCellbenderScript = function(file = "cellbender_script.sh",
fpr = 0.01,
epochs = 150,
use.gpu = TRUE,
expected.cells = NULL,
total.droplets = NULL,
data.path = self$data.path,
samples = self$metadata$sample,
args = NULL) {
# Preparations
checkDataPath(data.path)
inputs <- getH5Paths(data.path, samples, "raw")
outputs <- data.path %>%
pathsToList(samples) %>%
sapply(\(sample) paste0(sample[2],sample[1],"/outs/cellbender.h5")) %>%
setNames(samples)
if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples)
if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples)
# Create CellBender shell scripts
script.list <- samples %>%
lapply(\(sample) {
paste0("cellbender remove-background --input ",inputs[sample]," --output ",outputs[sample],if (use.gpu) c(" --cuda ") else c(" "),"--expected-cells ",expected.cells[sample]," --total-droplets-included ",total.droplets[sample]," --fpr ",fpr," --epochs ",epochs," ",if (!is.null(args)) paste(args, collapse = " "))
})
out <- list("#! /bin/sh", script.list) %>%
unlist()
cat(out, file = paste0(data.path,file), sep = "\n")
},
#' @description Extract the expected number of cells per sample based on the Cell Ranger summary metrics
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @return A numeric vector
#' @examples
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Get summary
#' crm$addSummaryFromCms()
#'
#' # Get no. cells
#' crm$getExpectedCells()
getExpectedCells = function(samples = self$metadata$sample) {
expected.cells <- self$summary.metrics %>%
filter(metric == "estimated number of cells") %$%
setNames(value, sample) %>%
.[samples]
return(expected.cells)
},
#' @description Get the total number of droplets included in the CellBender estimations. Based on the Cell Ranger summary metrics and multiplied by a preset multiplier.
#' @param samples character Samples names to include (default = self$metadata$sample)
#' @param multiplier numeric Number to multiply expected number of cells with (default = 3)
#' @return A numeric vector
#' @examples
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Add summary
#' crm$addSummaryFromCms()
#'
#' # Get no. droplets
#' crm$getTotalDroplets()
getTotalDroplets = function(samples = self$metadata$sample,
multiplier = 3) {
if (!is.numeric(multiplier)) stop("'multiplier' must be numeric.")
expected.cells <- self$getExpectedCells(samples = samples)
total.droplets <- expected.cells * multiplier
return(total.droplets)
},
#' @description Add a list of count matrices to the CRMetrics object.
#' @param cms list List of (sparse) count matrices (default = NULL)
#' @param data.path character Path to cellranger count data (default = self$data.path).
#' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)
#' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)
#' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE` (default = FALSE)
#' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE)
#' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE)
#' @param sep character Separator used to create unique cell names (default = "!!")
#' @param add.metadata boolean Add metadata from cms or not (default = TRUE)
#' @param n.cores integer Number of cores to use (default = self$n.cores)
#' @param verbose boolean Print progress (default = self$verbose)
#' @return Add list of (sparse) count matrices to R6 class object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#'
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' crm$addCms(cms = testdata.cms)
#' }
addCms = function(cms = NULL,
data.path = self$data.path,
samples = self$metadata$sample,
cellbender = FALSE,
raw = FALSE,
symbol = TRUE,
unique.names = TRUE,
sep = "!!",
add.metadata = TRUE,
n.cores = self$n.cores,
verbose = self$verbose) {
# Check
if (is.null(cms) && is.null(data.path)) stop("Either 'cms' or 'data.path' must be provided.")
if (!is.null(self$cms)) stop("CMs already present. To overwrite, set $cms = NULL and rerun this function.")
if (!is.null(cms)) {
# Add from cms argument
## Checks
if (!is.list(cms)) stop("'cms' must be a list of count matrices")
if (verbose) message(paste0("Adding list of ",length(cms)," count matrices."))
sample.class <- sapply(cms, class) %>%
unlist() %>%
sapply(\(x) grepl("Matrix", x))
if (!any(sample.class)) {
warning(paste0("Some samples are not a matrix (maybe they only contain 1 cell). Removing the following samples: ",paste(sample.class[!sample.class] %>% names(), collapse = " ")))
cms %<>% .[sample.class]
}
sample.cells <- sapply(cms, ncol) %>% unlist()
if (any(sample.cells == 0)) {
warning(paste0("Some samples does not contain cells. Removing the following samples: ",paste(sample.cells[sample.cells == 0] %>% names(), collapse=" ")))
cms %<>% .[sample.cells > 0]
}
if (is.null(samples)) samples <- names(cms)
if (is.null(samples)) stop("Either 'cms' must be named or 'samples' cannot be NULL")
if (length(samples) != length(cms)) stop("Length of 'samples' does not match length of 'cms'.")
## Create unique names
if (unique.names) cms %<>% createUniqueCellNames(samples, sep)
} else {
# Add from data.path argument
if (cellbender) {
cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
} else {
cms <- read10x(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
}
}
self$cms <- cms
if (add.metadata) {
if (!is.null(self$metadata)) {
warning("Overwriting metadata\n")
}
self$metadata <- data.frame(sample = samples)
}
if (!is.null(self$detailed.metrics)) warning("Consider updating detailed metrics by setting $detailed.metrics <- NULL and running $addDetailedMetrics().\n")
if (!is.null(self$con)) warning("Consider updating embedding by setting $cms.preprocessed <- NULL and $con <- NULL, and running $doPreprocessing() and $createEmbedding().\n")
if (!is.null(self$doublets)) warning("Consider updating doublet scores by setting $doublets <- NULL and running $detectDoublets().\n")
},
#' @description Plot the results from the CellBender estimations
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param pal character Plotting palette (default = self$pal)
#' @return A ggplot2 object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$prepareCellbender()
#' crm$saveCellbenderScript()
#' ## Run CellBender script
#' crm$plotCbTraining()
#' }
plotCbTraining = function(data.path = self$data.path,
samples = self$metadata$sample,
pal = self$pal) {
checkDataPath(data.path)
checkPackageInstalled("rhdf5", bioc = TRUE)
paths <- getH5Paths(data.path, samples, "cellbender")
train.df <- samples %>%
lapply(\(id) {
rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>%
{data.frame(ELBO = .,
Epoch = seq_len(length(.)),
sample = id)}
}) %>%
setNames(samples) %>%
bind_rows()
test.df <- samples %>%
lapply(\(id) {
path <- paths[id]
data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"),
Epoch = rhdf5::h5read(path, "matrix/test_epoch"),
sample = id)
}) %>%
setNames(samples) %>%
bind_rows()
g <- ggplot() +
geom_point(data = train.df, aes(Epoch, ELBO, col = "Train")) +
geom_line(data = train.df, aes(Epoch, ELBO, col = "Train")) +
geom_point(data = test.df, aes(Epoch, ELBO, col = "Test")) +
geom_line(data = test.df, aes(Epoch, ELBO, col = "Test")) +
self$theme +
labs(col = "") +
facet_wrap(~sample, scales = "free_y")
if (!is.null(pal)) g <- g + scale_color_manual(values = pal)
return(g)
},
#' @description Plot the CellBender assigned cell probabilities
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param low.col character Color for low probabilities (default = "gray")
#' @param high.col character Color for high probabilities (default = "red")
#' @return A ggplot2 object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$prepareCellbender()
#' crm$saveCellbenderScript()
#' ## Run the CellBender script
#' crm$plotCbCellProbs()
#' }
plotCbCellProbs = function(data.path = self$data.path,
samples = self$metadata$sample,
low.col = "gray",
high.col = "red") {
checkDataPath(data.path)
checkPackageInstalled("rhdf5", bioc = TRUE)
paths <- getH5Paths(data.path, samples, "cellbender")
cell.prob <- samples %>%
lapply(\(id) {
rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>%
{data.frame(prob = .,
cell = seq_len(length(.)),
sample = id)}
}) %>%
setNames(samples) %>%
bind_rows()
ggplot(cell.prob, aes(cell, prob, col = prob)) +
geom_point() +
scale_color_gradient(low=low.col, high=high.col) +
self$theme +
labs(x = "Cells", y = "Cell probability", col = "") +
facet_wrap(~sample, scales = "free_x")
},
#' @description Plot the estimated ambient gene expression per sample from CellBender calculations
#' @param cutoff numeric Horizontal line included in the plot to indicate highly expressed ambient genes (default = 0.005)
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @return A ggplot2 object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$prepareCellbender()
#' crm$saveCellbenderScript()
#' ## Run CellBender script
#' crm$plotCbAmbExp()
#' }
plotCbAmbExp = function(cutoff = 0.005,
data.path = self$data.path,
samples = self$metadata$sample) {
checkDataPath(data.path)
checkPackageInstalled("rhdf5", bioc = TRUE)
paths <- getH5Paths(data.path, samples, "cellbender")
amb <- samples %>%
lapply(\(id) {
rhdf5::h5read(paths[id], "matrix/ambient_expression") %>%
{data.frame(exp = .,
cell = seq_len(length(.)),
gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(),
sample = id)}
}) %>%
setNames(samples) %>%
bind_rows()
g <- ggplot(amb, aes(cell, exp)) +
geom_point() +
geom_hline(yintercept = cutoff) +
geom_label_repel(data = amb[amb$exp > cutoff,], aes(cell, exp, label = gene.names)) +
self$theme +
labs(y = "Ambient expression", x = "Genes") +
facet_wrap(~sample, scales = "free_y")
return(g)
},
#' @description Plot the most abundant estimated ambient genes from the CellBender calculations
#' @param cutoff numeric Cutoff of ambient gene expression to use to extract ambient genes per sample
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param pal character Plotting palette (default = self$pal)
#' @return A ggplot2 object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$prepareCellbender()
#' crm$saveCellbenderScript()
#' ## Run CellBender script
#' crm$plotCbAmbGenes()
#' }
plotCbAmbGenes = function(cutoff = 0.005,
data.path = self$data.path,
samples = self$metadata$sample,
pal = self$pal) {
checkDataPath(data.path)
checkPackageInstalled("rhdf5", bioc = TRUE)
paths <- getH5Paths(data.path, samples, "cellbender")
amb <- samples %>%
lapply(\(id) {
rhdf5::h5read(paths[id], "matrix/ambient_expression") %>%
{data.frame(exp = .,
cell = seq_len(length(.)),
gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(),
sample = id)} %>%
filter(exp >= cutoff)
}) %>%
setNames(samples) %>%
bind_rows() %$%
table(gene.names) %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
mutate(Freq = Freq / length(samples),
gene.names = factor(gene.names, levels = gene.names))
g <- ggplot(amb, aes(gene.names, Freq, fill = gene.names)) +
geom_bar(stat = "identity") +
self$theme +
labs(x = "", y = "Proportion") +
theme(axis.text.x = element_text(angle = 90)) +
guides(fill = "none")
if (!is.null(pal)) {
gene.len <- amb$gene.names %>%
unique() %>%
length()
if (length(pal) < gene.len) warning(paste0("Palette has ",length(pal)," colors but there are ",gene.len," genes, omitting palette.")) else g <- g + scale_fill_manual(values = pal)
}
return(g)
},
#' @description Add summary metrics from a list of count matrices
#' @param cms list A list of filtered count matrices (default = self$cms)
#' @param n.cores integer Number of cores to use (default = self$n.cores)
#' @param verbose logical Show progress (default = self$verbose)
#' @return data.frame
#' @examples
#' # Simulate data
#' testdata.cms <- lapply(seq_len(2), \(x) {
#' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
#' out[out < 0] <- 1
#' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
#' sapply(seq_len(1e3), \(x) paste0("cell",x)))
#' return(out)
#' })
#'
#' # Initialize
#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
#'
#' # Add summary
#' crm$addSummaryFromCms()
addSummaryFromCms = function(cms = self$cms,
n.cores = self$n.cores,
verbose = self$verbose) {
checkPackageInstalled("sparseMatrixStats", bioc = TRUE)
if (!is.null(self$summary.metrics)) warning("Overwriting existing summary metrics \n")
if (verbose) message(paste0(Sys.time()," Calculating ",length(cms)," summaries using ", if (n.cores < length(cms)) n.cores else length(cms)," cores"))
self$summary.metrics <- cms %>%
names() %>%
plapply(\(id) {
cm <- cms[[id]]
cm.bin <- cm
cm.bin[cm.bin > 0] <- 1
data.frame(cells = ncol(cm),
median.genes = sparseMatrixStats::colSums2(cm.bin) %>% median(),
median.umi = sparseMatrixStats::colSums2(cm) %>% median(),
total.genes = sum(sparseMatrixStats::rowSums2(cm.bin) > 0),
sample = id)
}, n.cores = n.cores) %>%
bind_rows() %>%
pivot_longer(cols = -c(sample),
names_to = "metric",
values_to = "value") %>%
mutate(metric = factor(metric, labels = c("estimated number of cells",
"median genes per cell",
"median umi counts per cell",
"total genes detected")))
if (verbose) message(paste0(Sys.time()," Done!"))
},
#' @description Run SoupX ambient RNA estimation and correction
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param n.cores numeric Number of cores (default = self$n.cores)
#' @param verbose logical Show progress (default = self$verbose)
#' @param arg.load10X list A list with additional parameters for `SoupX::load10X` (default = list())
#' @param arg.autoEstCont list A list with additional parameters for `SoupX::autoEstCont` (default = list())
#' @param arg.adjustCounts list A list with additional parameters for `SoupX::adjustCounts` (default = list())
#' @return List containing a list with corrected counts, and a data.frame containing plotting estimations
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$runSoupX()
#' }
runSoupX = function(data.path = self$data.path,
samples = self$metadata$sample,
n.cores = self$n.cores,
verbose = self$verbose,
arg.load10X = list(),
arg.autoEstCont = list(),
arg.adjustCounts = list()) {
checkDataPath(data.path)
checkPackageInstalled("SoupX", cran = TRUE)
if (verbose) message(paste0(Sys.time()," Running using ", if (n.cores <- length(samples)) n.cores else length(samples)," cores"))
# Create SoupX objects
if (verbose) message(paste0(Sys.time()," Loading data"))
soupx.list <- data.path %>%
pathsToList(samples) %>%
plapply(\(sample) {
arg <- list(dataDir = paste(sample[2],sample[1],"outs", sep = "/")) %>%
append(arg.load10X)
out <- do.call(SoupX::load10X, arg)
return(out)
}, n.cores = n.cores) %>%
setNames(samples)
# Perform automatic estimation of contamination
if (verbose) message(paste0(Sys.time()," Estimating contamination"))
tmp <- soupx.list %>%
plapply(\(soupx.obj) {
arg <- list(sc = soupx.obj) %>%
append(arg.autoEstCont)
out <- do.call(SoupX::autoEstCont, arg)
return(out)
}, n.cores = n.cores) %>%
setNames(samples)
# Save plot data
if (verbose) message(paste0(Sys.time()," Preparing plot data"))
rhoProbes <- seq(0,1,.001)
self$soupx$plot.df <- samples %>%
plapply(\(id) {
dat <- tmp[[id]]
## The following is taken from the SoupX package
post.rho <- dat$fit$posterior
priorRho <- dat$fit$priorRho
priorRhoStdDev <- dat$fit$priorRhoStdDev
v2 <- (priorRhoStdDev/priorRho)**2
k <- 1+v2**-2/2*(1+sqrt(1+4*v2))
theta <- priorRho/(k-1)
prior.rho <- dgamma(rhoProbes, k, scale=theta)
df <- data.frame(rhoProbes = rhoProbes,
post.rho = post.rho,
prior.rho = prior.rho) %>%
tidyr::pivot_longer(cols = -c("rhoProbes"),
names_to = "variable",
values_to = "value") %>%
mutate(rhoProbes = as.numeric(rhoProbes),
value = as.numeric(value),
sample = id)
return(df)
}, n.cores = n.cores) %>%
setNames(samples) %>%
bind_rows()
# Adjust counts
if (verbose) message(paste0(Sys.time()," Adjusting counts"))
self$soupx$cms.adj <- tmp %>%
plapply(\(sample) {
arg <- list(sc = sample) %>%
append(arg.adjustCounts)
out <- do.call(SoupX::adjustCounts, arg)
return(out)
}, n.cores = n.cores) %>%
setNames(samples)
if (verbose) message(paste0(Sys.time()," Done!"))
},
#' @description Plot the results from the SoupX estimations
#' @param plot.df data.frame SoupX estimations (default = self$soupx$plot.df)
#' @return A ggplot2 object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$runSoupX()
#' crm$plotSoupX()
#' }
plotSoupX = function(plot.df = self$soupx$plot.df) {
if (is.null(plot.df)) stop("No plot data found. Please run $runSoupX first.")
line.df <- plot.df %>%
split(., .$sample) %>%
lapply(\(x) x$rhoProbes[x$value == max(x$value)]) %>%
{lapply(names(.), \(x) data.frame(value = .[[x]], sample = x))} %>%
do.call(rbind, .)
ggplot(plot.df, aes(rhoProbes, value, linetype = variable, col = variable)) +
geom_line(show.legend = FALSE) +
geom_vline(data = line.df, aes(xintercept = value, col = "rho.max", linetype = "rho.max")) +
scale_color_manual(name = "", values = c("post.rho" = "black", "rho.max" = "red", "prior.rho" = "black")) +
scale_linetype_manual(name = "", values = c("post.rho" = "solid", "rho.max" = "solid", "prior.rho" = "dashed")) +
self$theme +
labs(x = "Contamination fraction", y = "Probability density") +
facet_wrap(~sample, scales = "free_y") +
theme(legend.spacing.y = unit(3, "pt")) +
guides(linetype = guide_legend(byrow = TRUE), col = guide_legend(byrow = TRUE))
},
#' @description Plot CellBender cell estimations against the estimated cell numbers from Cell Ranger
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param pal character Plotting palette (default = self$pal)
#' @return A ggplot2 object
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$prepareCellbender()
#' crm$saveCellbenderScript()
#' ## Run CellBender script
#' crm$plotCbCells()
#' }
plotCbCells = function(data.path = self$data.path,
samples = self$metadata$sample,
pal = self$pal) {
checkDataPath(data.path)
checkPackageInstalled("rhdf5", bioc = TRUE)
paths <- getH5Paths(data.path, samples, "cellbender_filtered")
df <- samples %>%
sapply(\(id) rhdf5::h5read(paths[id], "matrix/shape")[2]) %>%
{data.frame(exp = self$getExpectedCells(samples),
cb.cells = .,
sample = samples)} %>%
mutate(diff = cb.cells - exp,
rel = diff / exp) %>%
pivot_longer(cols = c(-sample),
names_to = "variable",
values_to = "value") %>%
mutate(variable = factor(variable,
labels = c("CellBender cells",
"Difference to exp. cells",
"Expected cells",
"Relative difference to exp. cells")))
g <- ggplot(df, aes(sample, value, fill = sample)) +
geom_bar(stat = "identity") +
self$theme +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
guides(fill = "none") +
labs(x = "", y = "") +
facet_wrap(~variable, scales = "free_y", nrow = 2, ncol = 2)
if (!is.null(pal)) g <- g + scale_fill_manual(values = pal)
return(g)
},
#' @description Add doublet results created from exported Python script
#' @param method character Which method to use, either `scrublet` or `doubletdetection` (default is both).
#' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
#' @param samples character Sample names to include (default = self$metadata$sample)
#' @param cms list List containing the count matrices (default = self$cms).
#' @param verbose boolean Print progress (default = self$verbose)
#' @return List of doublet results
#' @examples
#' \dontrun{
#' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
#' crm$detectDoublets(export = TRUE)
#' ## Run Python script
#' crm$addDoublets()
#' }
addDoublets = function(method = c("scrublet","doubletdetection"),
data.path = self$data.path,
samples = self$metadata$sample,
cms = self$cms,
verbose = self$verbose) {
# Check
if (length(data.path) > 1) data.path <- data.path[1]
unk <- setdiff(method, c("doubletdetection","scrublet"))
if (length(unk) > 0) stop(paste0("Method(s) not recognised: ",paste(unk, collapse = " ")))
ex.res <- self$doublets
if (!is.null(names(ex.res))) {
warning(paste0("Results for method(s) '",paste(intersect(names(ex.res), method), collapse = " "),"' found, skipping. Set $doublets$<METHOD> <- NULL and rerun this function to load."))
method <- setdiff(method, names(ex.res))
}
if (length(method) == 0) stop("No method left to load data for.")
# Load data
files <- dir(data.path, full.names = TRUE)
files.list <- method %>%
{`names<-`(lapply(., \(met) files[grepl(paste0(".",met), files, fixed = TRUE)]), .)}
not.found <- names(files.list)[sapply(files.list, length) == 0]
if (length(not.found) > 0) {
warning("No results found for method(s): ",paste(not.found, collapse = " "))
files.list %<>% .[setdiff(names(.), not.found)]
}
if (length(files.list) == 0) stop("No method left to load data for.")
res.list <- files.list %>%
names() %>%
lapply(\(met) {
len <- length(files.list[[met]])
if (len > 0) {
if (verbose) message(paste0("Found ",len," results for ",met))
tmp <- files.list[[met]] %>%
lapply(read.delim, sep = ",", header = TRUE) %>%
bind_rows() %>%
`names<-`(c("scores","labels"))
tmp[is.na(tmp)] <- 0
tmp %<>%
mutate(labels = as.logical(labels)) %>%
`rownames<-`(cms %>% lapply(colnames) %>% unlist() %>% unname())
if (verbose) message(paste0("Detected ",sum(tmp$labels, na.rm = TRUE)," possible doublets out of ",nrow(tmp)," cells."))
out <- list(results = tmp)
return(out)
}
}) %>%
`names<-`(method)
if (!is.null(ex.res)) res.list <- append(ex.res, res.list)
return(invisible(self$doublets <- res.list))
}
))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.