#' Doublet removement using Scrublet
#'
#' GMM is used by Kai Zhang in his Cell ATAC Atlas paper.
#' NormalMixEM is used by Yang Li in his CEMBA 1.0
#'
#' @param mat sparse matrix, cell by feature
#' @param path_to_python string
#' @param expected_doublet_rate double, default is 0.08
#' @param min_counts numeric, default is 3
#' @param min_cells integer, default is 5
#' @param min_conv_pctl numeric, default is 85
#' @param n_pc integer, default is 30
#' @importFrom reticulate r_to_py py_to_r import
#' @return list, five fields
#' - thresGMM: threshold based on GMM for simulation scores
#' - thresScrublet: threshold estimated from Scrublets
#' - probs: probs of real data based on GMM
#' - scrubletScore: doublet scores of real data based on Scrublets
#' - simuScrubletScore: doublet scores of simulation data based on Scrublets
#' - thresMixEM: threshold based normalmixEM for simulation scores.
#' @export
SnapATAC_runScrublet <- function(mat, path_to_python, expected_doublet_rate = 0.08,
min_counts = 3,
min_cells = 5L,
min_conv_pctl = 85,
n_pc = 30L) {
message("Epoch: loading python environments and packages ... \n")
# load library and python env
reticulate::use_python(path_to_python)
message("Use the Python located in:", path_to_python, "\n")
setSessionTimeLimit(cpu = Inf, elapsed = Inf)
message("Epoch: identify potential doublets ... \n")
scr <- import(module = "smmuty", convert = FALSE)
out <- scr$detectDoublet(
## scrublet needs cell-by-feature matrix
counts_matrix = r_to_py(mat),
expected_doublet_rate = expected_doublet_rate,
min_counts = min_counts,
min_cells = as.integer(min_cells),
min_gene_variability_pctl = min_conv_pctl,
n_prin_comps = as.integer(min(n_pc, nrow(mat), ncol(mat)))
)
out <- py_to_r(out)
names(out) <- c("thresGMM", "thresScrublet", "probs", "scrubletScore", "simuScrubletScore")
scores <- out$scrubletScore
ncell <- length(out$scrubletScore)
tgmm <- out$thresGMM
tscrub <- out$thresScrublet
tMixEM <- getScrubletThresholdByMixEM(scrubletSimScores = out$simuScrubletScore,
defaultCutoff = tscrub)
out$thresMixEM <- tMixEM
## summary
message("Epoch: summary doublets detection results ... \n")
message(paste("Threshold from GMM:", round(tgmm, 3)))
message(paste("Threshold from Raw Scrublets", round(tscrub, 3)))
message(paste("Threshold from MixEM", round(tMixEM, 3)))
message(paste("Number of cells in total:"), ncell)
message(paste("Doublet rate based on GMM:", round(sum(scores > tgmm) / ncell, 4)))
message(paste("Doublet rate based on RawScrublet:",round(sum(scores > tscrub) / ncell, 4)))
message(paste("Doublet rate based on MixEM:", round(sum(scores > tMixEM) / ncell, 4)))
## get result
return(out)
}
#' Another way to estimate the threshold for Scrublet
#' Method used by Yang Li in CEMBA 1.0.
#' @param scrubletSimScores numeric vector
#' @param defaultCutoff double, if mixEM failed, use this as threschold.
#' @param prob double, prob of a scrublet score treated as doublet, default is 0.5
#' @return double
#' @importFrom mixtools normalmixEM
#' @export
getScrubletThresholdByMixEM <- function(scrubletSimScores, defaultCutoff,
prob = 0.5) {
x <- scrubletSimScores[scrubletSimScores > 0]
x <- x[is.finite(x)]
model <- normalmixEM(x, k = 2, maxit = 10000)
i <- which.min(model$mu)
myfun <- function(x) {
prob - (model$lambda[i]*dnorm(x, model$mu[i], model$sigma[i]) /
(model$lambda[1]*dnorm(x, model$mu[1], model$sigma[1]) +
model$lambda[2]*dnorm(x, model$mu[2], model$sigma[2])))
}
out <- tryCatch({
stats::uniroot(f = myfun, interval = c(1e-2, 1), maxiter = 5000, trace = 0)$root
}, error = function(cond) {
message(cond)
return(defaultCutoff)
}, warning = function(cond) {
message(cond)
return(defaultCutoff)
}, finally = {
message("\nFinish normalmixEM.")
})
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.