Nothing
#' @title Cell clustering with Celda
#' @description Clusters the columns of a count matrix containing single-cell
#' data into K subpopulations. The
#' \code{useAssay} \link{assay} slot in
#' \code{altExpName} \link{altExp} slot will be used if
#' it exists. Otherwise, the \code{useAssay}
#' \link{assay} slot in \code{x} will be used if
#' \code{x} is a \linkS4class{SingleCellExperiment} object.
#' @param x A numeric \link{matrix} of counts or a
#' \linkS4class{SingleCellExperiment}
#' with the matrix located in the assay slot under \code{useAssay} in
#' \code{altExp(x, altExpName)}.
#' Rows represent features and columns represent cells.
#' @param useAssay A string specifying the name of the
#' \link{assay} slot to use. Default "counts".
#' @param altExpName The name for the \link{altExp} slot
#' to use. Default "featureSubset".
#' @param sampleLabel Vector or factor. Denotes the sample label for each cell
#' (column) in the count matrix.
#' @param K Integer. Number of cell populations.
#' @param alpha Numeric. Concentration parameter for Theta. Adds a pseudocount
#' to each cell population in each sample. Default 1.
#' @param beta Numeric. Concentration parameter for Phi. Adds a pseudocount to
#' each feature in each cell population. Default 1.
#' @param algorithm String. Algorithm to use for clustering cell subpopulations.
#' One of 'EM' or 'Gibbs'. The EM algorithm is faster, especially for larger
#' numbers of cells. However, more chains may be required to ensure a good
#' solution is found. If 'EM' is selected, then 'stopIter' will be
#' automatically set to 1. Default 'EM'.
#' @param stopIter Integer. Number of iterations without improvement in the
#' log likelihood to stop inference. Default 10.
#' @param maxIter Integer. Maximum number of iterations of Gibbs sampling or
#' EM to perform. Default 200.
#' @param splitOnIter Integer. On every `splitOnIter` iteration, a heuristic
#' will be applied to determine if a cell population should be reassigned and
#' another cell population should be split into two clusters. To disable
#' splitting, set to -1. Default 10.
#' @param splitOnLast Integer. After `stopIter` iterations have been
#' performed without improvement, a heuristic will be applied to determine if
#' a cell population should be reassigned and another cell population should be
#' split into two clusters. If a split occurs, then `stopIter` will be reset.
#' Default TRUE.
#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility,
#' a default value of 12345 is used. If NULL, no calls to
#' \link[withr]{with_seed} are made.
#' @param nchains Integer. Number of random cluster initializations. Default 3.
#' @param zInitialize Chararacter. One of 'random', 'split', or 'predefined'.
#' With 'random', cells are randomly assigned to a populations. With 'split',
#' cells will be split into sqrt(K) populations and then each popluation will
#' be subsequently split into another sqrt(K) populations. With 'predefined',
#' values in `zInit` will be used to initialize `z`. Default 'split'.
#' @param zInit Integer vector. Sets initial starting values of z. If NULL,
#' starting values for each cell will be randomly sampled from `1:K`. 'zInit'
#' can only be used when `initialize = 'random'`. Default NULL.
#' @param countChecksum Character. An MD5 checksum for the `counts` matrix.
#' Default NULL.
#' @param logfile Character. Messages will be redirected to a file named
#' `logfile`. If NULL, messages will be printed to stdout. Default NULL.
#' @param verbose Logical. Whether to print log messages. Default TRUE.
#' @param ... Ignored. Placeholder to prevent check warning.
#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object. Function
#' parameter settings are stored in the \link{metadata}
#' \code{"celda_parameters"} slot.
#' Columns \code{celda_sample_label} and \code{celda_cell_cluster} in
#' \link{colData} contain sample labels and celda cell
#' population clusters.
#' @seealso \link{celda_G} for feature clustering and \link{celda_CG} for
#' simultaneous clustering of features and cells. \link{celdaGridSearch} can
#' be used to run multiple values of K and multiple chains in parallel.
#' @examples
#' data(celdaCSim)
#' sce <- celda_C(celdaCSim$counts,
#' K = celdaCSim$K,
#' sampleLabel = celdaCSim$sampleLabel,
#' nchains = 1)
#' @import Rcpp RcppEigen
#' @importFrom withr with_seed
#' @export
setGeneric("celda_C", function(x, ...) {
standardGeneric("celda_C")})
#' @rdname celda_C
#' @export
setMethod("celda_C",
signature(x = "SingleCellExperiment"),
function(x,
useAssay = "counts",
altExpName = "featureSubset",
sampleLabel = NULL,
K,
alpha = 1,
beta = 1,
algorithm = c("EM", "Gibbs"),
stopIter = 10,
maxIter = 200,
splitOnIter = 10,
splitOnLast = TRUE,
seed = 12345,
nchains = 3,
zInitialize = c("split", "random", "predefined"),
countChecksum = NULL,
zInit = NULL,
logfile = NULL,
verbose = TRUE) {
xClass <- "SingleCellExperiment"
if (!altExpName %in% SingleCellExperiment::altExpNames(x)) {
stop(altExpName, " not in 'altExpNames(x)'. Run ",
"selectFeatures(x) first!")
}
altExp <- SingleCellExperiment::altExp(x, altExpName)
if (!useAssay %in% SummarizedExperiment::assayNames(altExp)) {
stop(useAssay, " not in assayNames(altExp(x, altExpName))")
}
counts <- SummarizedExperiment::assay(altExp, i = useAssay)
altExp <- .celdaCWithSeed(counts = counts,
xClass = xClass,
useAssay = useAssay,
sce = altExp,
sampleLabel = sampleLabel,
K = K,
alpha = alpha,
beta = beta,
algorithm = match.arg(algorithm),
stopIter = stopIter,
maxIter = maxIter,
splitOnIter = splitOnIter,
splitOnLast = splitOnLast,
seed = seed,
nchains = nchains,
zInitialize = match.arg(zInitialize),
countChecksum = countChecksum,
zInit = zInit,
logfile = logfile,
verbose = verbose)
SingleCellExperiment::altExp(x, altExpName) <- altExp
return(x)
}
)
#' @rdname celda_C
#' @export
setMethod("celda_C",
signature(x = "matrix"),
function(x,
useAssay = "counts",
altExpName = "featureSubset",
sampleLabel = NULL,
K,
alpha = 1,
beta = 1,
algorithm = c("EM", "Gibbs"),
stopIter = 10,
maxIter = 200,
splitOnIter = 10,
splitOnLast = TRUE,
seed = 12345,
nchains = 3,
zInitialize = c("split", "random", "predefined"),
countChecksum = NULL,
zInit = NULL,
logfile = NULL,
verbose = TRUE) {
ls <- list()
ls[[useAssay]] <- x
sce <- SingleCellExperiment::SingleCellExperiment(assays = ls)
SingleCellExperiment::altExp(sce, altExpName) <- sce
xClass <- "matrix"
altExp <- .celdaCWithSeed(counts = x,
xClass = xClass,
useAssay = useAssay,
sce = SingleCellExperiment::altExp(sce, altExpName),
sampleLabel = sampleLabel,
K = K,
alpha = alpha,
beta = beta,
algorithm = match.arg(algorithm),
stopIter = stopIter,
maxIter = maxIter,
splitOnIter = splitOnIter,
splitOnLast = splitOnLast,
seed = seed,
nchains = nchains,
zInitialize = match.arg(zInitialize),
countChecksum = countChecksum,
zInit = zInit,
logfile = logfile,
verbose = verbose)
SingleCellExperiment::altExp(sce, altExpName) <- altExp
return(sce)
}
)
.celdaCWithSeed <- function(counts,
xClass,
useAssay,
sce,
sampleLabel,
K,
alpha,
beta,
algorithm,
stopIter,
maxIter,
splitOnIter,
splitOnLast,
seed,
nchains,
zInitialize,
countChecksum,
zInit,
logfile,
verbose) {
.validateCounts(counts)
if (is.null(seed)) {
celdaCMod <- .celda_C(counts = counts,
sampleLabel = sampleLabel,
K = K,
alpha = alpha,
beta = beta,
algorithm = algorithm,
stopIter = stopIter,
maxIter = maxIter,
splitOnIter = splitOnIter,
splitOnLast = splitOnLast,
nchains = nchains,
zInitialize = zInitialize,
countChecksum = countChecksum,
zInit = zInit,
logfile = logfile,
verbose = verbose,
reorder = TRUE)
} else {
with_seed(seed,
celdaCMod <- .celda_C(counts = counts,
sampleLabel = sampleLabel,
K = K,
alpha = alpha,
beta = beta,
algorithm = algorithm,
stopIter = stopIter,
maxIter = maxIter,
splitOnIter = splitOnIter,
splitOnLast = splitOnLast,
nchains = nchains,
zInitialize = zInitialize,
countChecksum = countChecksum,
zInit = zInit,
logfile = logfile,
verbose = verbose,
reorder = TRUE))
}
sce <- .createSCEceldaC(celdaCMod = celdaCMod,
sce = sce,
xClass = xClass,
useAssay = useAssay,
algorithm = algorithm,
stopIter = stopIter,
maxIter = maxIter,
splitOnIter = splitOnIter,
splitOnLast = splitOnLast,
nchains = nchains,
zInitialize = zInitialize,
zInit = zInit,
logfile = logfile,
verbose = verbose)
return(sce)
}
# celda_C main function
.celda_C <- function(counts,
sampleLabel = NULL,
K,
alpha = 1,
beta = 1,
algorithm = c("EM", "Gibbs"),
stopIter = 10,
maxIter = 200,
splitOnIter = 10,
splitOnLast = TRUE,
nchains = 3,
zInitialize = c("split", "random", "predefined"),
countChecksum = NULL,
zInit = NULL,
logfile = NULL,
verbose = TRUE,
reorder = TRUE) {
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = FALSE,
verbose = verbose)
.logMessages("Starting Celda_C: Clustering cells.",
logfile = logfile,
append = TRUE,
verbose = verbose)
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = TRUE,
verbose = verbose)
startTime <- Sys.time()
## Error checking and variable processing
counts <- .processCounts(counts)
if (is.null(countChecksum)) {
countChecksum <- .createCountChecksum(counts)
}
sampleLabel <- .processSampleLabels(sampleLabel, ncol(counts))
s <- as.integer(sampleLabel)
algorithm <- match.arg(algorithm)
if (algorithm == "EM") {
stopIter <- 1
}
algorithmFun <- ifelse(algorithm == "Gibbs",
".cCCalcGibbsProbZ",
".cCCalcEMProbZ"
)
zInitialize <- match.arg(zInitialize)
allChains <- seq(nchains)
bestResult <- NULL
for (i in allChains) {
## Initialize cluster labels
.logMessages(date(),
".. Initializing 'z' in chain",
i,
"with",
paste0("'", zInitialize, "' "),
logfile = logfile,
append = TRUE,
verbose = verbose
)
if (zInitialize == "predefined") {
if (is.null(zInit)) {
stop("'zInit' needs to specified when initilize.z == 'given'.")
}
z <- .initializeCluster(K,
ncol(counts),
initial = zInit,
fixed = NULL
)
} else if (zInitialize == "split") {
z <- .initializeSplitZ(counts,
K = K,
alpha = alpha,
beta = beta
)
} else {
z <- .initializeCluster(K,
ncol(counts),
initial = NULL,
fixed = NULL
)
}
zBest <- z
## Calculate counts one time up front
p <- .cCDecomposeCounts(counts, s, z, K)
nS <- p$nS
nG <- p$nG
nM <- p$nM
mCPByS <- p$mCPByS
nGByCP <- p$nGByCP
nCP <- p$nCP
nByC <- p$nByC
ll <- .cCCalcLL(
mCPByS = mCPByS,
nGByCP = nGByCP,
s = s,
K = K,
nS = nS,
nG = nG,
alpha = alpha,
beta = beta
)
iter <- 1L
numIterWithoutImprovement <- 0L
doCellSplit <- TRUE
while (iter <= maxIter & numIterWithoutImprovement <= stopIter) {
nextZ <- do.call(algorithmFun, list(
counts = counts,
mCPByS = mCPByS,
nGByCP = nGByCP,
nByC = nByC,
nCP = nCP,
z = z,
s = s,
K = K,
nG = nG,
nM = nM,
alpha = alpha,
beta = beta
))
mCPByS <- nextZ$mCPByS
nGByCP <- nextZ$nGByCP
nCP <- nextZ$nCP
z <- nextZ$z
## Perform split on i-th iteration of no improvement in log
## likelihood
tempLl <- .cCCalcLL(
mCPByS = mCPByS,
nGByCP = nGByCP,
s = s,
K = K,
nS = nS,
nG = nG,
alpha = alpha,
beta = beta
)
if (K > 2 & iter != maxIter &
((((numIterWithoutImprovement == stopIter &
!all(tempLl > ll))) & isTRUE(splitOnLast)) |
(splitOnIter > 0 & iter %% splitOnIter == 0 &
isTRUE(doCellSplit)))) {
.logMessages(date(),
" .... Determining if any cell clusters should be split.",
logfile = logfile,
append = TRUE,
sep = "",
verbose = verbose
)
res <- .cCSplitZ(
counts,
mCPByS,
nGByCP,
nCP,
s,
z,
K,
nS,
nG,
alpha,
beta,
zProb = t(nextZ$probs),
maxClustersToTry = K,
minCell = 3
)
.logMessages(res$message,
logfile = logfile,
append = TRUE,
verbose = verbose
)
# Reset convergence counter if a split occured
if (!isTRUE(all.equal(z, res$z))) {
numIterWithoutImprovement <- 0L
doCellSplit <- TRUE
} else {
doCellSplit <- FALSE
}
## Re-calculate variables
z <- res$z
mCPByS <- res$mCPByS
nGByCP <- res$nGByCP
nCP <- res$nCP
}
## Calculate complete likelihood
tempLl <- .cCCalcLL(
mCPByS = mCPByS,
nGByCP = nGByCP,
s = s,
K = K,
nS = nS,
nG = nG,
alpha = alpha,
beta = beta
)
if ((all(tempLl > ll)) | iter == 1) {
zBest <- z
llBest <- tempLl
numIterWithoutImprovement <- 1L
} else {
numIterWithoutImprovement <- numIterWithoutImprovement + 1L
}
ll <- c(ll, tempLl)
.logMessages(date(),
".... Completed iteration:",
iter,
"| logLik:",
tempLl,
logfile = logfile,
append = TRUE,
verbose = verbose
)
iter <- iter + 1
}
names <- list(
row = rownames(counts),
column = colnames(counts),
sample = levels(sampleLabel)
)
result <- list(
z = zBest,
completeLogLik = ll,
finalLogLik = llBest,
K = K,
sampleLabel = sampleLabel,
alpha = alpha,
beta = beta,
countChecksum = countChecksum,
names = names
)
if (is.null(bestResult) ||
result$finalLogLik > bestResult$finalLogLik) {
bestResult <- result
}
.logMessages(date(),
".. Finished chain",
i,
logfile = logfile,
append = TRUE,
verbose = verbose
)
}
bestResult <- methods::new("celda_C",
clusters = list(z = bestResult$z),
params = list(
K = as.integer(bestResult$K),
alpha = bestResult$alpha,
beta = bestResult$beta,
countChecksum = bestResult$countChecksum
),
sampleLabel = bestResult$sampleLabel,
completeLogLik = bestResult$completeLogLik,
finalLogLik = bestResult$finalLogLik,
names = bestResult$names
)
if (isTRUE(reorder)) {
bestResult <- .reorderCeldaC(counts = counts, res = bestResult)
}
endTime <- Sys.time()
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = TRUE,
verbose = verbose
)
.logMessages("Completed Celda_C. Total time:",
format(difftime(endTime, startTime)),
logfile = logfile,
append = TRUE,
verbose = verbose
)
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = TRUE,
verbose = verbose
)
return(bestResult)
}
# Gibbs sampling for the celda_C Model
.cCCalcGibbsProbZ <- function(counts,
mCPByS,
nGByCP,
nByC,
nCP,
z,
s,
K,
nG,
nM,
alpha,
beta,
doSample = TRUE) {
## Set variables up front outside of loop
probs <- matrix(NA, ncol = nM, nrow = K)
ix <- sample(seq(nM))
for (i in ix) {
## Subtract cell counts from current population assignment
# nGByCP1 <- nGByCP
# nGByCP1[, z[i]] <- nGByCP[, z[i]] - counts[, i]
# nGByCP1 <- .colSums(lgamma(nGByCP1 + beta), nrow(nGByCP), ncol(nGByCP))
# nCP1 <- nCP
# nCP1[z[i]] <- nCP1[z[i]] - nByC[i]
# nCP1 <- lgamma(nCP1 + (nG * beta))
## Add cell counts to all other populations
# nGByCP2 <- nGByCP
# otherIx <- seq(K)[-z[i]]
# nGByCP2[, otherIx] <- nGByCP2[, otherIx] + counts[, i]
# nGByCP2 <- .colSums(lgamma(nGByCP2 + beta), nrow(nGByCP), ncol(nGByCP))
# nCP2 <- nCP
# nCP2[otherIx] <- nCP2[otherIx] + nByC[i]
# nCP2 <- lgamma(nCP2 + (nG * beta))
mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] - 1L
## Calculate probabilities for each state
## when consider a specific cluster fo this cell,
## no need to calculate cells in other cluster
for (j in seq_len(K)) {
# otherIx <- seq(K)[-j]
if (j != z[i]) { # when j is not current population assignment
## Theta simplified
probs[j, i] <- log(mCPByS[j, s[i]] + alpha) +
# if adding this cell -- Phi Numerator
sum(lgamma(nGByCP[, j] + counts[, i] + beta)) -
# if adding this cell -- Phi Denominator
lgamma(nCP[j] + nByC[i] + nG * beta) -
# if without this cell -- Phi Numerator
sum(lgamma(nGByCP[, j] + beta)) +
# if without this cell -- Phi Denominator
lgamma(nCP[j] + nG * beta)
# sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells)
# nGByCP2[j] - ## Phi Numerator (current cell)
# sum(nCP1[otherIx]) - ## Phi Denominator (other cells)
# nCP2[j] - ## Phi Denominator (current cell)
} else { # when j is current population assignment
## Theta simplified
probs[j, i] <- log(mCPByS[j, s[i]] + alpha) +
sum(lgamma(nGByCP[, j] + beta)) -
lgamma(nCP[j] + nG * beta) -
sum(lgamma(nGByCP[, j] - counts[, i] + beta)) +
lgamma(nCP[j] - nByC[i] + nG * beta)
}
}
## Sample next state and add back counts
prevZ <- z[i]
if (isTRUE(doSample)) {
z[i] <- .sampleLl(probs[, i])
}
if (prevZ != z[i]) {
nGByCP[, prevZ] <- nGByCP[, prevZ] - counts[, i]
nGByCP[, z[i]] <- nGByCP[, z[i]] + counts[, i]
nCP[prevZ] <- nCP[prevZ] - nByC[i]
nCP[z[i]] <- nCP[z[i]] + nByC[i]
}
mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] + 1L
}
return(list(
mCPByS = mCPByS,
nGByCP = nGByCP,
nCP = nCP,
z = z,
probs = probs
))
}
.cCCalcEMProbZ <- function(counts,
mCPByS,
nGByCP,
nByC,
nCP,
z,
s,
K,
nG,
nM,
alpha,
beta,
doSample = TRUE) {
## Expectation given current cell population labels
theta <- fastNormPropLog(mCPByS, alpha)
phi <- fastNormPropLog(nGByCP, beta)
## Maximization to find best label for each cell
probs <- eigenMatMultInt(phi, counts) + theta[, s]
if (isTRUE(doSample)) {
zPrevious <- z
z <- apply(probs, 2, which.max)
## Recalculate counts based on new label
p <- .cCReDecomposeCounts(counts, s, z, zPrevious, nGByCP, K)
mCPByS <- p$mCPByS
nGByCP <- p$nGByCP
nCP <- p$nCP
}
return(list(
mCPByS = mCPByS,
nGByCP = nGByCP,
nCP = nCP,
z = z,
probs = probs
))
}
# Calculate log-likelihood for celda_C model
.cCCalcLL <- function(mCPByS,
nGByCP,
s,
z,
K,
nS,
nG,
alpha,
beta) {
## Calculate for "Theta" component
a <- nS * lgamma(K * alpha)
b <- sum(lgamma(mCPByS + alpha))
c <- -nS * K * lgamma(alpha)
d <- -sum(lgamma(colSums(mCPByS + alpha)))
thetaLl <- a + b + c + d
## Calculate for "Phi" component
a <- K * lgamma(nG * beta)
b <- sum(lgamma(nGByCP + beta))
c <- -K * nG * lgamma(beta)
d <- -sum(lgamma(colSums(nGByCP + beta)))
phiLl <- a + b + c + d
final <- thetaLl + phiLl
return(final)
}
# Takes raw counts matrix and converts it to a series of matrices needed for
# log likelihood calculation
# @param counts Integer matrix. Rows represent features and columns represent
# cells.
# @param s Integer vector. Contains the sample label for each cell (column) in
# the count matrix.
# @param z Numeric vector. Denotes cell population labels.
# @param K Integer. Number of cell populations.
.cCDecomposeCounts <- function(counts, s, z, K) {
nS <- length(unique(s))
nG <- nrow(counts)
nM <- ncol(counts)
mCPByS <- matrix(as.integer(table(factor(z, levels = seq(K)), s)),
ncol = nS
)
nGByCP <- .colSumByGroup(counts, group = z, K = K)
nCP <- as.integer(colSums(nGByCP))
nByC <- as.integer(colSums(counts))
return(list(
mCPByS = mCPByS,
nGByCP = nGByCP,
nCP = nCP,
nByC = nByC,
nS = nS,
nG = nG,
nM = nM
))
}
.cCReDecomposeCounts <- function(counts, s, z, previousZ, nGByCP, K) {
## Recalculate counts based on new label
nGByCP <- .colSumByGroupChange(counts, nGByCP, z, previousZ, K)
nS <- length(unique(s))
mCPByS <- matrix(as.integer(table(factor(z, levels = seq(K)), s)),
ncol = nS
)
nCP <- as.integer(colSums(nGByCP))
return(list(
mCPByS = mCPByS,
nGByCP = nGByCP,
nCP = nCP
))
}
.reorderCeldaC <- function(counts, res) {
if (params(res)$K > 2 & isTRUE(length(unique(celdaClusters(res)$z)) > 1)) {
res@clusters$z <- as.integer(as.factor(celdaClusters(res)$z))
fm <- factorizeMatrix(counts, res)
uniqueZ <- sort(unique(celdaClusters(res)$z))
d <- .cosineDist(fm$posterior$module[, uniqueZ])
h <- stats::hclust(d, method = "complete")
res <- .recodeClusterZ(res,
from = h$order,
to = seq(length(h$order))
)
}
return(res)
}
.prepareCountsForDimReductionCeldaC <- function(sce,
useAssay,
maxCells,
minClusterSize,
normalize,
scaleFactor,
transformationFun) {
counts <- SummarizedExperiment::assay(sce, i = useAssay)
counts <- .processCounts(counts)
## Checking if maxCells and minClusterSize will work
if (!is.null(maxCells)) {
if ((maxCells < ncol(counts)) &
(maxCells / minClusterSize <
S4Vectors::metadata(sce)$celda_parameters$K)) {
stop("Cannot distribute ",
maxCells,
" cells among ",
S4Vectors::metadata(sce)$celda_parameters$K,
" clusters while maintaining a minumum of ",
minClusterSize,
" cells per cluster. Try increasing 'maxCells' or decreasing",
" 'minClusterSize'.")
}
} else {
maxCells <- ncol(counts)
}
## Select a subset of cells to sample if greater than 'maxCells'
totalCellsToRemove <- ncol(counts) - maxCells
zInclude <- rep(TRUE, ncol(counts))
if (totalCellsToRemove > 0) {
zTa <- tabulate(SummarizedExperiment::colData(sce)$celda_cell_cluster,
S4Vectors::metadata(sce)$celda_parameters$K)
## Number of cells that can be sampled from each cluster without
## going below the minimum threshold
clusterCellsToSample <- zTa - minClusterSize
clusterCellsToSample[clusterCellsToSample < 0] <- 0
## Number of cells to sample after exluding smaller clusters
## Rounding can cause number to be off by a few, so ceiling is
## used with a second round of subtraction
clusterNToSample <- ceiling((clusterCellsToSample /
sum(clusterCellsToSample)) * totalCellsToRemove)
diff <- sum(clusterNToSample) - totalCellsToRemove
clusterNToSample[which.max(clusterNToSample)] <-
clusterNToSample[which.max(clusterNToSample)] - diff
## Perform sampling for each cluster
for (i in which(clusterNToSample > 0)) {
zInclude[sample(which(
SummarizedExperiment::colData(sce)$celda_cell_cluster == i),
clusterNToSample[i])] <- FALSE
}
}
cellIx <- which(zInclude)
norm <- t(normalizeCounts(counts[, cellIx],
normalize = normalize,
scaleFactor = scaleFactor,
transformationFun = transformationFun))
return(list(norm = norm, cellIx = cellIx))
}
.createSCEceldaC <- function(celdaCMod,
sce,
xClass,
useAssay,
algorithm,
stopIter,
maxIter,
splitOnIter,
splitOnLast,
nchains,
zInitialize,
zInit,
logfile,
verbose) {
# add metadata
S4Vectors::metadata(sce)[["celda_parameters"]] <- list(
model = "celda_C",
xClass = xClass,
useAssay = useAssay,
sampleLevels = celdaCMod@names$sample,
K = celdaCMod@params$K,
alpha = celdaCMod@params$alpha,
beta = celdaCMod@params$beta,
algorithm = algorithm,
stopIter = stopIter,
maxIter = maxIter,
splitOnIter = splitOnIter,
splitOnLast = splitOnLast,
seed = celdaCMod@params$seed,
nchains = nchains,
zInitialize = zInitialize,
countChecksum = celdaCMod@params$countChecksum,
zInit = zInit,
logfile = logfile,
verbose = verbose,
completeLogLik = celdaCMod@completeLogLik,
finalLogLik = celdaCMod@finalLogLik,
cellClusterLevels = sort(unique(celdaClusters(celdaCMod)$z)))
SummarizedExperiment::rowData(sce)["rownames"] <- celdaCMod@names$row
SummarizedExperiment::colData(sce)["colnames"] <-
celdaCMod@names$column
SummarizedExperiment::colData(sce)["celda_sample_label"] <-
celdaCMod@sampleLabel
SummarizedExperiment::colData(sce)["celda_cell_cluster"] <-
celdaClusters(celdaCMod)$z
return(sce)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.