# R/initialize_clusters.R In celda: CEllular Latent Dirichlet Allocation

#### Defines functions .initializeSplitY.initializeSplitZ.initializeCluster

```.initializeCluster <- function(N,
len,
z = NULL,
initial = NULL,
fixed = NULL) {

# If initial values are given, then they will not be randomly initialized
if (!is.null(initial)) {
initValues <- sort(unique(initial))
if (length(unique(initial)) != N || length(initial) != len ||
!all(initValues %in% seq(N))) {
stop(
"'initial' needs to be a vector of length 'len'",
" containing N unique values."
)
}
z <- as.integer(as.factor(initial))
} else {
z <- rep(NA, len)
}

# Set any values that need to be fixed during sampling
if (!is.null(fixed)) {
fixedValues <- sort(unique(fixed))
if (length(fixed) != len || !all(fixedValues %in% seq(N))) {
stop(
"'fixed' to be a vector of length 'len' where each entry is",
" one of N unique values or NA."
)
}
fixedIx <- !is.na(fixed)
z[fixedIx] <- fixed[fixedIx]
zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx]))
} else {
zNotUsed <- seq(N)
fixedIx <- rep(FALSE, len)
}

# Randomly sample remaining values
zNa <- which(is.na(z))
if (length(zNa) > 0) {
z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE)
}

# Check to ensure each value is in the vector at least once
missing <- setdiff(seq(N), z)
for (i in missing) {
ta <- sort(table(z[!fixedIx]), decreasing = TRUE)
if (ta[1] == 1) {
stop("'len' is not long enough to accomodate 'N' unique values")
}
ix <- which(z == as.integer(names(ta))[1] & !fixedIx)
z[sample(ix, 1)] <- i
}

return(z)
}

.initializeSplitZ <- function(counts,
K,
KSubcluster = NULL,
alpha = 1,
beta = 1,
minCell = 3) {
s <- rep(1, ncol(counts))
if (is.null(KSubcluster)) {
KSubcluster <- ceiling(sqrt(K))
}

# Initialize the model with KSubcluster clusters
res <- .celda_C(
counts,
K = min(KSubcluster, ncol(counts)),
maxIter = 20,
zInitialize = "random",
alpha = alpha,
beta = beta,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE,
reorder = FALSE
)
overallZ <- as.integer(as.factor(celdaClusters(res)\$z))
currentK <- max(overallZ)

counter <- 0
while (currentK < K & counter < 25) {
# Determine which clusters are split-able
# KRemaining <- K - currentK
KPerCluster <- min(ceiling(K / currentK), KSubcluster)
KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster)

zTa <- tabulate(overallZ, max(overallZ))

zToSplit <- which(zTa > minCell & zTa > KToUse)
if (length(zToSplit) > 1) {
zToSplit <- sample(zToSplit)
} else if (length(zToSplit) == 0) {
break
}

# Cycle through each splitable cluster and split it up into
# K.sublcusters
for (i in zToSplit) {
clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE],
K = KToUse,
zInitialize = "random",
alpha = alpha,
beta = beta,
maxIter = 20,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE)
tempZ <- as.integer(as.factor(celdaClusters(clustLabel)\$z))

# Reassign clusters with label > 1
splitIx <- tempZ > 1
ix <- overallZ == i
newZ <- overallZ[ix]
newZ[splitIx] <- currentK + tempZ[splitIx] - 1

overallZ[ix] <- newZ
currentK <- max(overallZ)

# Ensure that the maximum number of clusters does not get too large'
if (currentK > K + 10) {
break
}
}
counter <- counter + 1
}

# Decompose counts for likelihood calculation
p <- .cCDecomposeCounts(counts, s, overallZ, currentK)
nS <- p\$nS
nG <- p\$nG
nM <- p\$nM
mCPByS <- p\$mCPByS
nGByCP <- p\$nGByCP
nCP <- p\$nCP
nByC <- p\$nByC

# Remove clusters 1-by-1 until K is reached
while (currentK > K) {
# Find second best assignment give current assignments for each cell
probs <- .cCCalcEMProbZ(counts,
s = s,
z = overallZ,
K = currentK,
mCPByS = mCPByS,
nGByCP = nGByCP,
nByC = nByC,
nCP = nCP,
nG = nG,
nM = nM,
alpha = alpha,
beta = beta,
doSample = FALSE)
zProb <- t(probs\$probs)
zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
zSecond <- apply(zProb, 1, which.max)

zTa <- tabulate(overallZ, currentK)
zNonEmpty <- which(zTa > 0)

# Find worst cluster by logLik to remove
previousZ <- overallZ
llShuffle <- rep(NA, currentK)
for (i in zNonEmpty) {
ix <- overallZ == i
newZ <- overallZ
newZ[ix] <- zSecond[ix]

p <- .cCReDecomposeCounts(
counts,
s,
newZ,
previousZ,
nGByCP,
currentK)
nGByCP <- p\$nGByCP
mCPByS <- p\$mCPByS
llShuffle[i] <- .cCCalcLL(
mCPByS,
nGByCP,
s,
newZ,
currentK,
nS,
nG,
alpha,
beta)
previousZ <- newZ
}

# Remove the cluster which had the the largest likelihood after removal
zToRemove <- which.max(llShuffle)

ix <- overallZ == zToRemove
overallZ[ix] <- zSecond[ix]

p <- .cCReDecomposeCounts(counts,
s,
overallZ,
previousZ,
nGByCP,
currentK)
nGByCP <- p\$nGByCP[, -zToRemove, drop = FALSE]
mCPByS <- p\$mCPByS[-zToRemove, , drop = FALSE]
overallZ <- as.integer(as.factor(overallZ))
currentK <- currentK - 1
}
return(overallZ)
}

.initializeSplitY <- function(counts,
L,
LSubcluster = NULL,
tempK = 100,
beta = 1,
delta = 1,
gamma = 1,
minFeature = 3) {
if (is.null(LSubcluster)) {
LSubcluster <- ceiling(sqrt(L))
}

# Collapse cells to managable number of clusters
if (!is.null(tempK) && ncol(counts) > tempK) {
z <- .initializeSplitZ(counts, K = tempK)
counts <- .colSumByGroup(counts, z, length(unique(z)))
}

# Initialize the model with KSubcluster clusters
res <- .celda_G(counts,
L = LSubcluster,
maxIter = 10,
yInitialize = "random",
beta = beta,
delta = delta,
gamma = gamma,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE,
reorder = FALSE)
overallY <- as.integer(as.factor(celdaClusters(res)\$y))
currentL <- max(overallY)

counter <- 0
while (currentL < L & counter < 25) {
# Determine which clusters are split-able
yTa <- tabulate(overallY, max(overallY))
yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster))

if (length(yToSplit) == 0) {
break
}

# Cycle through each splitable cluster and split it up into
# LSublcusters
for (i in yToSplit) {
# make sure the colSums of subset counts is not 0
countsY <- counts[overallY == i, , drop = FALSE]
countsY <- countsY[, !(colSums(countsY) == 0)]

if (ncol(countsY) == 0) {
next
}

clustLabel <- .celda_G(
countsY,
L = min(LSubcluster, nrow(countsY)),
yInitialize = "random",
beta = beta,
delta = delta,
gamma = gamma,
maxIter = 20,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE
)
tempY <- as.integer(as.factor(celdaClusters(clustLabel)\$y))

# Reassign clusters with label > 1
splitIx <- tempY > 1
ix <- overallY == i
newY <- overallY[ix]
newY[splitIx] <- currentL + tempY[splitIx] - 1

overallY[ix] <- newY
currentL <- max(overallY)

# Ensure that the maximum number of clusters does not get too large
if (currentL > L + 10) {
break
}
}
counter <- counter + 1
}

## Decompose counts for likelihood calculation
p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL)
nTSByC <- p\$nTSByC
nByG <- p\$nByG
nByTS <- p\$nByTS
nGByTS <- p\$nGByTS
nM <- p\$nM
nG <- p\$nG
rm(p)

# Pre-compute lgamma values
lgbeta <- lgamma((seq(0, max(.colSums(
counts, nrow(counts),
ncol(counts)
)))) + beta)
lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))

# Remove clusters 1-by-1 until L is reached
while (currentL > L) {
# Find second best assignment give current assignments for each cell
probs <- .cGCalcGibbsProbY(
counts = counts,
y = overallY,
L = currentL,
nTSByC = nTSByC,
nByTS = nByTS,
nGByTS = nGByTS,
nByG = nByG,
nG = nG,
beta = beta,
delta = delta,
gamma = gamma,
lgbeta = lgbeta,
lggamma = lggamma,
lgdelta = lgdelta,
doSample = FALSE)
yProb <- t(probs\$probs)
yProb[cbind(seq(nrow(yProb)), overallY)] <- NA
ySecond <- apply(yProb, 1, which.max)

yTa <- tabulate(overallY, currentL)
yNonEmpty <- which(yTa > 0)

# Find worst cluster by logLik to remove
previousY <- overallY
llShuffle <- rep(NA, currentL)
for (i in yNonEmpty) {
ix <- overallY == i
newY <- overallY
newY[ix] <- ySecond[ix]

# Move arounds counts for likelihood calculation
p <- .cGReDecomposeCounts(
counts,
newY,
previousY,
nTSByC,
nByG,
currentL)
nTSByC <- p\$nTSByC
nGByTS <- p\$nGByTS
nByTS <- p\$nByTS
llShuffle[i] <- .cGCalcLL(
nTSByC,
nByTS,
nByG,
nGByTS,
nM,
nG,
currentL,
beta,
delta,
gamma)
previousY <- newY
}

# Remove the cluster which had the the largest likelihood after removal
yToRemove <- which.max(llShuffle)

ix <- overallY == yToRemove
overallY[ix] <- ySecond[ix]

# Move around counts and remove module
p <- .cGReDecomposeCounts(
counts,
overallY,
previousY,
nTSByC,
nByG,
currentL)
nTSByC <- p\$nTSByC[-yToRemove, , drop = FALSE]
nGByTS <- p\$nGByTS[-yToRemove]
nByTS <- p\$nByTS[-yToRemove]
overallY <- as.integer(as.factor(overallY))
currentL <- currentL - 1
}
return(overallY)
}
```

## Try the celda package in your browser

Any scripts or data that you put into this service are public.

celda documentation built on Nov. 8, 2020, 8:24 p.m.