R/initialize_clusters.R
In celda: CEllular Latent Dirichlet Allocation

.initializeCluster <- function(N,
                               len,
                               z = NULL,
                               initial = NULL,
                               fixed = NULL) {

  # If initial values are given, then they will not be randomly initialized
  if (!is.null(initial)) {
    initValues <- sort(unique(initial))
    if (length(unique(initial)) != N || length(initial) != len ||
      !all(initValues %in% seq(N))) {
      stop(
        "'initial' needs to be a vector of length 'len'",
        " containing N unique values."
      )
    }
    z <- as.integer(as.factor(initial))
  } else {
    z <- rep(NA, len)
  }

  # Set any values that need to be fixed during sampling
  if (!is.null(fixed)) {
    fixedValues <- sort(unique(fixed))
    if (length(fixed) != len || !all(fixedValues %in% seq(N))) {
      stop(
        "'fixed' to be a vector of length 'len' where each entry is",
        " one of N unique values or NA."
      )
    }
    fixedIx <- !is.na(fixed)
    z[fixedIx] <- fixed[fixedIx]
    zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx]))
  } else {
    zNotUsed <- seq(N)
    fixedIx <- rep(FALSE, len)
  }

  # Randomly sample remaining values
  zNa <- which(is.na(z))
  if (length(zNa) > 0) {
    z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE)
  }

  # Check to ensure each value is in the vector at least once
  missing <- setdiff(seq(N), z)
  for (i in missing) {
    ta <- sort(table(z[!fixedIx]), decreasing = TRUE)
    if (ta[1] == 1) {
      stop("'len' is not long enough to accomodate 'N' unique values")
    }
    ix <- which(z == as.integer(names(ta))[1] & !fixedIx)
    z[sample(ix, 1)] <- i
  }

  return(z)
}


.initializeSplitZ <- function(counts,
                              K,
                              KSubcluster = NULL,
                              alpha = 1,
                              beta = 1,
                              minCell = 3) {
  s <- rep(1, ncol(counts))
  if (is.null(KSubcluster)) {
    KSubcluster <- ceiling(sqrt(K))
  }

  # Initialize the model with KSubcluster clusters
  res <- .celda_C(
    counts,
    K = min(KSubcluster, ncol(counts)),
    maxIter = 20,
    zInitialize = "random",
    alpha = alpha,
    beta = beta,
    splitOnIter = -1,
    splitOnLast = FALSE,
    verbose = FALSE,
    reorder = FALSE
  )
  overallZ <- as.integer(as.factor(celdaClusters(res)$z))
  currentK <- max(overallZ)

  counter <- 0
  while (currentK < K & counter < 25) {
    # Determine which clusters are split-able
    # KRemaining <- K - currentK
    KPerCluster <- min(ceiling(K / currentK), KSubcluster)
    KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster)

    zTa <- tabulate(overallZ, max(overallZ))

    zToSplit <- which(zTa > minCell & zTa > KToUse)
    if (length(zToSplit) > 1) {
      zToSplit <- sample(zToSplit)
    } else if (length(zToSplit) == 0) {
      break
    }

    # Cycle through each splitable cluster and split it up into
    # K.sublcusters
    for (i in zToSplit) {
      clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE],
        K = KToUse,
        zInitialize = "random",
        alpha = alpha,
        beta = beta,
        maxIter = 20,
        splitOnIter = -1,
        splitOnLast = FALSE,
        verbose = FALSE)
      tempZ <- as.integer(as.factor(celdaClusters(clustLabel)$z))

      # Reassign clusters with label > 1
      splitIx <- tempZ > 1
      ix <- overallZ == i
      newZ <- overallZ[ix]
      newZ[splitIx] <- currentK + tempZ[splitIx] - 1

      overallZ[ix] <- newZ
      currentK <- max(overallZ)

      # Ensure that the maximum number of clusters does not get too large'
      if (currentK > K + 10) {
        break
      }
    }
    counter <- counter + 1
  }

  # Decompose counts for likelihood calculation
  p <- .cCDecomposeCounts(counts, s, overallZ, currentK)
  nS <- p$nS
  nG <- p$nG
  nM <- p$nM
  mCPByS <- p$mCPByS
  nGByCP <- p$nGByCP
  nCP <- p$nCP
  nByC <- p$nByC

  # Remove clusters 1-by-1 until K is reached
  while (currentK > K) {
    # Find second best assignment give current assignments for each cell
    probs <- .cCCalcEMProbZ(counts,
      s = s,
      z = overallZ,
      K = currentK,
      mCPByS = mCPByS,
      nGByCP = nGByCP,
      nByC = nByC,
      nCP = nCP,
      nG = nG,
      nM = nM,
      alpha = alpha,
      beta = beta,
      doSample = FALSE)
    zProb <- t(probs$probs)
    zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
    zSecond <- apply(zProb, 1, which.max)

    zTa <- tabulate(overallZ, currentK)
    zNonEmpty <- which(zTa > 0)

    # Find worst cluster by logLik to remove
    previousZ <- overallZ
    llShuffle <- rep(NA, currentK)
    for (i in zNonEmpty) {
      ix <- overallZ == i
      newZ <- overallZ
      newZ[ix] <- zSecond[ix]

      p <- .cCReDecomposeCounts(
        counts,
        s,
        newZ,
        previousZ,
        nGByCP,
        currentK)
      nGByCP <- p$nGByCP
      mCPByS <- p$mCPByS
      llShuffle[i] <- .cCCalcLL(
        mCPByS,
        nGByCP,
        s,
        newZ,
        currentK,
        nS,
        nG,
        alpha,
        beta)
      previousZ <- newZ
    }

    # Remove the cluster which had the the largest likelihood after removal
    zToRemove <- which.max(llShuffle)

    ix <- overallZ == zToRemove
    overallZ[ix] <- zSecond[ix]

    p <- .cCReDecomposeCounts(counts,
      s,
      overallZ,
      previousZ,
      nGByCP,
      currentK)
    nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE]
    mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE]
    overallZ <- as.integer(as.factor(overallZ))
    currentK <- currentK - 1
  }
  return(overallZ)
}


.initializeSplitY <- function(counts,
                              L,
                              LSubcluster = NULL,
                              tempK = 100,
                              beta = 1,
                              delta = 1,
                              gamma = 1,
                              minFeature = 3) {
  if (is.null(LSubcluster)) {
    LSubcluster <- ceiling(sqrt(L))
  }

  # Collapse cells to managable number of clusters
  if (!is.null(tempK) && ncol(counts) > tempK) {
    z <- .initializeSplitZ(counts, K = tempK)
    counts <- .colSumByGroup(counts, z, length(unique(z)))
  }

  # Initialize the model with KSubcluster clusters
  res <- .celda_G(counts,
    L = LSubcluster,
    maxIter = 10,
    yInitialize = "random",
    beta = beta,
    delta = delta,
    gamma = gamma,
    splitOnIter = -1,
    splitOnLast = FALSE,
    verbose = FALSE,
    reorder = FALSE)
  overallY <- as.integer(as.factor(celdaClusters(res)$y))
  currentL <- max(overallY)

  counter <- 0
  while (currentL < L & counter < 25) {
    # Determine which clusters are split-able
    yTa <- tabulate(overallY, max(overallY))
    yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster))

    if (length(yToSplit) == 0) {
      break
    }

    # Cycle through each splitable cluster and split it up into
    # LSublcusters
    for (i in yToSplit) {
      # make sure the colSums of subset counts is not 0
      countsY <- counts[overallY == i, , drop = FALSE]
      countsY <- countsY[, !(colSums(countsY) == 0)]

      if (ncol(countsY) == 0) {
        next
      }

      clustLabel <- .celda_G(
        countsY,
        L = min(LSubcluster, nrow(countsY)),
        yInitialize = "random",
        beta = beta,
        delta = delta,
        gamma = gamma,
        maxIter = 20,
        splitOnIter = -1,
        splitOnLast = FALSE,
        verbose = FALSE
      )
      tempY <- as.integer(as.factor(celdaClusters(clustLabel)$y))

      # Reassign clusters with label > 1
      splitIx <- tempY > 1
      ix <- overallY == i
      newY <- overallY[ix]
      newY[splitIx] <- currentL + tempY[splitIx] - 1

      overallY[ix] <- newY
      currentL <- max(overallY)

      # Ensure that the maximum number of clusters does not get too large
      if (currentL > L + 10) {
        break
      }
    }
    counter <- counter + 1
  }

  ## Decompose counts for likelihood calculation
  p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL)
  nTSByC <- p$nTSByC
  nByG <- p$nByG
  nByTS <- p$nByTS
  nGByTS <- p$nGByTS
  nM <- p$nM
  nG <- p$nG
  rm(p)

  # Pre-compute lgamma values
  lgbeta <- lgamma((seq(0, max(.colSums(
    counts, nrow(counts),
    ncol(counts)
  )))) + beta)
  lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
  lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))

  # Remove clusters 1-by-1 until L is reached
  while (currentL > L) {
    # Find second best assignment give current assignments for each cell
    probs <- .cGCalcGibbsProbY(
      counts = counts,
      y = overallY,
      L = currentL,
      nTSByC = nTSByC,
      nByTS = nByTS,
      nGByTS = nGByTS,
      nByG = nByG,
      nG = nG,
      beta = beta,
      delta = delta,
      gamma = gamma,
      lgbeta = lgbeta,
      lggamma = lggamma,
      lgdelta = lgdelta,
      doSample = FALSE)
    yProb <- t(probs$probs)
    yProb[cbind(seq(nrow(yProb)), overallY)] <- NA
    ySecond <- apply(yProb, 1, which.max)

    yTa <- tabulate(overallY, currentL)
    yNonEmpty <- which(yTa > 0)

    # Find worst cluster by logLik to remove
    previousY <- overallY
    llShuffle <- rep(NA, currentL)
    for (i in yNonEmpty) {
      ix <- overallY == i
      newY <- overallY
      newY[ix] <- ySecond[ix]

      # Move arounds counts for likelihood calculation
      p <- .cGReDecomposeCounts(
        counts,
        newY,
        previousY,
        nTSByC,
        nByG,
        currentL)
      nTSByC <- p$nTSByC
      nGByTS <- p$nGByTS
      nByTS <- p$nByTS
      llShuffle[i] <- .cGCalcLL(
        nTSByC,
        nByTS,
        nByG,
        nGByTS,
        nM,
        nG,
        currentL,
        beta,
        delta,
        gamma)
      previousY <- newY
    }

    # Remove the cluster which had the the largest likelihood after removal
    yToRemove <- which.max(llShuffle)

    ix <- overallY == yToRemove
    overallY[ix] <- ySecond[ix]

    # Move around counts and remove module
    p <- .cGReDecomposeCounts(
      counts,
      overallY,
      previousY,
      nTSByC,
      nByG,
      currentL)
    nTSByC <- p$nTSByC[-yToRemove, , drop = FALSE]
    nGByTS <- p$nGByTS[-yToRemove]
    nByTS <- p$nByTS[-yToRemove]
    overallY <- as.integer(as.factor(overallY))
    currentL <- currentL - 1
  }
  return(overallY)
}