SeuratAddon: Unsupervised evaluation of cell identities in Seurat

Documented in ClusterCellsKmeans

#' Cluster cells with K-means clustering and mini-batch K-means clustering
#'
#' Perform K-means clustering on cells, to obtain cell identities.
#' Based on \code{Seurat::DoKMeans}.
#'
#' K-means clustering is performed on either scaled data or reduced dimensions,
#' with a number of subpopulations set to \code{k.cells}.
#' After the initial K-means clustering is performed,
#' one can use \code{EvaluateIdent} to test individual cell identities, as
#' given by K-means clustering.
#'
#' @param object Seurat object
#' @param genes.use Genes to use for clustering
#' @param reduction.type Name of dimensional reduction technique to use in
#' k-means clustering. If NULL, genes will be used. (default is NULL)
#' @param dims.use A vector of the dimensions to use in k-means clustering
#' graph (e.g. To use the first 10 PCs, pass 1:10)
#' @param num_init A number of times the algorithm will be run with different centroid seeds. (default is 5)
#' @param center Center the cells/rows (default is TRUE)
#' @param k.cells K value to use for clustering cells
#' @param k.seed Random seed
#' @param do.plot Draw heatmap of clustered genes/cells (default is FALSE).
#' @param data.cut Clip all z-scores to have an absolute value below this.
#' Reduces the effect of huge outliers in the data. (default is NULL)
#' @param k.cols Color palette for heatmap
#' @param set.ident If clustering cells (so k.cells>0), set the cell identity
#' class to its K-means cluster (default is TRUE)
#' @param minibatch FALSE by default. If TRUE, use the mini-batch K-means clustering implemented in the ClusterR package.
#' @param do.constrained FALSE by default. If TRUE, use the constrained K-means function implemented in the tclust package.
#' @param assay.type Type of data to normalize for (default is RNA), but can be changed for multimodal analyses.
#' @param \dots Additional parameters passed to kmeans (or tkmeans)
#'
#' @importFrom methods new
#' @importFrom stats kmeans
#' @importFrom tclust tkmeans
#' @importFrom ClusterR MiniBatchKmeans
#' @importFrom ClusterR predict_MBatchKMeans
#' @importFrom ClusterR KMeans_rcpp
#'
#' @return Seurat object where the k-means results for genes is stored in
#' object@@kmeans.gene[[1]]. The cluster for each cell is stored in object@@meta.data[,"kmeans.ident"]
#' and also object@@ident (if set.ident=TRUE)
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # Cluster single cells
#' pbmc_small <- ClusterCellsKmeans(pbmc_small, k.cells = 3)
ClusterCellsKmeans <- function(
  object,
  genes.use = NULL,
  reduction.type = NULL,
  dims.use = NULL,
  num_init = 5,
  center = TRUE,
  k.cells = NULL,
  k.seed = 1,
  do.plot = FALSE,
  data.cut = NULL,
  k.cols = PurpleAndYellow(),
  set.ident = TRUE,
  minibatch = FALSE,
  do.constrained = FALSE,
  assay.type="RNA",
  ...
) {
  if(is.null(k.cells) | k.cells < 2) { stop("Set the proper number of clusters for cells, for evaluation of cell identities.") }

  if(is.null(genes.use)) { genes.use <- Seurat:::SetIfNull(x = genes.use, default = object@var.genes) }
  if (is.null(x = dims.use)) {
    message("Using scaled data.")
    data.use <- GetAssayData(
      object = object,
      assay.type = assay.type,
      slot = "scale.data"
    )
    # rows: genes and cols: cells
    genes.use <- genes.use[genes.use %in% rownames(x = data.use)]
    data.use <- data.use[genes.use,]
    # rows: cells and cols: genes
    if(center) {
      data.use <- t(scale(data.use, center = TRUE, scale = FALSE))
    } else {
      data.use <- t(data.use)
    }
    if(!is.null(data.cut)) { data.use <- MinMax(data = data.use, min = data.cut * (-1), max = data.cut) }
  } else {
    message(paste0("Using ", reduction.type, "."))
    data.use <- GetCellEmbeddings(object = object,
                                  reduction.type = reduction.type,
                                  dims.use = dims.use)
    # rows: cells and cols: genes
    if(center) {
      data.use <- t(scale(t(data.use), center = TRUE, scale = FALSE))
    }
  }

  message(paste0("Clustering ", nrow(data.use)," cells."))
  # k-means clustering cells
  if (do.constrained) {
    message(paste0("Truncated K-means clustering."))
    set.seed(seed = k.seed)
    Seurat:::PackageCheck('tclust')
    kmeans.cell <- tclust::tkmeans(x = data.use, k = k.cells, ...)
  } else if (minibatch) {
    message(paste0("Mini-batch K-means clustering."))
    set.seed(seed = k.seed)
    Seurat:::PackageCheck('ClusterR')
    kmeans.cell <- ClusterR::MiniBatchKmeans(data = data.use, clusters = k.cells, ...)
    kmeans.cell$cluster = ClusterR::predict_MBatchKMeans(data = data.use, CENTROIDS = kmeans.cell$centroids)
  } else {
    message(paste0("K-means clustering with K-means++."))
    set.seed(seed = k.seed)
    kmeans.cell <- ClusterR::KMeans_rcpp(data = data.use, clusters = k.cells, num_init = num_init, ...)
  }
  names(x = kmeans.cell$cluster) <- object@cell.names

  object@kmeans <- new(
    Class = "kmeans.info",
    cell.kmeans.obj = kmeans.cell
  )

  if (k.cells > 0) {
    kmeans.code=paste("kmeans",k.cells,"ident",sep=".")
    object@meta.data[names(x = kmeans.cell$cluster), kmeans.code] <- kmeans.cell$cluster
  }
  if (set.ident) {
    object <- Seurat:::SetIdent(
      object = object,
      cells.use = names(x = kmeans.cell$cluster),
      ident.use = kmeans.cell$cluster
    )
  }
  if (do.plot) {
    KMeansHeatmap(object = object)
  }
  return(object)
}