SeuratAddon: Unsupervised evaluation of cell identities in Seurat

Documented in EvaluateIdent EvaluateIdentKmeans

#' Evaluate the cell identities using clustering
#'
#' This function is a simple wrapper for \code{EvaluateIdentKmeans},
#' facilitating unsupervised evaluation of cell identities.
#'
#' First, run K-means clustering or mini-batch K-means clustering using \code{ClusterCellsKmeans},
#' which computes computationally defined cell identities, \code{ident} in the Seurat object.
#' Second, run this function with the aforementioned Seurat object.
#' Ensure that the same parameters (\code{reduction.type}, \code{dims.use}, \code{k.param}, ...)
#' are used.
#'
#' @param object Seurat object
#' @param genes.use Genes to use for clustering
#' @param clustering Clustering method to use
#' @param data.cut Clip all z-scores to have an absolute value below this
#' Reduces the effect of huge outliers in the data. (default is NULL)
#' @param do.plot Draw histograms of statistics (default is FALSE)
#' @param seed Random seed
#' @param \dots Additional parameters passed to \code{EvaluateIdentKmeans} and related functions
#'
#' @seealso EvaluateIdentKmeans
#'
#' @importFrom tidyr gather
#' @import ggplot2
#' @importFrom methods new
#' @importFrom stats kmeans
#'
#' @return Seurat object where the unsupervised evaluation for cell identities is stored in
#' object@ident_prob. Furthermore, chosen probabilities (PIP, adjusted p-values, or etc) are
#' stored in object@meta.data["ident_prob"]. See the optional argument \code{prob.use}.
#'
#' @export EvaluateIdent
#'
EvaluateIdent <- function(
  object,
  genes.use = NULL,
  clustering = "kmeans",
  data.cut = NULL,
  do.plot = FALSE,
  seed = 1,
  ...
) {
  if("kmeans" %in% tolower(clustering)) {
    return(EvaluateIdentKmeans(
      object = object,
      genes.use = genes.use,
      seed = seed,
      data.cut = data.cut,
      do.plot = do.plot,
      minibatch = FALSE,
      ...
    ))
  } else if(any(c("minibatch","minibatchkmeans") %in% tolower(clustering))) {
    return(EvaluateIdentKmeans(
      object = object,
      genes.use = genes.use,
      seed = seed,
      data.cut = NULL,
      do.plot = do.plot,
      minibatch = TRUE,
      ...
    ))
  } else {
    stop("Please select clustering algorithms that are supported by SeuratAddon.")
  }
}

#' Evaluate the cell identities using K-means clustering
#'
#' Evaluate the cell identities that were determined from k-means clustering
#'
#' Use \code{data.cut} to trim outliers, as done in \code{DoKMeans2}.
#' Ensure that these parameters are identical to the original clustering step.
#'
#' @param object Seurat object
#' @param prob.use Probabilities saved to object@meta.data["ident_prob"]; To be used in downstream analysis by default
#' @param p.adjust.methods A p-value correction method for multiple comparisons
#' @param genes.use Genes to use for clustering
#' @param reduction.type Name of dimensional reduction technique (e.g. "pca", "ica")
#' @param dims.use A vector of the dimensions to use (e.g. To use the first 10 PCs, pass 1:10)
#' @param center Center the cells
#' @param data.cut Clip all z-scores to have an absolute value below this.
#' Reduces the effect of huge outliers in the data. (default is NULL)
#' @param minibatch FALSE by default. If TRUE, use the mini-batch K-means clustering implemented in the ClusterR package.
#' @param assay.type Type of assay to fetch data for (default is RNA)
#' @param do.plot Draw histograms of statistics (default is FALSE)
#' @param seed Random seed
#' @param verbose Print the computational progress. TRUE, by default.
#' @param return.jackstraw FALSE by default. If TRUE, return a list of jackstraw-related statistics, instead of a Seurat object,
#' @param \dots Additional parameters passed to jackstraw.kmeans
#'
#' @seealso DoKMeans2
#'
#' @importFrom tidyr gather
#' @import ggplot2
#' @importFrom methods new
#' @importFrom stats kmeans
#'
#' @return Seurat object where the unsupervised evaluation for cell identities is stored in
#' object@ident_prob. Furthermore, chosen probabilities (PIP, adjusted p-values, or etc) are
#' stored in object@meta.data["ident_prob"]. See the optional argument \code{prob.use}.
#'
#' @export EvaluateIdentKmeans
#'
#' @examples
#' \dontrun{
#' set.seed(1234)
#' require(Seurat)
#'   pbmc_small <- DoKMeans2(pbmc_small, k.cells = 3)
#'   pbmc_small2 <- EvaluateIdentKmeans(pbmc_small)
#' }
#'
EvaluateIdentKmeans <- function(
  object,
  prob.use = "pip",
  p.adjust.methods = "BH",
  genes.use = NULL,
  reduction.type = NULL,
  dims.use = NULL,
  center = TRUE,
  data.cut = NULL,
  minibatch = FALSE,
  assay.type="RNA",
  do.plot = TRUE,
  seed = 1,
  verbose = TRUE,
  return.jackstraw = FALSE,
  ...
) {
  # Make sure k-means clustering has been applied
  if(is.null(object@kmeans)) {
    stop("Run k-means clustering on the Seurat object using DoKMeans2() with k.cells > 0.")
  } else {
    if(is.null(object@kmeans@cell.kmeans.obj)) {
      stop("Run k-means clustering on the Seurat object using DoKMeans2() with k.cells > 0.")
    }
  }
  k.cells = length(unique(object@kmeans@cell.kmeans.obj$cluster))

  # Get and prepare data
  if(is.null(genes.use)) { genes.use <- Seurat:::SetIfNull(x = genes.use, default = object@var.genes) }
  if (is.null(x = dims.use)) {
    message("Using scaled data.")
    data.use <- GetAssayData(
      object = object,
      assay.type = assay.type,
      slot = "scale.data"
    )
    # rows: genes and cols: cells
    genes.use <- genes.use[genes.use %in% rownames(x = data.use)]
    data.use <- data.use[genes.use,]
    # rows: cells and cols: genes
    if(center) {
      data.use <- t(scale(data.use, center = TRUE, scale = FALSE))
    } else {
      data.use <- t(data.use)
    }
    if(!is.null(data.cut)) { data.use <- MinMax(data = data.use, min = data.cut * (-1), max = data.cut) }
  } else {
    message(paste0("Using ", reduction.type, "."))
    data.use <- GetCellEmbeddings(object = object,
                                  reduction.type = reduction.type,
                                  dims.use = dims.use)
    # rows: cells and cols: genes
    if(center) {
      data.use <- t(scale(t(data.use), center = TRUE, scale = FALSE))
    }
  }

  if(minibatch) {
    jackstraw.ident = jackstraw_MiniBatchKmeans(data.use, MiniBatchKmeans.output=object@kmeans@cell.kmeans.obj, verbose=verbose, center=center, ...)
  } else {
    jackstraw.ident = jackstraw_kmeanspp(data.use, kmeans.dat=object@kmeans@cell.kmeans.obj, verbose=verbose, center=center, ...)
  }
  jackstraw.ident$pi0 = tryCatch(pi0est(jackstraw.ident$p.F$pi0),
    error = function(e) {
      message(paste("Default pi0 estimation failed. Setting pi0 = 1 as the conservative option."));
      1}
  )

  jackstraw.ident$pip <- pip(jackstraw.ident$p.F, pi0=jackstraw.ident$pi0)
  jackstraw.ident$p.adj <- p.adjust(jackstraw.ident$p.F, method=p.adjust.methods)

  if (any(c("PIP","pip","posterior") %in% prob.use)) {
    object@meta.data["ident_prob"] = unlist(jackstraw.ident$pip)
  } else if (any(c("p.adjust","p.adj") %in% prob.use)) {
    object@meta.data["ident_prob"] = unlist(jackstraw.ident$p.adj)
  } else if (any(c("p.value","pvalue") %in% prob.use)) {
    object@meta.data["ident_prob"] = unlist(jackstraw.ident$p.F)
  }

  if (do.plot) {
    plot_data <- gather(data.frame(AdjustedPvalue = jackstraw.ident$p.adj, PIP = jackstraw.ident$pip))
    ggplot(plot_data) + geom_histogram(aes(value)) + facet_wrap(~ key) + theme_bw()
  }

  if(return.jackstraw) {
    return(jackstraw.ident)
  } else {
    return(object)
  }
}