iMUBAC: Integration of Multi-Batch Cytometry datasets

Documented in clustering clustering_with_masking

#' @title Clustering
#'
#' @description
#' \code{clustering} currently implements two methods. The first approach groups
#' cells into \code{xdim}x\code{ydim} clusters using \pkg{FlowSOM}, and then
#' performs metaclustering with \pkg{ConsensusClusterPlus} into \code{maxK} clusters.
#' The second approach performs dimention reduction through UMAP, constructs
#' shared nearest-neighbor graphs using \pkg{scran}, and then performs community
#' detection using the Leuvein algorithm in \pkg{igraph}. In both cases,
#' an \code{SingleCellExperiment} object with a '\code{cluster_id}' column is
#' returned.\cr
#' \code{clustering_with_masking} performs a preliminary clustering and prompts
#' the user to provide the clusters to be masked. The median expression
#' value heatmap will be generated by \code{plotClusterHeatmap}. Then the function
#' performs the main clustering only using the non-masked cells. The masked cells
#' will be assigned to a cluster ID of zero.
#'
#' @param sce A \code{\link[SingleCellExperiment]{SingleCellExperiment}} object.
#' @param features A character vector.
#'   Specifies which antigens to use for clustering.
#' @param features_masking A character vector.
#'   Specifies which antigens to use for the preliminary masking.
#' @param by_exprs_values A character string.
#'   Specifies which assay data to use for clustering.
#' @param method A character string.
#'   Specifies which clustering method to use. Currently, either "FlowSOM" or "SNNGraph" is supported.
#' @param xdim,ydim
#'   Numeric. Specifies the grid size of the self-orginizing map.
#'   For example, 20x20 grid will yield 400 clusters.
#' @param maxK
#'   Numeric. Specifies the maximum number of clusters to evaluate
#'   in the metaclustering. For \code{maxK = 40}, for example,
#'   metaclustering will be performed to obtain 40 clusters.
#' @param n_components,n_neighbors,min_dist
#'   Specifies parameters for UMAP and SNN-graph construction.
#' @param seed Numeric. Sets a random seed.
#'
#' @return A \code{\link{SingleCellExperiment-class}} object with a
#' '\code{cluster_id}' column. Masked cells are assigned to the cluster 0.

#' @export
#' @rdname clustering
#' @name clustering
clustering <- function(
  sce,
  features=rownames(sce),
  by_exprs_values="normexprs",
  method="FlowSOM",
  xdim=20,
  ydim=20,
  maxK=40,
  n_components=min(c(10, length(features))),
  n_neighbors=10,
  min_dist=0.1,
  seed=12345
){
  # Set the random seed
  set.seed(seed)

  # FlowSOM-guided clustering & metaclustering
  if(identical(method, "FlowSOM")){
    ## FlowSOM clustering
    cat("1. FlowSOM clustering...\n")
    som <- FlowSOM::ReadInput(flowCore::flowFrame(t(assay(sce, by_exprs_values))))
    som <- FlowSOM::BuildSOM(
      som,
      colsToUse=features,
      xdim=xdim,
      ydim=ydim,
      silent=T
    )

    ## Metaclustering
    cat("2. ConsensusClusterPlus metaclustering...\n")
    pdf(NULL)
    mc <- suppressMessages(
      ConsensusClusterPlus::ConsensusClusterPlus(
        t(som$map$codes),
        maxK=maxK,
        reps=100,
        distance="euclidean",
        seed=seed,
        plot=NULL
      )
    )
    dev.off()

    ## Retrieve cluster metadata
    k <- xdim*ydim
    mcs <- seq_len(maxK)[-1]
    clstCodeDF <- data.frame(seq_len(k), purrr::map(mc[-1], "consensusClass"))
    clstCodeDF <- dplyr::mutate_all(clstCodeDF, function(u){factor(u, levels=sort(unique(u)))})
    colnames(clstCodeDF) <- c(sprintf("som%s", k), sprintf("meta%s", mcs))

    ## Convert original FlowSOM cluster codes into final metacluster IDs
    clstIDs <- som$map$mapping[,1]
    clstCodeDF <- data.frame(
      "FlowSOM"=clstCodeDF[[paste0("som", k)]],
      "Metaclustering"=clstCodeDF[[paste0("meta", maxK)]],
      stringsAsFactors=F
    )
    clstCodeDF <- clstCodeDF[which(clstCodeDF$"FlowSOM" %in% unique(clstIDs)),]

    ## Put the final metacluster IDs back into the original SCE object
    sce$"cluster_id" <- factor(
      clstIDs,
      levels=clstCodeDF$"FlowSOM",
      labels=clstCodeDF$"Metaclustering"
    )
  }

  # Shared nearest-neighbor graph-based clustering
  if(identical(method, "SNNGraph")){
    cat("1. Dimention reduction through UMAP...\n")
    SingleCellExperiment::reducedDim(sce, type="UMAP_SNN", withDimnames=T) <- uwot::umap(
      t(assay(sce, by_exprs_values)),
      n_components=n_components,
      n_neighbors=n_neighbors,
      min_dist=min_dist,
      scale=T,
      n_threads=parallel::detectCores(logical=F),
      verbose=T
    ) %>%
      magrittr::set_colnames(paste0("UMAP_SNN", 1:n_components))
    cat("2. SNN graph construction...\n")
    g <- scran::buildSNNGraph(
      sce,
      use.dimred="UMAP_SNN"
    )
    cat("3. Graph-based clustering...\n")
    g_comm <- igraph::cluster_louvain(g)
    sce$"cluster_id" <- factor(g_comm$"membership")
  }

  # Output
  return(sce)
}

#' @export
#' @rdname clustering
#' @name clustering
clustering_with_masking <-  function(
  sce,
  features=rownames(sce),
  features_masking=rownames(sce),
  by_exprs_values="normexprs",
  method="FlowSOM",
  xdim=20,
  ydim=20,
  maxK=40,
  n_components=min(c(10, length(features))),
  n_neighbors=10,
  min_dist=0.1,
  seed=12345
){
  # Set the random seed
  set.seed(seed)

  # A preliminary clustering
  cat("A preliminary clustering...\n")
  sce <- clustering(
    sce,
    features=features_masking,
    by_exprs_values=by_exprs_values,
    method=method,
    xdim=xdim,
    ydim=ydim,
    maxK=maxK,
    n_components=min(c(n_components, length(features_masking))),
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    seed=seed
  )
  plt <- plotClusterHeatmap(
    sce,
    features=rownames(sce),
    clusters=sce$"cluster_id",
    by_exprs_values=by_exprs_values,
    fun="median",
    cluster_anno=T,
    scale=T,
    draw_dend=T,
    draw_freqs=T
  )
  print(plt)

  # Prompt the user to provide the cluster IDs to be masked
  cat("Enter the cluster IDs to be masked (separated by comma without space)\n")
  rm <- readline(prompt="Cluster IDs: ")
  rm <- as.numeric(unlist(stringr::str_split(rm, ",")))
  sce$"cluster_masked" <- F
  sce$"cluster_masked"[sce$"cluster_id" %in% rm] <- T

  # The main clustering
  cat("The main clustering...\n")
  clusterIDs <- rep(0, ncol(sce)) ## 0 for masked clusters
  sce_sub <- clustering(
    sce[,sce$"cluster_masked"==F],
    features=features,
    by_exprs_values=by_exprs_values,
    method=method,
    xdim=xdim,
    ydim=ydim,
    maxK=maxK,
    n_components=n_components,
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    seed=seed
  )
  clusterIDs[which(sce$"cluster_masked"==F)] <- sce_sub$"cluster_id"
  sce$"cluster_id" <- factor(
    clusterIDs,
    levels=sort(unique(clusterIDs))
  ) ## re-factorize
  return(sce)
}