airpart: Differential cell-type-specific allelic imbalance

Documented in geneCluster

#' Gene clustering based on allelic ratio matrix with pseudo-count
#'
#' @param sce SingleCellExperiment containing assays \code{"ratio_pseudo"} and
#' colData factor \code{"x"}
#' @param method the method to do gene clustering. The default is the Gaussian
#' Mixture Modeling which
#' is likely to be more accurate. \code{"hierarchical"} represents
#' automatic hierarchical clustering which is faster to compute.
#' @param minClusterSize Minimum cluster size of \code{"hierarchical"} method.
#' @param plot logical, whether to make a PCA plot
#' @param G An integer vector specifying the numbers of clusters for which the
#' BIC is to be calculated.
#'  The default is G=c(8, 12, 16, 20, 24).
#' @param ... Catches unused arguments in indirect or list calls via do.call
#' as described in \code{\link[mclust]{Mclust}}
#'
#' @references
#'
#' This function leverages Mclust from the mclust package, or hclust.
#'
#' For mclust see:
#' Luca Scrucca and Michael Fop and T. Brendan Murphy, Adrian E. Raftery
#' "mclust 5: clustering, classification and density
#' estimation using {G}aussian finite mixture models"
#' 2016. The R Journal. doi: 10.32614/RJ-2016-021
#'
#' @seealso \code{\link[mclust]{Mclust}}
#'
#' @return gene cluster IDs are stored in the rowData column \code{cluster}
#' and a table of gene cluster is returned in metadata \code{geneCluster}
#'
#' @examples
#'
#' sce <- makeSimulatedData()
#' sce <- preprocess(sce)
#' sce <- geneCluster(sce, G = seq_len(4))
#' @importFrom mclust Mclust hc hcEII mclustBIC
#' @import ggplot2
#' @importFrom dynamicTreeCut cutreeDynamic
#' @importFrom stats prcomp hclust cutree dist as.dist median mad
#' @importFrom grDevices rainbow
#' @importFrom graphics par
#' @importFrom rlang .data
#'
#' @export
geneCluster <- function(sce, G, method = c("GMM", "hierarchical"),
                        minClusterSize = 3, plot = TRUE, ...) {
  method <- match.arg(method, c("GMM", "hierarchical"))
  if (missing(G)) {
    G <- c(8, 12, 16, 20, 24)
  }
  if (!"x" %in% names(colData(sce))) {
    stop('require a vector of annotated cell types "x" in colData')
  }
  if (!"ratio_pseudo" %in% assayNames(sce)) {
    stop('require an assay "ratio_pseudo"')
  }
  if (!is.factor(sce$x)) {
    sce$x <- as.factor(sce$x)
  }
  nct <- nlevels(sce$x)
  # PCA first
  pca <- prcomp(assays(sce)[["ratio_pseudo"]], rank. = 2 * nct) # use 2*nct
  ratio_pca <- as.matrix(pca$x)

  if (method == "GMM") {
    fit <- gmmCluster(ratio_pca, G, ...)
  } else if (method == "hierarchical") {
    fit <- hierCluster(ratio_pca, minClusterSize)
  }
  nclust <- fit$nclust
  my_clusters <- fit$my_clusters

  if (plot) {
    dat <- as.data.frame(ratio_pca)
    dat$GeneCluster <- factor(my_clusters)
    cols <- unname(rainbow(nclust + 1)[-1])
    p <- ggplot(dat, aes(.data$PC1, .data$PC2, col = .data$GeneCluster)) +
      geom_point() +
      scale_color_manual(values = cols) +
      theme_minimal()
    print(p)
  }

  rowData(sce)$cluster <- my_clusters
  metadata(sce)$geneCluster <- unname(table(my_clusters))
  return(sce)
}

gmmCluster <- function(ratio_pca, G, ...) {
  init <- list(hcPairs = mclust::hc(ratio_pca,
    modelName = "EII", use = "VARS"
  ))
  d_clust <- mclust::Mclust(ratio_pca,
    G = G, modelNames = "EII",
    initialization = init, ...
  )
  nclust <- dim(d_clust$z)[2]
  my_clusters <- unname(d_clust$classification)
  cat("model-based optimal number of clusters:", nclust, "\n")
  list(nclust = nclust, my_clusters = my_clusters)
}

hierCluster <- function(ratio_pca, minClusterSize) {
  my_dist <- dist(ratio_pca, method = "manhattan")
  my_tree <- hclust(my_dist, method = "ward.D2")
  my_clusters <- unname(
    cutreeDynamic(my_tree,
      distM = as.matrix(my_dist),
      verbose = 0, minClusterSize = minClusterSize
    )
  )
  nclust <- max(my_clusters)
  list(nclust = nclust, my_clusters = my_clusters)
}