diceR: Diverse Cluster Ensemble in R

Documented in sigclust

#' Significant Testing of Clustering Results
#'
#' Uses the SigClust K-Means algorithm to assess significance of clustering
#' results.
#'
#' This function is a wrapper for the original [sigclust::sigclust()], except
#' that an additional parameter `k` is allows testing against any number of
#' clusters. In addition, the default type of covariance estimation is also
#' different.
#'
#' @param x data matrix, samples are rows and features are columns
#' @param k cluster size to test against
#' @param nsim number of simulations
#' @param nrep See [sigclust::sigclust()] for details.
#' @param labflag See [sigclust::sigclust()] for details.
#' @param label true class label. See [sigclust::sigclust()] for details.
#' @param icovest type of covariance matrix estimation
#'
#' @return An object of class `sigclust`. See [sigclust::sigclust()] for
#'   details.
#' @author Hanwen Huang: \email{hanwenh@email.unc.edu}; Yufeng Liu:
#'   \email{yfliu@email.unc.edu}; J. S. Marron: \email{marron@email.unc.edu}
#' @references Liu, Yufeng, Hayes, David Neil, Nobel, Andrew and Marron, J. S,
#'   2008, *Statistical Significance of Clustering for High-Dimension,
#'   Low-Sample Size Data*, *Journal of the American Statistical Association*
#'   **103**(483) 1281--1293.
#' @export
#'
#' @examplesIf rlang::is_installed("sigclust")
#' data(hgsc)
#' dat <- hgsc[1:100, 1:50]
#' nk <- 4
#' cc <- consensus_cluster(dat, nk = nk, reps = 5, algorithms = "pam",
#' progress = FALSE)
#' cl.mat <- consensus_combine(cc, element = "class")
#' lab <- cl.mat$`4`[, 1]
#' set.seed(1)
#' str(sigclust(x = dat, k = nk, nsim = 50, labflag = 1, label = lab))
sigclust <- function(x, k, nsim, nrep = 1, labflag = 0, label = 0,
                     icovest = 2) {
  n <- dim(x)[1]
  p <- dim(x)[2]
  if (n > 1) {
    if (!requireNamespace("sigclust", quietly = TRUE)) {
      stop("Package \"sigclust\" is needed. Please install it.",
           call. = FALSE)
    } else {
      x <- as.matrix(x)
      if (labflag == 0) {
        xclust <- .cluster(x, k)
        for (i in 1:nrep) {
          clust.temp <- .cluster(x, k)
          if (clust.temp$cindex < xclust$cindex)
            xclust <- clust.temp
          xcindex <- xclust$cindex
        }
      }
      xvareigen <- sigclust::.vareigen(x, n, p, icovest)
      simcindex <- rep(0, nsim)
      for (i in 1:nsim) {
        xsim <- .simnull(xvareigen$vsimeigval, n, p, k)
        simcindex[i] <- xsim$cindex
      }
      if (labflag == 0) {
        index <- simcindex <= xclust$cindex
        mindex <- mean(simcindex)
        sindex <- stats::sd(simcindex)
        pval <- sum(index) / nsim
        pvalnorm <- stats::pnorm(xclust$cindex, mindex, sindex)
      }
      if (labflag == 1) {
        meanpl <- vapply(sort(unique(label)),
                         function(y) colMeans(x[label == y, , drop = FALSE]),
                         FUN.VALUE = double(p))
        txdiffl <- lapply(sort(unique(label)),
                          function(y) t(x[label == y, ]) - meanpl[, y])
        withinsum <- sum(vapply(txdiffl,
                                function(y) sum(y ^ 2),
                                FUN.VALUE = double(1)))
        meanp <- colMeans(x)
        tx <- t(x)
        txdiff <- tx - meanp
        totalsum <- sum(txdiff ^ 2)
        cindexlab <- withinsum / totalsum
        index <- simcindex <= cindexlab
        mindex <- mean(simcindex)
        sindex <- stats::sd(simcindex)
        pval <- sum(index) / nsim
        pvalnorm <- stats::pnorm(cindexlab, mindex, sindex)
        xcindex <- cindexlab
      }
      return(methods::new("sigclust", raw.data = x, veigval = xvareigen$veigval,
                          vsimeigval = xvareigen$vsimeigval,
                          simbackvar = xvareigen$simbackvar,
                          icovest = icovest, nsim = nsim, simcindex = simcindex,
                          pval = pval, pvalnorm = pvalnorm, xcindex = xcindex))
    }
  }
  else {
    print("Only one sample left, no need for clustering!")
    return(0)
  }
}

#' @noRd
.cluster <- function(x, k) {
  clust <- stats::kmeans(x, k)
  withinsum <- sum(clust$withinss)
  meanp <- colMeans(x)
  tx <- t(x)
  txdiff <- tx - meanp
  totalsum <- sum(txdiff ^ 2)
  cindex <- withinsum / totalsum
  list(clust = clust, cindex = cindex)
}

#' @noRd
.simnull <- function(vsimeigval, n, p, k) {
  simnorm <- t(replicate(n, stats::rnorm(p, sd = sqrt(vsimeigval))))
  simclust <- .cluster(simnorm, k)
  list(cindex = simclust$cindex)
}