R/cStability.R
In cstab: Selection of Number of Clusters via Normalized Clustering Instability

Documented in cStability

#' Selection of number of clusters via clustering instability
#'
#' Selection of number of clusters via \emph{model-based} or \emph{model-free},
#'   \emph{normalized} or \emph{unnormalized} clustering instability.
#'
#' @param data a n x p data matrix of type numeric.
#' @param kseq a vector with considered numbers clusters k > 1
#' @param nB an integer specifying the number of bootstrap comparisons.
#' @param norm logical specifying whether the instability path should be
#'    normalized. If TRUE, the instability path is normalized, accounting for a
#'    trivial decrease in instability due to a increasing k (see Haslbeck & Wulff,
#'    2016).
#' @param predict boolean specifying whether the model-based or the model-free
#'    variant should be used (see Haslbeck & Wulff, 2016).
#' @param method character string specifying the clustering algorithm. 'kmeans' for
#'    the k-means algorithm, 'hierarchical' for hierarchical clustering.
#' @param linkage character specifying the linkage criterion, in case
#'    \code{type='hierarchical'}. The available options are "single", "complete",
#'    "average", "mcquitty", "ward.D", "ward.D2", "centroid" or "median". See
#'    \link[stats]{hclust}.
#' @param kmIter integer specifying the the number of restarts of the k-means algorithm
#'    in order to avoid local minima.
#' @param pbar logical
#'
#' @return a list that contains the optimal k selected by the unnormalized and
#'    normalized instability method. It also includes a vector containing the averaged
#'    instability path (over bootstrap samples) and a matrix containing the instability
#'    path of each bootstrap sample for both the normalized and the unnormalized method.
#'
#' @references
#'    Ben-Hur, A., Elisseeff, A., & Guyon, I. (2001). A stability based method for
#'    discovering structure in clustered data. \emph{Pacific symposium on biocomputing,
#'    7}, 6-17.
#'
#'    Tibshirani, R., & Walther, G. (2005). Cluster validation by prediction strength.
#'    \emph{Journal of Computational and Graphical Statistics, 14}(3), 511-528.
#'
#' @author
#'    Dirk U. Wulff <dirk.wulff@gmail.com>
#'    Jonas M. B. Haslbeck <jonas.haslbeck@gmail.com>
#'
#' @examples
#' \dontrun{
#'   # Generate Data from Gaussian Mixture
#'   s <- .1
#'   n <- 50
#'   data <- rbind(cbind(rnorm(n, 0, s), rnorm(n, 0, s)),
#'                 cbind(rnorm(n, 1, s), rnorm(n, 1, s)),
#'                 cbind(rnorm(n, 0, s), rnorm(n, 1, s)),
#'                 cbind(rnorm(n, 1, s), rnorm(n, 0, s)))
#'   plot(data)
#'
#'   # Selection of Number of Clusters using Instability-based Measures
#'   stab_obj <- cStability(data, kseq=2:10)
#'   print(stab_obj)
#'   }
#'
#' @export

cStability <- function(data, # n x p data matrix
                       kseq = 2:20, # sequence of ks tested
                       nB   = 10, # number of bootstrap comparisons
                       norm = TRUE, # norm over pw equal assign,FALSE=as in Wang etal
                       predict = TRUE, # use prediction approach, if FALSE, use brute pair in equal cluster approach
                       method    = 'kmeans', # or 'hierarchical'
                       linkage = 'complete', # or average, or ...
                       kmIter  = 5,
                       pbar = TRUE) # number of reruns of k-means algorithm
{


  # ---------- Input Checks ----------

  # On Data
  if(sum(is.na(data))>0) stop('No missing values permitted!')

  # On k-sequence
  if(1 %in% kseq) stop('Please select a k sequence starting with 2: {2,3,...K}!')

  # On B
  if(round(nB)!=nB) stop('The number of bootstrap comparison has to be a positive integer value.')

  # On type
  if(method %in% c('kmeans', 'hierarchical'))


  # ---------- Create Containers ----------

  n_obj <- nrow(data)
  m_instab <- matrix(NA, nB, length(kseq)) # storage: no normalization
  m_instab_norm <- matrix(NA, nB, length(kseq)) # storage: normalization


  # ---------- Draw bootstrap samples ----------

  ind <- list()
  share <- list()
  for(b in 1:nB) {
    tmp_1 <- sample(1:n_obj, n_obj, replace=T)
    tmp_2 <- sample(1:n_obj, n_obj, replace=T)
    tmp_1 <- tmp_1[order(tmp_1)]
    tmp_2 <- tmp_2[order(tmp_2)]
    ind[[b]] <- list(tmp_1,tmp_2)
    intersct <- intersect(tmp_1,tmp_2)
    share[[b]] <- list(tmp_1 %in% intersct & !duplicated(tmp_1), tmp_2 %in% intersct & !duplicated(tmp_2)) # leave duplicates in ?
    }


  # ---------- Calculate distance matrix ----------

  distm = fast_dist(data) # maybe here?

  # ----- Loop over B comparisons -----

  if(pbar)  pb <- utils::txtProgressBar(min=0, max=nB, style = 2)

  for(b in 1:nB) {
    for(k in kseq) {

        # kmeans
        if(method == 'kmeans') {
          kms_1 <- list()
          kms_2 <- list()
          for(km_i in 1:kmIter) {
            kms_1[[km_i]] = stats::kmeans(data[ind[[b]][[1]],], centers = k)
            kms_2[[km_i]] = stats::kmeans(data[ind[[b]][[2]],], centers = k) # this has to be 2
            }
          km_1 = kms_1[[which.min(sapply(kms_1,function(x) mean(x$tot.withinss)))]]
          km_2 = kms_2[[which.min(sapply(kms_1,function(x) mean(x$tot.withinss)))]]
          if(predict == TRUE) {
            cl_1 = kmeans_predict(km_1, data = data)
            cl_2 = kmeans_predict(km_2, data = data)
            } else {
            cl_1 = km_1$cluster[share[[b]][[1]]]
            cl_2 = km_2$cluster[share[[b]][[2]]]
            }
          }

        if(method == 'hierarchical') {
          #t = proc.time()[3]
          hc_1 = fastcluster::hclust(stats::as.dist(distm[ind[[b]][[1]],ind[[b]][[1]]]), method = linkage)
          hc_2 = fastcluster::hclust(stats::as.dist(distm[ind[[b]][[2]],ind[[b]][[2]]]), method = linkage)
          cl_1 = stats::cutree(hc_1, k)[share[[b]][[1]]]
          cl_2 = stats::cutree(hc_2, k)[share[[b]][[2]]]
          #print(proc.time()[3] - t)
          }

      # check for equality of clusterings
      eq_1 = equal(cl_1)
      eq_2 = equal(cl_2)
      InStab <- mean(eq_1 != eq_2)

      # Normalize = FALSE
      m_instab[b, which(kseq==k)] <- InStab

      # Normalize = TRUE
      norm_val <- instabLookup(table(cl_1), table(cl_2))
      m_instab_norm[b, which(kseq==k)] <- InStab / norm_val

    } # end for k

    if(pbar) utils::setTxtProgressBar(pb, b)

  } # end for B

  # taking the mean
  m_instab_M      = colMeans(m_instab)
  m_instab_norm_M = colMeans(m_instab_norm)

  kopt_instab  = which(m_instab_M == min(m_instab_M))+(min(kseq)-1)
  kopt_instabN = which(m_instab_norm_M == min(m_instab_norm_M))+(min(kseq)-1)

  # replicate function call
  f_call <- list('kseq'=kseq,
                 'nB'=nB,
                 'norm'=norm,
                 'prediction'=predict,
                 'method'=method,
                 'linkage'=linkage,
                 'kmIter'=kmIter)

  outlist <- list("k_instab"=kopt_instab,
                  "k_instab_norm"=kopt_instabN,
                  "instab_path"=m_instab_M,
                  "instab_path_norm"=m_instab_norm_M,
                  "instab_path_matrix"=m_instab,
                  "instab_path_nrom_matrix"=m_instab_norm,
                  'call'=f_call)

  class(outlist) <- c('list', 'cstab', 'cStability')

  return(outlist)

} # EoF