R/clusGapGen.R
In cluster: "Finding Groups in Data": Cluster Analysis Extended Rousseeuw et al.

### From Master Thesis of Emmanuel Profumo (w/ M.Maechler) Autumn 2016--March 2017
### "Generalized clusGap()" :  We cannot be 100% compatible to clusGap()

#' @param x the data, can be a data frame or a matrix
#' @param algo, a clustering algorithm function taking the prepared data and
#' a number of clusters as arguments
#' @param index, a function taking a clustering vector and the prepared data
#' which returns the value of a validity index. Index can also be a list of such
#' functions to obtain results for different indices.
#' For coherence with the originally proposed Gap Statistic in Tibshirani et al's
#' a LOWER value of the validity index implies a better clustering quality, so
#' indices such as average silhouette width should be added a minus sign.
#' This can be changed by setting the argument low=FALSE.
#' @param modelH0, a function which takes as argument at least the data x,
#' parameters estimated from the data, and further arguments in ...
#' @param K.max, number of different clusters for which the index should be
#' evaluated.
#' @param B, the number of bootstraps sample.
#' @param transformData, a function which takes the data x as argument and
#' processed it for clustering
#' @param modelH0Param, a function which takes as argument the data x and returns
#' a list of modelH0 parameters with matching names.
#' @param low, logical, if FALSE a HIGHER value of the index or the indices in the
#' user provided list implies a better clustering quality
#'
#' @return if index is just one function a list with components:
#' Indks, the bootstrap validity plots
#' Ind, the validity plot corresponding to the data
#' E.Ind, the sample mean of the bootstrap validity plots
#' gap, the calibrated validity plot, difference between E.Ind and Ind
#' gapHen, the gap divided by the standard deviation of the bootstraps validity plots
#' SE.sim, the sample standard error of the bootstrap validity plots with a correction
#' term for bootstrap estimation
#' SE, the sample standard error
#' if index is a list of index then each of the components above are lists with values
#' for each index
clusGapGen <- function(x, algo, index, modelH0, K.max, B = 100,
                       transformData = identity,
                       modelH0Param = function(y) list(),
                       low=TRUE, verbose = interactive(), ...)
{
  ind.isList <- is.list(index)
  if (is.function(index))
    index <- list(index)
  else if (!ind.isList || !all(vapply(index, is.function, NA)))
    stop("index has to be a function or a list of function")

  Ind <- E.Ind <- SE.sim <- SE <- index
  for (i in seq_along(index)) Ind[[i]] <- E.Ind[[i]] <- SE.sim[[i]] <- numeric(K.max)

  if(verbose) cat("Clustering k = 1,2,..., K.max (= ",K.max,"): .. ", sep='')
  xt <- transformData(x)
  for(k in 1:K.max){
    cls <- algo(xt,k)
    for (i in seq_along(index))
      Ind[[i]][k] <- index[[i]](cls,xt)
  }
  if(verbose) cat("done\n")
  Indks <- index
  for (i in seq_along(index)) Indks[[i]] <- matrix(0, B, K.max)

  param <- modelH0Param(x)
  if(verbose) cat("Bootstrapping, b = 1,2,..., B (= ", B,
                  ")  [one \".\" per sample]:\n", sep="")
  for (b in 1:B) {
    z <- do.call(modelH0,c(list(x=x),param,list(...)))
    zt <- transformData(z)
    for(k in 1:K.max) {
      cls <- algo(zt,k)
      for (i in seq_along(index))
        Indks[[i]][b,k] <- index[[i]](cls,zt)
    }
    if(verbose) cat(".", if(b %% 50 == 0) paste(b,"\n"))
  }
  if(verbose && (B %% 50 != 0)) cat("",B,"\n")
  gap <- gapHen <- index
  for (i in seq_along(index)){
    E.Ind[[i]] <- colMeans(Indks[[i]])
    var.i <- apply(Indks[[i]], 2, var)
    SE[[i]] <- sqrt(var.i)
    SE.sim[[i]] <- sqrt((1 + 1/B) * var.i)
    gap[[i]] <- gap.i <- E.Ind[[i]] - Ind[[i]]
    gapHen[[i]] <- gap.i/SE[[i]]
    if (!low) {
      gap[[i]] <- -gap[[i]]
      gapHen[[i]] <- -gapHen[[i]]
    }
  }
  ## TODO: really? make distinction of *list* of indices vs 1 index?
  ## ---   well maybe, keep it: *The* usual case = _one_ index (or not?)
  if (ind.isList) {
    list(Indks=Indks,Ind=Ind, E.Ind=E.Ind, gap = gap,
         gapHen = gapHen ,SE.sim=SE.sim,SE=SE)
  }
  else {list(Indks=Indks[[1]],Ind=Ind[[1]], E.Ind=E.Ind[[1]], gap = gap[[1]],
             gapHen = gapHen[[1]] ,SE.sim=SE.sim[[1]],SE=SE[[1]])
  }
}


#' @param clusGapRes, a list returned by a call of function clusGapGen
#' @param main, the main title to the plots
#' @param divBySd, logical, if TRUE plot for the standardize version of the gap

clusGapGen.plot <- function(clusGapRes,divBySd=FALSE,main=""){


  if (!is.list(clusGapRes$Ind)) clusGapRes <- lapply(clusGapRes,
                                                     function(el) list(el))
  for (i in seq_along(clusGapRes$Ind)){
    B <- nrow(clusGapRes$Indks[[i]])
    K.max <- ncol(clusGapRes$Indks[[i]])
    std <- t(replicate(B,rep(1,K.max)))
    ylm <- range(rbind(clusGapRes$Indks[[i]],clusGapRes$Ind[[i]]),na.rm=TRUE)
    ylb <- names(clusGapRes$Ind[i])
    if (is.null(ylb)) ylb <- paste("Index",as.character(i))
    gp <- "gap"
    namegap <- paste(gp,ylb)
    if (divBySd) {
      std <- t(replicate(B,clusGapRes$SE.sim[[i]]))
      gp <- "gapHen"
      namegap <- paste(gp,ylb)
    }
    matplot(replicate(B,1:K.max),t(clusGapRes$Indks[[i]]),
            pch = "-",xlab = "k",ylab = ylb,
            type="l",ylim=ylm,main=main
    )

    lines(1:K.max,clusGapRes$E.Ind[[i]],type="l",col="white",lwd=2)
    lines(1:K.max,clusGapRes$Ind[[i]],lwd=2)

    boxplot((clusGapRes$Indks[[i]]-t(replicate(B,clusGapRes$Ind[[i]])))/std,
            pch = "*",xlab = "k", ylab = namegap,type="l",col=c("light blue"),
            notch=TRUE, border="grey",main=main
    )

    lines(1:K.max,clusGapRes[[gp]][[i]],type="l",xlab="k",ylab="",
          col="orangered",lwd=1.5)

  }
}