R/DiscreteClusGapFuns.R

Defines functions clusGapDiscr

Documented in clusGapDiscr

#' Discrete application of clusGap - core function.
#'
#' Based on the implementation of the function found in the `cluster` R package.
#'
#' @param x A matrix object specifying category attributes in the columns and observations in the rows.
#' @param FUNcluster a function that accepts as first argument a matrix like `x`; second argument specifies number of `k` (k=>2) clusters
#' This function should return a list with a component named `cluster`, a vector of length `n=nrow(x)` of integers from `1:k` indicating observation cluster assignment.
#' Make sure `FUNcluster` and `Input2Alg` agree.
#' @param K.max Integer. Maximum number of clusters `k` to consider
#' @param value.range String, character vector or a list of character vectors with the length matching the number of columns (nQ) of the array.
#' A vector with all categories to consider when bootstrapping the null distribution sample (KS: Known Support option).
#' By DEFAULT vals=NULL, meaning unique range of categories found in the data will be used when drawing the null (DS: Data Support option).
#' If a character vector of categories is provided, these values would be used for the null distribution drawing across the array.
#' If a list with category character vectors is provided, it has to have the same number of columns as the input array. The order of list element corresponds to the array's columns.
#' @param verbose Integer or logical. Determines if “progress” output should be printed. The default prints one bit per bootstrap sample.
#' @param distName String. Name of categorical distance to apply.
#' Available distances: 'bhattacharyya', 'chisquare', 'cramerV', 'hamming' and 'hellinger'.
#' @param Input2Alg Specifies the kind of input provided to the algorithm function in `FUNcluster`.
#' For algorithms that only accept a distance matrix use `'distMatr'` option (default).
#' For algorithms that require the dataset and a prespecified distance function (e.g. `stats::dist`) use the `'distFun'` option.
#' This case the distance function is defined internally and determined by parameter `distName`.
#' @param B Number of bootstrap samples. By default B = nrow(x).
#' @param verbose Integer or logical. Determines whether progress output should printed while running. By DEFAULT one bit is printed per bootstrap sample.
#' @param useLog Logical. Use log function after estimating `W.k`. Following the original formulation `useLog=TRUE` by default.
#' @param ... optionally further arguments for `FUNcluster()`
#'
#' @return a matrix with K.max rows and 4 columns, named "logW", "E.logW", "gap", and "SE.sim",
#' where gap = E.logW - logW, and SE.sim correspond to the standard error of `gap`.

clusGapDiscr0 <- function (x,
                          FUNcluster,
                          K.max,
                          B = nrow(x),
                          value.range = "DS",
                          verbose = interactive(),
                          distName = "hamming",
                          useLog = TRUE,
                          Input2Alg = 'distMatr',
                          ...){

   stopifnot(is.function(FUNcluster),
             length(dim(x)) == 2,
             K.max >= 2,
             (n <- nrow(x)) >= 1,
             ncol(x) >= 1)

   if(!is.matrix(x))
      x <- as.matrix(x)

   uniLevs <- unique(as.vector(x))
   uniLevs <- sort(uniLevs)

   if (is.numeric(x)) {
      message(paste0("x array is numerical and has ", length(uniLevs),
                     " levels."))
      stopifnot(length(uniLevs) < 10)
      message("Array values would be transformed to categorical levels: ")
      myCats <- paste0(paste0(uniLevs, " -> c", uniLevs), collapse = ", ")
      message(myCats)
      x <- apply(x, 1, function(y) paste0('c', y)) %>% t ## Transformed matrix
   }
   else {
      message(paste0("Found levels: ", paste0(uniLevs, collapse = ", ")))
   }

   if (B != (B. <- as.integer(B)) || (B <- B.) <= 0)
      stop("'B' has to be a positive integer")
   cl. <- match.call

   ii <- seq_len(n)

   if(Input2Alg == 'distMatr'){

      W.k <- function(X, kk) {
         clus <- if (kk > 1) {
            Xdist <- distancematrix(X, d = distName)
            FUNcluster(Xdist, kk)$cluster
         }
         else {
            rep.int(1L, nrow(X))
         }
         mySplit <- split(ii, clus)
         mySplit <- mySplit[sapply(mySplit, length) != 1]
         # 0.5 * sum(vapply(mySplit, function(I) {
         sum(vapply(mySplit, function(I) {
            xs <- X[I, , drop = FALSE]
            sum(distancematrix(xs, d = distName)/nrow(xs), na.rm = TRUE)
         }, 0))
      }

   }else if(Input2Alg == 'distFun'){

      PWdistFun <- function(x, y){
         x <- unlist(x)
         y <- unlist(y)
         distancematrix(rbind(x, y), d = distName)[1]
      }

      myFUNcluster <- function(X, kk){
         ## Condition: FUNcluster is expecting a distFun parameter
         FUNcluster(X,
                    kk,
                    distFun = function(x, y) PWdistFun(x, y) )
      }

      W.k <- function(X, kk){
         clus <- if (kk > 1) {
            ## Xdist <- distancematrix(X, d = distName)
            myFUNcluster(X, kk)$cluster
         }
         else {
            rep.int(1L, nrow(X))
         }
         mySplit <- split(ii, clus) ## Split into a list with potential clusters
         mySplit <- mySplit[sapply(mySplit, length) != 1] ## Removes singletons
         # 0.5 * sum(vapply(mySplit, function(I) {
         sum(vapply(mySplit, function(I) {
            xs <- X[I, , drop = FALSE]
            sum(distancematrix(xs, d = distName)/nrow(xs), na.rm = TRUE)
         }, 0))
      }

   }

   logW <- E.logW <- SE.sim <- numeric(K.max)
   if (verbose)
      cat("Clustering k = 1,2,..., K.max (= ", K.max, "): .. ",
          sep = "")

   for (k in 1:K.max){
      if(useLog){
         logW[k] <- log(W.k(x, k))
      }else{
         logW[k] <- W.k(x, k)
      }
   }

   ## If all distances are 0; option 1.
   ## if(any(!is.finite(logW)))
   ##     logW[!is.finite(logW)] <- min(logW[is.finite(logW)])

   if (verbose)
      cat("done\n")

   if(is.numeric(value.range)){
      ## If value.range are numeric, turn them into c0, c1, ...
      message('value.range cannot be numerical and will be transformed to characters.')
      vals <- as.character(paste0('c', value.range))
   }else if (is.character(value.range) &
             !is.list(value.range) &
             value.range[1] == "DS" & length(value.range) == 1) {
      ## Data-Support null option
      vals <- NULL
      rng.x1 <- lapply(1:ncol(x), function(i) unique(x[, i]))
   }
   else if (all(is.character(value.range)) &
            !is.list(value.range) &
            (length(value.range) > 1)) {
      ## User-defined single range
      vals <- value.range
   }
   else if (!is.character(value.range) & is.list(value.range)) {
      ## When user defines the value range for each question
      stopifnot(ncol(x) == length(value.range))
      vals <- value.range
   }

   logWks <- matrix(0, B, K.max)
   if (verbose)
      cat("Bootstrapping, b = 1,2,..., B (= ", B, ")  [one \".\" per sample]:\n",
          sep = "")

   for (b in 1:B) {
      if (is.null(vals)) {
         ## Data-Support case. Notice that rng.x1 is a list with individual values ranges.
         z <- lapply(rng.x1, function(M, nn) sample(size = nn,
                                                    x = M,
                                                    replace = TRUE),
                     nn = n) %>%
            do.call(what = cbind)
      }
      else if (!is.list(vals) & length(vals) > 2) {
         ## KS: Known-Support User-specified categories as vector.
         z <- matrix(sample(x = vals, size = nrow(x) * ncol(x),
                            replace = TRUE), nrow = nrow(x))
      }
      else if (is.list(vals)) {
         ## User-defined categories as list (length nQ)
         z <- sapply(1:length(vals),
                     function(i, nn) sample(size = nn,
                                            x = vals[[i]],
                                            replace = TRUE),
                     nn = n)
      }

      for (k in 1:K.max) {
         if(useLog){
            logWks[b, k] <- log(W.k(z, k))
         }else{
            logWks[b, k] <- W.k(z, k)
         }
      }
      if (verbose)
         cat(".", if (b%%50 == 0)
            paste(b, "\n"))
   }
   if (verbose && (B%%50 != 0))
      cat("", B, "\n")
   E.logW <- colMeans(logWks)
   SE.sim <- sqrt((1 + 1/B) * apply(logWks, 2, stats::var))

   return(structure(class = "clusGap",
                     list(Tab = cbind(logW, E.logW,
                                      gap = E.logW - logW,
                                      SE.sim),
                          call = cl.,
                          n = n,
                          B = B,
                          FUNcluster = FUNcluster,
                          useLog = useLog,
                          distName = distName,
                          value.range = value.range)) )
}

#' Discrete application of clusGap
#'
#' Based on the implementation of the function found in the `cluster` R package.
#'
#' @param x A matrix object specifying category attributes in the columns and observations in the rows.
#' @param clusterFUN Character string with one of the available clustering implementations.
#' Available options are: 'pam' (default) from `cluster::pam`, 'diana' from `cluster::diana`, 'fanny' from `cluster::fanny`,
#' 'agnes-\{average, single, complete, ward, weighted\}' from `cluster::fanny`,
#' 'hclust-\{ward.D, ward.D2, single, complete, average, mcquitty, median, centroid\}' from `stats::hclust`,
#' 'kmodes' from `klar::kmodes` (`iter.max = 10`, `weighted = FALSE` and `fast= TRUE`).
#' 'kmodes-N' enables to run the `kmodes` algorithm with a given number N of iterations where `iter.max = N`.
#' @param K.max Integer. Maximum number of clusters `k` to consider
#' @param value.range String character vector or a list of character vector with the length matching the number of columns (nQ) of the array.
#' A vector with all categories to consider when bootstrapping the null distribution sample (KS: Known Support option).
#' By DEFAULT vals=NULL, meaning unique range of categories found in the data will be used when drawing the null (DS: Data Support option).
#' If a character vector of categories is provided, these values would be used for the null distribution drawing across the array.
#' If a list with category character vectors is provided, it has to have the same number of columns as the input array. The order of list element corresponds to the array's columns.
#' @param verbose Integer or logical. Determines if “progress” output should be printed. The default prints one bit per bootstrap sample.
#' @param distName String. Name of categorical distance to apply.
#' Available distances: 'bhattacharyya', 'chisquare', 'cramerV', 'hamming' and 'hellinger'.
#' @param B Number of bootstrap samples. By default B = nrow(x).
#' @param verbose Integer or logical. Determines whether progress output should printed while running. By DEFAULT one bit is printed per bootstrap sample.
#' @param useLog Logical. Use log function after estimating `W.k`. Following the original formulation `useLog=TRUE` by default.
#' @param ... optionally further arguments for `FUNcluster()`
#'
#' @return a matrix with K.max rows and 4 columns, named "logW", "E.logW", "gap", and "SE.sim",
#' where gap = E.logW - logW, and SE.sim correspond to the standard error of `gap`.
#' @export

clusGapDiscr <- function(x,
                         clusterFUN,
                         K.max,
                         B = nrow(x),
                         value.range = "DS",
                         verbose = interactive(),
                         distName = "hamming",
                         useLog = TRUE,
                         ...){

   if(clusterFUN == 'pam'){
      clusGapDiscr0(x = x,
                    FUNcluster = cluster::pam,
                    K.max = K.max,
                    B = B,
                    value.range = value.range,
                    verbose = verbose,
                    distName = distName,
                    useLog = useLog,
                     ...)

   }else if(clusterFUN == 'fanny'){
      clusGapDiscr0(x = x,
                    FUNcluster = cluster::fanny,
                    K.max = K.max,
                    B = B,
                    value.range = value.range,
                    verbose = verbose,
                    distName = distName,
                    useLog = useLog,
                    ...)

   }else if(clusterFUN == 'diana'){
      dianaK <- function(d, k){
         out <- cluster::diana(x = d) %>%
                 stats::cutree(k = k)
         list(cluster = out)
      }

      clusGapDiscr0(x = x,
                    FUNcluster = dianaK,
                    K.max = K.max,
                    B = B,
                    value.range = value.range,
                    verbose = verbose,
                    distName = distName,
                    useLog = useLog,
                    ...)

   }else if(grepl(pattern = '^agnes-.+', x = clusterFUN)){

      cMeth <- sub('agnes-', '', clusterFUN)
      agnesK <- function(d = d, k = k){
         out <- cluster::agnes(x = d,
                               diss = TRUE,
                               method = cMeth) %>%
            stats::cutree(k = k)
         list(cluster = out)
      }

      clusGapDiscr0(x = x,
                    FUNcluster = agnesK,
                    K.max = K.max,
                    B = B,
                    value.range = value.range,
                    verbose = verbose,
                    distName = distName,
                    useLog = useLog,
                    ...)

   }else if(grepl(pattern = '^hclust-.+', x = clusterFUN)){

      cMeth <- sub('hclust-', '', clusterFUN)
      hClustK <- function(d = d, k = k){
            out <- stats::hclust(d = d, method = cMeth) %>%
               stats::cutree(k = k)
            list(cluster = out)
         }

      clusGapDiscr0(x = x,
                    FUNcluster = hClustK,
                    K.max = K.max,
                    B = B,
                    value.range = value.range,
                    verbose = verbose,
                    distName = distName,
                    useLog = useLog,
                    ...)

   }else if(grepl(pattern = '^kmodes.*', x = clusterFUN)){

      cIter <- ifelse(grepl(pattern = 'kmodes-.+',  x = clusterFUN),
                      as.numeric(sub('kmodes-', '', clusterFUN)),
                      10)

      kmodesD_ <- function(x, k, distFun){
         kmodesD(data = x, modes = k, distFun = distFun, iter.max = cIter)
      }

      clusGapDiscr0(x = x,
                    FUNcluster = kmodesD_,
                    K.max = K.max,
                    B = B,
                    value.range = value.range,
                    verbose = verbose,
                    distName = distName,
                    useLog = useLog,
                    Input2Alg = 'distFun',
                    ...)
   }else
      message('Clustering algorithm not available.')

}

#' Criteria to determine number of clusters k
#'
#' Same function as found in `cluster` package.
#'
#' @import dplyr
#' @param cG_obj Output object obtained from `clusGapDiscr`
#' @param meth Method to use to determine optimal k number of clusters.
#' @return A numerical value from 1 to K.max, contained in the input `cG_obj` object.
#' @export

findK <- function (cG_obj, meth = "Tibs2001SEmax"){
   logW <- gap <- SE.sim <- NULL
   if (!meth %in% c("minSE", "minGap", "maxChange")) {

      ## Tibs2001 SEmax criterion
      # myTab <- data.frame(cG_obj$Tab) %>%
      #    mutate(nClus = 1:nrow(cG_obj$Tab) ) %>%
      #    subset(is.finite(logW) & !is.na(gap) & !is.na(SE.sim))

      outDF <- data.frame(cG_obj['Tab'])
      outDF[, 'nClus'] <- 1:nrow(outDF)
      colnames(outDF) <- c('logW', 'E.logW', 'gap', 'SE.sim', 'nClus')

      myTab <- outDF %>%
         subset(is.finite(logW) & !is.na(gap) & !is.na(SE.sim))
      selK <- with(myTab,
                   cluster::maxSE(f = gap,
                                  SE.f = SE.sim,
                                  method = meth) )
      myTab[selK, 'nClus']

   }
   else if (meth == "minSE") {
      which.min(cG_obj$Tab[, "SE.sim"])
   }
   else if (meth == "minGap") {
      which.min(cG_obj$Tab[, "gap"])
   }
   else if (meth == "maxChange") {
      which.max(abs(cG_obj$Tab[, "gap"])) + 1
   }
}

Try the DiscreteGapStatistic package in your browser

Any scripts or data that you put into this service are public.

DiscreteGapStatistic documentation built on April 3, 2025, 9:22 p.m.