R/fviz_nbclust.R

Defines functions .viz_NbClust .get_withinSS .get_ave_sil_width fviz_gap_stat

Documented in fviz_gap_stat

#' @include  hcut.R
 NULL
#' Dertermining and Visualizing the Optimal Number of Clusters
#' @description Partitioning methods, such as k-means clustering require the 
#'   users to specify the number of clusters to be generated. \itemize{
#'   \item fviz_nbclust(): Determines and visualizes the optimal number of
#'   clusters using different methods: \strong{within cluster sums of squares},
#'   \strong{average silhouette} and \strong{gap statistics}.
#'   \item fviz_gap_stat(): Visualizes the gap statistic generated by the
#'   function \code{\link[cluster]{clusGap}}() [in cluster package]. The optimal
#'   number of clusters is specified using the "firstmax" method
#'   (?cluster::clustGap). }
#'   
#'   Read more: 
#'   \href{https://www.datanovia.com/en/lessons/determining-the-optimal-number-of-clusters-3-must-know-methods/}{Determining
#'    the optimal number of clusters}
#'   
#' @param x numeric matrix or data frame. In the function fviz_nbclust(), x can 
#'   be the results of the function NbClust().
#' @param method the method to be used for estimating the optimal number of 
#'   clusters. Possible values are "silhouette" (for average silhouette width), 
#'   "wss" (for total within sum of square) and "gap_stat" (for gap statistics).
#' @param FUNcluster a partitioning function which accepts as first argument a 
#'   (data) matrix like x, second argument, say k, k >= 2, the number of 
#'   clusters desired, and returns a list with a component named cluster which 
#'   contains the grouping of observations. Allowed values include: kmeans,
#'   cluster::pam, cluster::clara, cluster::fanny, hcut, etc. This argument is
#'   not required when x is an output of the function 
#'   \code{NbClust::NbClust()}.
#' @param diss dist object as produced by dist(), i.e.: diss = dist(x, method = 
#'   "euclidean"). Used to compute the average silhouette width of clusters, the
#'   within sum of square and hierarchical clustering. If NULL, dist(x) is 
#'   computed with the default method = "euclidean"
#' @param k.max the maximum number of clusters to consider, must be at least two.
#' @param nboot integer, number of Monte Carlo ("bootstrap") samples. Used only for determining the number of clusters 
#' using gap statistic.
#' @param verbose logical value. If TRUE, the result of progress is printed.
#' @param barfill,barcolor fill color and outline color for bars
#' @param linecolor color for lines
#' @param print.summary logical value. If true, the optimal number of clusters 
#'   are printed in fviz_nbclust().
#' @param ... optionally further arguments:
#'   arguments for FUNcluster() in "wss"/"silhouette" modes; arguments for
#'   \code{\link[cluster]{clusGap}}() in "gap_stat" mode. A \code{maxSE} list
#'   can also be supplied in "gap_stat" mode and is forwarded to
#'   \code{fviz_gap_stat()}.
#'   
#' @return \itemize{ \item fviz_nbclust, fviz_gap_stat: return a ggplot2 }
#' @seealso \code{\link{fviz_cluster}}, \code{\link{eclust}}
#' @author Alboukadel Kassambara \email{alboukadel.kassambara@@gmail.com}
#'   
#' @examples 
#' set.seed(123)
#' 
#' # Data preparation
#' # +++++++++++++++
#' data("iris")
#' head(iris)
#' # Remove species column (5) and scale the data
#' iris.scaled <- scale(iris[, -5])
#' 
#' 
#' # Optimal number of clusters in the data
#' # ++++++++++++++++++++++++++++++++++++++
#' # Examples are provided only for kmeans, but
#' # you can also use cluster::pam (for pam) or
#' #  hcut (for hierarchical clustering)
#'  
#' ### Elbow method (look at the knee)
#' # Elbow method for kmeans
#' fviz_nbclust(iris.scaled, kmeans, method = "wss") +
#' geom_vline(xintercept = 3, linetype = 2)
#' 
#' # Average silhouette for kmeans
#' fviz_nbclust(iris.scaled, kmeans, method = "silhouette")
#' 
#' ### Gap statistic
#' library(cluster)
#' set.seed(123)
#' # Compute gap statistic for kmeans
#' # we used B = 10 for demo. Recommended value is ~500
#' gap_stat <- clusGap(iris.scaled, FUN = kmeans, nstart = 25,
#'  K.max = 10, B = 10)
#'  print(gap_stat, method = "firstmax")
#' fviz_gap_stat(gap_stat)
#'  
#' # Gap statistic for hierarchical clustering
#' gap_stat <- clusGap(iris.scaled, FUN = hcut, K.max = 10, B = 10)
#' fviz_gap_stat(gap_stat)
#' 
#'  
#' @name fviz_nbclust
#' @rdname fviz_nbclust
#' @export
fviz_nbclust <- function (x, FUNcluster = NULL, method = c("silhouette", "wss", "gap_stat"),
                          diss = NULL, k.max = 10, nboot = 100, verbose = interactive(),
                          barfill="steelblue", barcolor="steelblue", 
                          linecolor = "steelblue", print.summary = TRUE,  ...) 
  {
  if(k.max < 2) stop("k.max must bet > = 2")
  method = match.arg(method)
  if(!inherits(x, c("data.frame", "matrix")) && !("Best.nc" %in% names(x)))
    stop("x should be an object of class matrix/data.frame or ",
         "an object created by the function NbClust() [NbClust package].")
  
  # x is an object created by the function NbClust() [NbClust package]
  if(inherits(x, "list") && "Best.nc" %in% names(x)){
      best_nc <- x$Best.nc
      # FIX: R 4.0.0+ deprecation - class() can return multiple values for matrices
      # Using inherits() or is.X() functions instead of class() == comparison
      # See: https://github.com/kassambara/factoextra/issues/171
      if(is.numeric(best_nc) && !is.matrix(best_nc)) print(best_nc)
      else if(is.matrix(best_nc))
        .viz_NbClust(x, print.summary, barfill, barcolor)
  }
  else if(is.null(FUNcluster)) stop("The argument FUNcluster is required. ",
                                    "Possible values are kmeans, pam, hcut, clara, ...")
  else if(!is.function(FUNcluster)){
    stop(
      "The argument FUNcluster should be a function. ",
      "Check if you're not overriding the specified function name somewhere."
      )
  }
  else if(method %in% c("silhouette", "wss")) {

      if (is.data.frame(x)) x <- as.matrix(x)
      if(is.null(diss)) diss <- stats::dist(x)
      
      v <- rep(0, k.max)
      if(method == "silhouette"){
        for(i in 2:k.max){
          clust <- FUNcluster(x, i, ...)
          v[i] <- .get_ave_sil_width(diss, clust$cluster)
        }
      }
      else if(method == "wss"){
        for(i in 1:k.max){
          clust <- FUNcluster(x, i, ...)
          v[i] <- .get_withinSS(diss, clust$cluster)
        }
        
      }
      
      df <- data.frame(clusters = as.factor(1:k.max), y = v)
      
      ylab <- "Total Within Sum of Square"
      main_title <- "Optimal number of clusters"
      if(method == "silhouette") {
        ylab <- "Average silhouette width"
        main_title <- "Optimal number of clusters (method = \"silhouette\")"
      }

      p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
                          color = linecolor, ylab = ylab,
                          xlab = "Number of clusters k",
                          main = main_title
                          )
      if(method == "silhouette")
        p <- p + geom_vline(xintercept = which.max(v), linetype=2, color = linecolor)
      
      return(p) 
  }
  
  else if(method == "gap_stat"){
    extra_args <- list(...)
    if(!is.null(extra_args$maxSE)) {
      maxSE <- extra_args$maxSE
      extra_args$maxSE <- NULL
    } else {
      maxSE <- list(method = "firstSEmax", SE.factor = 1)
    }
    gap_stat <- do.call(cluster::clusGap, c(list(x = x, FUNcluster = FUNcluster, K.max = k.max, B = nboot,
                                                  verbose = verbose), extra_args))
    p <- fviz_gap_stat(gap_stat,  linecolor = linecolor, maxSE = maxSE)
    return(p) 
  }
  

}

#' @rdname fviz_nbclust
#' @param gap_stat an object of class "clusGap" returned by the function
#'   clusGap() [in cluster package]
#' @param maxSE a list containing the parameters (method and SE.factor) for
#'   determining the location of the maximum of the gap statistic (Read the
#'   documentation ?cluster::maxSE). Allowed values for maxSE$method include:
#'   \itemize{ \item "globalmax": simply corresponds to the global maximum,
#'   i.e., is which.max(gap) \item "firstmax": gives the location of the first
#'   local maximum \item "Tibs2001SEmax": uses the criterion, Tibshirani et al
#'   (2001) proposed: "the smallest k such that gap(k) >= gap(k+1) - s(k+1)".
#'   It's also possible to use "the smallest k such that gap(k) >= gap(k+1) -
#'   SE.factor*s(k+1)" where SE.factor is a numeric value which can be 1
#'   (default), 2, 3, etc. \item "firstSEmax": location of the first f() value
#'   which is not larger than the first local maximum minus SE.factor * SE.f,
#'   i.e, within an "f S.E." range of that maximum. \item
#'   see ?cluster::maxSE for more options }
#'
#' @section Method selection for gap statistic:
#'   The default method "firstSEmax" (developed by Martin Maechler, 2012) is
#'   recommended as a robust alternative to "Tibs2001SEmax". The original
#'   Tibshirani method can be overly conservative and often returns k=1 when
#'   standard deviations are large relative to gap differences. The "firstSEmax"
#'   method finds the smallest k within one standard error of the first local
#'   maximum, providing more stable results in practice.
#'
#'
#' @export
fviz_gap_stat <- function(gap_stat,  linecolor = "steelblue",
                          maxSE = list(method = "firstSEmax", SE.factor = 1)){
  if(!inherits(gap_stat, "clusGap"))
    stop("Only an object of class clusGap is allowed. (cluster package)")
  if(is.list(maxSE)){
    if(is.null(maxSE$method)) maxSE$method = "firstmax"
    if(is.null(maxSE$SE.factor)) maxSE$SE.factor = 1
  }
  else stop("The argument maxSE must be a list containing the parameters method and SE.factor")
  
  # first local max
  gap <- gap_stat$Tab[, "gap"]
  se <- gap_stat$Tab[, "SE.sim"]
  k <- .maxSE(gap, se, method = maxSE$method, SE.factor = maxSE$SE.factor)

  df <- as.data.frame(gap_stat$Tab)
  df$clusters <- as.factor(seq_len(nrow(df)))
  df$ymin <- gap-se
  df$ymax <- gap + se
  p <- ggpubr::ggline(df, x = "clusters", y = "gap", group = 1, color = linecolor)+
    # FIX: ggplot2 3.0.0+ deprecation - aes_string() replaced with aes() + .data pronoun
    # See: https://github.com/kassambara/factoextra/issues/190
    ggplot2::geom_errorbar(aes(ymin = .data[["ymin"]], ymax = .data[["ymax"]]), width=.2, color = linecolor)+
    geom_vline(xintercept = k, linetype=2, color = linecolor)+
    labs(y = "Gap statistic (k)", x = "Number of clusters k",
         title = "Optimal number of clusters (method = \"gap_stat\")")
  p
}




# Get the average silhouette width
# ++++++++++++++++++++++++++
# Cluster package required
# d: dist object
# cluster: cluster number of observation
# Returns NA if silhouette cannot be computed (e.g., k <= 1 or k >= n)
# Fixes GitHub issues #113 and #147
.get_ave_sil_width <- function(d, cluster){
  if (!requireNamespace("cluster", quietly = TRUE)) {
    stop("cluster package needed for this function to work. Please install it.")
  }
  ss <- cluster::silhouette(cluster, d)
  # Handle case where silhouette() returns NA (when k <= 1 or k >= n)
  if (length(ss) == 1 && is.na(ss)) {
    return(NA_real_)
  }
  mean(ss[, 3])
}

# Get total within sum of square
# +++++++++++++++++++++++++++++
# d: dist object
# cluster: cluster number of observation
#
# OPTIMIZATION: Replaced explicit loops with vapply() for better performance
# - Pre-compute cluster indices once using split()
# - Use vapply for type-safe vectorized computation
# - Avoid repeated subsetting of distance matrix
# - Maintains identical econometric results
.get_withinSS <- function(d, cluster){
  d <- stats::as.dist(d)
  dmat <- as.matrix(d)

  # Handle cluster renumbering if needed
  clusterf <- as.factor(cluster)
  clusterl <- levels(clusterf)
  cn <- length(clusterl)

  if (max(cluster) != cn) {
    warning("cluster renumbered because maximum != number of clusters")
    cluster <- as.integer(clusterf)
  }

  # OPTIMIZED: Pre-compute cluster membership indices
  # split() creates a list of indices for each cluster - computed once
  cluster_indices <- split(seq_along(cluster), cluster)

  # OPTIMIZED: Compute within-cluster SS using vapply (faster than for loop)
  # For each cluster: sum(d[i,j]^2) / cluster_size for all pairs in cluster
  within_ss <- vapply(cluster_indices, function(idx) {
    if (length(idx) <= 1) return(0)
    # Extract submatrix for this cluster and compute sum of squared distances
    di <- dmat[idx, idx, drop = FALSE]
    # Only use lower triangle (as.dist) to avoid double counting
    sum(di[lower.tri(di)]^2) / length(idx)
  }, FUN.VALUE = numeric(1))

  sum(within_ss)
}





# Visualization of the output returned by the function
# NbClust()
# x : an object generated by the function NbClust()
.viz_NbClust <- function(x, print.summary = TRUE,
                         barfill = "steelblue", barcolor = "steelblue")
  {
     best_nc <- x$Best.nc
    # FIX: R 4.0.0+ deprecation - class() can return multiple values for matrices
    # Using inherits() or is.X() functions instead of class() == comparison
    # See: https://github.com/kassambara/factoextra/issues/171
    if(is.numeric(best_nc) && !is.matrix(best_nc)) print(best_nc)
     else if(is.matrix(best_nc)){
    best_nc <- as.data.frame(t(best_nc))
    best_nc$Number_clusters <- as.factor(best_nc$Number_clusters)
    ss <- summary(best_nc$Number_clusters)
    
    # Summary
    if(print.summary){
      cat ("Among all indices: \n===================\n")
      for(i in seq_along(ss)){
        cat("*", ss[i], "proposed ", names(ss)[i], "as the best number of clusters\n" )
      }
      cat("\nConclusion\n=========================\n")
      cat("* According to the majority rule, the best number of clusters is ",
          names(which.max(ss)),  ".\n\n")
    }
    # Fix for Issue #131: ensure numeric ordering of clusters (not alphabetical)
    # When clusters > 9, alphabetical ordering would put "10" before "2"
    cluster_names <- names(ss)
    cluster_order <- order(as.numeric(cluster_names))
    df <- data.frame(
      Number_clusters = factor(cluster_names, levels = cluster_names[cluster_order]),
      freq = ss,
      stringsAsFactors = FALSE
    )
    p <- ggpubr::ggbarplot(df,  x = "Number_clusters", y = "freq", fill = barfill, color = barcolor)+
      labs(x = "Number of clusters k", y = "Frequency among all indices",
           title = paste0("Optimal number of clusters - k = ", names(which.max(ss)) ))
    
    return(p)
  }
}


#  Determines the location of the maximum of f see ?cluster::maxSE
# +++++++++++++++++++++++++++++++++++++++++++
# f: numeric vector containing the gap statistic
# SE.f : standard error of the gap statistic
# method : character string indicating how the "optimal" number of clusters, k, 
  # is computed from the gap statistics (and their standard deviations), 
  # or more generally how the location k^ of the maximum of f[k] should be determined.
# SE.factor:  Determining the optimal number of clusters, Tibshirani et al. proposed the "1 S.E."-rule.
.maxSE <- function (f, SE.f, method = c("firstSEmax", "Tibs2001SEmax", 
                                        "globalSEmax", "firstmax", "globalmax"), SE.factor = 1) 
{
  method <- match.arg(method)
  stopifnot((K <- length(f)) >= 1, K == length(SE.f), SE.f >= 
              0, SE.factor >= 0)
  fSE <- SE.factor * SE.f
  switch(method, firstmax = {
    decr <- diff(f) <= 0
    if (any(decr)) which.max(decr) else K
  }, globalmax = {
    which.max(f)
  }, Tibs2001SEmax = {
    g.s <- f - fSE
    if (any(mp <- f[-K] >= g.s[-1])) which.max(mp) else K
  }, firstSEmax = {
    decr <- diff(f) <= 0
    nc <- if (any(decr)) which.max(decr) else K
    if (any(mp <- f[seq_len(nc - 1)] >= f[nc] - fSE[nc])) which(mp)[1] else nc
  }, globalSEmax = {
    nc <- which.max(f)
    if (any(mp <- f[seq_len(nc - 1)] >= f[nc] - fSE[nc])) which(mp)[1] else nc
  })
}

Try the factoextra package in your browser

Any scripts or data that you put into this service are public.

factoextra documentation built on March 3, 2026, 5:08 p.m.