Nothing
#' @include hcut.R
NULL
#' Dertermining and Visualizing the Optimal Number of Clusters
#' @description Partitioning methods, such as k-means clustering require the
#' users to specify the number of clusters to be generated. \itemize{
#' \item fviz_nbclust(): Determines and visualizes the optimal number of
#' clusters using different methods: \strong{within cluster sums of squares},
#' \strong{average silhouette} and \strong{gap statistics}.
#' \item fviz_gap_stat(): Visualizes the gap statistic generated by the
#' function \code{\link[cluster]{clusGap}}() [in cluster package]. The optimal
#' number of clusters is specified using the "firstmax" method
#' (?cluster::clustGap). }
#'
#' Read more:
#' \href{https://www.datanovia.com/en/lessons/determining-the-optimal-number-of-clusters-3-must-know-methods/}{Determining
#' the optimal number of clusters}
#'
#' @param x numeric matrix or data frame. In the function fviz_nbclust(), x can
#' be the results of the function NbClust().
#' @param method the method to be used for estimating the optimal number of
#' clusters. Possible values are "silhouette" (for average silhouette width),
#' "wss" (for total within sum of square) and "gap_stat" (for gap statistics).
#' @param FUNcluster a partitioning function which accepts as first argument a
#' (data) matrix like x, second argument, say k, k >= 2, the number of
#' clusters desired, and returns a list with a component named cluster which
#' contains the grouping of observations. Allowed values include: kmeans,
#' cluster::pam, cluster::clara, cluster::fanny, hcut, etc. This argument is
#' not required when x is an output of the function
#' \code{NbClust::NbClust()}.
#' @param diss dist object as produced by dist(), i.e.: diss = dist(x, method =
#' "euclidean"). Used to compute the average silhouette width of clusters, the
#' within sum of square and hierarchical clustering. If NULL, dist(x) is
#' computed with the default method = "euclidean"
#' @param k.max the maximum number of clusters to consider, must be at least two.
#' @param nboot integer, number of Monte Carlo ("bootstrap") samples. Used only for determining the number of clusters
#' using gap statistic.
#' @param verbose logical value. If TRUE, the result of progress is printed.
#' @param barfill,barcolor fill color and outline color for bars
#' @param linecolor color for lines
#' @param print.summary logical value. If true, the optimal number of clusters
#' are printed in fviz_nbclust().
#' @param ... optionally further arguments:
#' arguments for FUNcluster() in "wss"/"silhouette" modes; arguments for
#' \code{\link[cluster]{clusGap}}() in "gap_stat" mode. A \code{maxSE} list
#' can also be supplied in "gap_stat" mode and is forwarded to
#' \code{fviz_gap_stat()}.
#'
#' @return \itemize{ \item fviz_nbclust, fviz_gap_stat: return a ggplot2 }
#' @seealso \code{\link{fviz_cluster}}, \code{\link{eclust}}
#' @author Alboukadel Kassambara \email{alboukadel.kassambara@@gmail.com}
#'
#' @examples
#' set.seed(123)
#'
#' # Data preparation
#' # +++++++++++++++
#' data("iris")
#' head(iris)
#' # Remove species column (5) and scale the data
#' iris.scaled <- scale(iris[, -5])
#'
#'
#' # Optimal number of clusters in the data
#' # ++++++++++++++++++++++++++++++++++++++
#' # Examples are provided only for kmeans, but
#' # you can also use cluster::pam (for pam) or
#' # hcut (for hierarchical clustering)
#'
#' ### Elbow method (look at the knee)
#' # Elbow method for kmeans
#' fviz_nbclust(iris.scaled, kmeans, method = "wss") +
#' geom_vline(xintercept = 3, linetype = 2)
#'
#' # Average silhouette for kmeans
#' fviz_nbclust(iris.scaled, kmeans, method = "silhouette")
#'
#' ### Gap statistic
#' library(cluster)
#' set.seed(123)
#' # Compute gap statistic for kmeans
#' # we used B = 10 for demo. Recommended value is ~500
#' gap_stat <- clusGap(iris.scaled, FUN = kmeans, nstart = 25,
#' K.max = 10, B = 10)
#' print(gap_stat, method = "firstmax")
#' fviz_gap_stat(gap_stat)
#'
#' # Gap statistic for hierarchical clustering
#' gap_stat <- clusGap(iris.scaled, FUN = hcut, K.max = 10, B = 10)
#' fviz_gap_stat(gap_stat)
#'
#'
#' @name fviz_nbclust
#' @rdname fviz_nbclust
#' @export
fviz_nbclust <- function (x, FUNcluster = NULL, method = c("silhouette", "wss", "gap_stat"),
diss = NULL, k.max = 10, nboot = 100, verbose = interactive(),
barfill="steelblue", barcolor="steelblue",
linecolor = "steelblue", print.summary = TRUE, ...)
{
if(k.max < 2) stop("k.max must bet > = 2")
method = match.arg(method)
if(!inherits(x, c("data.frame", "matrix")) && !("Best.nc" %in% names(x)))
stop("x should be an object of class matrix/data.frame or ",
"an object created by the function NbClust() [NbClust package].")
# x is an object created by the function NbClust() [NbClust package]
if(inherits(x, "list") && "Best.nc" %in% names(x)){
best_nc <- x$Best.nc
# FIX: R 4.0.0+ deprecation - class() can return multiple values for matrices
# Using inherits() or is.X() functions instead of class() == comparison
# See: https://github.com/kassambara/factoextra/issues/171
if(is.numeric(best_nc) && !is.matrix(best_nc)) print(best_nc)
else if(is.matrix(best_nc))
.viz_NbClust(x, print.summary, barfill, barcolor)
}
else if(is.null(FUNcluster)) stop("The argument FUNcluster is required. ",
"Possible values are kmeans, pam, hcut, clara, ...")
else if(!is.function(FUNcluster)){
stop(
"The argument FUNcluster should be a function. ",
"Check if you're not overriding the specified function name somewhere."
)
}
else if(method %in% c("silhouette", "wss")) {
if (is.data.frame(x)) x <- as.matrix(x)
if(is.null(diss)) diss <- stats::dist(x)
v <- rep(0, k.max)
if(method == "silhouette"){
for(i in 2:k.max){
clust <- FUNcluster(x, i, ...)
v[i] <- .get_ave_sil_width(diss, clust$cluster)
}
}
else if(method == "wss"){
for(i in 1:k.max){
clust <- FUNcluster(x, i, ...)
v[i] <- .get_withinSS(diss, clust$cluster)
}
}
df <- data.frame(clusters = as.factor(1:k.max), y = v)
ylab <- "Total Within Sum of Square"
main_title <- "Optimal number of clusters"
if(method == "silhouette") {
ylab <- "Average silhouette width"
main_title <- "Optimal number of clusters (method = \"silhouette\")"
}
p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
color = linecolor, ylab = ylab,
xlab = "Number of clusters k",
main = main_title
)
if(method == "silhouette")
p <- p + geom_vline(xintercept = which.max(v), linetype=2, color = linecolor)
return(p)
}
else if(method == "gap_stat"){
extra_args <- list(...)
if(!is.null(extra_args$maxSE)) {
maxSE <- extra_args$maxSE
extra_args$maxSE <- NULL
} else {
maxSE <- list(method = "firstSEmax", SE.factor = 1)
}
gap_stat <- do.call(cluster::clusGap, c(list(x = x, FUNcluster = FUNcluster, K.max = k.max, B = nboot,
verbose = verbose), extra_args))
p <- fviz_gap_stat(gap_stat, linecolor = linecolor, maxSE = maxSE)
return(p)
}
}
#' @rdname fviz_nbclust
#' @param gap_stat an object of class "clusGap" returned by the function
#' clusGap() [in cluster package]
#' @param maxSE a list containing the parameters (method and SE.factor) for
#' determining the location of the maximum of the gap statistic (Read the
#' documentation ?cluster::maxSE). Allowed values for maxSE$method include:
#' \itemize{ \item "globalmax": simply corresponds to the global maximum,
#' i.e., is which.max(gap) \item "firstmax": gives the location of the first
#' local maximum \item "Tibs2001SEmax": uses the criterion, Tibshirani et al
#' (2001) proposed: "the smallest k such that gap(k) >= gap(k+1) - s(k+1)".
#' It's also possible to use "the smallest k such that gap(k) >= gap(k+1) -
#' SE.factor*s(k+1)" where SE.factor is a numeric value which can be 1
#' (default), 2, 3, etc. \item "firstSEmax": location of the first f() value
#' which is not larger than the first local maximum minus SE.factor * SE.f,
#' i.e, within an "f S.E." range of that maximum. \item
#' see ?cluster::maxSE for more options }
#'
#' @section Method selection for gap statistic:
#' The default method "firstSEmax" (developed by Martin Maechler, 2012) is
#' recommended as a robust alternative to "Tibs2001SEmax". The original
#' Tibshirani method can be overly conservative and often returns k=1 when
#' standard deviations are large relative to gap differences. The "firstSEmax"
#' method finds the smallest k within one standard error of the first local
#' maximum, providing more stable results in practice.
#'
#'
#' @export
fviz_gap_stat <- function(gap_stat, linecolor = "steelblue",
maxSE = list(method = "firstSEmax", SE.factor = 1)){
if(!inherits(gap_stat, "clusGap"))
stop("Only an object of class clusGap is allowed. (cluster package)")
if(is.list(maxSE)){
if(is.null(maxSE$method)) maxSE$method = "firstmax"
if(is.null(maxSE$SE.factor)) maxSE$SE.factor = 1
}
else stop("The argument maxSE must be a list containing the parameters method and SE.factor")
# first local max
gap <- gap_stat$Tab[, "gap"]
se <- gap_stat$Tab[, "SE.sim"]
k <- .maxSE(gap, se, method = maxSE$method, SE.factor = maxSE$SE.factor)
df <- as.data.frame(gap_stat$Tab)
df$clusters <- as.factor(seq_len(nrow(df)))
df$ymin <- gap-se
df$ymax <- gap + se
p <- ggpubr::ggline(df, x = "clusters", y = "gap", group = 1, color = linecolor)+
# FIX: ggplot2 3.0.0+ deprecation - aes_string() replaced with aes() + .data pronoun
# See: https://github.com/kassambara/factoextra/issues/190
ggplot2::geom_errorbar(aes(ymin = .data[["ymin"]], ymax = .data[["ymax"]]), width=.2, color = linecolor)+
geom_vline(xintercept = k, linetype=2, color = linecolor)+
labs(y = "Gap statistic (k)", x = "Number of clusters k",
title = "Optimal number of clusters (method = \"gap_stat\")")
p
}
# Get the average silhouette width
# ++++++++++++++++++++++++++
# Cluster package required
# d: dist object
# cluster: cluster number of observation
# Returns NA if silhouette cannot be computed (e.g., k <= 1 or k >= n)
# Fixes GitHub issues #113 and #147
.get_ave_sil_width <- function(d, cluster){
if (!requireNamespace("cluster", quietly = TRUE)) {
stop("cluster package needed for this function to work. Please install it.")
}
ss <- cluster::silhouette(cluster, d)
# Handle case where silhouette() returns NA (when k <= 1 or k >= n)
if (length(ss) == 1 && is.na(ss)) {
return(NA_real_)
}
mean(ss[, 3])
}
# Get total within sum of square
# +++++++++++++++++++++++++++++
# d: dist object
# cluster: cluster number of observation
#
# OPTIMIZATION: Replaced explicit loops with vapply() for better performance
# - Pre-compute cluster indices once using split()
# - Use vapply for type-safe vectorized computation
# - Avoid repeated subsetting of distance matrix
# - Maintains identical econometric results
.get_withinSS <- function(d, cluster){
d <- stats::as.dist(d)
dmat <- as.matrix(d)
# Handle cluster renumbering if needed
clusterf <- as.factor(cluster)
clusterl <- levels(clusterf)
cn <- length(clusterl)
if (max(cluster) != cn) {
warning("cluster renumbered because maximum != number of clusters")
cluster <- as.integer(clusterf)
}
# OPTIMIZED: Pre-compute cluster membership indices
# split() creates a list of indices for each cluster - computed once
cluster_indices <- split(seq_along(cluster), cluster)
# OPTIMIZED: Compute within-cluster SS using vapply (faster than for loop)
# For each cluster: sum(d[i,j]^2) / cluster_size for all pairs in cluster
within_ss <- vapply(cluster_indices, function(idx) {
if (length(idx) <= 1) return(0)
# Extract submatrix for this cluster and compute sum of squared distances
di <- dmat[idx, idx, drop = FALSE]
# Only use lower triangle (as.dist) to avoid double counting
sum(di[lower.tri(di)]^2) / length(idx)
}, FUN.VALUE = numeric(1))
sum(within_ss)
}
# Visualization of the output returned by the function
# NbClust()
# x : an object generated by the function NbClust()
.viz_NbClust <- function(x, print.summary = TRUE,
barfill = "steelblue", barcolor = "steelblue")
{
best_nc <- x$Best.nc
# FIX: R 4.0.0+ deprecation - class() can return multiple values for matrices
# Using inherits() or is.X() functions instead of class() == comparison
# See: https://github.com/kassambara/factoextra/issues/171
if(is.numeric(best_nc) && !is.matrix(best_nc)) print(best_nc)
else if(is.matrix(best_nc)){
best_nc <- as.data.frame(t(best_nc))
best_nc$Number_clusters <- as.factor(best_nc$Number_clusters)
ss <- summary(best_nc$Number_clusters)
# Summary
if(print.summary){
cat ("Among all indices: \n===================\n")
for(i in seq_along(ss)){
cat("*", ss[i], "proposed ", names(ss)[i], "as the best number of clusters\n" )
}
cat("\nConclusion\n=========================\n")
cat("* According to the majority rule, the best number of clusters is ",
names(which.max(ss)), ".\n\n")
}
# Fix for Issue #131: ensure numeric ordering of clusters (not alphabetical)
# When clusters > 9, alphabetical ordering would put "10" before "2"
cluster_names <- names(ss)
cluster_order <- order(as.numeric(cluster_names))
df <- data.frame(
Number_clusters = factor(cluster_names, levels = cluster_names[cluster_order]),
freq = ss,
stringsAsFactors = FALSE
)
p <- ggpubr::ggbarplot(df, x = "Number_clusters", y = "freq", fill = barfill, color = barcolor)+
labs(x = "Number of clusters k", y = "Frequency among all indices",
title = paste0("Optimal number of clusters - k = ", names(which.max(ss)) ))
return(p)
}
}
# Determines the location of the maximum of f see ?cluster::maxSE
# +++++++++++++++++++++++++++++++++++++++++++
# f: numeric vector containing the gap statistic
# SE.f : standard error of the gap statistic
# method : character string indicating how the "optimal" number of clusters, k,
# is computed from the gap statistics (and their standard deviations),
# or more generally how the location k^ of the maximum of f[k] should be determined.
# SE.factor: Determining the optimal number of clusters, Tibshirani et al. proposed the "1 S.E."-rule.
.maxSE <- function (f, SE.f, method = c("firstSEmax", "Tibs2001SEmax",
"globalSEmax", "firstmax", "globalmax"), SE.factor = 1)
{
method <- match.arg(method)
stopifnot((K <- length(f)) >= 1, K == length(SE.f), SE.f >=
0, SE.factor >= 0)
fSE <- SE.factor * SE.f
switch(method, firstmax = {
decr <- diff(f) <= 0
if (any(decr)) which.max(decr) else K
}, globalmax = {
which.max(f)
}, Tibs2001SEmax = {
g.s <- f - fSE
if (any(mp <- f[-K] >= g.s[-1])) which.max(mp) else K
}, firstSEmax = {
decr <- diff(f) <= 0
nc <- if (any(decr)) which.max(decr) else K
if (any(mp <- f[seq_len(nc - 1)] >= f[nc] - fSE[nc])) which(mp)[1] else nc
}, globalSEmax = {
nc <- which.max(f)
if (any(mp <- f[seq_len(nc - 1)] >= f[nc] - fSE[nc])) which(mp)[1] else nc
})
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.