R/hkmeans.R

#' Hierarchical k-means clustering
#' 
#' @description 
#' The final k-means clustering solution is very sensitive to the initial random selection 
#' of cluster centers. This function provides a solution using an hybrid approach by combining 
#' the hierarchical clustering and the k-means methods. The procedure is explained in "Details" section. Read more:
#'   \href{http://www.sthda.com/english/wiki/hybrid-hierarchical-k-means-clustering-for-optimizing-clustering-outputs-unsupervised-machine-learning}{Hybrid hierarchical k-means clustering for optimizing clustering outputs}.
#' \itemize{
#' \item hkmeans(): compute hierarchical k-means clustering
#' \item print.hkmeans(): prints the result of hkmeans
#' \item hkmeans_tree(): plots the initial dendrogram
#' }
#' 
#' @param x a numeric matrix, data frame or vector
#' @param k the number of clusters to be generated
#' @param hc.metric the distance measure to be used. Possible values are "euclidean", "maximum", "manhattan", 
#' "canberra", "binary" or "minkowski" (see ?dist).
#' @param hc.method the agglomeration method to be used. Possible values include "ward.D", "ward.D2", "single", 
#' "complete", "average", "mcquitty", "median"or "centroid" (see ?hclust).
#' @param iter.max the maximum number of iterations allowed for k-means.
#' @param km.algorithm the algorithm to be used for kmeans (see ?kmeans).
#' @details 
#' The procedure is as follow:   
#'      
#' 1. Compute hierarchical clustering
#'    
#' 2. Cut the tree in k-clusters
#'    
#' 3. compute the center (i.e the mean) of each cluster
#'      
#' 4. Do k-means by using the set of cluster centers (defined in step 3) as the initial cluster centers
#' @return hkmeans returns an object of class "hkmeans" containing the following components:
#' \itemize{
#' \item The elements returned by the standard function kmeans() (see ?kmeans)
#' \item data: the data used for the analysis
#' \item hclust: an object of class "hclust" generated by the function hclust()
#' }
#' @examples
#' \donttest{
#' # Load data
#' data(USArrests)
#' # Scale the data
#' df <- scale(USArrests)
#' 
#' # Compute hierarchical k-means clustering
#' res.hk <-hkmeans(df, 4)
#' 
#' # Elements returned by hkmeans()
#' names(res.hk)
#' 
#' # Print the results
#' res.hk
#' 
#' # Visualize the tree
#' hkmeans_tree(res.hk, cex = 0.6)
#' # or use this
#' fviz_dend(res.hk, cex = 0.6)
#' 
#' 
#' # Visualize the hkmeans final clusters
#' fviz_cluster(res.hk, frame.type = "norm", frame.level = 0.68)
#' }
#' @name hkmeans
#' @rdname hkmeans
#' @export
hkmeans <- function(x, k, hc.metric = "euclidean", hc.method = "ward.D2",
                    iter.max = 10, km.algorithm = "Hartigan-Wong"){
  
  res.hc <- stats::hclust(stats::dist(x, method = "euclidean"), method = hc.method)
  grp <- stats::cutree(res.hc, k = k)
  clus.centers <- stats::aggregate(x, list(grp), mean)[, -1]
  res.km <- kmeans(x, centers = clus.centers, 
                   iter.max = iter.max, algorithm = km.algorithm)
  class(res.km) <- "hkmeans"
  res.km$data <- x
  res.km$hclust <- res.hc
  res.km
}

 

#' @rdname hkmeans
#' @export
print.hkmeans <- function (x, ...) 
  {
  cat("Hierarchical K-means clustering with ", length(x$size), " clusters of sizes ", 
      paste(x$size, collapse = ", "), "\n", sep = "")
  cat("\nCluster means:\n")
  print(x$centers, ...)
  cat("\nClustering vector:\n")
  print(x$cluster, ...)
  cat("\nWithin cluster sum of squares by cluster:\n")
  print(x$withinss, ...)
  ratio <- sprintf(" (between_SS / total_SS = %5.1f %%)\n", 
                   100 * x$betweenss/x$totss)
  cat(sub(".", getOption("OutDec"), ratio, fixed = TRUE), "Available components:\n", 
      sep = "\n")
  print(names(x))
  if (!is.null(x$ifault) && x$ifault == 2L) 
    cat("Warning: did *not* converge in specified number of iterations\n")
  invisible(x)
}


#' @rdname hkmeans
#' @param hkmeans an object of class hkmeans (returned by the function hkmeans())
#' @param rect.col Vector with border colors for the rectangles around clusters in dendrogram
#' @param ... others arguments to be passed to the function plot.hclust(); (see ? plot.hclust) 
#' @export
hkmeans_tree <- function(hkmeans, rect.col = NULL, ...)
  {
  res.hk <- hkmeans
  if(is.null(rect.col)) rect.col <- unique(res.hk$cluster)
  plot(res.hk$hclust, hang = -1, sub = "", xlab = "", ...)
  k <- length(unique(res.hk$cluster))
  stats::rect.hclust(res.hk$hclust, k = k, border = rect.col)
}
YTLogos/factoextra documentation built on May 3, 2019, 9:03 p.m.