R/clustHVT.R

Defines functions clustHVT

Documented in clustHVT

#' @name clustHVT
#' @title Performing Hierarchical Clustering Analysis
#' @description This is the main function to perform hierarchical clustering
#' analysis which determines optimal number of clusters, perform AGNES clustering
#' and plot the 2D cluster hvt plot.
#' @param data Data frame. A data frame intended for performing hierarchical clustering analysis.
#' @param trainHVT_results List.  A list object which is obtained as a result of trainHVT function.
#' @param scoreHVT_results List. A list object which is obtained as a result of scoreHVT function.
#' @param clusters_k Character.  A parameter that specifies the number of clusters for the provided data. 
#' The options include “champion,” “challenger,” or any integer between 1 and 20. 
#' Selecting “champion” will use the highest number of clusters recommended by the ‘NbClust’ function,
#' while “challenger” will use the second-highest recommendation. If a numerical value from 1 to 20
#' is provided, that exact number will be used as the number of clusters.
#' @param indices Character. The indices used for determining the optimal number of clusters in NbClust function.
#'  By default it uses 20 different indices.
#' @param clustering_method Character. The method used for clustering in both NbClust and hclust function. Defaults to ‘ward.D2’.
#' @param type Character. The type of output required. Default is 'default'. Other option is 'plot' which
#'  will return only the clustered heatmap.
#' @param domains.column Character. A vector of cluster names for the clustered heatmap.
#' Used only when type is 'plot'.
#' @return A list object that contains the hierarchical clustering results.
#' \item{[[1]] }{Summary of k suggested by all indices with plots} 
#' \item{[[2]] }{A dendogram plot with the selected number of clusters} 
#' \item{[[3]] }{A 2D Cluster HVT Plotly visualization that colors cells according to clusters derived from AGNES clustering results. 
#' It is interactive, allowing users to view cell contents by hovering over them}
#' @author Vishwavani <vishwavani@@mu-sigma.com>
#' @keywords Clustering_Analysis
#' @include clusterPlot.R
#' @importFrom utils data head tail
#' @examples 
#'data("EuStockMarkets")
#'dataset <- data.frame(t = as.numeric(time(EuStockMarkets)),
#'                      DAX = EuStockMarkets[, "DAX"],
#'                      SMI = EuStockMarkets[, "SMI"],
#'                      CAC = EuStockMarkets[, "CAC"],
#'                      FTSE = EuStockMarkets[, "FTSE"])
#'rownames(EuStockMarkets) <- dataset$t
#'hvt.results<- trainHVT(dataset[-1],n_cells = 30, depth = 1, quant.err = 0.1,
#'                       distance_metric = "L1_Norm", error_metric = "max",
#'                       normalize = TRUE,quant_method = "kmeans")
#'scoring <- scoreHVT(dataset, hvt.results, analysis.plots = TRUE, names.column = dataset[,1])
#'centroid_data <- scoring$centroidData
#'hclust_data_1 <- centroid_data[,2:3]
#'clust.results <- clustHVT(data = hclust_data_1, 
#'                          trainHVT_results = hvt.results,
#'                          scoreHVT_results = scoring, 
#'                          clusters_k = 'champion', indices = 'hartigan')
#' @export clustHVT




clustHVT <- function(data, trainHVT_results, scoreHVT_results, clustering_method = 'ward.D2',
                     indices, clusters_k = "champion", type = "default",domains.column) {
  requireNamespace('NbClust')
  
  if (type == "plot"){
    
    if (is.null(data) || is.null(trainHVT_results) || is.null(domains.column)) {
      stop("For type 'plot',  the arguments `data`, `trainHVT_results` and `domains.column` are required.")
    }
    
    plot_a <- clusterPlot(dataset= data, hvt.results  = trainHVT_results, domains.column = domains.column )
    
    return(plot_a)
    
  } 
  else if (type == "default"){
    
  
  hclust_data <- data
  
  results_df <- c()
  
  
  # Function to run NbClust and extract results
  print_nbclust_results <- function(hclust_data, indices) {
    get_nbclust_result <- function(index) {
      res <- NbClust::NbClust(hclust_data, method = clustering_method, index = index)$Best.nc
      if (!is.null(res)) {
        return(data.frame(Index = index, 
                          Number_clusters = res["Number_clusters"], 
                          Value_Index = res["Value_Index"]))
      } else {
        return(data.frame(Index = index, 
                          Number_clusters = NA, 
                          Value_Index = NA))
      }
    }
    
    results_list <- lapply(indices, get_nbclust_result)
    results_df <<- do.call(rbind, results_list)
    
    cluster_summary <- table(results_df$Number_clusters)
    cat("** Among all indices:\n")
    lapply(seq_along(cluster_summary), function(i) {
      cat(sprintf("** * %d proposed %d as the best number of clusters\n", 
                  cluster_summary[i], as.numeric(names(cluster_summary)[i])))})
    
    
    best_clusters <- as.numeric(names(cluster_summary)[which.max(cluster_summary)])
    cat("*******************************************************************\n")
    cat("***** Conclusion *****\n")
    cat(sprintf("** * According to the majority rule, the best number of clusters is %d\n", best_clusters))
    cat("*******************************************************************\n")
    cat("                         Index     Number_clusters    Value_Index\n")
    cat("-------------------------------------------------------------------\n")
    apply(results_df, 1, function(row) {
      cat(sprintf("%24s  %15.4f  %15.4f\n", 
                  row["Index"], 
                  as.numeric(row["Number_clusters"]), 
                  as.numeric(row["Value_Index"])))
    })
    cat("*******************************************************************\n")
  }
  
  
  # Run NbClust
  indices <- c("kl", "ch", "hartigan", "cindex", "db", "silhouette", "ratkowsky", "ball","hubert","dindex",
               "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", "sdindex", "sdbw")
  print_nbclust_results(hclust_data, indices)
  
  #to fetch champion and challenger
  cluster_summary <- table(results_df$Number_clusters)
  
  # Determine the number of clusters based on user input
  if (clusters_k == "champion") {
    no_of_clusters <- as.numeric(names(cluster_summary)[which.max(cluster_summary)])
  } else if (clusters_k == "challenger") {
    sorted_summary <- sort(cluster_summary, decreasing = TRUE)
    no_of_clusters <- as.numeric(names(sorted_summary)[2])
  } else if (is.numeric(clusters_k) && clusters_k >= 1 && clusters_k <= 20) {
    no_of_clusters <- clusters_k
  } else {
    stop("Invalid input for clusters_k. Use 'champion', 'challenger', or a numeric value between 1 and 20.")
  }
  #browser()  
  # Perform hierarchical clustering
  hc <- stats::hclust(dist(hclust_data), clustering_method)
  clusters <- stats::cutree(hc, k = no_of_clusters)
  
#browser()  
  # Replace the existing plot_dendrogram function with this:
  plot_dendrogram <- function(hc_1, no_of_clusters_1) {
    function() {
      plot(hc_1, xlab = "Clusters", ylab = "Distance", sub ="")
      stats::rect.hclust(hc_1, k = no_of_clusters_1, border = grDevices::rainbow(no_of_clusters_1))
    }
  }
  
  a <- plot_dendrogram(hc_1 = hc, no_of_clusters_1 = no_of_clusters)

    
#browser()  
  
  # Prepare data for clusterPlotly
  cluster_data <- scoreHVT_results$centroidData %>% 
    dplyr::select("Cell.ID", "names.column") %>%
    mutate(clusters =  clusters)
  
  #cluster_data <- cluster_data[order(cluster_data$Cell.ID), ]
  
  
  b <- clusterPlot(dataset= cluster_data, hvt.results  = trainHVT_results, domains.column = "clusters" )
  
  output_list <- list(
    hc = hc,
    clusters = clusters,
    cluster_data =cluster_data,
    dendogram = a,
    clusterplot = b
  )
  
  return(output_list)
}
}

Try the HVT package in your browser

Any scripts or data that you put into this service are public.

HVT documentation built on April 3, 2025, 8:45 p.m.