R/getOptimalCentroids.R

#' getOptimalCentroids
#'
#' Get Optimal Centroids
#'
#' The raw data is first scaled and this scaled data is supplied as input to
#' the vector quantization algorithm. Vector quantization technique uses a
#' parameter called quantization error. This parameter acts as a threshold and
#' determines the number of levels in the hierarchy. It means that, if there
#' are 'n' number of levels in the hierarchy, then all the clusters formed till
#' this level will have quantization error equal or greater than the threshold
#' quantization error. The user can define the number of clusters in the first
#' level of hierarchy and then each cluster in first level is sub-divided into
#' the same number of clusters as there are in the first level. This process
#' continues and each group is divided into smaller clusters as long as the
#' threshold quantization error is met. The output of this technique will be
#' hierarchically arranged vector quantized data.
#'
#' @param x Data Frame. A dataframe of multivariate data. Each row corresponds to an
#' observation, and each column corresponds to a variable. Missing values are
#' not accepted.
#' @param n_cells Numeric. Indicating the number of nodes per hierarchy.
#' @param depth Numeric. Indicating the hierarchy depth (or) the depth of the
#' tree (1 = no hierarchy, 2 = 2 levels, etc..)
#' @param quant.err Numeric. The quantization error for the algorithm.
#' @param algorithm String. The type of algorithm used for quantization.
#' Available algorithms are Hartigan and Wong, "Lloyd", "Forgy", "MacQueen".
#' (default is "Hartigan-Wong")
#' @param distance_metric Character. The distance metric to calculate inter point distance. It can be 'L1_Norm" or "L2_Norm". L1_Norm is selected by default.
#' @param function_to_calculate_distance_metric Function. The function is to find 'L1_Norm" or "L2_Norm" distances. L1_Norm is selected by default.
#' @param function_to_calculate_error_metric Character. The error metric can be "mean" or "max". mean is selected by default
#' @param quant_method Character. The quant_method can be "kmeans" or "kmedoids". kmeans is selected by default
#' @return \item{values}{ List. A list showing observations assigned to a cluster.}
#' \item{maxQE}{ List. A list corresponding to maximum QE values for each cell. }
#' \item{meanQE}{ List. A list corresponding to mean QE values for each cell. }
#' \item{centers}{ List. A list of quantization error for all levels and nodes. } 
#' \item{nsize}{ List. A list corresponding to number of observations in respective groups. } 
#' @author Shubhra Prakash <shubhra.prakash@@mu-sigma.com>, Sangeet Moy Das <sangeet.das@@mu-sigma.com>
#' @keywords internal

getOptimalCentroids <-
  function (x, 
            iter.max, 
            algorithm, 
            n_cells, 
            function_to_calculate_distance_metric, 
            function_to_calculate_error_metric=c("mean","max"), 
            quant.err,
            distance_metric = "L1_Norm",
            quant_method=c("kmeans","kmedoids"),...
  ){
    # browser()
    if(quant_method == "kmeans"){
      
      options(warn = -1)
      # Start with splitting data into three clusters
      nclust_iter <- 3
      outkinit <- list(centers = numeric(),maxQE = numeric(),meanQE = numeric(), values = logical() , nsize = numeric())
      quantok <- rep(T, n_cells)
      
      # Check if 3 <= No of clusters AND No of cells in a cluster > 3 AND flag to check QE for all clusters
      while(nclust_iter <= n_cells & nrow(x) > nclust_iter & (sum(quantok,na.rm = T) > 0)) {
        resplt <- list()
        #outkinit will have centroids and datapoints and size of the cluster
        set.seed(100)
        kout<-stats::kmeans(x, nclust_iter, iter.max=10^5, algorithm=algorithm)
        result <- getCentroids_for_opti(x,kout, nclust_iter,function_to_calculate_distance_metric,function_to_calculate_error_metric)
        
        outkinit[[1]] <- result[[1]]
        outkinit[[2]] <- result[[2]]
        outkinit[[3]] <- result[[3]]
        #flag to check for quantization error
        resplt <- unlist(outkinit$centers) > quant.err
        quantok <- unlist(resplt)
        nclust_iter <- nclust_iter + 1      
      }
      clusts <- nclust_iter-1
      outkinit[[4]] <- c(1:clusts) %>% purrr::map(~x[kout$cluster==.x,])
      outkinit[[5]] <- as.list(kout$size)
      outkinit[["centroid_val"]] <- kout$centers
      
      dummy_iter = n_cells - nclust_iter + 1
      
      if(dummy_iter !=0){
        outkinit[["centers"]] <- c(outkinit[["centers"]],as.list(rep(NA,dummy_iter)))
        outkinit[["maxQE"]] <- c(outkinit[["maxQE"]],as.list(rep(NA,dummy_iter)))
        outkinit[["meanQE"]] <- c(outkinit[["meanQE"]],as.list(rep(NA,dummy_iter)))
        outkinit[["values"]] <- c(outkinit[["values"]],as.list(rep(NA,dummy_iter)))
        outkinit[["nsize"]] <- c(outkinit[["nsize"]],as.list(rep(0,dummy_iter)))
        temprow <- matrix(c(rep.int(NA,length(outkinit[["centroid_val"]][1,]))),nrow=length(nclust_iter:n_cells),ncol=length(outkinit[["centroid_val"]][1,]))
        outkinit[["centroid_val"]] <- rbind(outkinit[["centroid_val"]],temprow)
      }
      # return centroids, datapoints and size of each cluster
      return(outkinit)
    }else if(quant_method == "kmedoids"){
      options(warn = -1)
      # Start with splitting data into three clusters
      # browser()
      nclust_iter <- 3
      outkinit <- list(centers = numeric(),maxQE = numeric(),meanQE = numeric(), values = logical() , nsize = numeric())
      quantok <- rep(T, n_cells)
      if(distance_metric == "L1_Norm") {
        distance_metric = "manhattan"
      }else if(distance_metric == "L2_Norm") {
        distance_metric = "euclidean"
      }
      kout=list()
      while(nclust_iter <= n_cells & nrow(x) > nclust_iter & (sum(quantok,na.rm = T) > 0)) {
        resplt <- list()
        #outkinit will have centroids and datapoints and size of the cluster
        set.seed(100)
        
        kmedoids_model <-
          cluster::pam(
            x = cluster::daisy(x, metric = distance_metric),
            k = nclust_iter,
            diss = TRUE,
            keep.data = F
          )
        kout$cluster=kmedoids_model[["clustering"]]
        result<- getCentroids_for_opti(x,kout, nclust_iter,function_to_calculate_distance_metric,function_to_calculate_error_metric)
        outkinit[[1]] <-result[[1]]
        outkinit[[2]]<- result[[2]]
        outkinit[[3]]<- result[[3]]
        
        
        #flag to check for quantization error
        resplt <- unlist(outkinit$centers) > quant.err
        quantok <- unlist(resplt)
        nclust_iter <- nclust_iter + 1      
      }
      kout$size=as.numeric(table(kmedoids_model[["clustering"]]))
      kout$centers=as.matrix(x[kmedoids_model[["medoids"]],])
      clusts <- nclust_iter-1
      outkinit[[4]] <- c(1:clusts) %>% purrr::map(~x[kout$cluster==.x,])
      outkinit[[5]] <- as.list(kout$size)
      outkinit[["centroid_val"]] <- kout$centers
      
      dummy_iter = n_cells - nclust_iter + 1
      
      if(dummy_iter !=0){
        outkinit[["centers"]] <- c(outkinit[["centers"]],as.list(rep(NA,dummy_iter)))
        outkinit[["maxQE"]] <- c(outkinit[["maxQE"]],as.list(rep(NA,dummy_iter)))
        outkinit[["meanQE"]] <- c(outkinit[["meanQE"]],as.list(rep(NA,dummy_iter)))
        outkinit[["values"]] <- c(outkinit[["values"]],as.list(rep(NA,dummy_iter)))
        outkinit[["nsize"]] <- c(outkinit[["nsize"]],as.list(rep(0,dummy_iter)))
        temprow <- matrix(c(rep.int(NA,length(outkinit[["centroid_val"]][1,]))),nrow=length(nclust_iter:n_cells),ncol=length(outkinit[["centroid_val"]][1,]))
        outkinit[["centroid_val"]] <- rbind(outkinit[["centroid_val"]],temprow)
        
      }
      # return centroids, datapoints and size of each cluster
      return(outkinit)
    }
    
  }

Try the muHVT package in your browser

Any scripts or data that you put into this service are public.

muHVT documentation built on March 7, 2023, 6:38 p.m.