R/hvq.R
In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

#' hvq
#' 
#' Hierarchical Vector Quantization
#' 
#' The raw data is first scaled and this scaled data is supplied as input to
#' the vector quantization algorithm. Vector quantization technique uses a
#' parameter called quantization error. This parameter acts as a threshold and
#' determines the number of levels in the hierarchy. It means that, if there
#' are 'n' number of levels in the hierarchy, then all the clusters formed till
#' this level will have quantization error equal or greater than the threshold
#' quantization error. The user can define the number of clusters in the first
#' level of hierarchy and then each cluster in first level is sub-divided into
#' the same number of clusters as there are in the first level. This process
#' continues and each group is divided into smaller clusters as long as the
#' threshold quantization error is met. The output of this technique will be
#' hierarchically arranged vector quantized data.
#' 
#' @param x Data Frame. A dataframe of multivariate data. Each row corresponds to an
#' observation, and each column corresponds to a variable. Missing values are
#' not accepted.
#' @param min_compression_perc Numeric. An integer indicating the minimum percent compression rate to
#' be achieved for the dataset
#' @param n_cells Numeric. Indicating the number of nodes per hierarchy.
#' @param depth Numeric. Indicating the hierarchy depth (or) the depth of the
#' tree (1 = no hierarchy, 2 = 2 levels, etc..)
#' @param quant.err Numeric. The quantization error for the algorithm.
#' @param algorithm String. The type of algorithm used for quantization.
#' Available algorithms are Hartigan and Wong, "Lloyd", "Forgy", "MacQueen".
#' (default is "Hartigan-Wong")
#' @param distance_metric character. The distance metric can be 'L1_Norm" or "L2_Norm". L1_Norm is selected by default.
#' @param error_metric character. The error metric can be "mean" or "max". mean is selected by default 
#' @param quant_method character. The quant_method can be "kmeans" or "kmedoids". kmeans is selected by default
#' @return \item{clusters}{ List. A list showing each ID assigned to a cluster.
#' } \item{nodes.clust}{ List. A list corresponding to nodes' details. }
#' \item{idnodes}{ List. A list of ID and segments similar to
#' \code{nodes.clust} with additional columns for nodes ID. }
#' \item{error.quant}{ List. A list of quantization error for all levels and
#' nodes. } \item{plt.clust}{ List. A list of logical values indicating if the
#' quantization error was met. } \item{summary}{ Summary. Output table with
#' summary. }
#' @author Shubhra Prakash <shubhra.prakash@@mu-sigma.com>, Sangeet Moy Das <sangeet.das@@mu-sigma.com>
#' @seealso \code{\link{hvtHmap}}
#' @importFrom magrittr %>%
#' @importFrom stats complete.cases
#' @examples
#' 
#' data("USArrests",package="datasets")
#' hvqOutput = hvq(USArrests, n_cells = 5, depth = 2, quant.err = 0.2,
#' distance_metric='L1_Norm',error_metric='mean',quant_method="kmeans")
#' 
#' @export hvq
#' @keywords internal

hvq <-
  function (x,
            min_compression_perc = NA,
            n_cells = NA,
            depth = 3,
            quant.err = 10,
            algorithm = "Hartigan-Wong",
            distance_metric = c("L1_Norm", "L2_Norm"),
            error_metric = c("mean", "max"),
            quant_method=c("kmeans","kmedoids")
  ) {
    
    requireNamespace("dplyr")
    # browser()
    
    rescl <- list()
    resid <- list()
    resm <- list()
    maxqe <- list()
    meanqe <- list()
    resplt <- list()
    ztab3up <- list()
    ztab1 <- ztab2 <- ztabn <- NULL
    ztab11 <- ztab12 <- ztab13 <- NULL
    zdepth <- depth
    if(!is.na(n_cells)){
      quantinit <- rep(F, n_cells)
      ztab3upc<-matrix(0, nrow = ncol(x), n_cells)
      std<-matrix(0, nrow = ncol(x), n_cells)
    }
    
    set.seed(300)
    # flog.info("Parameters are initialized")
    #outkinit will have centroids and datapoints and size of the cluster
    # outkinit <- getOptimalCentroids(x, iter.max=100, algorithm=algorithm, n_cells,distance_metric=distance_metric,error_metric=error_metric,quant.err=quant.err)
    
    colSd <- function (x, na.rm=FALSE) apply(X=x, MARGIN=2, FUN=stats::sd, na.rm=na.rm)
    
    calculate_euclidean_distance_for_each_cluster <- function(x){
      sqrt(rowSums(scale(x,center = T,scale = F)^2))/ncol(x)
    }
    
    calculate_manhattan_distance_for_each_cluster <- function(x){
      rowSums(abs(scale(x,center = T,scale = F)))/ncol(x)
    }
    
    
    ## for distance metrics i.e; manhattan or eucleadian
    if(distance_metric == "L1_Norm"){
      function_to_calculate_distance_metric <- calculate_manhattan_distance_for_each_cluster
    } else if(distance_metric == "L2_Norm"){
      function_to_calculate_distance_metric <- calculate_euclidean_distance_for_each_cluster
    } else{
      stop('distance_metric must be L1_Norm (Manhattan), L2_Norm (Euclidean) or a custom distance function')
    }
    
    ## for error metric i.e; mean or max
    if(error_metric %in% c("mean","max")){
      function_to_calculate_error_metric <- error_metric
    } else{
      stop('error_metric must be max,mean or custom function')
    }
    
    if(!is.na(min_compression_perc)){
      depth <- 1
      compress_percent <- 0
      n_cells_compress_perc = 2
      num_data_points <- rep(F, n_cells_compress_perc)
      n_cells_quant_err_list <- list()
      n_cells = NA

      while(n_cells_compress_perc < nrow(x) & compress_percent <= min_compression_perc){
      quantinit <- rep(F, n_cells_compress_perc)
      ztab3upc<-matrix(0, nrow = ncol(x), n_cells_compress_perc)
      std<-matrix(0, nrow = ncol(x), n_cells_compress_perc)
      # print(n_cells_compress_perc)
      outkinit <- getCentroids(x=x,
                               kout = stats::kmeans(x, n_cells_compress_perc, iter.max=10^5, algorithm=algorithm),
                               n_cells=n_cells_compress_perc,
                               function_to_calculate_distance_metric=function_to_calculate_distance_metric,
                               function_to_calculate_error_metric=function_to_calculate_error_metric,
                               distance_metric=distance_metric,
                               quant_method = quant_method)

      n_cells_quant_err_list <- unlist(outkinit$cent)
      noOfCells = sum(!is.na(n_cells_quant_err_list))
      noOfCellsBelowQuantizationError = sum(n_cells_quant_err_list < quant.err,na.rm = TRUE)
      compress_percent = round((noOfCellsBelowQuantizationError/noOfCells) * 100, 2) #percentOfCellsBelowQuantizationErrorThreshold
      n_cells_compress_perc <- n_cells_compress_perc + 1
    }
    n_cells_optimal <- n_cells_compress_perc - 1
    print(paste0("For the given dataset ",min_compression_perc,"% compression rate is achieved at n_cells : ", n_cells_optimal))

    }else{

      outkinit <- getCentroids(x=x,
                             kout = stats::kmeans(x, n_cells, iter.max=10^5, algorithm=algorithm),
                             n_cells=n_cells,
                             function_to_calculate_distance_metric=function_to_calculate_distance_metric,
                             function_to_calculate_error_metric=function_to_calculate_error_metric,
                             distance_metric=distance_metric,
                             quant_method = quant_method)
    
      n_cells_optimal <- n_cells
    }

    # names(outkinit$val) <- seq_along(outkinit$val)
    rescl[[1]] <- outkinit$val
    tet <- lapply(outkinit$val, row.names)
    
    # tet <- lapply(1:length(tet),function(k){
    #   data.frame(tet[[as.character(k)]], 1, 1, k)
    # })
    # tet <- lapply(1:length(tet),function(k){
    #   k<- as.character(k)
    #   if(length(tet[[k]])!=0)
    #     data.frame(tet[[as.character(k)]], 1, 1, k)
    #   else data.frame(`tet..as.character.k...`=character(0), `1`=numeric(0),'1'=numeric(0), k=integer(0))
    # })
    
    #length(tet) = no of clusters
    for (k in 1:length(tet)){
      #print(k)
      tet[[k]] <- data.frame(tet[[k]], 1, 1, k)
      #print(head(tet[[k]]))
    }
    
    names(tet) <- paste(1:length(tet))
    # names(tet) <- seq_along(tet)
    # for (k in 1: length(tet)) 
    #   tet[[as.character(k)]] <- data.frame(tet[[as.character(k)]], 1, 1, k)
    #ID for each datapoint
    resid[[1]] <- tet
    #centroids of each cluster
    # names(outkinit$cent) <- seq_along(outkinit$cent)
    resm[[1]] <- outkinit$cent
    maxqe[[1]] <- outkinit$maxQE
    meanqe[[1]] <- outkinit$meanQE
    #flag to check for quantization error
    resplt[[1]] <- unlist(outkinit$cent) > quant.err
    #initial values for next level k-means
    # names(outkinit$values)<-seq_along(outkinit$values)
    initclust <- outkinit$values
    
    
    if (depth > 1) {
      # flog.info("HVQ calculation started")
      i <- 1
      while (i < depth) {
        ijclust <- NULL
        #stores values of datapoints
        ijrescl <- list()
        #ids of the datapoints
        ijresid <- list()
        #to store the centroids
        ijresm <- list()
        # to store max qe as radius for anomalies
        ijmaxqe <- list()
        ijmeanqe <- list()
        #flag to check the quantization error
        ijresplt <- list()
        #number of datapoints in a cluster
        ijresnsize <- list()
        ijztab3up <- list()
        #flag which checks quantization error
        quantok <- unlist(resplt[[i]])
        j <- 1
        
        ## Intervention 1
        while (j < (n_cells^i) + 1) {
          
          #if datapoint exceeds quantization error and number of datapoints in that cluster is greater than n_cells
          if (quantok[j] & NROW(initclust[[j]]) > 3) {
            #k-means on the initclust to obtain the next level clustering(sub-clusters)
            z = data.frame(initclust[[j]])
            # try(
            # outk <- getOptimalCentroids_new(z, iter.max = 10^5, algorithm = algorithm, n_cells, n_min_points, function_to_calculate_distance_metric, function_to_calculate_error_metric, quant.err = quant.err, distance_metric = distance_metric, quant_method=quant_method)
            outk <- getOptimalCentroids(z, iter.max = 10^5, algorithm = algorithm, n_cells, function_to_calculate_distance_metric, function_to_calculate_error_metric, quant.err = quant.err, distance_metric = distance_metric, quant_method=quant_method)
            # )
            
            # outk$centers <- outk$centers[-which(sapply(outk$centers, is.na))]
            # outk$maxQE <- outk$maxQE[-which(sapply(outk$maxQE, is.na))]
            # outk$meanQE <- outk$meanQE[-which(sapply(outk$meanQE, is.na))]
            # outk$values <- outk$values[-which(sapply(outk$values, is.na))]
            # outk$nsize <- outk$nsize[-which(sapply(outk$nsize, is.na))]
            
            # browser()
            # outk <- getOptimalCentroids(initclust[[j]], iter.max = 100, algorithm = algorithm, n_cells,distance_metric = distance_metric,error_metric = error_metric,quant.err = quant.err)
            
            # outk <- getCentroids(initclust[[j]], kout = stats::kmeans(initclust[[j]], n_cells, iter.max = 100, algorithm = algorithm), n_cells,distance_metric = distance_metric,error_metric = error_metric)
            #store the datapoints
            # names(outk$val)<-seq_along(outk$val)
            ijrescl[[j]] <- outk$val
            # tet <- lapply(outk$val, sapply, row.names)
            tet <- lapply(outk$val, row.names)
            tet[-which(sapply(tet, is.null))]
            # names(tet) <- seq_along(tet)
            #create ID's for each datapoint
            # tet <- lapply(1:length(tet),function(k){
            #   if(length(tet[[as.character(k)]])!=0)
            #     data.frame(tet[[as.character(k)]], i+1, j, k)
            #   else data.frame(`tet..as.character.k...`=character(0), `i...1`=numeric(0), j=numeric(0), k=integer(0))
            # })
            # for (k in 1: length(tet)) {
            #   print(k)
            #   print(nrow(data.frame(tet[[k]])))
            # }
            for (k in 1: length(tet)){
              
              if(!is.null(tet[[k]])){
                tet[[k]] <- data.frame(tet[[k]],i + 1, j, k)  
              }
              else{
                tet[[k]] <- data.frame(tet[[k]]<-numeric(0))
              }
            }
            
            # names(tet) <- paste(1:length(tet))
            # names(tet)<-seq_along(tet)
            ijresid[[j]] <- tet
            #store the centroids
            # names(outk$cent) <- seq_along(outk$cent)
            ijresm[[j]] <- outk$centers
            ijmaxqe[[j]] <- outk$maxQE
            ijmeanqe[[j]] <- outk$meanQE
            #size of a cluster
            # names(outk$nsize) <- seq_along(outk$nsize)
            ijresnsize[[j]] <- outk$nsize
            # ijztab3up[[j]] <- sapply(outk$val, mean)
            
            ijztab3up[[j]]<-matrix(0, nrow = ncol(x), n_cells)
            
            # rownames(ijztab3up[[j]]) <- colnames(x)
            ijztab3up[[j]] <- t(outk$centroid_val)
            # ijztab3up[[j]]<- na.omit(ijztab3up[[j]])
            
            #flag to check if the quantization error threshold is exceeded
            ijresplt[[j]] <- unlist(outk$centers) > quant.err
            #store the datapoints
            ijclust <- c(ijclust, outk$values)
          }else {
            ijrescl[[j]] <- rep(NA, n_cells)
            ijresid[[j]] <- NULL
            ijresm[[j]] <- rep(NA, n_cells)
            ijmaxqe[[j]] <- rep(NA, n_cells)
            ijmeanqe[[j]] <- rep(NA, n_cells)
            ijresnsize[[j]] <- rep(0, n_cells)
            ijztab3up[[j]] <- matrix(NA, ncol(x), n_cells)
            ijresplt[[j]] <- rep(F, n_cells)
            ijclust <- c(ijclust, rep(NA, n_cells))
          }
          n_cells_at_j <- length(tet)
          ztab1 <- c(ztab1, paste(i + 1, j, 1:n_cells_at_j, sep = ","))
          ztab11 <- c(ztab11, rep(i + 1, n_cells_at_j))
          ztab12 <- c(ztab12, rep(j, n_cells_at_j))
          ztab13 <- c(ztab13, 1: n_cells_at_j)
          j <- j + 1
        }
        rescl[[i + 1]] <- ijrescl
        resid[[i + 1]] <- ijresid
        resm[[i + 1]] <- ijresm
        maxqe[[i + 1]] <- ijmaxqe
        meanqe[[i + 1]] <- ijmeanqe
        resplt[[i + 1]] <- ijresplt
        ztab2 <- c(ztab2, unlist(ijresnsize))
        ztab3up[[i + 1]] <- data.frame(ijztab3up)
        ztabn <- c(ztabn, unlist(ijresm))
        initclust <- ijclust
        # flog.info("Output for Level %s is ready", i)
        i <- i + 1
        #check if the desired depth is reached
        if (!is.element(T, unlist(ijresplt))) {
          zdepth <- i
          i <- depth
        }
      }
      # flog.info("HVQ for user-defined depth has been calculated")
      #ztab is the output which contains the datapoints and their IDs.
      #initialize ztab
      ztab <- data.frame(matrix(0, nrow = sum(n_cells^(1:zdepth)), 
                                ncol = (ncol(x) + 5)))
      #Segment Level
      ztab[1:n_cells, 1] <- rep(1, n_cells)
      #Segment Parent
      ztab[1:n_cells, 2] <- rep(1, n_cells)
      #Segment Child
      ztab[1:n_cells, 3] <- 1: n_cells
      #Size of the cluster
      ztab[1:n_cells, 4] <- unlist(outkinit$nsize)
      #Centroid/Quantization error of the cluster
      ztab[1:n_cells, 5] <- unlist(outkinit$cent)
      # ztab3upc <- sapply(outkinit$val, mean, na.rm = T)
      # ztab3upc<-matrix(0, nrow = ncol(x), n_cells)
      # std<-matrix(0, nrow = ncol(x), n_cells)
      for(a in 1: n_cells){
        for(b in 1: ncol(x)){
          ztab3upc[b, a] <- as.matrix(mean(outkinit$val[[a]][, b], na.rm = T))
          rownames(ztab3upc) <- colnames(x)
          # Calculating sd
          std[b, a] <- as.matrix(stats::sd(outkinit$val[[a]][, b], na.rm = T))
          rownames(std) <- colnames(x)
        }
      }
      for (l in 1:length(ztab3up)){
        ztab3upc <- cbind(ztab3upc,ztab3up[[l]])
        std <- cbind(std, ztab3up[[l]])
      }
      
      ztab[(n_cells + 1):sum(n_cells^(1:zdepth)), 1] <- ztab11
      ztab[(n_cells + 1):sum(n_cells^(1:zdepth)), 2] <- ztab12
      ztab[(n_cells + 1):sum(n_cells^(1:zdepth)), 3] <- ztab13
      ztab[(n_cells + 1):sum(n_cells^(1:zdepth)), 4] <- ztab2
      ztab[, 6: ncol(ztab)] <- t(ztab3upc)
      ztab[(n_cells + 1): sum(n_cells^(1: zdepth)), 5] <- ztabn
      names(ztab) <- c("Segment.Level", "Segment.Parent", "Segment.Child", 
                       "n", "Quant.Error", colnames(x))
    } else {
      # else loop if depth = 1
      ztab <- data.frame(matrix(0, nrow = n_cells_optimal, ncol = (ncol(x) + 
                                                            5)))
      ztab[, 1] <- rep(1, n_cells_optimal)
      ztab[, 2] <- rep(1, n_cells_optimal)
      ztab[, 3] <- 1:n_cells_optimal
      ztab[, 4] <- unlist(outkinit$nsize)
      ztab[, 5] <- unlist(outkinit$cent)
      ztab_mean <- t(sapply(outkinit$val, colMeans, 
                            na.rm = T))
      std <- t(sapply(outkinit$val, colSd, 
                      na.rm = T))
      if(quant_method == "kmeans"){
        ztab[, 6:ncol(ztab)] <- ztab_mean # mean values
      } else {
        ztab[, 6:ncol(ztab)] <- outkinit[["sum_val"]] # Medoid/median values
      }
      
      names(ztab) <- c("Segment.Level", "Segment.Parent", "Segment.Child", 
                       "n", "Quant.Error", colnames(x))
    }
    
    
    if(depth > 1){
      meanCol = t(ztab3upc)
      std = t(std)
    }else{
      meanCol =  ztab_mean
      std = data.frame(std)
    }
    
    
    
    ## Calculate compress percentage
    compression_summary <- ztab %>% 
      dplyr::group_by(Segment.Level) %>% 
      dplyr::summarise( noOfCells = sum(!is.na(Quant.Error)), noOfCellsBelowQuantizationError = sum(Quant.Error < quant.err,na.rm = TRUE)) %>% dplyr::mutate(percentOfCellsBelowQuantizationErrorThreshold = (noOfCellsBelowQuantizationError/noOfCells))
    
    colnames(compression_summary)[1] <- "segmentLevel"
    
    if(any(is.nan(compression_summary$percentOfCellsBelowQuantizationErrorThreshold))){
      compression_summary <- compression_summary[stats::complete.cases(compression_summary),]
    }
 

    compression_summary$parameters=paste("n_cells:",n_cells_optimal,"quant.err:",quant.err,"distance_metric:",distance_metric[1],"error_metric:",error_metric[1],"quant_method:",quant_method[1])

    
    ridnames <- resid
    rclnames <- rescl
    return(list(clusters = initclust, nodes.clust = rescl, idnodes = resid, 
                error.quant = resm, max_QE = maxqe, plt.clust = resplt, summary = ztab, compression_summary = compression_summary,
                meanClust = meanCol, sdClust = std,mean_QE = meanqe ))
    
  }
Any scripts or data that you put into this service are public.
muHVT documentation built on March 7, 2023, 6:38 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
muHVT
Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

R/hvq.R
In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

Try the muHVT package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

muHVT Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

R/hvq.R In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

Try the muHVT package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

muHVT
Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

R/hvq.R
In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis