R/VQ_codebookSplit.R
In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

Documented in VQ_codebookSplit

requireNamespace("deldir")   

#======================================================================================
# Global variables
#======================================================================================

#quant.err <- 80
errorCode <- 0

#______________________________________________________________________________________
# Splitting the codebooks
#______________________________________________________________________________________

split_codevec = function(codeVec,epsilon){
  #errorValue=errorValue/100
  codeVec <- plyr::llply(codeVec, .fun= function(X) return(list((1+epsilon)*X, (1-epsilon)*X)))
  return(codeVec)  
}

#_____________________________________________________________________________________
# Checks if the MAPE for a given cluster and its codebook is lesser/greater
# than the specified treshold
#_____________________________________________________________________________________

findMape <- function(cluster, centroid){
  mape <- 1/nrow(cluster) * sum(apply(cluster, MARGIN = 1, FUN= function(X){ return(abs(sum((X-centroid), na.rm=TRUE)/length(X)))}))
  return(mape)
}

mapeTest = function(cluster, centroid,errorValue){
  
  if(is.null(dim(cluster))){
    return(TRUE)
  }
  
  mape <- findMape(cluster, centroid)
  #print(paste("Quantization error:", mape))
  
  if(mape < errorValue){
    return(TRUE)
  } else {
    return(FALSE)
  }
}

#_____________________________________________________________________________________
# allocate members to their centroids
#_____________________________________________________________________________________
findMembers <- function(dataset, centroid){
  
  # compute the distance of every point in the data from a centroid
  #err <- lapply(centroid, FUN = function(X){ return(apply(dataset, MARGIN=1, FUN= function(Y)sqrt(sum((Y-X)^2))))})
  err = lapply(centroid, FUN=function(x) sqrt(colSums((t(dataset) - x)^2)))
  err <- do.call(rbind, err)
  
  # Allocate the membership of a datum to its closest centroid
  clust.list <- apply(err, MARGIN=2, FUN = function(X) { return(which(X==min(X), arr.ind=T))})
  
  return(clust.list)
}

#_____________________________________________________________________________________
# Function to assign cluster membership of data points
#_____________________________________________________________________________________

findSeque <- function(dataset, clusters){
  
  tryCatch({
    rownames(dataset) <- NULL
    clusters <- lapply(clusters, unname)
    clusters <- lapply(clusters, as.matrix)
    clusters <- lapply(clusters , function(X){colnames(X)<- colnames(dataset); return(X)})
    oneRowIndices <- which(lapply(lapply(clusters, dim), function(X)return(X[2]))==1)
    clusters[oneRowIndices] <- lapply(clusters[oneRowIndices], t)
    pointPos <- lapply(clusters, FUN=function(X)rownames(plyr::match_df(dataset, as.data.frame(X))))
    pointPos <- lapply(pointPos, as.numeric)
    sequeDf <- as.data.frame(1:nrow(dataset))
    lapply(1:length(pointPos), FUN=function(X){sequeDf[pointPos[[X]],]<<- X})
    clust.list <- sequeDf[,1]
  },
           error=function(err){print(err)}, 
           finally={})
  
  return(clust.list)
}

#_____________________________________________________________________________________
# allocate members to their centroids
#_____________________________________________________________________________________

allocateMembers <- function(dataset, centroid){ 
  
  clust.list <- findMembers(dataset, centroid)
  
  return(lapply(unique(clust.list), FUN=function(X)which(clust.list==X)))
}

#_____________________________________________________________________________________
# Nearest neighbour condition - LBGVQ function
#_____________________________________________________________________________________

lbgVQ = function(dataset,quant.err,epsilon){
  
  # mean of the entire data set
  cnew <- list(apply(dataset, MARGIN = 2, FUN= function(X)mean(X, na.rm=TRUE)))
  cFin <- c()
  finalClusters <- c()
  
  # Set the quantization error
  mape <- 1/nrow(dataset) * sum(apply(dataset, MARGIN = 1, FUN= function(X){ return(abs(sum((X-unlist(cnew)), na.rm=TRUE))/length(X))}))
  if(mape != 0){
    errorValue <- mape*quant.err
  } else {
    errorValue <- quant.err
  }
  
  # clust.list is the indices of the members of a cluster
  clust.list <- list(1:nrow(dataset))
  
  while(T){
    
    # cluster is the actual data points in the cluster
    cluster <- lapply(clust.list, FUN=function(Y) { dataset[Y,] })
    
    # codesToSplit is the codebooks which have q error greater than the treshold 
    tryCatch({codesToSplit <- cnew[which(!unlist(lapply(1:length(cluster), FUN= function(i){mapeTest(cluster[[i]], cnew[[i]],errorValue)})))]}, 
             error=function(err){print(err)}, 
             finally={})
    
    # split the codebooks
    tryCatch({cSplit <- unlist(split_codevec(codesToSplit,epsilon), recursive=F)}, 
             error=function(err){print(err)}, 
             finally={})
    
    # remove the final codebook from the list whose q error is lesser than the treshold
    doneFlags <- !(cnew %in% codesToSplit)
    cFin <- c(cFin, cnew[doneFlags])
    if(length(which(doneFlags))!=0){
      
      # Update the finalClusters
      finalClusters <- c(finalClusters, cluster[which(!cnew %in% codesToSplit)])
      cluster[which(!cnew %in% codesToSplit)] <- NULL
      
      
      # Remove the clustered data from the original dataset
      dataset <- dataset[-unlist(clust.list[which(doneFlags)]),]
      
      # Start afresh
      cnew <- list(apply(dataset, MARGIN = 2, FUN= mean))
      clust.list <- list(1:nrow(dataset))
      
      if(length(codesToSplit) == 0){
        break
      }
      next
    }
    
    # update the clust.list with the new split codebooks and re-compute the mean codebook
    tryCatch({clust.list <- allocateMembers(dataset, cSplit)}, 
             error=function(err){print(err)}, 
             finally={})
    
    cnew <- lapply(clust.list, 
                   FUN = function(X) 
                     if(length(X)==1){ return(dataset[X,]) 
                     } else { return(apply(dataset[unlist(X),], MARGIN=2, FUN=mean))} )
  }
  
  cFin <- lapply(cFin, FUN = as.numeric)
  
  return(list(clusters = finalClusters, codebooks = cFin,errorValue=errorValue))
}

#_____________________________________________________________________________________
# Functions to merge the clusters that satisfy the quantization treshold
#_____________________________________________________________________________________

euDistance <- function(index, codebooks){
  return((codebooks[[index[1]]] - codebooks[[index[2]]])^2)
}

#_____________________________________________________________________________________
# Function checks if the clusters should be merged based on the quantization error
#_____________________________________________________________________________________

shouldMergeClusters <- function(X, clusters,errorValue){ 
  
  # merge the two clusters and compute the new mean
  newCluster <- do.call(rbind, list(clusters[[X[1]]], clusters[[X[2]]]))
  meanCb <- apply(newCluster, MARGIN = 2, FUN= mean)
  
  # check for the quantization error
  tryCatch({mapeFlag <- mapeTest(newCluster, meanCb,errorValue)}, 
           error=function(err){print(err)}, 
           finally={})
  
  return(mapeFlag)
}


mergeClusters <- function(codebooks, clusters,errorValue){
  
  # find the index of the centroids that are closest (X)
  pairs <- t(utils::combn(length(codebooks),2))
  
  tryCatch({euD <- sqrt(colSums(apply(pairs, MARGIN=1, FUN=function(X){euDistance(X, codebooks)})))},
           error=function(err){print(err)}, 
           finally={})
  
  X<- pairs[which(euD==min(euD)),]
  
  
  if(shouldMergeClusters(X, clusters,errorValue)){
    # merge the two clusters and compute the new mean
    newCluster <- do.call(rbind, list(clusters[[X[1]]], clusters[[X[2]]]))
    meanCb <- list(apply(newCluster, MARGIN = 2, FUN= mean))
    
    # Update the codebooks and the clusters
    clusters[[X[1]]] <- newCluster
    clusters[[X[2]]] <- NULL
    codebooks[[X[1]]] <- meanCb[[1]]
    codebooks[[X[2]]] <- NULL
    
    # recurse to find the next merge-able codebook pair
    mergeClusters(codebooks, clusters,errorValue)
  } else {
    return(list(clusters =clusters, codebooks = codebooks,errorValue=errorValue))
  }
  
}

#_______________________________________________________________________________________
# Main Function to quantize
#______________________________________________________________________________________

vectorQuantize <- function(dataset,quant.err,epsilon){
  
  tryCatch({lbgvq <- lbgVQ(dataset,quant.err,epsilon)}, 
           error=function(err){print(err)}, 
           finally={})
  
  if(length(lbgvq$codebooks) > 1){
    
    tryCatch({model <- mergeClusters(lbgvq$codebooks, lbgvq$clusters,lbgvq$errorValue)}, 
             error=function(err){print(err)},
             finally={})
    

    return(model)
    
  } else {
    return(lbgvq)
  }
  
}

#==================================================================
# Function to Trilaterate
# Input : a distance matrix or a 'dist' object
# Output : a set of trilaterated points
#==================================================================

# trilaterate <- function(d){
#   if(class(d)=='dist'){
#     d <- as.matrix(d)
#   }

trilaterate <- function(d){
  # if(is(d, "dist")){
  if(inherits(d, "dist")){
    d <- as.matrix(d)
  }
  

  # p1, p2, p3 are the initial triangulation points
  p1 <- c(0,0)
  p2 <- c(0, d[1,2])
  
  if(dim(d)[1]==3){  # If there are only 3 points to trilaterate
    
    p3y <- (d[1,3]^2 + d[1,2]^2 - d[2,3]^2)/(2*d[1,2])
    p3x <- sqrt(d[1,3]^2 - p3y^2)
    p3 <- c(p3x,p3y)  
    pdash <- rbind(p1,p2,p3)	
    colnames(pdash) = c('x', 'y')
    return(pdash)
    
  } else if(dim(d)[1] > 3){ # If there are more than 3 points to trilaterate
    
    p3y <- (d[1,3]^2 + d[1,2]^2 - d[2,3]^2)/(2*d[1,2])
    p3x <- sqrt(d[1,3]^2 - p3y^2)
    p3 <- c(p3x,p3y)
    
    # iterator
    i <- 4:dim(d)[1]
    # compute the remaining points
    pdash <- lapply(i, FUN= function(ind) {
      y <- (d[1,ind]^2 + d[1,2]^2 - d[2,ind]^2)/(2*d[1,2])
      x <- (d[1,ind]^2 + d[1,3]^2 - d[3,ind]^2 - 2*y*p3[2])/(2*p3[1])
      return(c(x, y))
    })
    
    pdash<- do.call(rbind, pdash)
    pdash <- rbind(p1,p2, p3,pdash)
    colnames(pdash) = c('x', 'y')
    return(pdash)
  } else {  # If there are one or 2 points to trilaterate
    pdash <- rbind(p1,p2)
    colnames(pdash) = c('x', 'y')
    return(pdash)
  }
}



#===========================================================================
# Main function to call
#===========================================================================







#' VQ_codebookSplit
#' 
#' Vector Quantization by codebook split method
#' 
#' Performs Vector Quantization by codebook split method. Initially, the entire
#' dataset is considered to be one cluster where the codebook is the mean of
#' the cluster. The quantization criteria is checked and the codebook is split
#' such that the new codebooks are (codebook+epsilon) and (codebook-epsilon).
#' The observations are reassigned to these new codebooks based on the nearest
#' neighbour condition and the means recomputed for the new clusters. This is
#' done iteratively until all the clusters meet the quantization criteria.
#' 
#' @param dataset Matrix. A matrix of multivariate data. Each row corresponds
#' to an observation, and each column corresponds to a variable. Missing values
#' are not accepted.
#' @param quant.err Numeric. The quantization error for the algorithm.
#' @param epsilon Numeric. The value to offset the codebooks during the
#' codebook split. Default is NULL, in which case the value is set to quant.err
#' parameter.
#' @return \item{clusters}{ List. A list showing each ID assigned to a cluster.
#' } \item{nodes.clust}{ List. A list corresponding to nodes' details. }
#' \item{idnodes}{ List. A list of ID and segments similar to
#' \code{nodes.clust} with additional columns for nodes ID. }
#' \item{error.quant}{ List. A list of quantization error for all levels and
#' nodes. } \item{plt.clust}{ List. A list of logical values indicating if the
#' quantization error was met. } \item{summary}{ Summary. Output table with
#' summary. }
#' @author Sangeet Moy Das <sangeet.das@@mu-sigma.com>
#' @seealso \code{\link{hvtHmap}}
#' @examples
#' 
#' 
#' data("iris",package="datasets")
#' iris <- iris[,1:2]
#' 
#' vqOutput = VQ_codebookSplit(iris, quant.err = 0.5)
#' 
#' @export VQ_codebookSplit
#' @keywords internal
VQ_codebookSplit <- function(dataset,quant.err=0.5,epsilon=NULL){
  
  model <- list()
  
  if(is.null(epsilon)){
    epsilon = quant.err
  }
  
  if(!is.null(dataset)){
    # Run VQ
    tryCatch({system.time(model <- vectorQuantize(dataset,quant.err,epsilon))}, 
             error=function(err){print(err)}, 
             finally={})
    
  } else {
    #flog.error("Data is missing", name=logNS)	      
  }
  
  if(exists('model')){	  
    
    # Reduce the dimensionality of the codebooks to 2
    codebooks <- do.call(rbind, model$codebooks)
    distances <- stats::dist(codebooks)
    if(length(distances) != 0){
      tryCatch({twoDCodebooks <- trilaterate(distances)}, error=function(err){print(err)}, finally={})
    } else {
      twoDCodebooks <- model$codebooks[[1]][1:2]
    }
    mapes <- lapply(1:length(model$clusters), FUN=function(i){findMape(model$clusters[[i]],model$codebooks[[i]])})
	
    ztab <- data.frame('Segment.Level' = rep(1, length(model$clusters)), 'segment.Parent' = rep(1, length(model$clusters)), 'Segment.Child'= 1:length(model$clusters), 'n'=  unlist(lapply(model$clusters, nrow)),'Quant.Error'= unlist(mapes))
    ztab <- cbind(ztab, do.call(rbind, model$codebooks))
    
    idnodes <- lapply(model$clusters, rownames)
    idnodes <- list(lapply(1:length(idnodes), FUN=function(i){data.frame('tet..k..'=idnodes[[i]], 'X1'=rep(1, length(idnodes[[i]])), 'X1.1'=rep(1, length(idnodes[[i]])), 'k'= i)}))
    
    return(list('clusters'=model$clusters, 'nodes.clust'= list(model$clusters), 'idnodes'= idnodes, 'error.quant'= list(mapes), plt.clust= list(mapes<model$errorValue), 'summary'= ztab))
    
       
  } else {
    return('error')
  }
  
}

Any scripts or data that you put into this service are public.

muHVT documentation built on March 7, 2023, 6:38 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

muHVT
Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

R/VQ_codebookSplit.R
In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

Defines functions VQ_codebookSplit trilaterate vectorQuantize mergeClusters shouldMergeClusters euDistance lbgVQ allocateMembers findSeque findMembers mapeTest findMape split_codevec

Documented in VQ_codebookSplit

Try the muHVT package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

muHVT Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

R/VQ_codebookSplit.R In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

Defines functions VQ_codebookSplit trilaterate vectorQuantize mergeClusters shouldMergeClusters euDistance lbgVQ allocateMembers findSeque findMembers mapeTest findMape split_codevec

Documented in VQ_codebookSplit

Try the muHVT package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

muHVT
Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis

R/VQ_codebookSplit.R
In muHVT: Constructing Hierarchical Voronoi Tessellations and Overlay Heatmap for Data Analysis