R/VQ_codebookSplit.R

Defines functions VQ_codebookSplit trilaterate vectorQuantize mergeClusters shouldMergeClusters euDistance lbgVQ allocateMembers findSeque findMembers mapeTest findMape split_codevec

Documented in VQ_codebookSplit

#' @importFrom utils combn
requireNamespace("deldir")

# ======================================================================================
# Global variables
# ======================================================================================

# quant.err <- 80
errorCode <- 0

# ______________________________________________________________________________________
# Splitting the codebooks
# ______________________________________________________________________________________

split_codevec <- function(codeVec, epsilon) {
  # errorValue=errorValue/100
  codeVec <- plyr::llply(codeVec, .fun = function(X) {
    return(list((1 + epsilon) * X, (1 - epsilon) * X))
  })
  return(codeVec)
}

# _____________________________________________________________________________________
# Checks if the MAPE for a given cluster and its codebook is lesser/greater
# than the specified treshold
# _____________________________________________________________________________________

findMape <- function(cluster, centroid) {
  mape <- 1 / nrow(cluster) * sum(apply(cluster, MARGIN = 1, FUN = function(X) {
    return(abs(sum((X - centroid), na.rm = TRUE) / length(X)))
  }))
  return(mape)
}

mapeTest <- function(cluster, centroid, errorValue) {
  if (is.null(dim(cluster))) {
    return(TRUE)
  }

  mape <- findMape(cluster, centroid)
  # print(paste("Quantization error:", mape))

  if (mape < errorValue) {
    return(TRUE)
  } else {
    return(FALSE)
  }
}

# _____________________________________________________________________________________
# allocate members to their centroids
# _____________________________________________________________________________________
findMembers <- function(dataset, centroid) {
  # compute the distance of every point in the data from a centroid
  # err <- lapply(centroid, FUN = function(X){ return(apply(dataset, MARGIN=1, FUN= function(Y)sqrt(sum((Y-X)^2))))})
  err <- lapply(centroid, FUN = function(x) sqrt(colSums((t(dataset) - x)^2)))
  err <- do.call(rbind, err)

  # Allocate the membership of a datum to its closest centroid
  clust.list <- apply(err, MARGIN = 2, FUN = function(X) {
    return(which(X == min(X), arr.ind = TRUE))
  })

  return(clust.list)
}

# _____________________________________________________________________________________
# Function to assign cluster membership of data points
# _____________________________________________________________________________________

findSeque <- function(dataset, clusters) {
  tryCatch(
    {
      rownames(dataset) <- NULL
      clusters <- lapply(clusters, unname)
      clusters <- lapply(clusters, as.matrix)
      clusters <- lapply(clusters, function(X) {
        colnames(X) <- colnames(dataset)
        return(X)
      })
      oneRowIndices <- which(lapply(lapply(clusters, dim), function(X) {
        return(X[2])
      }) == 1)
      clusters[oneRowIndices] <- lapply(clusters[oneRowIndices], t)
      pointPos <- lapply(clusters, FUN = function(X) rownames(plyr::match_df(dataset, as.data.frame(X))))
      pointPos <- lapply(pointPos, as.numeric)
      sequeDf <- as.data.frame(1:nrow(dataset))
      lapply(1:length(pointPos), FUN = function(X) {
        sequeDf[pointPos[[X]], ] <<- X
      })
      clust.list <- sequeDf[, 1]
    },
    error = function(err) {
      warning(err)
    },
    finally = {}
  )

  return(clust.list)
}

# _____________________________________________________________________________________
# allocate members to their centroids
# _____________________________________________________________________________________

allocateMembers <- function(dataset, centroid) {
  clust.list <- findMembers(dataset, centroid)

  return(lapply(unique(clust.list), FUN = function(X) which(clust.list == X)))
}

# _____________________________________________________________________________________
# Nearest neighbour condition - LBGVQ function
# _____________________________________________________________________________________

lbgVQ <- function(dataset, quant.err, epsilon) {
  # mean of the entire data set
  cnew <- list(apply(dataset, MARGIN = 2, FUN = function(X) mean(X, na.rm = TRUE)))
  cFin <- c()
  finalClusters <- c()

  # Set the quantization error
  mape <- 1 / nrow(dataset) * sum(apply(dataset, MARGIN = 1, FUN = function(X) {
    return(abs(sum((X - unlist(cnew)), na.rm = TRUE)) / length(X))
  }))
  if (mape != 0) {
    errorValue <- mape * quant.err
  } else {
    errorValue <- quant.err
  }

  # clust.list is the indices of the members of a cluster
  clust.list <- list(1:nrow(dataset))

  while (T) {
    # cluster is the actual data points in the cluster
    cluster <- lapply(clust.list, FUN = function(Y) {
      dataset[Y, ]
    })

    # codesToSplit is the codebooks which have q error greater than the treshold
    tryCatch(
      {
        codesToSplit <- cnew[which(!unlist(lapply(1:length(cluster), FUN = function(i) {
          mapeTest(cluster[[i]], cnew[[i]], errorValue)
        })))]
      },
      error = function(err) {
        warning(err)
      },
      finally = {}
    )

    # split the codebooks
    tryCatch(
      {
        cSplit <- unlist(split_codevec(codesToSplit, epsilon), recursive = FALSE)
      },
      error = function(err) {
        warning(err)
      },
      finally = {}
    )

    # remove the final codebook from the list whose q error is lesser than the treshold
    doneFlags <- !(cnew %in% codesToSplit)
    cFin <- c(cFin, cnew[doneFlags])
    if (length(which(doneFlags)) != 0) {
      # Update the finalClusters
      finalClusters <- c(finalClusters, cluster[which(!cnew %in% codesToSplit)])
      cluster[which(!cnew %in% codesToSplit)] <- NULL


      # Remove the clustered data from the original dataset
      dataset <- dataset[-unlist(clust.list[which(doneFlags)]), ]

      # Start afresh
      cnew <- list(apply(dataset, MARGIN = 2, FUN = mean))
      clust.list <- list(1:nrow(dataset))

      if (length(codesToSplit) == 0) {
        break
      }
      next
    }

    # update the clust.list with the new split codebooks and re-compute the mean codebook
    tryCatch(
      {
        clust.list <- allocateMembers(dataset, cSplit)
      },
      error = function(err) {
        warning(err)
      },
      finally = {}
    )

    cnew <- lapply(clust.list,
      FUN = function(X) {
        if (length(X) == 1) {
          return(dataset[X, ])
        } else {
          return(apply(dataset[unlist(X), ], MARGIN = 2, FUN = mean))
        }
      }
    )
  }

  cFin <- lapply(cFin, FUN = as.numeric)

  return(list(clusters = finalClusters, codebooks = cFin, errorValue = errorValue))
}

# _____________________________________________________________________________________
# Functions to merge the clusters that satisfy the quantization treshold
# _____________________________________________________________________________________

euDistance <- function(index, codebooks) {
  return((codebooks[[index[1]]] - codebooks[[index[2]]])^2)
}

# _____________________________________________________________________________________
# Function checks if the clusters should be merged based on the quantization error
# _____________________________________________________________________________________

shouldMergeClusters <- function(X, clusters, errorValue) {
  # merge the two clusters and compute the new mean
  newCluster <- do.call(rbind, list(clusters[[X[1]]], clusters[[X[2]]]))
  meanCb <- apply(newCluster, MARGIN = 2, FUN = mean)

  # check for the quantization error
  tryCatch(
    {
      mapeFlag <- mapeTest(newCluster, meanCb, errorValue)
    },
    error = function(err) {
      warning(err)
    },
    finally = {}
  )

  return(mapeFlag)
}


mergeClusters <- function(codebooks, clusters, errorValue) {
  # find the index of the centroids that are closest (X)
  pairs <- t(combn(length(codebooks), 2))

  tryCatch(
    {
      euD <- sqrt(colSums(apply(pairs, MARGIN = 1, FUN = function(X) {
        euDistance(X, codebooks)
      })))
    },
    error = function(err) {
      warning(err)
    },
    finally = {}
  )

  X <- pairs[which(euD == min(euD)), ]


  if (shouldMergeClusters(X, clusters, errorValue)) {
    # merge the two clusters and compute the new mean
    newCluster <- do.call(rbind, list(clusters[[X[1]]], clusters[[X[2]]]))
    meanCb <- list(apply(newCluster, MARGIN = 2, FUN = mean))

    # Update the codebooks and the clusters
    clusters[[X[1]]] <- newCluster
    clusters[[X[2]]] <- NULL
    codebooks[[X[1]]] <- meanCb[[1]]
    codebooks[[X[2]]] <- NULL

    # recurse to find the next merge-able codebook pair
    mergeClusters(codebooks, clusters, errorValue)
  } else {
    return(list(clusters = clusters, codebooks = codebooks, errorValue = errorValue))
  }
}

# _______________________________________________________________________________________
# Main Function to quantize
# ______________________________________________________________________________________

vectorQuantize <- function(dataset, quant.err, epsilon) {
  tryCatch(
    {
      lbgvq <- lbgVQ(dataset, quant.err, epsilon)
    },
    error = function(err) {
      warning(err)
    },
    finally = {}
  )

  if (length(lbgvq$codebooks) > 1) {
    tryCatch(
      {
        model <- mergeClusters(lbgvq$codebooks, lbgvq$clusters, lbgvq$errorValue)
      },
      error = function(err) {
        warning(err)
      },
      finally = {}
    )


    return(model)
  } else {
    return(lbgvq)
  }
}

# ==================================================================
# Function to Trilaterate
# Input : a distance matrix or a 'dist' object
# Output : a set of trilaterated points
# ==================================================================

# trilaterate <- function(d){
#   if(class(d)=='dist'){
#     d <- as.matrix(d)
#   }

trilaterate <- function(d) {
  # if(is(d, "dist")){
  if (inherits(d, "dist")) {
    d <- as.matrix(d)
  }


  # p1, p2, p3 are the initial triangulation points
  p1 <- c(0, 0)
  p2 <- c(0, d[1, 2])

  if (dim(d)[1] == 3) { # If there are only 3 points to trilaterate

    p3y <- (d[1, 3]^2 + d[1, 2]^2 - d[2, 3]^2) / (2 * d[1, 2])
    p3x <- sqrt(d[1, 3]^2 - p3y^2)
    p3 <- c(p3x, p3y)
    pdash <- rbind(p1, p2, p3)
    colnames(pdash) <- c("x", "y")
    return(pdash)
  } else if (dim(d)[1] > 3) { # If there are more than 3 points to trilaterate

    p3y <- (d[1, 3]^2 + d[1, 2]^2 - d[2, 3]^2) / (2 * d[1, 2])
    p3x <- sqrt(d[1, 3]^2 - p3y^2)
    p3 <- c(p3x, p3y)

    # iterator
    i <- 4:dim(d)[1]
    # compute the remaining points
    pdash <- lapply(i, FUN = function(ind) {
      y <- (d[1, ind]^2 + d[1, 2]^2 - d[2, ind]^2) / (2 * d[1, 2])
      x <- (d[1, ind]^2 + d[1, 3]^2 - d[3, ind]^2 - 2 * y * p3[2]) / (2 * p3[1])
      return(c(x, y))
    })

    pdash <- do.call(rbind, pdash)
    pdash <- rbind(p1, p2, p3, pdash)
    colnames(pdash) <- c("x", "y")
    return(pdash)
  } else { # If there are one or 2 points to trilaterate
    pdash <- rbind(p1, p2)
    colnames(pdash) <- c("x", "y")
    return(pdash)
  }
}



# ===========================================================================
# Main function to call
# ===========================================================================







#' VQ_codebookSplit
#'
#' Vector Quantization by codebook split method
#'
#' Performs Vector Quantization by codebook split method. Initially, the entire
#' dataset is considered to be one cluster where the codebook is the mean of
#' the cluster. The quantization criteria is checked and the codebook is split
#' such that the new codebooks are (codebook+epsilon) and (codebook-epsilon).
#' The observations are reassigned to these new codebooks based on the nearest
#' neighbour condition and the means recomputed for the new clusters. This is
#' done iteratively until all the clusters meet the quantization criteria.
#'
#' @param dataset Matrix. A matrix of multivariate data. Each row corresponds
#' to an observation, and each column corresponds to a variable. Missing values
#' are not accepted.
#' @param quant.err Numeric. The quantization error for the algorithm.
#' @param epsilon Numeric. The value to offset the codebooks during the
#' codebook split. Default is NULL, in which case the value is set to quant.err
#' parameter.
#' @return \item{clusters}{ List. A list showing each ID assigned to a cluster.
#' } \item{nodes.clust}{ List. A list corresponding to nodes' details. }
#' \item{idnodes}{ List. A list of ID and segments similar to
#' \code{nodes.clust} with additional columns for nodes ID. }
#' \item{error.quant}{ List. A list of quantization error for all levels and
#' nodes. } \item{plt.clust}{ List. A list of logical values indicating if the
#' quantization error was met. } \item{summary}{ Summary. Output table with
#' summary. }
#' @author Sangeet Moy Das <sangeet.das@@mu-sigma.com>
#' @seealso \code{\link{plotHVT}}
#' @examples
#'
#'
#' data("iris", package = "datasets")
#' iris <- iris[, 1:2]
#'
#' vqOutput <- VQ_codebookSplit(iris, quant.err = 0.5)
#'
#' @export VQ_codebookSplit
#' @keywords internal
VQ_codebookSplit <- function(dataset, quant.err = 0.5, epsilon = NULL) {
  model <- list()

  if (is.null(epsilon)) {
    epsilon <- quant.err
  }

  if (!is.null(dataset)) {
    # Run VQ
    tryCatch(
      {
        system.time(model <- vectorQuantize(dataset, quant.err, epsilon))
      },
      error = function(err) {
        warning(err)
      },
      finally = {}
    )
  } else {
    # flog.error("Data is missing", name=logNS)
  }

  if (exists("model")) {
    # Reduce the dimensionality of the codebooks to 2
    codebooks <- do.call(rbind, model$codebooks)
    distances <- stats::dist(codebooks)
    if (length(distances) != 0) {
      tryCatch({
        twoDCodebooks <- trilaterate(distances)
      }, error = function(err) {
        warning(err)
      }, finally = {})
    } else {
      twoDCodebooks <- model$codebooks[[1]][1:2]
    }
    mapes <- lapply(1:length(model$clusters), FUN = function(i) {
      findMape(model$clusters[[i]], model$codebooks[[i]])
    })

    ztab <- data.frame(
      "Segment.Level" = rep(1, length(model$clusters)),
      "segment.Parent" = rep(1, length(model$clusters)),
      "Segment.Child" = 1:length(model$clusters),
      "n" = unlist(lapply(model$clusters, nrow)),
      "Quant.Error" = unlist(mapes)
    )
    ztab <- cbind(ztab, do.call(rbind, model$codebooks))

    idnodes <- lapply(model$clusters, rownames)
    idnodes <- list(lapply(1:length(idnodes), FUN = function(i) {
      data.frame(
        "tet..k.." = idnodes[[i]],
        "X1" = rep(1, length(idnodes[[i]])),
        "X1.1" = rep(1, length(idnodes[[i]])),
        "k" = i
      )
    }))

    return(list(
      "clusters" = model$clusters,
      "nodes.clust" = list(model$clusters),
      "idnodes" = idnodes,
      "error.quant" = list(mapes),
      plt.clust = list(mapes < model$errorValue),
      "summary" = ztab
    ))
  } else {
    return("error")
  }
}

Try the HVT package in your browser

Any scripts or data that you put into this service are public.

HVT documentation built on April 3, 2025, 8:45 p.m.