R/KMeansClustering.R

#' K-Means Clustering
#'
#' This function can be used to train a k-means clustering  machine learning model in R.
#'
#' @details
#' K Means Clustering is widely used Unsupervised Machine Learning Algorithm, this function can be used to perform unsuperwised Clustering or Labelling based on KMC algorithm. This package imports the mini-batch k-means function from
#' ClusterR package which has been developed and written in C++, therefore  it is computationally very fast.
#'
#' @export

KMeansClustering <- R6Class("KMeansClustering", public = list(

  #' @field clusters indicate the number of clusters, this is a hyperparameter and must be tuned.
  clusters = NA,
  #' @field b_size indicates the the size of the mini batches to be used while fitting the model.
  b_size = 10,
  #' @field num_rep indicates the number of times the algorithm shall be run each time with the different centroid seeds chosen randomly.
  num_rep = 1,
  #' @field max_iterations indicate the maximum number of epochs performed for clustering.
  max_iterations = 100,
  #' @field init_fraction indicates the total percentage of data to be used for the purpose of  initialization of the random centroids  points, it applies if initializer is set to kmeans++. It shall be of type float with in range of 0 to 1.
  init_fraction = 1,
  #' @field initializer this indicates  the method that has been used for the initialization of the centeroids. It can take values of kmeans++, optimal_init, or quantile_init, ususally kmeans++ is used.
  initializer = "kmeans++",
  #' @field early_stop_iterations indicate the contination foe running the algorithm for given number of  iterations after finding one of the best within-cluster-sum-ofsquared-error.
  early_stop_iterations = 10,
  #' @field This field indicates if you want to the progress to be printed on the console or not, It shall be logical  either TRUE or FALSE.
  verbose = FALSE,
  #' @field centroids is  a matrix of initial cluster centroids. The columns shall be equal to the features in the data and the rows shall be equal to the number of centeroids or clusters.
  centroids = NULL,
  #' @field tolerance shall be a  floating number, in case is an iteration number is > 1 and iteration number is < max_itererations and the tolerance is greater than the squared norm of the centroids, then this is an indication that kmeans clustering algorithm has converged
  tolerance = 1e-04,
  #' @field tolerance_optimal_init is the tolerance value for the optimal_init type of initializer, the greater value is an indication of well separated clusters.
  tolerance_optimal_init = 0.3,
  #' @field seed shall be an integer value for Random Number Generator.
  seed = 1,
  #' @field model this is used for internal purpose for superml.
  model = NA,
  #' @field max_clusters this can be either a numeric , a contiguous or non-continguous numeric vector specifying search space of the clusters.
  max_clusters = NA,

  #' @details
  #' Create a new `KMeansClustering` object.
  #'
  #' @param clusters It shall be of type numeric, the  value  must lie  between 0 and 1.
  #' @param b_size It shall be of type nuemric, indicates the mini batch size for minibatch C++ package.
  #' @param num_rep It shall be of type integer, indicates the number of times the algorithm shall be run each time with the different centroid seeds chosen randomly.
  #' @param max_iterations It shall be of type integer indicating maximum number of iterations to be performed.
  #' @param init_fraction It shall be of type float,init_fraction indicates the total percentage of data to be used for the purpose of  initialization of the random centroids  points, it applies if initializer is set to kmeans++. It shall be of type float with in range of 0 to 1.
  #' @param initializer It shall be of type character,indicating the initiazer for centeroids most famous is kmeans++.
  #' @param early_stop_iterations It shall be of type integer, indication to run the algorithm for number of given interations after the best within-cluster-sum-ofsquared-error has been achieved.
  #' @param verbose It shall be of type logical, either TRUE or FALSE, indicating whether progress shall be  printed to the console during calculations.
  #' @param centroids It shall be a matrix with entities of type integer for float, indicating the  initial cluster centroids.
  #' @param tolerance It shall be of type float, in case is an iteration number is > 1 and iteration number is < max_itererations and the tolerance is greater than the squared norm of the centroids, then this is an indication that kmeans clustering algorithm has converged a float number. If, in case of an iteration (iteration > 1 and iteration < max_iters) "tol" is greater than the squared norm of the centroids, then kmeans has converged
  #' @param tolerance_optimal_init It shall be of type float, tolerance_optimal_init is the tolerance value for the optimal_init type of initializer, the greater value is an indication of well separated clusters.
  #' @param seed Its shall be of type integer, indicating the  value for Random Number Generator.
  #' @param max_clusters max_clusters  can be either a numeric , a contiguous or non-continguous numeric vector specifying search space of the clusters.
  #' @param num_init
  #' @return A `KMeansClustering` object.
  #'
  #' @examples
  #' data_set <- rbind(replicate(30, rnorm(1e4, 3)),
  #'              replicate(30, rnorm(1e4, -1)),
  #'              replicate(30, rnorm(1e4, 5)))
  #' km <- KMeansClustering$new(clusters=2, b_size=30, max_clusters=6)

  initialize = function(clusters,
                        b_size = 10,
                        num_rep=1,
                        max_iterations=100,
                        init_fraction=1,
                        initializer = "kmeans++",
                        early_stop_iterations = 10,
                        verbose=FALSE,
                        centroids=NULL,
                        tolerance = 1e-04,
                        tolerance_optimal_init=0.3,
                        seed=1,
                        max_clusters=NA){
    if (!(missing(clusters))) self$clusters <- clusters
    if (!(missing(b_size))) self$b_size <- b_size
    if (!(missing(num_rep))) self$num_rep <- 1
    if (!(missing(max_iterations))) self$max_iterations <- max_iterations
    if (!(missing(init_fraction))) self$init_fraction <- init_fraction
    if (!(missing(initializer))) self$initializer <- initializer
    if (!(missing(early_stop_iterations))) self$early_stop_iterations <- early_stop_iterations
    if (!(missing(verbose))) self$verbose <- verbose
    if (!(missing(centroids))) self$centroids <- centroids
    if (!(missing(tolerance))) self$tolerance <- tolerance
    if (!(missing(tolerance_optimal_init)))
      self$tolerance_optimal_init <- tolerance_optimal_init
    if (!(missing(seed))) self$seed <- seed
    if (!(missing(max_clusters))) self$max_clusters <- max_clusters
    superml::check_package("ClusterR")

  },

  #' @details
  #' This functions fits the KMeansClustering model
  #'
  #' @param X_data X_data shall be either a  data.frame or a matrix containing the features of interest.
  #' @param y y is set to NULL only kept here because of  superml general e:g way for every x you have to map it to y.
  #' @param find_optimal find_optimal shall be logical, it indicates to search the optimal clusters automatically.
  #' @return NULL
  #'
  #' @examples
  #' data_set <- rbind(replicate(30, rnorm(1e4, 3)),
  #'              replicate(30, rnorm(1e4, -1)),
  #'              replicate(30, rnorm(1e4, 5)))
  #' km <- KMeansClustering$new(clusters=2, b_size=30, max_clusters=6)
  #' km$fit(data_set, find_optimal = FALSE)

  fit = function(X_data, y=NULL, find_optimal = FALSE){

    # X should be a matrix
    if (!(is.matrix(X_data) | is.data.frame(X_data)) )
      stop("The Data Set provided Shall be a R Data Frame or Matrix")

    if (isTRUE(find_optimal)){
      message('Working to Find the Optimal Clusters Based on Variance Explianed')
      fm <- ClusterR::Optimal_Clusters_KMeans(X_data, max_clusters = self$max_clusters)
      self$clusters <- which.max(fm[-1])+1
    }

    message(sprintf('Using %d Clusters to fit to the Gievn data:: Learning in Progress', self$clusters))
    self$model <- ClusterR::MiniBatchKmeans(data = X_data
                                            ,clusters = self$clusters
                                            ,batch_size = self$b_size
                                            ,num_init = self$num_rep
                                            ,max_iters = self$max_iterations
                                            ,init_fraction = self$init_fraction
                                            ,initializer = self$initializer
                                            ,early_stop_iter = self$early_stop_iterations
                                            ,verbose = self$verbose
                                            ,CENTROIDS = self$centroids
                                            ,tol=self$tolerance
                                            ,tol_optimal_init = self$tolerance_optimal_init
                                            ,seed = self$seed)
  },

  #' @details
  #' Returns the prediction on the provided data.
  #'
  #' @param X_data it shall be an R Data Frame or Matrix
  #' @return a vector containing predictions
  #'
  #' @examples
  #' data_set <- rbind(replicate(30, rnorm(1e4, 2)),
  #'              replicate(30, rnorm(1e4, -1)),
  #'              replicate(30, rnorm(1e4, 5)))
  #' km <- KMeansClustering$new(clusters=2, b_size=30, max_clusters=6)
  #' km$fit(data_set, find_optimal = FALSE)
  #' preds <- km$predict(data_set)

  predict = function(X_data){
    return(ClusterR::predict_MBatchKMeans(X_data, CENTROIDS = self$model$centroids))
  })

)
MalikShahidSultan/machinelearning documentation built on May 9, 2022, 8:32 p.m.