#' K-Means Clustering
#'
#' This function can be used to train a k-means clustering machine learning model in R.
#'
#' @details
#' K Means Clustering is widely used Unsupervised Machine Learning Algorithm, this function can be used to perform unsuperwised Clustering or Labelling based on KMC algorithm. This package imports the mini-batch k-means function from
#' ClusterR package which has been developed and written in C++, therefore it is computationally very fast.
#'
#' @export
KMeansClustering <- R6Class("KMeansClustering", public = list(
#' @field clusters indicate the number of clusters, this is a hyperparameter and must be tuned.
clusters = NA,
#' @field b_size indicates the the size of the mini batches to be used while fitting the model.
b_size = 10,
#' @field num_rep indicates the number of times the algorithm shall be run each time with the different centroid seeds chosen randomly.
num_rep = 1,
#' @field max_iterations indicate the maximum number of epochs performed for clustering.
max_iterations = 100,
#' @field init_fraction indicates the total percentage of data to be used for the purpose of initialization of the random centroids points, it applies if initializer is set to kmeans++. It shall be of type float with in range of 0 to 1.
init_fraction = 1,
#' @field initializer this indicates the method that has been used for the initialization of the centeroids. It can take values of kmeans++, optimal_init, or quantile_init, ususally kmeans++ is used.
initializer = "kmeans++",
#' @field early_stop_iterations indicate the contination foe running the algorithm for given number of iterations after finding one of the best within-cluster-sum-ofsquared-error.
early_stop_iterations = 10,
#' @field This field indicates if you want to the progress to be printed on the console or not, It shall be logical either TRUE or FALSE.
verbose = FALSE,
#' @field centroids is a matrix of initial cluster centroids. The columns shall be equal to the features in the data and the rows shall be equal to the number of centeroids or clusters.
centroids = NULL,
#' @field tolerance shall be a floating number, in case is an iteration number is > 1 and iteration number is < max_itererations and the tolerance is greater than the squared norm of the centroids, then this is an indication that kmeans clustering algorithm has converged
tolerance = 1e-04,
#' @field tolerance_optimal_init is the tolerance value for the optimal_init type of initializer, the greater value is an indication of well separated clusters.
tolerance_optimal_init = 0.3,
#' @field seed shall be an integer value for Random Number Generator.
seed = 1,
#' @field model this is used for internal purpose for superml.
model = NA,
#' @field max_clusters this can be either a numeric , a contiguous or non-continguous numeric vector specifying search space of the clusters.
max_clusters = NA,
#' @details
#' Create a new `KMeansClustering` object.
#'
#' @param clusters It shall be of type numeric, the value must lie between 0 and 1.
#' @param b_size It shall be of type nuemric, indicates the mini batch size for minibatch C++ package.
#' @param num_rep It shall be of type integer, indicates the number of times the algorithm shall be run each time with the different centroid seeds chosen randomly.
#' @param max_iterations It shall be of type integer indicating maximum number of iterations to be performed.
#' @param init_fraction It shall be of type float,init_fraction indicates the total percentage of data to be used for the purpose of initialization of the random centroids points, it applies if initializer is set to kmeans++. It shall be of type float with in range of 0 to 1.
#' @param initializer It shall be of type character,indicating the initiazer for centeroids most famous is kmeans++.
#' @param early_stop_iterations It shall be of type integer, indication to run the algorithm for number of given interations after the best within-cluster-sum-ofsquared-error has been achieved.
#' @param verbose It shall be of type logical, either TRUE or FALSE, indicating whether progress shall be printed to the console during calculations.
#' @param centroids It shall be a matrix with entities of type integer for float, indicating the initial cluster centroids.
#' @param tolerance It shall be of type float, in case is an iteration number is > 1 and iteration number is < max_itererations and the tolerance is greater than the squared norm of the centroids, then this is an indication that kmeans clustering algorithm has converged a float number. If, in case of an iteration (iteration > 1 and iteration < max_iters) "tol" is greater than the squared norm of the centroids, then kmeans has converged
#' @param tolerance_optimal_init It shall be of type float, tolerance_optimal_init is the tolerance value for the optimal_init type of initializer, the greater value is an indication of well separated clusters.
#' @param seed Its shall be of type integer, indicating the value for Random Number Generator.
#' @param max_clusters max_clusters can be either a numeric , a contiguous or non-continguous numeric vector specifying search space of the clusters.
#' @param num_init
#' @return A `KMeansClustering` object.
#'
#' @examples
#' data_set <- rbind(replicate(30, rnorm(1e4, 3)),
#' replicate(30, rnorm(1e4, -1)),
#' replicate(30, rnorm(1e4, 5)))
#' km <- KMeansClustering$new(clusters=2, b_size=30, max_clusters=6)
initialize = function(clusters,
b_size = 10,
num_rep=1,
max_iterations=100,
init_fraction=1,
initializer = "kmeans++",
early_stop_iterations = 10,
verbose=FALSE,
centroids=NULL,
tolerance = 1e-04,
tolerance_optimal_init=0.3,
seed=1,
max_clusters=NA){
if (!(missing(clusters))) self$clusters <- clusters
if (!(missing(b_size))) self$b_size <- b_size
if (!(missing(num_rep))) self$num_rep <- 1
if (!(missing(max_iterations))) self$max_iterations <- max_iterations
if (!(missing(init_fraction))) self$init_fraction <- init_fraction
if (!(missing(initializer))) self$initializer <- initializer
if (!(missing(early_stop_iterations))) self$early_stop_iterations <- early_stop_iterations
if (!(missing(verbose))) self$verbose <- verbose
if (!(missing(centroids))) self$centroids <- centroids
if (!(missing(tolerance))) self$tolerance <- tolerance
if (!(missing(tolerance_optimal_init)))
self$tolerance_optimal_init <- tolerance_optimal_init
if (!(missing(seed))) self$seed <- seed
if (!(missing(max_clusters))) self$max_clusters <- max_clusters
superml::check_package("ClusterR")
},
#' @details
#' This functions fits the KMeansClustering model
#'
#' @param X_data X_data shall be either a data.frame or a matrix containing the features of interest.
#' @param y y is set to NULL only kept here because of superml general e:g way for every x you have to map it to y.
#' @param find_optimal find_optimal shall be logical, it indicates to search the optimal clusters automatically.
#' @return NULL
#'
#' @examples
#' data_set <- rbind(replicate(30, rnorm(1e4, 3)),
#' replicate(30, rnorm(1e4, -1)),
#' replicate(30, rnorm(1e4, 5)))
#' km <- KMeansClustering$new(clusters=2, b_size=30, max_clusters=6)
#' km$fit(data_set, find_optimal = FALSE)
fit = function(X_data, y=NULL, find_optimal = FALSE){
# X should be a matrix
if (!(is.matrix(X_data) | is.data.frame(X_data)) )
stop("The Data Set provided Shall be a R Data Frame or Matrix")
if (isTRUE(find_optimal)){
message('Working to Find the Optimal Clusters Based on Variance Explianed')
fm <- ClusterR::Optimal_Clusters_KMeans(X_data, max_clusters = self$max_clusters)
self$clusters <- which.max(fm[-1])+1
}
message(sprintf('Using %d Clusters to fit to the Gievn data:: Learning in Progress', self$clusters))
self$model <- ClusterR::MiniBatchKmeans(data = X_data
,clusters = self$clusters
,batch_size = self$b_size
,num_init = self$num_rep
,max_iters = self$max_iterations
,init_fraction = self$init_fraction
,initializer = self$initializer
,early_stop_iter = self$early_stop_iterations
,verbose = self$verbose
,CENTROIDS = self$centroids
,tol=self$tolerance
,tol_optimal_init = self$tolerance_optimal_init
,seed = self$seed)
},
#' @details
#' Returns the prediction on the provided data.
#'
#' @param X_data it shall be an R Data Frame or Matrix
#' @return a vector containing predictions
#'
#' @examples
#' data_set <- rbind(replicate(30, rnorm(1e4, 2)),
#' replicate(30, rnorm(1e4, -1)),
#' replicate(30, rnorm(1e4, 5)))
#' km <- KMeansClustering$new(clusters=2, b_size=30, max_clusters=6)
#' km$fit(data_set, find_optimal = FALSE)
#' preds <- km$predict(data_set)
predict = function(X_data){
return(ClusterR::predict_MBatchKMeans(X_data, CENTROIDS = self$model$centroids))
})
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.