inst/doc/the_clusterR_package.R

## ---- eval = T----------------------------------------------------------------

library(ClusterR)

data(dietary_survey_IBS)

dim(dietary_survey_IBS)

X = dietary_survey_IBS[, -ncol(dietary_survey_IBS)]   # data (excluding the response variable)

y = dietary_survey_IBS[, ncol(dietary_survey_IBS)]    # the response variable

dat = center_scale(X, mean_center = T, sd_scale = T)  # centering and scaling the data

## ---- eval = F----------------------------------------------------------------
#  gmm = GMM(dat, 2, dist_mode = "maha_dist", seed_mode = "random_subset", km_iter = 10,
#            em_iter = 10, verbose = F)
#  
#  # predict centroids, covariance matrix and weights
#  pr = predict(gmm, newdata = dat)
#  

## ---- eval = F----------------------------------------------------------------
#  
#  opt_gmm = Optimal_Clusters_GMM(dat, max_clusters = 10, criterion = "BIC",
#  
#                                 dist_mode = "maha_dist", seed_mode = "random_subset",
#  
#                                 km_iter = 10, em_iter = 10, var_floor = 1e-10,
#  
#                                 plot_data = T)
#  

## ---- eval = F----------------------------------------------------------------
#  
#  res = external_validation(dietary_survey_IBS$class, pr$cluster_labels,
#  
#                            method = "adjusted_rand_index", summary_stats = T)
#  
#  res
#  
#  ##
#  ## ----------------------------------------
#  ## purity                         : 1
#  ## entropy                        : 0
#  ## normalized mutual information  : 1
#  ## variation of information       : 0
#  ## ----------------------------------------
#  ## specificity                    : 1
#  ## sensitivity                    : 1
#  ## precision                      : 1
#  ## recall                         : 1
#  ## F-measure                      : 1
#  ## ----------------------------------------
#  ## accuracy OR rand-index         : 1
#  ## adjusted-rand-index            : 1
#  ## jaccard-index                  : 1
#  ## fowlkes-mallows-index          : 1
#  ## mirkin-metric                  : 0
#  ## ----------------------------------------
#  

## ---- eval = F----------------------------------------------------------------
#  
#  pca_dat = stats::princomp(dat)$scores[, 1:2]
#  
#  km = KMeans_arma(pca_dat, clusters = 2, n_iter = 10, seed_mode = "random_subset",
#  
#                   verbose = T, CENTROIDS = NULL)
#  
#  pr = predict_KMeans(pca_dat, km)
#  
#  table(dietary_survey_IBS$class, pr)
#  
#  class(km) = 'matrix'
#  
#  plot_2d(data = pca_dat, clusters = as.vector(pr), centroids_medoids = as.matrix(km))

## ----fig.width = 3.0, fig.height = 3.0, echo = T, eval = T--------------------

library(OpenImageR)

im = readImage('elephant.jpg')

# first resize the image to reduce the dimensions
im = resizeImage(im, 75, 75, method = 'bilinear')            

imageShow(im)                                                # plot the original image

im2 = apply(im, 3, as.vector)                                # vectorize RGB

## ----eval = T-----------------------------------------------------------------

# perform KMeans_rcpp clustering

km_rc = KMeans_rcpp(im2, clusters = 5, num_init = 5, max_iters = 100, 
                    initializer = 'optimal_init', verbose = F)

km_rc$between.SS_DIV_total.SS
pr = predict(km_rc, newdata = im2)


## ----fig.width = 3.0, fig.height = 3.0, echo = T, eval = T--------------------
getcent = km_rc$centroids

getclust = km_rc$clusters

new_im = getcent[getclust, ]     # each observation is associated with the nearby centroid

dim(new_im) = c(nrow(im), ncol(im), 3)        # back-convertion to a 3-dimensional image

imageShow(new_im)

## ----fig.width = 5.0, fig.height = 5.0, echo = T, eval = T--------------------

opt = Optimal_Clusters_KMeans(im2, max_clusters = 10, plot_clusters = T,
                              
                              criterion = 'distortion_fK', fK_threshold = 0.85,
                              
                              initializer = 'optimal_init', tol_optimal_init = 0.2)


## ----fig.width = 3.0, fig.height = 3.0, echo = T, eval = T--------------------

im_d = readImage('dog.jpg')

# first resize the image to reduce the dimensions
im_d = resizeImage(im_d, 350, 350, method = 'bilinear')            

imageShow(im_d)                                                # plot the original image

im3 = apply(im_d, 3, as.vector)                                # vectorize RGB

dim(im3)                                              # initial dimensions of the data

## ----fig.width = 3.0, fig.height = 3.0, echo = T, eval = T--------------------

start = Sys.time()

km_init = KMeans_rcpp(im3, clusters = 5, num_init = 5, max_iters = 100, 
                      initializer = 'kmeans++', verbose = F)

end = Sys.time()

t = end - start
  
cat('time to complete :', t, attributes(t)$units, '\n')

getcent_init = km_init$centroids

getclust_init = km_init$clusters

new_im_init = getcent_init[getclust_init, ]  # each observation is associated with the nearby centroid

dim(new_im_init) = c(nrow(im_d), ncol(im_d), 3)     # back-convertion to a 3-dimensional image

imageShow(new_im_init)

## ----fig.width = 3.0, fig.height = 3.0, echo = T, eval = T--------------------

start = Sys.time()

km_mb = MiniBatchKmeans(im3, clusters = 5, batch_size = 20, num_init = 5, max_iters = 100, 
                        init_fraction = 0.2, initializer = 'kmeans++', early_stop_iter = 10,
                        verbose = F)

pr_mb = predict(object = km_mb, newdata = im3)

end = Sys.time()

t = end - start
  
cat('time to complete :', t, attributes(t)$units, '\n')

getcent_mb = km_mb$centroids

new_im_mb = getcent_mb[pr_mb, ]   # each observation is associated with the nearby centroid

dim(new_im_mb) = c(nrow(im_d), ncol(im_d), 3)     # back-convertion to a 3-dimensional image

imageShow(new_im_mb)

## ---- eval = T----------------------------------------------------------------

data(mushroom)

X = mushroom[, -1]

y = as.numeric(mushroom[, 1])            # convert the labels to numeric

gwd = FD::gowdis(X)           # calculate the 'gower' distance for the factor variables

gwd_mat = as.matrix(gwd)                 # convert the distances to a matrix

cm = Cluster_Medoids(gwd_mat, clusters = 2, swap_phase = TRUE, verbose = F)


## ---- eval = T, echo = F------------------------------------------------------

 knitr::kable(data.frame(adusted_rand_index = external_validation(y, cm$clusters, method = "adjusted_rand_index", summary_stats = F), avg_silhouette_width = mean(cm$silhouette_matrix[, 'silhouette_widths'])), caption = "Non-Weigthed-K-medoids", align = 'l')

## ---- eval = T, echo = F------------------------------------------------------

 knitr::kable(data.frame(predictors = c("cap_shape", "cap_surface", "cap_color", "bruises", "odor", 
"gill_attachment", "gill_spacing", "gill_size", "gill_color", 
"stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring", 
"stalk_color_above_ring", "stalk_color_below_ring", "veil_type", 
"veil_color", "ring_number", "ring_type", "spore_print_color", 
"population", "habitat"), weights = c(4.626, 38.323, 55.899, 34.028, 169.608, 6.643, 42.08, 57.366, 
37.938, 33.081, 65.105, 18.718, 76.165, 27.596, 26.238, 0.0, 1.507, 
37.314, 32.685, 127.87, 64.019, 44.519)), align = 'l')

## ---- eval = T----------------------------------------------------------------

weights = c(4.626, 38.323, 55.899, 34.028, 169.608, 6.643, 42.08, 57.366, 37.938, 
            
            33.081, 65.105, 18.718, 76.165, 27.596, 26.238, 0.0, 1.507, 37.314, 
            
            32.685, 127.87, 64.019, 44.519)

gwd_w = FD::gowdis(X, w = weights)       # 'gower' distance using weights

gwd_mat_w = as.matrix(gwd_w)                 # convert the distances to a matrix

cm_w = Cluster_Medoids(gwd_mat_w, clusters = 2, swap_phase = TRUE, verbose = F)


## ---- eval = T, echo = F------------------------------------------------------

 knitr::kable(data.frame(adusted_rand_index = external_validation(y, cm_w$clusters, method = "adjusted_rand_index", summary_stats = F), avg_silhouette_width = mean(cm_w$silhouette_matrix[, 'silhouette_widths'])), caption = "Weigthed-K-medoids", align = 'l')

## ---- eval = T----------------------------------------------------------------

cl_X = X        # copy initial data 

# the Clara_Medoids function allows only numeric attributes
# so first convert to numeric

for (i in 1:ncol(cl_X)) { cl_X[, i] = as.numeric(cl_X[, i]) }

start = Sys.time()

cl_f = Clara_Medoids(cl_X, clusters = 2, distance_metric = 'hamming', samples = 5, 
                     sample_size = 0.2, swap_phase = TRUE, verbose = F, threads = 1)

end = Sys.time()

t = end - start
  
cat('time to complete :', t, attributes(t)$units, '\n')

## ---- eval = T, echo = F------------------------------------------------------

 knitr::kable(data.frame(adusted_rand_index = external_validation(y, cl_f$clusters, method = "adjusted_rand_index", summary_stats = F), avg_silhouette_width = mean(cl_f$silhouette_matrix[, 'silhouette_widths'])), caption = "hamming-Clara-Medoids", align = 'l')

## ---- eval = T----------------------------------------------------------------

start = Sys.time()

cl_e = Cluster_Medoids(cl_X, clusters = 2, distance_metric = 'hamming', swap_phase = TRUE,  
                       verbose = F, threads = 1)

end = Sys.time()

t = end - start
  
cat('time to complete :', t, attributes(t)$units, '\n')

## ---- eval = T, echo = F------------------------------------------------------

 knitr::kable(data.frame(adusted_rand_index = external_validation(y, cl_e$clusters, method = "adjusted_rand_index", summary_stats = F), avg_silhouette_width = mean(cl_e$silhouette_matrix[, 'silhouette_widths'])), caption = "hamming-Cluster-Medoids", align = 'l')

## ---- eval = T----------------------------------------------------------------

# Silhouette Plot for the "Clara_Medoids" object

Silhouette_Dissimilarity_Plot(cl_f, silhouette = TRUE)


## ---- eval = T----------------------------------------------------------------

# Silhouette Plot for the "Cluster_Medoids" object

Silhouette_Dissimilarity_Plot(cl_e, silhouette = TRUE)

Try the ClusterR package in your browser

Any scripts or data that you put into this service are public.

ClusterR documentation built on April 30, 2023, 1:08 a.m.