tests/testthat/test-kmeans.R

#============================================================

# data

data(dietary_survey_IBS)

dat = dietary_survey_IBS[, -ncol(dietary_survey_IBS)]

X = center_scale(dat)

# tbl = tibble::as.tibble(X)             # see line 356 [ it works, however I didn't add this test case because tibble has many dependencies -- I guess not sensible for a single test case ]


#=============================================================


context('k-means and mini-batch-k-means')


#############################
# error handling KMeans_arma
#############################


testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {

  tmp_x = list(X)

  testthat::expect_error( KMeans_arma(tmp_x, clusters = 2, n_iter = 10, "random_subset", verbose = F) )
})


testthat::test_that("in case that the clusters parameter is not numeric, it returns an error", {

  tmp_m = data.frame(1)

  testthat::expect_error( KMeans_arma(X, clusters = tmp_m, n_iter = 10, "random_subset", verbose = F) )
})


testthat::test_that("in case that the length of the clusters parameter is not 1, it returns an error", {

  tmp_m = c(1,2)

  testthat::expect_error( KMeans_arma(X, clusters = tmp_m, n_iter = 10, "random_subset", verbose = F) )
})


testthat::test_that("in case that the clusters parameter is less than 1, it returns an error", {

  tmp_m = 0

  testthat::expect_error( KMeans_arma(X, clusters = tmp_m, n_iter = 10, "random_subset", verbose = F) )
})


testthat::test_that("in case that the n_iter parameter is less than 0, it returns an error", {

  testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = -1, "random_subset", verbose = F) )
})


testthat::test_that("in case that the seed_mode parameter is invalid, it returns an error", {

  testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "invalid", verbose = F) )
})


testthat::test_that("in case that the seed_mode parameter equals 'keep_existing' and the CENTROIDS has invalid columns, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)

  testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = cntr) )
})

testthat::test_that("in case that the seed_mode parameter equals 'keep_existing' and the CENTROIDS has invalid rows, it returns an error", {

  NROW = 3
  cntr = matrix(runif(NROW * (ncol(X))), nrow = NROW, ncol = ncol(X))

  testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = cntr) )
})


testthat::test_that("in case that the seed_mode parameter equals 'keep_existing' and the CENTROIDS is NULL, it returns an error", {

  testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = NULL) )
})


testthat::test_that("in case that the seed_mode parameter does not equal 'keep_existing' and the CENTROIDS is not NULL, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "static_subset", verbose = F, CENTROIDS = cntr) )
})


testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {

  testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "static_subset", verbose = 'invalid') )
})


testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {

  tmp_dat = X

  tmp_dat[1,1] = NaN

  testthat::expect_error( KMeans_arma(tmp_dat, clusters = 2, n_iter = 10, "random_subset", verbose = F)  )
})



#######################
# KMeans_arma function
#######################


testthat::test_that("in case that the data is a matrix the result is a CENTROIDS-matrix and the class is 'k-means clustering' ", {

  km = KMeans_arma(X, clusters = 2, n_iter = 10, "random_subset", verbose = F)

  testthat::expect_true( is.matrix(km) && inherits(km, "k-means clustering") )
})


testthat::test_that("in case that the data is a data frame the result is a CENTROIDS-matrix and the class is 'k-means clustering' ", {

  km = KMeans_arma(dat, clusters = 2, n_iter = 10, "random_subset", verbose = F)

  testthat::expect_true( is.matrix(km) && inherits(km, "k-means clustering") )
})


testthat::test_that("it returns a CENTROID-matrix of class 'k-means clustering' when the CENTROIDS parameter is not NULL ", {

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  res = KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = cntr)

  testthat::expect_true( is.matrix(res) && inherits(res, 'k-means clustering') )
})


testthat::test_that("it returns a matrix of class 'k-means clustering' for different seed modes", {

  parms = c('static_subset','random_subset','static_spread','random_spread')

  res_vec = rep(NA, length(parms))

  for (i in 1:length(parms)) {

    tmp_km = KMeans_arma(X, clusters = 2, n_iter = 5, parms[i], verbose = F)

    res_vec[i] =  (is.matrix(tmp_km) && inherits(tmp_km, 'k-means clustering'))
  }

  testthat::expect_true( sum(res_vec) == length(parms) )
})



#############################
# error handling KMeans_rcpp
#############################


testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {

  tmp_x = list(X)

  testthat::expect_error( KMeans_rcpp(tmp_x, clusters = 2, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})


testthat::test_that("in case that the clusters parameter is not numeric, it returns an error", {

  tmp_m = data.frame(1)

  testthat::expect_error( KMeans_rcpp(X, clusters = tmp_m, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})


testthat::test_that("in case that the length of the clusters parameter is not 1, it returns an error", {

  tmp_m = c(1,2)

  testthat::expect_error( KMeans_rcpp(X, clusters = tmp_m, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})


testthat::test_that("in case that the clusters parameter is less than 1, it returns an error", {

  tmp_m = 0

  testthat::expect_error( KMeans_rcpp(X, clusters = tmp_m, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})


testthat::test_that("in case that the num_init parameter is less than 1, it returns an error", {

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 0, max_iters = 100, initializer = 'optimal_init') )
})


testthat::test_that("in case that the max_iters parameter is less than 1, it returns an error", {

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 0, initializer = 'optimal_init') )
})


testthat::test_that("in case that the initializer parameter is not one of c('kmeans++', 'random', 'optimal_init', 'quantile_init'), it returns an error", {

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'invalid') )
})


testthat::test_that("in case that the fuzzy parameter is not logical, it returns an error", {

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', fuzzy = 'invalid') )
})


testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', verbose = 'invalid') )
})


testthat::test_that("in case that CENTROIDS has invalid columns, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', CENTROIDS = cntr) )
})


testthat::test_that("in case that CENTROIDS has invalid rows, it returns an error", {

  NROW = 3
  cntr = matrix(runif(NROW * (ncol(X))), nrow = NROW, ncol = ncol(X))

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', CENTROIDS = cntr) )
})


testthat::test_that("in case that the tol parameter is less than or equal to 0.0, it returns an error", {

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', tol = 0.0) )
})


testthat::test_that("in case that the tol_optimal_init parameter is less than or equal to 0.0, it returns an error", {

  testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', tol_optimal_init = 0.0) )
})


testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {

  tmp_dat = X

  tmp_dat[1,1] = Inf

  testthat::expect_error( KMeans_rcpp(tmp_dat, clusters = 2, num_init = 5, max_iters = 100, initializer = 'optimal_init')  )
})



#######################
# KMeans_rcpp function
#######################

test_KMeansCluster <- function(km, nclust) {
  expect_true(all(c("call", "clusters", "centroids", "total_SSE", "best_initialization",
                    "WCSS_per_cluster", "obs_per_cluster", "between.SS_DIV_total.SS") %in%
                  names(km)))
  expect_is(km$clusters, "numeric")
  expect_length(km$clusters, nrow(X))
  expect_is(km$between.SS_DIV_total.SS, "numeric")
  expect_length(km$between.SS_DIV_total.SS, 1)
  if (!is.null(km$fuzzy_clusters)) {
    expect_is(km$fuzzy_clusters, "matrix")
    expect_equal(ncol(km$fuzzy_clusters), nclust)
  }
  expect_equal(nrow(km$centroids), nclust)
  expect_equal(ncol(km$centroids), ncol(X))
  expect_length(km$total_SSE, 1)
  expect_is(km$total_SSE, "numeric")
  expect_length(km$best_initialization, 1)
  expect_is(km$best_initialization, "integer")
  expect_equal(ncol(km$WCSS_per_cluster), nclust)
  expect_equal(ncol(km$obs_per_cluster), nclust)
  expect_s3_class(km, "KMeansCluster")
}

testthat::test_that("in case that the data is a matrix the result is a list and the class is 'k-means clustering' ", {
  nclust <- 2
  km <- KMeans_rcpp(X, clusters = nclust, num_init = 5, max_iters = 100, initializer = 'optimal_init', fuzzy = TRUE)
  test_KMeansCluster(km, nclust)
})


testthat::test_that("in case that the data is a data frame the result is a list and the class is 'k-means clustering' ", {
  nclust <- 2
  km <- KMeans_rcpp(dat, clusters = nclust, num_init = 5, max_iters = 100, initializer = 'optimal_init')
  test_KMeansCluster(km, nclust)
})


testthat::test_that("KMeans_rcpp returns the correct output for the initializers", {
  nclust <- 2
  res <- rep(NA, 4)
  count <- 1
  set.seed(1)
  for (i in c('kmeans++', 'random', 'optimal_init', 'quantile_init')) {
    km <- KMeans_rcpp(X, clusters = nclust, num_init = 5, max_iters = 10, initializer = i, tol_optimal_init = 0.2)
    test_KMeansCluster(km, nclust)
  }
})



testthat::test_that("KMeans_rcpp returns the correct output if CENTROIDS is user-defined ", {
  nclust <- 2
  cntr <- matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
  km <- KMeans_rcpp(X, clusters = nclust, num_init = 5, max_iters = 100, CENTROIDS = cntr)
  test_KMeansCluster(km, nclust)
})



# testthat::test_that("in case that the data is of type 'tibble' the result is a list and the class is 'k-means clustering' ", {
#
#   clust = 2
#
#   km = KMeans_rcpp(tbl, clusters = clust, num_init = 5, max_iters = 100, initializer = 'optimal_init', fuzzy = TRUE)
#
#   testthat::expect_true( names(km) %in% c("clusters", "centroids", "total_SSE", "best_initialization", "WCSS_per_cluster", "obs_per_cluster", "between.SS_DIV_total.SS")  && is.matrix(km$fuzzy_clusters) &&
#
#                            is.vector(km$clusters) && length(km$clusters) == nrow(tbl) && is.numeric(km$between.SS_DIV_total.SS) && length(km$between.SS_DIV_total.SS) == 1 && ncol(km$fuzzy_clusters) == clust &&
#
#                            nrow(km$centroids) == clust && ncol(km$centroids) == ncol(tbl) && length(km$total_SSE) == 1 && is.numeric(km$total_SSE) && length(km$best_initialization) == 1 &&
#
#                            is.numeric(km$best_initialization) && ncol(km$WCSS_per_cluster) == clust && ncol(km$obs_per_cluster) == clust && inherits(km, "k-means clustering")  )
# })


testthat::test_that("the 'kmeans_pp_init()' function (i.e. the 'kmeans++' initializer) does not return duplicated centroids (see the Github issue https://github.com/mlampros/ClusterR/issues/25)", {

  data = matrix(data = c(0,0,0,1,1,0,1,1,2,2,3,1,4,2,6,2), ncol = 2, byrow = TRUE)
  runs = 10
  nansum = sse = rep(NA_real_, runs)

  for (i in 1:runs) {

    L = KMeans_rcpp(data = data,
                    clusters = 6,
                    num_init = 10,
                    max_iters = 100,
                    initializer = "kmeans++",
                    seed = 1,                # keep the seed always the same for reproducibility (otherwise it is possible that I receive a slightly worse 'sse' but not higher than 1.333333), fact is I don't want NA's in the 'nansum' vector
                    verbose = FALSE)

    nansum[i] = sum(is.na(L$centroids))
    sse[i] = sum(L$WCSS_per_cluster)
  }

  testthat::expect_true(all(nansum == 0) & !any(is.na(nansum)) & all(sse == 1.0))
})



################################
# error handling predict_KMeans
################################


testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  tmp_x = list(X)

  testthat::expect_error( predict_KMeans(tmp_x, CENTROIDS = cntr) )
})


testthat::test_that("in case that the CENTROIDS is not a matrix, it returns an error", {

  cntr = as.data.frame(matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X)))

  testthat::expect_error( predict_KMeans(X, CENTROIDS = cntr) )
})


testthat::test_that("in case that the columns of the CENTROIDS is not equal to the columns of the data, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)

  testthat::expect_error( predict_KMeans(X, CENTROIDS = cntr) )
})


testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {

  tmp_dat = X

  tmp_dat[1,1] = Inf

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  testthat::expect_error( predict_KMeans(tmp_dat, CENTROIDS = cntr) )
})


##########################
# predict_KMeans function
##########################


testthat::test_that("predict_KMeans returns the correct output if the input is a data frame AND if the CENTROIDS is a matrix and has the correct dimensions ", {

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  km = predict_KMeans(dat, CENTROIDS = cntr)
  testthat::expect_true(length(km) == nrow(X))

  km = KMeans_rcpp(dat, 5)
  testthat::expect_equal(predict_KMeans(dat, CENTROIDS = km$centroids),
                         predict(km, dat))
})


testthat::test_that("predict_KMeans returns the correct output if the input is a matrix AND if the CENTROIDS is a matrix and has the correct dimensions ", {

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  km = predict_KMeans(X, CENTROIDS = cntr)

  testthat::expect_true( length(km) == nrow(X) )
})


testthat::test_that("the predict_KMeans works using the CENTROIDS of the KMeans_rcpp function", {

  km = KMeans_rcpp(X, clusters = 2, num_init = 5, max_iters = 100, initializer = 'optimal_init')

  km_preds = predict_KMeans(X, CENTROIDS = km$centroids)

  testthat::expect_true( length(km_preds) == nrow(X) )
})


testthat::test_that("the predict_KMeans works using the CENTROIDS of the KMeans_arma function", {

  km = KMeans_arma(X, clusters = 2, n_iter = 10, "random_subset", verbose = F)

  km_preds = predict_KMeans(X, CENTROIDS = km)

  testthat::expect_true(length(km_preds) == nrow(X))
  ## testthat::expect_equal(km_preds, predict(km, X))
})


testthat::test_that("the unified predict() function returns the same result as the hard clustering when the parameter fuzzy is TRUE", {

  km = KMeans_rcpp(X, clusters = 2, num_init = 5, max_iters = 100)
  km_hard_clusts = predict(object = km, newdata = X, fuzzy = FALSE)
  km_soft_clusts = predict(object = km, newdata = X, fuzzy = TRUE)
  conv_soft_to_hard = apply(km_soft_clusts, 1, which.max)

  testthat::expect_true(all.equal(target = km_hard_clusts, current = conv_soft_to_hard))
})




#########################################
# error handling Optimal_Clusters_KMeans
#########################################


testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {

  tmp_x = list(X)

  testthat::expect_error( Optimal_Clusters_KMeans(tmp_x, max_clusters = 10, criterion = 'distortion_fK', plot_clusters = FALSE) )
})


testthat::test_that("in case that the max_clusters parameter is not numeric, it returns an error", {

  tmp_m = data.frame(1)

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = tmp_m, criterion = 'distortion_fK', plot_clusters = FALSE) )
})


testthat::test_that("in case that the max_clusters parameter is less than 1, it returns an error", {

  tmp_m = 0

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = tmp_m, criterion = 'distortion_fK', plot_clusters = FALSE) )
})


testthat::test_that("if the criterion is not one of c('variance_explained', 'WCSSE', 'dissimilarity', 'silhouette', 'distortion_fK', 'AIC', 'BIC', 'Adjusted_Rsquared'), it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'invalid', plot_clusters = FALSE) )
})


testthat::test_that("in case that the num_init parameter is less than 1, it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', num_init = 0, plot_clusters = FALSE) )
})


testthat::test_that("in case that the max_iters parameter is less than 1, it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', max_iters = 0, plot_clusters = FALSE) )
})


testthat::test_that("if the initializer is not one of c('kmeans++', 'random', 'optimal_init', 'quantile_init'), it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, initializer = 'invalid', plot_clusters = FALSE) )
})


testthat::test_that("in case that the threads parameter is less than 1, it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', threads = 0, plot_clusters = FALSE) )
})


testthat::test_that("in case that the tol parameter is less than or equal to 0.0, it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', tol = 0.0, plot_clusters = FALSE) )
})


testthat::test_that("in case that the plot_clusters parameter is not logical, it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', plot_clusters = 'FALSE') )
})


testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', verbose = 'FALSE', plot_clusters = FALSE) )
})


testthat::test_that("in case that the tol_optimal_init parameter is less than or equal to 0.0, it returns an error", {

  testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', tol_optimal_init = 0.0, plot_clusters = FALSE) )
})


testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {

  tmp_dat = X

  tmp_dat[1,1] = -Inf

  testthat::expect_error( Optimal_Clusters_KMeans(tmp_dat, max_clusters = 5, criterion = 'distortion_fK', plot_clusters = FALSE) )
})


testthat::test_that("in case that the 'mini_batch_params' is not NULL and the named list is not valid it returns an error", {

  params_mbkm = list(invalid = 10, init_fraction = 0.3, early_stop_iter = 10)

  testthat::expect_error( Optimal_Clusters_KMeans(dat, max_clusters = 10, criterion = "distortion_fK",

                                                  plot_clusters = FALSE, mini_batch_params = params_mbkm) )
})


testthat::test_that("in case that the 'mini_batch_params' is not NULL and the criterion is 'variance_explained' it returns an error", {

  params_mbkm = list(batch_size = 10, init_fraction = 0.3, early_stop_iter = 10)

  testthat::expect_error( Optimal_Clusters_KMeans(dat, max_clusters = 10, criterion = "variance_explained",

                                                  plot_clusters = FALSE, mini_batch_params = params_mbkm) )
})



###################################
# Optimal_Clusters_KMeans function   [ in case that the 'max_clusters' parameter is of length 1 ]
###################################


testthat::test_that("Optimal_Clusters_KMeans returns the correct output if the input is a data frame ", {

  nr_clust = 10

  res =  Optimal_Clusters_KMeans(dat, max_clusters = nr_clust, criterion = 'distortion_fK', plot_clusters = FALSE, tol_optimal_init = 0.2)

  testthat::expect_true( length(res) == nr_clust )
})



testthat::test_that("Optimal_Clusters_KMeans returns the correct output for different criteria", {

  vec = c('variance_explained', 'WCSSE', 'dissimilarity', 'silhouette', 'AIC', 'BIC', 'distortion_fK', 'Adjusted_Rsquared')

  out = rep(NA, length(vec))

  nr_clust = 5

  count = 1

  for (i in vec) {

    res =  Optimal_Clusters_KMeans(dat, max_clusters = nr_clust, criterion = i, plot_clusters = T, tol_optimal_init = 0.2)

    out[count] = (length(res) == nr_clust)

    count = count + 1
  }

  testthat::expect_true( sum(out) == length(vec) )
})


testthat::test_that("Optimal_Clusters_KMeans returns the correct output if the 'mini_batch_params' is not NULL", {

  nr_clust = 10

  params_mbkm = list(batch_size = 10, init_fraction = 0.3, early_stop_iter = 10)

  res = Optimal_Clusters_KMeans(dat, max_clusters = nr_clust, criterion = "distortion_fK",

                                plot_clusters = FALSE, mini_batch_params = params_mbkm)

  testthat::expect_true( length(res) == nr_clust  )
})


###################################
# Optimal_Clusters_KMeans function         [ in case that the 'max_clusters' parameter is a contiguous or non-contiguous vector ] [ here I tested only the 'KMeans_rcpp' function but the same applies to 'MiniBatchKmeans' ]
###################################


testthat::test_that("max_clusters-vector for 'variance_explained'", {

  subs = 2:3

  res1 =  Optimal_Clusters_KMeans(dat, max_clusters = 1:3, criterion = 'variance_explained')
  res2 =  Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'variance_explained')

  testthat::expect_true( all(res1[subs] == res2)  )
})


testthat::test_that("max_clusters-vector for 'WCSSE'", {

  subs = c(2,4)

  res1 =  Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'WCSSE')
  res2 =  Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'WCSSE')

  testthat::expect_true( all(res1[subs] == res2)  )
})


testthat::test_that("max_clusters-vector for 'dissimilarity'", {

  subs = c(1,3)

  res1 =  Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'dissimilarity')
  res2 =  Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'dissimilarity')

  testthat::expect_true( all(res1[subs] == res2)  )
})


testthat::test_that("max_clusters-vector for 'silhouette'", {

  subs = c(2,3)

  res1 =  Optimal_Clusters_KMeans(dat, max_clusters = 1:3, criterion = 'silhouette')
  res2 =  Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'silhouette')

  testthat::expect_true( all(res1[subs] == res2)  )
})


testthat::test_that("max_clusters-vector for 'AIC'", {

  subs = c(2,4)

  res1 =  Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'AIC')
  res2 =  Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'AIC')

  testthat::expect_true( all(res1[subs] == res2)  )
})


testthat::test_that("max_clusters-vector for 'BIC'", {

  subs = c(1,4)

  res1 =  Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'BIC')
  res2 =  Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'BIC')

  testthat::expect_true( all(res1[subs] == res2)  )
})



#####################################################
# test-case for the 'silhouette_of_clusters' function
#####################################################


testthat::test_that("'silhouette_of_clusters' function returns the correct output", {

  clusters = 2
  km = KMeans_rcpp(data = X, clusters = clusters, num_init = 5, max_iters = 100, initializer = 'kmeans++')
  silh = silhouette_of_clusters(data = X, clusters = km$clusters)

  # receive summary per cluster
  silh_summary_from_matrix = lapply(1:clusters, function(x) {
    IDX = which(as.vector(silh$silhouette_matrix[, 'cluster']) == x)
    clust_subs = silh$silhouette_matrix[IDX, , drop = F]
    data.frame(list(cluster = unique(clust_subs[, 'cluster']),
                    size = nrow(clust_subs),
                    avg_intra_dissim = mean(clust_subs[, 'intra_cluster_dissim'], na.rm = TRUE),
                    avg_silhouette = mean(clust_subs[, 'silhouette'], na.rm = TRUE)))
  })

  silh_summary_from_matrix = do.call(rbind, silh_summary_from_matrix)
  silh_summary = silh$silhouette_summary

  testthat::expect_true( all.equal(silh_summary_from_matrix, silh_summary, tolerance = sqrt(.Machine$double.eps))  )
})


################################
# error handling MiniBatchKmeans
################################

testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {

  tmp_x = list(X)

  testthat::expect_error( MiniBatchKmeans(tmp_x, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})


testthat::test_that("in case that the clusters parameter is not numeric, it returns an error", {

  tmp_m = data.frame(1)

  testthat::expect_error( MiniBatchKmeans(X, clusters = tmp_m, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})


testthat::test_that("in case that the length of the clusters parameter is not 1, it returns an error", {

  tmp_m = c(1,2)

  testthat::expect_error( MiniBatchKmeans(X, clusters = tmp_m, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})


testthat::test_that("in case that the clusters parameter is less than 1, it returns an error", {

  tmp_m = 0

  testthat::expect_error( MiniBatchKmeans(X, clusters = tmp_m, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})


testthat::test_that("in case that the batch_size parameter is less than 1, it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 0, num_init = 5, early_stop_iter = 10) )
})



testthat::test_that("in case that the num_init parameter is less than 1, it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 0, early_stop_iter = 10) )
})



testthat::test_that("in case that the max_iters parameter is less than 1, it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, max_iters = 0, early_stop_iter = 10) )
})


testthat::test_that("in case that the init_fraction parameter is less than or equal to 0.0, it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, init_fraction = 0.0, early_stop_iter = 10) )
})


testthat::test_that("in case that the initializer parameter is not one of c('kmeans++', 'random', 'optimal_init', 'quantile_init'), it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, initializer = 'invalid', early_stop_iter = 10) )
})


testthat::test_that("in case that the early_stop_iter parameter is less than 1, it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 0) )
})


testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, verbose = 'FALSE') )
})


testthat::test_that("in case that CENTROIDS has invalid columns, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, CENTROIDS = cntr) )
})


testthat::test_that("in case that CENTROIDS has invalid rows, it returns an error", {

  NROW = 3
  cntr = matrix(runif(NROW * (ncol(X))), nrow = NROW, ncol = ncol(X))

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, CENTROIDS = cntr) )
})


testthat::test_that("in case that the tol parameter is less than or equal to 0.0, it returns an error", {

  testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, tol = 0.0) )
})


testthat::test_that("in case that the tol_optimal_init parameter is less than or equal to 0.0, it returns an error", {

  testthat::expect_error(  MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, tol_optimal_init = 0.0) )
})


testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {

  tmp_dat = X

  tmp_dat[1,1] = Inf

  testthat::expect_error( MiniBatchKmeans(tmp_dat, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10) )
})


##########################
# MiniBatchKmeans function
##########################


testthat::test_that("in case that the data is a matrix the result is a list and the class is 'k-means clustering' ", {

  clust = 2

  numinit = 5

  km = MiniBatchKmeans(X, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, tol_optimal_init = 0.2)

  testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization"))  && is.matrix(km$centroids) && nrow(km$centroids) == clust &&

                           ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&

                           is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering")  )
})


testthat::test_that("in case that the data is a matrix the result is a list and the class is 'k-means clustering' ", {

  clust = 2

  numinit = 5

  km = MiniBatchKmeans(dat, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, tol_optimal_init = 0.2)

  testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization"))  && is.matrix(km$centroids) && nrow(km$centroids) == clust &&

                           ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&

                           is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering")  )
})



testthat::test_that("for different parameter settings it returns the correct output", {

  clust = 2

  numinit = 5

  inits = c('kmeans++', 'random', 'optimal_init', 'quantile_init')

  res = rep(NA, length(inits))

  for (i in 1:length(inits)) {

    km = MiniBatchKmeans(dat, clusters = clust, batch_size = 20, num_init = numinit, initializer = inits[i], early_stop_iter = 10, tol_optimal_init = 0.2)

    res[i] = ( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization"))  && is.matrix(km$centroids) && nrow(km$centroids) == clust &&

                ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&

                is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering"))
  }

  testthat::expect_true( sum(res) == length(inits) )
})



testthat::test_that("it returns the correct output if the CENTROIDS parameter is not NULL ", {

  clust = 2

  cntr = matrix(runif(clust * (ncol(X))), nrow = clust, ncol = ncol(dat))

  km = MiniBatchKmeans(dat, clusters = clust, batch_size = 20, early_stop_iter = 10, CENTROIDS = cntr, tol_optimal_init = 0.2)

  testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization"))  && is.matrix(km$centroids) && nrow(km$centroids) == clust &&

                           ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&

                           is.matrix(km$iters_per_initialization) && inherits(km, "k-means clustering")  )
})



testthat::test_that("in case that the init_fraction is greater than 0.0 and the intializer equals to 'kmeans++' it returns the correct output ", {

  clust = 2

  numinit = 5

  km = MiniBatchKmeans(X, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, init_fraction = 0.4, initializer = 'kmeans++')

  testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization"))  && is.matrix(km$centroids) && nrow(km$centroids) == clust &&

                           ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&

                           is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering")  )
})


testthat::test_that("in case that the init_fraction is greater than 0.0 and the intializer equals to 'quantile_init' it returns the correct output ", {

  clust = 2

  numinit = 5

  km = MiniBatchKmeans(X, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, init_fraction = 0.4, initializer = "quantile_init", tol_optimal_init = 0.2)

  testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization"))  && is.matrix(km$centroids) && nrow(km$centroids) == clust &&

                           ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&

                           is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering")  )
})

#####################################
# error handling predict_MBatchKMeans
#####################################


testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {

  tmp_x = list(X)

  MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)

  testthat::expect_error( predict_MBatchKMeans(tmp_x, MbatchKm$centroids, fuzzy = FALSE) )
})


testthat::test_that("in case that the CENTROIDS is not a matrix, it returns an error", {

  cntr = as.data.frame(matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X)))

  testthat::expect_error( predict_MBatchKMeans(X, CENTROIDS = cntr, fuzzy = FALSE) )
})


testthat::test_that("in case that the columns of the CENTROIDS is not equal to the columns of the data, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)

  testthat::expect_error( predict_MBatchKMeans(X, CENTROIDS = cntr) )
})


testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {

  tmp_dat = X

  tmp_dat[1,1] = Inf

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  testthat::expect_error( predict_MBatchKMeans(tmp_dat, CENTROIDS = cntr) )
})


testthat::test_that("in case that the fuzzy parameter is not logical, it returns an error", {

  cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))

  testthat::expect_error( predict_MBatchKMeans(X, CENTROIDS = cntr, fuzzy = 'FALSE') )
})


################################
# predict_MBatchKMeans function
################################



testthat::test_that("in case that the data is a matrix (fuzzy = TRUE) the result is a list and the class is 'k-means clustering' ", {

  MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)

  km = predict_MBatchKMeans(X, MbatchKm$centroids, fuzzy = TRUE)

  testthat::expect_true( all(names(km) %in% c("clusters", "fuzzy_clusters"))  && is.matrix(km$fuzzy_clusters) && nrow(km$fuzzy_clusters) == nrow(X) && ncol(km$fuzzy_clusters) == 2 &&

                           is.vector(km$clusters) && length(km$clusters) == nrow(X) && inherits(km, "k-means clustering")  )
})



testthat::test_that("in case that the data is a data frame (fuzzy = TRUE) the result is a list and the class is 'k-means clustering' ", {

  MbatchKm = MiniBatchKmeans(dat, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)

  km = predict_MBatchKMeans(dat, MbatchKm$centroids, fuzzy = TRUE)

  testthat::expect_true( all(names(km) %in% c("clusters", "fuzzy_clusters"))  && is.matrix(km$fuzzy_clusters) && nrow(km$fuzzy_clusters) == nrow(X) && ncol(km$fuzzy_clusters) == 2 &&

                           is.vector(km$clusters) && length(km$clusters) == nrow(X) && inherits(km, "k-means clustering")  )
})


testthat::test_that("in case that the data is a matrix (fuzzy = FALSE) the result is a vector and the class is 'k-means clustering' ", {

  MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)

  km = predict_MBatchKMeans(X, MbatchKm$centroids, fuzzy = FALSE)

  testthat::expect_true( is.numeric(km) && length(km) == nrow(X) )
})


testthat::test_that("the unified predict() function returns the same result as the hard clustering when the parameter fuzzy is TRUE", {

  MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)
  mbkm_hard_clusts = predict(object = MbatchKm, newdata = X, fuzzy = FALSE)
  mbkm_soft_clusts = predict(object = MbatchKm, newdata = X, fuzzy = TRUE)
  conv_soft_to_hard = apply(mbkm_soft_clusts, 1, which.max)

  testthat::expect_true(all.equal(target = mbkm_hard_clusts, current = conv_soft_to_hard))
})

Try the ClusterR package in your browser

Any scripts or data that you put into this service are public.

ClusterR documentation built on June 22, 2024, 10:28 a.m.