Nothing
#============================================================
# data
data(dietary_survey_IBS)
dat = dietary_survey_IBS[, -ncol(dietary_survey_IBS)]
X = center_scale(dat)
# tbl = tibble::as.tibble(X) # see line 356 [ it works, however I didn't add this test case because tibble has many dependencies -- I guess not sensible for a single test case ]
#=============================================================
context('k-means and mini-batch-k-means')
#############################
# error handling KMeans_arma
#############################
testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {
tmp_x = list(X)
testthat::expect_error( KMeans_arma(tmp_x, clusters = 2, n_iter = 10, "random_subset", verbose = F) )
})
testthat::test_that("in case that the clusters parameter is not numeric, it returns an error", {
tmp_m = data.frame(1)
testthat::expect_error( KMeans_arma(X, clusters = tmp_m, n_iter = 10, "random_subset", verbose = F) )
})
testthat::test_that("in case that the length of the clusters parameter is not 1, it returns an error", {
tmp_m = c(1,2)
testthat::expect_error( KMeans_arma(X, clusters = tmp_m, n_iter = 10, "random_subset", verbose = F) )
})
testthat::test_that("in case that the clusters parameter is less than 1, it returns an error", {
tmp_m = 0
testthat::expect_error( KMeans_arma(X, clusters = tmp_m, n_iter = 10, "random_subset", verbose = F) )
})
testthat::test_that("in case that the n_iter parameter is less than 0, it returns an error", {
testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = -1, "random_subset", verbose = F) )
})
testthat::test_that("in case that the seed_mode parameter is invalid, it returns an error", {
testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "invalid", verbose = F) )
})
testthat::test_that("in case that the seed_mode parameter equals 'keep_existing' and the CENTROIDS has invalid columns, it returns an error", {
cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)
testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = cntr) )
})
testthat::test_that("in case that the seed_mode parameter equals 'keep_existing' and the CENTROIDS has invalid rows, it returns an error", {
NROW = 3
cntr = matrix(runif(NROW * (ncol(X))), nrow = NROW, ncol = ncol(X))
testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = cntr) )
})
testthat::test_that("in case that the seed_mode parameter equals 'keep_existing' and the CENTROIDS is NULL, it returns an error", {
testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = NULL) )
})
testthat::test_that("in case that the seed_mode parameter does not equal 'keep_existing' and the CENTROIDS is not NULL, it returns an error", {
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "static_subset", verbose = F, CENTROIDS = cntr) )
})
testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {
testthat::expect_error( KMeans_arma(X, clusters = 2, n_iter = 5, "static_subset", verbose = 'invalid') )
})
testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {
tmp_dat = X
tmp_dat[1,1] = NaN
testthat::expect_error( KMeans_arma(tmp_dat, clusters = 2, n_iter = 10, "random_subset", verbose = F) )
})
#######################
# KMeans_arma function
#######################
testthat::test_that("in case that the data is a matrix the result is a CENTROIDS-matrix and the class is 'k-means clustering' ", {
km = KMeans_arma(X, clusters = 2, n_iter = 10, "random_subset", verbose = F)
testthat::expect_true( is.matrix(km) && inherits(km, "k-means clustering") )
})
testthat::test_that("in case that the data is a data frame the result is a CENTROIDS-matrix and the class is 'k-means clustering' ", {
km = KMeans_arma(dat, clusters = 2, n_iter = 10, "random_subset", verbose = F)
testthat::expect_true( is.matrix(km) && inherits(km, "k-means clustering") )
})
testthat::test_that("it returns a CENTROID-matrix of class 'k-means clustering' when the CENTROIDS parameter is not NULL ", {
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
res = KMeans_arma(X, clusters = 2, n_iter = 5, "keep_existing", verbose = F, CENTROIDS = cntr)
testthat::expect_true( is.matrix(res) && inherits(res, 'k-means clustering') )
})
testthat::test_that("it returns a matrix of class 'k-means clustering' for different seed modes", {
parms = c('static_subset','random_subset','static_spread','random_spread')
res_vec = rep(NA, length(parms))
for (i in 1:length(parms)) {
tmp_km = KMeans_arma(X, clusters = 2, n_iter = 5, parms[i], verbose = F)
res_vec[i] = (is.matrix(tmp_km) && inherits(tmp_km, 'k-means clustering'))
}
testthat::expect_true( sum(res_vec) == length(parms) )
})
#############################
# error handling KMeans_rcpp
#############################
testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {
tmp_x = list(X)
testthat::expect_error( KMeans_rcpp(tmp_x, clusters = 2, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})
testthat::test_that("in case that the clusters parameter is not numeric, it returns an error", {
tmp_m = data.frame(1)
testthat::expect_error( KMeans_rcpp(X, clusters = tmp_m, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})
testthat::test_that("in case that the length of the clusters parameter is not 1, it returns an error", {
tmp_m = c(1,2)
testthat::expect_error( KMeans_rcpp(X, clusters = tmp_m, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})
testthat::test_that("in case that the clusters parameter is less than 1, it returns an error", {
tmp_m = 0
testthat::expect_error( KMeans_rcpp(X, clusters = tmp_m, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})
testthat::test_that("in case that the num_init parameter is less than 1, it returns an error", {
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 0, max_iters = 100, initializer = 'optimal_init') )
})
testthat::test_that("in case that the max_iters parameter is less than 1, it returns an error", {
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 0, initializer = 'optimal_init') )
})
testthat::test_that("in case that the initializer parameter is not one of c('kmeans++', 'random', 'optimal_init', 'quantile_init'), it returns an error", {
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'invalid') )
})
testthat::test_that("in case that the fuzzy parameter is not logical, it returns an error", {
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', fuzzy = 'invalid') )
})
testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', verbose = 'invalid') )
})
testthat::test_that("in case that CENTROIDS has invalid columns, it returns an error", {
cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', CENTROIDS = cntr) )
})
testthat::test_that("in case that CENTROIDS has invalid rows, it returns an error", {
NROW = 3
cntr = matrix(runif(NROW * (ncol(X))), nrow = NROW, ncol = ncol(X))
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', CENTROIDS = cntr) )
})
testthat::test_that("in case that the tol parameter is less than or equal to 0.0, it returns an error", {
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', tol = 0.0) )
})
testthat::test_that("in case that the tol_optimal_init parameter is less than or equal to 0.0, it returns an error", {
testthat::expect_error( KMeans_rcpp(X, clusters = 2, num_init = 1, max_iters = 1, initializer = 'optimal_init', tol_optimal_init = 0.0) )
})
testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {
tmp_dat = X
tmp_dat[1,1] = Inf
testthat::expect_error( KMeans_rcpp(tmp_dat, clusters = 2, num_init = 5, max_iters = 100, initializer = 'optimal_init') )
})
#######################
# KMeans_rcpp function
#######################
test_KMeansCluster <- function(km, nclust) {
expect_true(all(c("call", "clusters", "centroids", "total_SSE", "best_initialization",
"WCSS_per_cluster", "obs_per_cluster", "between.SS_DIV_total.SS") %in%
names(km)))
expect_is(km$clusters, "numeric")
expect_length(km$clusters, nrow(X))
expect_is(km$between.SS_DIV_total.SS, "numeric")
expect_length(km$between.SS_DIV_total.SS, 1)
if (!is.null(km$fuzzy_clusters)) {
expect_is(km$fuzzy_clusters, "matrix")
expect_equal(ncol(km$fuzzy_clusters), nclust)
}
expect_equal(nrow(km$centroids), nclust)
expect_equal(ncol(km$centroids), ncol(X))
expect_length(km$total_SSE, 1)
expect_is(km$total_SSE, "numeric")
expect_length(km$best_initialization, 1)
expect_is(km$best_initialization, "integer")
expect_equal(ncol(km$WCSS_per_cluster), nclust)
expect_equal(ncol(km$obs_per_cluster), nclust)
expect_s3_class(km, "KMeansCluster")
}
testthat::test_that("in case that the data is a matrix the result is a list and the class is 'k-means clustering' ", {
nclust <- 2
km <- KMeans_rcpp(X, clusters = nclust, num_init = 5, max_iters = 100, initializer = 'optimal_init', fuzzy = TRUE)
test_KMeansCluster(km, nclust)
})
testthat::test_that("in case that the data is a data frame the result is a list and the class is 'k-means clustering' ", {
nclust <- 2
km <- KMeans_rcpp(dat, clusters = nclust, num_init = 5, max_iters = 100, initializer = 'optimal_init')
test_KMeansCluster(km, nclust)
})
testthat::test_that("KMeans_rcpp returns the correct output for the initializers", {
nclust <- 2
res <- rep(NA, 4)
count <- 1
set.seed(1)
for (i in c('kmeans++', 'random', 'optimal_init', 'quantile_init')) {
km <- KMeans_rcpp(X, clusters = nclust, num_init = 5, max_iters = 10, initializer = i, tol_optimal_init = 0.2)
test_KMeansCluster(km, nclust)
}
})
testthat::test_that("KMeans_rcpp returns the correct output if CENTROIDS is user-defined ", {
nclust <- 2
cntr <- matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
km <- KMeans_rcpp(X, clusters = nclust, num_init = 5, max_iters = 100, CENTROIDS = cntr)
test_KMeansCluster(km, nclust)
})
# testthat::test_that("in case that the data is of type 'tibble' the result is a list and the class is 'k-means clustering' ", {
#
# clust = 2
#
# km = KMeans_rcpp(tbl, clusters = clust, num_init = 5, max_iters = 100, initializer = 'optimal_init', fuzzy = TRUE)
#
# testthat::expect_true( names(km) %in% c("clusters", "centroids", "total_SSE", "best_initialization", "WCSS_per_cluster", "obs_per_cluster", "between.SS_DIV_total.SS") && is.matrix(km$fuzzy_clusters) &&
#
# is.vector(km$clusters) && length(km$clusters) == nrow(tbl) && is.numeric(km$between.SS_DIV_total.SS) && length(km$between.SS_DIV_total.SS) == 1 && ncol(km$fuzzy_clusters) == clust &&
#
# nrow(km$centroids) == clust && ncol(km$centroids) == ncol(tbl) && length(km$total_SSE) == 1 && is.numeric(km$total_SSE) && length(km$best_initialization) == 1 &&
#
# is.numeric(km$best_initialization) && ncol(km$WCSS_per_cluster) == clust && ncol(km$obs_per_cluster) == clust && inherits(km, "k-means clustering") )
# })
testthat::test_that("the 'kmeans_pp_init()' function (i.e. the 'kmeans++' initializer) does not return duplicated centroids (see the Github issue https://github.com/mlampros/ClusterR/issues/25)", {
data = matrix(data = c(0,0,0,1,1,0,1,1,2,2,3,1,4,2,6,2), ncol = 2, byrow = TRUE)
runs = 10
nansum = sse = rep(NA_real_, runs)
for (i in 1:runs) {
L = KMeans_rcpp(data = data,
clusters = 6,
num_init = 10,
max_iters = 100,
initializer = "kmeans++",
seed = 1, # keep the seed always the same for reproducibility (otherwise it is possible that I receive a slightly worse 'sse' but not higher than 1.333333), fact is I don't want NA's in the 'nansum' vector
verbose = FALSE)
nansum[i] = sum(is.na(L$centroids))
sse[i] = sum(L$WCSS_per_cluster)
}
testthat::expect_true(all(nansum == 0) & !any(is.na(nansum)) & all(sse == 1.0))
})
################################
# error handling predict_KMeans
################################
testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
tmp_x = list(X)
testthat::expect_error( predict_KMeans(tmp_x, CENTROIDS = cntr) )
})
testthat::test_that("in case that the CENTROIDS is not a matrix, it returns an error", {
cntr = as.data.frame(matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X)))
testthat::expect_error( predict_KMeans(X, CENTROIDS = cntr) )
})
testthat::test_that("in case that the columns of the CENTROIDS is not equal to the columns of the data, it returns an error", {
cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)
testthat::expect_error( predict_KMeans(X, CENTROIDS = cntr) )
})
testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {
tmp_dat = X
tmp_dat[1,1] = Inf
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
testthat::expect_error( predict_KMeans(tmp_dat, CENTROIDS = cntr) )
})
##########################
# predict_KMeans function
##########################
testthat::test_that("predict_KMeans returns the correct output if the input is a data frame AND if the CENTROIDS is a matrix and has the correct dimensions ", {
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
km = predict_KMeans(dat, CENTROIDS = cntr)
testthat::expect_true(length(km) == nrow(X))
km = KMeans_rcpp(dat, 5)
testthat::expect_equal(predict_KMeans(dat, CENTROIDS = km$centroids),
predict(km, dat))
})
testthat::test_that("predict_KMeans returns the correct output if the input is a matrix AND if the CENTROIDS is a matrix and has the correct dimensions ", {
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
km = predict_KMeans(X, CENTROIDS = cntr)
testthat::expect_true( length(km) == nrow(X) )
})
testthat::test_that("the predict_KMeans works using the CENTROIDS of the KMeans_rcpp function", {
km = KMeans_rcpp(X, clusters = 2, num_init = 5, max_iters = 100, initializer = 'optimal_init')
km_preds = predict_KMeans(X, CENTROIDS = km$centroids)
testthat::expect_true( length(km_preds) == nrow(X) )
})
testthat::test_that("the predict_KMeans works using the CENTROIDS of the KMeans_arma function", {
km = KMeans_arma(X, clusters = 2, n_iter = 10, "random_subset", verbose = F)
km_preds = predict_KMeans(X, CENTROIDS = km)
testthat::expect_true(length(km_preds) == nrow(X))
## testthat::expect_equal(km_preds, predict(km, X))
})
testthat::test_that("the unified predict() function returns the same result as the hard clustering when the parameter fuzzy is TRUE", {
km = KMeans_rcpp(X, clusters = 2, num_init = 5, max_iters = 100)
km_hard_clusts = predict(object = km, newdata = X, fuzzy = FALSE)
km_soft_clusts = predict(object = km, newdata = X, fuzzy = TRUE)
conv_soft_to_hard = apply(km_soft_clusts, 1, which.max)
testthat::expect_true(all.equal(target = km_hard_clusts, current = conv_soft_to_hard))
})
#########################################
# error handling Optimal_Clusters_KMeans
#########################################
testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {
tmp_x = list(X)
testthat::expect_error( Optimal_Clusters_KMeans(tmp_x, max_clusters = 10, criterion = 'distortion_fK', plot_clusters = FALSE) )
})
testthat::test_that("in case that the max_clusters parameter is not numeric, it returns an error", {
tmp_m = data.frame(1)
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = tmp_m, criterion = 'distortion_fK', plot_clusters = FALSE) )
})
testthat::test_that("in case that the max_clusters parameter is less than 1, it returns an error", {
tmp_m = 0
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = tmp_m, criterion = 'distortion_fK', plot_clusters = FALSE) )
})
testthat::test_that("if the criterion is not one of c('variance_explained', 'WCSSE', 'dissimilarity', 'silhouette', 'distortion_fK', 'AIC', 'BIC', 'Adjusted_Rsquared'), it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'invalid', plot_clusters = FALSE) )
})
testthat::test_that("in case that the num_init parameter is less than 1, it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', num_init = 0, plot_clusters = FALSE) )
})
testthat::test_that("in case that the max_iters parameter is less than 1, it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', max_iters = 0, plot_clusters = FALSE) )
})
testthat::test_that("if the initializer is not one of c('kmeans++', 'random', 'optimal_init', 'quantile_init'), it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, initializer = 'invalid', plot_clusters = FALSE) )
})
testthat::test_that("in case that the threads parameter is less than 1, it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', threads = 0, plot_clusters = FALSE) )
})
testthat::test_that("in case that the tol parameter is less than or equal to 0.0, it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', tol = 0.0, plot_clusters = FALSE) )
})
testthat::test_that("in case that the plot_clusters parameter is not logical, it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', plot_clusters = 'FALSE') )
})
testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', verbose = 'FALSE', plot_clusters = FALSE) )
})
testthat::test_that("in case that the tol_optimal_init parameter is less than or equal to 0.0, it returns an error", {
testthat::expect_error( Optimal_Clusters_KMeans(X, max_clusters = 5, criterion = 'distortion_fK', tol_optimal_init = 0.0, plot_clusters = FALSE) )
})
testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {
tmp_dat = X
tmp_dat[1,1] = -Inf
testthat::expect_error( Optimal_Clusters_KMeans(tmp_dat, max_clusters = 5, criterion = 'distortion_fK', plot_clusters = FALSE) )
})
testthat::test_that("in case that the 'mini_batch_params' is not NULL and the named list is not valid it returns an error", {
params_mbkm = list(invalid = 10, init_fraction = 0.3, early_stop_iter = 10)
testthat::expect_error( Optimal_Clusters_KMeans(dat, max_clusters = 10, criterion = "distortion_fK",
plot_clusters = FALSE, mini_batch_params = params_mbkm) )
})
testthat::test_that("in case that the 'mini_batch_params' is not NULL and the criterion is 'variance_explained' it returns an error", {
params_mbkm = list(batch_size = 10, init_fraction = 0.3, early_stop_iter = 10)
testthat::expect_error( Optimal_Clusters_KMeans(dat, max_clusters = 10, criterion = "variance_explained",
plot_clusters = FALSE, mini_batch_params = params_mbkm) )
})
###################################
# Optimal_Clusters_KMeans function [ in case that the 'max_clusters' parameter is of length 1 ]
###################################
testthat::test_that("Optimal_Clusters_KMeans returns the correct output if the input is a data frame ", {
nr_clust = 10
res = Optimal_Clusters_KMeans(dat, max_clusters = nr_clust, criterion = 'distortion_fK', plot_clusters = FALSE, tol_optimal_init = 0.2)
testthat::expect_true( length(res) == nr_clust )
})
testthat::test_that("Optimal_Clusters_KMeans returns the correct output for different criteria", {
vec = c('variance_explained', 'WCSSE', 'dissimilarity', 'silhouette', 'AIC', 'BIC', 'distortion_fK', 'Adjusted_Rsquared')
out = rep(NA, length(vec))
nr_clust = 5
count = 1
for (i in vec) {
res = Optimal_Clusters_KMeans(dat, max_clusters = nr_clust, criterion = i, plot_clusters = T, tol_optimal_init = 0.2)
out[count] = (length(res) == nr_clust)
count = count + 1
}
testthat::expect_true( sum(out) == length(vec) )
})
testthat::test_that("Optimal_Clusters_KMeans returns the correct output if the 'mini_batch_params' is not NULL", {
nr_clust = 10
params_mbkm = list(batch_size = 10, init_fraction = 0.3, early_stop_iter = 10)
res = Optimal_Clusters_KMeans(dat, max_clusters = nr_clust, criterion = "distortion_fK",
plot_clusters = FALSE, mini_batch_params = params_mbkm)
testthat::expect_true( length(res) == nr_clust )
})
###################################
# Optimal_Clusters_KMeans function [ in case that the 'max_clusters' parameter is a contiguous or non-contiguous vector ] [ here I tested only the 'KMeans_rcpp' function but the same applies to 'MiniBatchKmeans' ]
###################################
testthat::test_that("max_clusters-vector for 'variance_explained'", {
subs = 2:3
res1 = Optimal_Clusters_KMeans(dat, max_clusters = 1:3, criterion = 'variance_explained')
res2 = Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'variance_explained')
testthat::expect_true( all(res1[subs] == res2) )
})
testthat::test_that("max_clusters-vector for 'WCSSE'", {
subs = c(2,4)
res1 = Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'WCSSE')
res2 = Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'WCSSE')
testthat::expect_true( all(res1[subs] == res2) )
})
testthat::test_that("max_clusters-vector for 'dissimilarity'", {
subs = c(1,3)
res1 = Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'dissimilarity')
res2 = Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'dissimilarity')
testthat::expect_true( all(res1[subs] == res2) )
})
testthat::test_that("max_clusters-vector for 'silhouette'", {
subs = c(2,3)
res1 = Optimal_Clusters_KMeans(dat, max_clusters = 1:3, criterion = 'silhouette')
res2 = Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'silhouette')
testthat::expect_true( all(res1[subs] == res2) )
})
testthat::test_that("max_clusters-vector for 'AIC'", {
subs = c(2,4)
res1 = Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'AIC')
res2 = Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'AIC')
testthat::expect_true( all(res1[subs] == res2) )
})
testthat::test_that("max_clusters-vector for 'BIC'", {
subs = c(1,4)
res1 = Optimal_Clusters_KMeans(dat, max_clusters = 1:4, criterion = 'BIC')
res2 = Optimal_Clusters_KMeans(dat, max_clusters = subs, criterion = 'BIC')
testthat::expect_true( all(res1[subs] == res2) )
})
#####################################################
# test-case for the 'silhouette_of_clusters' function
#####################################################
testthat::test_that("'silhouette_of_clusters' function returns the correct output", {
clusters = 2
km = KMeans_rcpp(data = X, clusters = clusters, num_init = 5, max_iters = 100, initializer = 'kmeans++')
silh = silhouette_of_clusters(data = X, clusters = km$clusters)
# receive summary per cluster
silh_summary_from_matrix = lapply(1:clusters, function(x) {
IDX = which(as.vector(silh$silhouette_matrix[, 'cluster']) == x)
clust_subs = silh$silhouette_matrix[IDX, , drop = F]
data.frame(list(cluster = unique(clust_subs[, 'cluster']),
size = nrow(clust_subs),
avg_intra_dissim = mean(clust_subs[, 'intra_cluster_dissim'], na.rm = TRUE),
avg_silhouette = mean(clust_subs[, 'silhouette'], na.rm = TRUE)))
})
silh_summary_from_matrix = do.call(rbind, silh_summary_from_matrix)
silh_summary = silh$silhouette_summary
testthat::expect_true( all.equal(silh_summary_from_matrix, silh_summary, tolerance = sqrt(.Machine$double.eps)) )
})
################################
# error handling MiniBatchKmeans
################################
testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {
tmp_x = list(X)
testthat::expect_error( MiniBatchKmeans(tmp_x, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})
testthat::test_that("in case that the clusters parameter is not numeric, it returns an error", {
tmp_m = data.frame(1)
testthat::expect_error( MiniBatchKmeans(X, clusters = tmp_m, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})
testthat::test_that("in case that the length of the clusters parameter is not 1, it returns an error", {
tmp_m = c(1,2)
testthat::expect_error( MiniBatchKmeans(X, clusters = tmp_m, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})
testthat::test_that("in case that the clusters parameter is less than 1, it returns an error", {
tmp_m = 0
testthat::expect_error( MiniBatchKmeans(X, clusters = tmp_m, batch_size = 20, num_init = 5, early_stop_iter = 10) )
})
testthat::test_that("in case that the batch_size parameter is less than 1, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 0, num_init = 5, early_stop_iter = 10) )
})
testthat::test_that("in case that the num_init parameter is less than 1, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 0, early_stop_iter = 10) )
})
testthat::test_that("in case that the max_iters parameter is less than 1, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, max_iters = 0, early_stop_iter = 10) )
})
testthat::test_that("in case that the init_fraction parameter is less than or equal to 0.0, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, init_fraction = 0.0, early_stop_iter = 10) )
})
testthat::test_that("in case that the initializer parameter is not one of c('kmeans++', 'random', 'optimal_init', 'quantile_init'), it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, initializer = 'invalid', early_stop_iter = 10) )
})
testthat::test_that("in case that the early_stop_iter parameter is less than 1, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 0) )
})
testthat::test_that("in case that the verbose parameter is not logical, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, verbose = 'FALSE') )
})
testthat::test_that("in case that CENTROIDS has invalid columns, it returns an error", {
cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, CENTROIDS = cntr) )
})
testthat::test_that("in case that CENTROIDS has invalid rows, it returns an error", {
NROW = 3
cntr = matrix(runif(NROW * (ncol(X))), nrow = NROW, ncol = ncol(X))
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, CENTROIDS = cntr) )
})
testthat::test_that("in case that the tol parameter is less than or equal to 0.0, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, tol = 0.0) )
})
testthat::test_that("in case that the tol_optimal_init parameter is less than or equal to 0.0, it returns an error", {
testthat::expect_error( MiniBatchKmeans(X, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10, tol_optimal_init = 0.0) )
})
testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {
tmp_dat = X
tmp_dat[1,1] = Inf
testthat::expect_error( MiniBatchKmeans(tmp_dat, clusters = 2, batch_size = 10, num_init = 1, early_stop_iter = 10) )
})
##########################
# MiniBatchKmeans function
##########################
testthat::test_that("in case that the data is a matrix the result is a list and the class is 'k-means clustering' ", {
clust = 2
numinit = 5
km = MiniBatchKmeans(X, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, tol_optimal_init = 0.2)
testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization")) && is.matrix(km$centroids) && nrow(km$centroids) == clust &&
ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&
is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering") )
})
testthat::test_that("in case that the data is a matrix the result is a list and the class is 'k-means clustering' ", {
clust = 2
numinit = 5
km = MiniBatchKmeans(dat, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, tol_optimal_init = 0.2)
testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization")) && is.matrix(km$centroids) && nrow(km$centroids) == clust &&
ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&
is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering") )
})
testthat::test_that("for different parameter settings it returns the correct output", {
clust = 2
numinit = 5
inits = c('kmeans++', 'random', 'optimal_init', 'quantile_init')
res = rep(NA, length(inits))
for (i in 1:length(inits)) {
km = MiniBatchKmeans(dat, clusters = clust, batch_size = 20, num_init = numinit, initializer = inits[i], early_stop_iter = 10, tol_optimal_init = 0.2)
res[i] = ( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization")) && is.matrix(km$centroids) && nrow(km$centroids) == clust &&
ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&
is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering"))
}
testthat::expect_true( sum(res) == length(inits) )
})
testthat::test_that("it returns the correct output if the CENTROIDS parameter is not NULL ", {
clust = 2
cntr = matrix(runif(clust * (ncol(X))), nrow = clust, ncol = ncol(dat))
km = MiniBatchKmeans(dat, clusters = clust, batch_size = 20, early_stop_iter = 10, CENTROIDS = cntr, tol_optimal_init = 0.2)
testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization")) && is.matrix(km$centroids) && nrow(km$centroids) == clust &&
ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&
is.matrix(km$iters_per_initialization) && inherits(km, "k-means clustering") )
})
testthat::test_that("in case that the init_fraction is greater than 0.0 and the intializer equals to 'kmeans++' it returns the correct output ", {
clust = 2
numinit = 5
km = MiniBatchKmeans(X, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, init_fraction = 0.4, initializer = 'kmeans++')
testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization")) && is.matrix(km$centroids) && nrow(km$centroids) == clust &&
ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&
is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering") )
})
testthat::test_that("in case that the init_fraction is greater than 0.0 and the intializer equals to 'quantile_init' it returns the correct output ", {
clust = 2
numinit = 5
km = MiniBatchKmeans(X, clusters = clust, batch_size = 20, num_init = numinit, early_stop_iter = 10, init_fraction = 0.4, initializer = "quantile_init", tol_optimal_init = 0.2)
testthat::expect_true( all(names(km) %in% c("centroids", "WCSS_per_cluster", "best_initialization", "iters_per_initialization")) && is.matrix(km$centroids) && nrow(km$centroids) == clust &&
ncol(km$centroids) == ncol(X) && is.matrix(km$WCSS_per_cluster) && ncol(km$WCSS_per_cluster) == clust && is.numeric(km$best_initialization) && length(km$best_initialization) == 1 &&
is.matrix(km$iters_per_initialization) && ncol(km$iters_per_initialization) == numinit && inherits(km, "k-means clustering") )
})
#####################################
# error handling predict_MBatchKMeans
#####################################
testthat::test_that("in case that the data is not a matrix or a data frame, it returns an error", {
tmp_x = list(X)
MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)
testthat::expect_error( predict_MBatchKMeans(tmp_x, MbatchKm$centroids, fuzzy = FALSE) )
})
testthat::test_that("in case that the CENTROIDS is not a matrix, it returns an error", {
cntr = as.data.frame(matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X)))
testthat::expect_error( predict_MBatchKMeans(X, CENTROIDS = cntr, fuzzy = FALSE) )
})
testthat::test_that("in case that the columns of the CENTROIDS is not equal to the columns of the data, it returns an error", {
cntr = matrix(runif(2 * (ncol(X) - 1)), nrow = 2, ncol = ncol(X) - 1)
testthat::expect_error( predict_MBatchKMeans(X, CENTROIDS = cntr) )
})
testthat::test_that("in case that the data includes NaN or Inf values, it returns an error", {
tmp_dat = X
tmp_dat[1,1] = Inf
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
testthat::expect_error( predict_MBatchKMeans(tmp_dat, CENTROIDS = cntr) )
})
testthat::test_that("in case that the fuzzy parameter is not logical, it returns an error", {
cntr = matrix(runif(2 * (ncol(X))), nrow = 2, ncol = ncol(X))
testthat::expect_error( predict_MBatchKMeans(X, CENTROIDS = cntr, fuzzy = 'FALSE') )
})
################################
# predict_MBatchKMeans function
################################
testthat::test_that("in case that the data is a matrix (fuzzy = TRUE) the result is a list and the class is 'k-means clustering' ", {
MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)
km = predict_MBatchKMeans(X, MbatchKm$centroids, fuzzy = TRUE)
testthat::expect_true( all(names(km) %in% c("clusters", "fuzzy_clusters")) && is.matrix(km$fuzzy_clusters) && nrow(km$fuzzy_clusters) == nrow(X) && ncol(km$fuzzy_clusters) == 2 &&
is.vector(km$clusters) && length(km$clusters) == nrow(X) && inherits(km, "k-means clustering") )
})
testthat::test_that("in case that the data is a data frame (fuzzy = TRUE) the result is a list and the class is 'k-means clustering' ", {
MbatchKm = MiniBatchKmeans(dat, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)
km = predict_MBatchKMeans(dat, MbatchKm$centroids, fuzzy = TRUE)
testthat::expect_true( all(names(km) %in% c("clusters", "fuzzy_clusters")) && is.matrix(km$fuzzy_clusters) && nrow(km$fuzzy_clusters) == nrow(X) && ncol(km$fuzzy_clusters) == 2 &&
is.vector(km$clusters) && length(km$clusters) == nrow(X) && inherits(km, "k-means clustering") )
})
testthat::test_that("in case that the data is a matrix (fuzzy = FALSE) the result is a vector and the class is 'k-means clustering' ", {
MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)
km = predict_MBatchKMeans(X, MbatchKm$centroids, fuzzy = FALSE)
testthat::expect_true( is.numeric(km) && length(km) == nrow(X) )
})
testthat::test_that("the unified predict() function returns the same result as the hard clustering when the parameter fuzzy is TRUE", {
MbatchKm = MiniBatchKmeans(X, clusters = 2, batch_size = 20, num_init = 5, early_stop_iter = 10)
mbkm_hard_clusts = predict(object = MbatchKm, newdata = X, fuzzy = FALSE)
mbkm_soft_clusts = predict(object = MbatchKm, newdata = X, fuzzy = TRUE)
conv_soft_to_hard = apply(mbkm_soft_clusts, 1, which.max)
testthat::expect_true(all.equal(target = mbkm_hard_clusts, current = conv_soft_to_hard))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.