knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
The goal of sbclust is to provide an approach to cluster large datasets through a sample-based bagged clustering algorithm, which is effectively just the bclust
function from the e1071 package with one tiny edit that allows to grab a sample of the dataset for each run instead of the whole dataset.
You can install the development version of sbclust from GitHub with:
# install.packages("devtools") devtools::install_github("sdesabbata/sbclust")
This is a basic example of clustering a large dataset using sbclust
:
library(sbclust) # Set seed set.seed(731)
Create test dataset containing three million cases with two attributes and five clusters, using approach suggested by Wang et al. (2008).
# Size test_dataset_size <- 3000000 # Proportions of the five clusters centres_prop <- c(0.2, 0.2, 0.3, 0.1, 0.2) # Clusters' centers centres_mean <- array( c(-3, 0, 4, 5, -5, -3, 0, 3, -2, 4), c(5, 2) ) # Clusters standard deviation centres_sd <- array( c(1, 1, 0.1, 0.5, 0.2, 0.2, 1, 1, 1, 1), c(5, 2) ) # Values dataset values_x <- c() values_y <- c() values_cluster <- c() for (i in 1:5) { values_x <- c( values_x, rnorm(n = test_dataset_size*centres_prop[i], mean = centres_mean[i, 1], sd = centres_sd[i, 1]) ) values_y <- c( values_y, rnorm(n = test_dataset_size*centres_prop[i], mean = centres_mean[i, 2], sd = centres_sd[i, 2]) ) values_cluster <- c( values_cluster, rep(as.character(i), test_dataset_size*centres_prop[i]) ) } # Create dataset test_dataset <- data.frame( x = values_x, y = values_y, cluster = values_cluster )
Plot the example dataset.
plot( test_dataset$x, test_dataset$y, col = test_dataset$cluster, pch = 19, cex = 0.2 )
Run the clustering algorithm and save the results as a new column of the dataset.
# Clustering start_time <- Sys.time() clustering_result <- sbclust( test_dataset[,c("x", "y")], centers = 5, iter.max = 5000, ) end_time <- Sys.time() # Check time lapsed end_time - start_time # Save results test_dataset["sbclust"] <- clustering_result$cluster
Plot the clustering results.
plot( test_dataset$x, test_dataset$y, col = test_dataset$sbclust, pch = 19, cex = 0.2 )
Compare the list of test centers with the results of the clustering procedure.
# Comparison centres_mean clustering_result$centers
e1071::bclust
Run the e1071::bclust
algorithm for comparison.
# Clustering e1071_bclust_start_time <- Sys.time() e1071_bclust_result <- e1071::bclust( test_dataset[,c("x", "y")], centers = 5, iter.max = 5000, ) e1071_bclust_end_time <- Sys.time() # Check time lapsed e1071_bclust_end_time - e1071_bclust_start_time # Save results test_dataset["e1071_bclust"] <- e1071_bclust_result$cluster
Plot the clustering results.
plot( test_dataset$x, test_dataset$y, col = test_dataset$e1071_bclust, pch = 19, cex = 0.2 )
Compare the list of test centers with the results of the clustering procedure.
# Comparison centres_mean e1071_bclust_result$centers
table(test_dataset[, c("sbclust", "e1071_bclust")])
The examples above illustrate how, when working with large datasets, sbclust
can achieve similar results as e1071::bclust
in a fraction (r round( (as.numeric(end_time - start_time) / as.numeric(e1071_bclust_end_time - e1071_bclust_start_time)) * 100, digits = 2)
%) of the time.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.