knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

sbclust

The goal of sbclust is to provide an approach to cluster large datasets through a sample-based bagged clustering algorithm, which is effectively just the bclust function from the e1071 package with one tiny edit that allows to grab a sample of the dataset for each run instead of the whole dataset.

Installation

You can install the development version of sbclust from GitHub with:

# install.packages("devtools")
devtools::install_github("sdesabbata/sbclust")

Example

This is a basic example of clustering a large dataset using sbclust:

library(sbclust)

# Set seed
set.seed(731)

Create test dataset containing three million cases with two attributes and five clusters, using approach suggested by Wang et al. (2008).

# Size
test_dataset_size <- 3000000

# Proportions of the five clusters
centres_prop <- c(0.2, 0.2, 0.3, 0.1, 0.2)

# Clusters' centers
centres_mean <- array(
  c(-3, 0, 4, 5, -5, -3, 0, 3, -2, 4),
  c(5, 2)
)

# Clusters standard deviation
centres_sd <- array(
  c(1, 1, 0.1, 0.5, 0.2, 0.2, 1, 1, 1, 1),
  c(5, 2)
)

# Values dataset
values_x <- c()
values_y <- c()
values_cluster <- c()
for (i in 1:5) {
  values_x <-
    c(
      values_x,
      rnorm(n = test_dataset_size*centres_prop[i], mean = centres_mean[i, 1], sd = centres_sd[i, 1])
    )
  values_y <-
    c(
      values_y,
      rnorm(n = test_dataset_size*centres_prop[i], mean = centres_mean[i, 2], sd = centres_sd[i, 2])
    )
  values_cluster <-
    c(
      values_cluster,
      rep(as.character(i), test_dataset_size*centres_prop[i])
    )
}

# Create dataset
test_dataset <- data.frame(
  x = values_x,
  y = values_y,
  cluster = values_cluster
)

Plot the example dataset.

plot(
  test_dataset$x, test_dataset$y,
  col = test_dataset$cluster,
  pch = 19,
  cex = 0.2
)

Run the clustering algorithm and save the results as a new column of the dataset.

# Clustering
start_time <- Sys.time()

clustering_result <-
  sbclust(
    test_dataset[,c("x", "y")],
    centers = 5,
    iter.max = 5000,
  )

end_time <- Sys.time()

# Check time lapsed
end_time - start_time

# Save results
test_dataset["sbclust"] <- clustering_result$cluster

Plot the clustering results.

plot(
  test_dataset$x, test_dataset$y,
  col = test_dataset$sbclust,
  pch = 19,
  cex = 0.2
)

Compare the list of test centers with the results of the clustering procedure.

# Comparison
centres_mean
clustering_result$centers

Comparison with e1071::bclust

Run the e1071::bclust algorithm for comparison.

# Clustering
e1071_bclust_start_time <- Sys.time()

e1071_bclust_result <-
  e1071::bclust(
    test_dataset[,c("x", "y")],
    centers = 5,
    iter.max = 5000,
  )

e1071_bclust_end_time <- Sys.time()

# Check time lapsed
e1071_bclust_end_time - e1071_bclust_start_time

# Save results
test_dataset["e1071_bclust"] <- e1071_bclust_result$cluster

Plot the clustering results.

plot(
  test_dataset$x, test_dataset$y,
  col = test_dataset$e1071_bclust,
  pch = 19,
  cex = 0.2
)

Compare the list of test centers with the results of the clustering procedure.

# Comparison
centres_mean
e1071_bclust_result$centers
table(test_dataset[, c("sbclust", "e1071_bclust")])

The examples above illustrate how, when working with large datasets, sbclust can achieve similar results as e1071::bclust in a fraction (r round( (as.numeric(end_time - start_time) / as.numeric(e1071_bclust_end_time - e1071_bclust_start_time)) * 100, digits = 2)%) of the time.



sdesabbata/sbclust documentation built on May 12, 2022, 7:07 a.m.