knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

How to install the kmeans library

library(devtools)
devtools::install_github("zhaodyleo/Kmeans")

One-dimension clustering

kmeans_cluster

The kmeans_cluster function in kmeans library could perform one dimension clustering

library(kmeans)

# randomly generate training data
x1 <- runif(10000,0,1)
x2 <- runif(10000,2,3)
x3 <- runif(10000,4,5)
train_data <- c(x1,x2,x3)
train_label <- c(rep(0,10000),rep(1,10000),rep(2,10000))
training_result <- kmeans_cluster(train_data, 3, 2500, 1e-8)
# there is two output in training result 
# one is centre 
training_result[[1]]
# One is the predict label 
training_result[[2]][1:10]

kmeans_cluster_predict

We could use the kmeans_cluster_predict function to predict the label which new observations belongs.

X2 <- runif(1000,0,5)
kmeans_cluster_predict(X2,training_result[[1]])[1:10]

kmeans_label_rematch

Sometimes, the label we got from the kmeans didn't match the actual label. Thus,we could use kmeans_label_rematch function to match the train label with the actual label. This function will also return the training accuracy. If we input the test data and the actual test label ,it will return the test accuracy and the rematched label lists.

rematch_list <- kmeans_label_rematch(training_result[[2]],train_label)
rematch_list[[1]][1:10]
# training accuracy
rematch_list[[2]]
test_label <- c(1,2,2,1,3,2,1)
test_actual<- c(2,0,0,2,1,0,2)
# including test label, the function will also return the rematch test label
rematch_list_test <- kmeans_label_rematch(training_result[[2]],train_label,test_label)
# rematched training label
rematch_list_test[[1]][1:10]
# rematched test label
rematch_list_test[[2]]
# training accuracy
rematch_list_test[[3]]

# include the actual test label, function will give the test accuracy
rematch_list_test <- kmeans_label_rematch(training_result[[2]],train_label,test_label,test_actual)
rematch_list_test[[4]]

Multi-dimension clustering

kmeans_cluster

The kmeans_cluster function in kmeans library could also perform multi dimension clustering

library(kmeans)
X1 <-  runif(10000,0,1)
X2 <-  runif(10000,2,3)
X3 <-  runif(10000,3,5)
X4 <-  runif(10000,6,9)
index  <-  sample(1:20000)
train_data <-  cbind(c(X1,X3),c(X2,X4))[index,]
train_label <-  c(rep(0,10000),rep(1,10000))[index]
training_result <-  kmeans_cluster(train_data,2,2500,1e-8)
# there is two output in training result 
# one is centre 
training_result[[1]]
# One is the predict label 
training_result[[2]][1:10]

kmeans_cluster_predict

We could also use the kmeans_cluster_predict function to predict the label which new observations belongs.

X1 <-  runif(10000,0,5)
X2 <-  runif(10000,2,9)
index  <-  sample(1:20000)
test_data <-  cbind(X1,X2)
kmeans_cluster_predict(test_data,training_result[[1]])[1:10]

kmeans_label_rematch

Check the same section in One-dimension clustering for specific example.

Comparision with the kmeans function from stats library

Following code will test the accuracy of kmeans_cluster with the kmeans function from stats library

One-dimension Example

x1 <- runif(100000,0,1)
x2 <- runif(100000,2,3)
x3 <- runif(100000,4,5)

index <- sample(1:300000,replace = FALSE)
train_data <- c(x1,x2,x3)[index]
train_label <- c(rep(0,10000),rep(1,10000),rep(2,10000))[index]
training_result <- kmeans_cluster(train_data, 3, 2500, 1e-8)
kmeans_stat_result <- kmeans(train_data,3)
kmeans_cluster_label <- kmeans_label_rematch(training_result[[2]],train_label)
kmeans_label <- kmeans_label_rematch(kmeans_stat_result$cluster,train_label)

all.equal(kmeans_cluster_label,kmeans_label)

Let's compare the efficiency between kmeans_cluster and kmeans from stats.

library(bench)
bench::mark(kmeans_cluster(train_data, 3, 2500, 1e-8),filter_gc = FALSE)[,c(2,3,5)]
bench::mark(kmeans(train_data,3,iter.max = 2500))[,c(2,3,5)]

Multi-dimension Example

library(kmeans)
X1 <-  runif(10000,0,1)
X2 <-  runif(10000,0,1)
X3 <-  runif(10000,3,5)
X4 <-  runif(10000,0,1)
X5 <-  runif(10000,3,5)
X6 <-  runif(10000,3,5)
index  <-  sample(1:30000)
train_data <-  cbind(c(X1,X3,X5),c(X2,X4,X6))[index,]
train_label <-  c(rep(0,10000),rep(1,10000),rep(2,10000))[index]
initial_center <- matrix(c(1.1,2,3.3,2,3.3,4),2,3)
training_result <-  kmeans_cluster(train_data,initial_center ,2500,1e-8)
kmeans_stat_result <- kmeans(train_data,t(initial_center))
kmeans_cluster_label <- kmeans_label_rematch(training_result[[2]],train_label)
kmeans_label <- kmeans_label_rematch(kmeans_stat_result$cluster,train_label)
# Both method will provide same results 
# but it is not consistent since the method include some randomness
# if the cluster is not perfectly separable, same output is not garanteed. 
all.equal(kmeans_cluster_label,kmeans_label)
bench::mark(kmeans_cluster(train_data,3,2500,1e-8),filter_gc = FALSE)[,c(2,3,5)]
bench::mark(kmeans(train_data,3))[,c(2,3,5)]

Unfortunately, kmeans_cluster didn't perform better than the kmeans from the stats library. Potential reason is that kmeans from stats library using Rcpp in the function to speed up some functionality. Also, it used different algorithms which may be more efficient in computation.



zhaodyleo/Kmeans documentation built on Dec. 23, 2021, 9:18 p.m.