knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(devtools) devtools::install_github("zhaodyleo/Kmeans")
The kmeans_cluster
function in kmeans
library could perform one dimension clustering
library(kmeans) # randomly generate training data x1 <- runif(10000,0,1) x2 <- runif(10000,2,3) x3 <- runif(10000,4,5) train_data <- c(x1,x2,x3) train_label <- c(rep(0,10000),rep(1,10000),rep(2,10000)) training_result <- kmeans_cluster(train_data, 3, 2500, 1e-8) # there is two output in training result # one is centre training_result[[1]] # One is the predict label training_result[[2]][1:10]
We could use the kmeans_cluster_predict
function to predict the label which new observations belongs.
X2 <- runif(1000,0,5) kmeans_cluster_predict(X2,training_result[[1]])[1:10]
Sometimes, the label we got from the kmeans didn't match the actual label. Thus,we could use kmeans_label_rematch
function to match the train label with the actual label. This function will also return the training accuracy. If we input the test data and the actual test label ,it will return the test accuracy and the rematched label lists.
rematch_list <- kmeans_label_rematch(training_result[[2]],train_label) rematch_list[[1]][1:10] # training accuracy rematch_list[[2]]
test_label <- c(1,2,2,1,3,2,1) test_actual<- c(2,0,0,2,1,0,2) # including test label, the function will also return the rematch test label rematch_list_test <- kmeans_label_rematch(training_result[[2]],train_label,test_label) # rematched training label rematch_list_test[[1]][1:10] # rematched test label rematch_list_test[[2]] # training accuracy rematch_list_test[[3]] # include the actual test label, function will give the test accuracy rematch_list_test <- kmeans_label_rematch(training_result[[2]],train_label,test_label,test_actual) rematch_list_test[[4]]
The kmeans_cluster
function in kmeans
library could also perform multi dimension clustering
library(kmeans) X1 <- runif(10000,0,1) X2 <- runif(10000,2,3) X3 <- runif(10000,3,5) X4 <- runif(10000,6,9) index <- sample(1:20000) train_data <- cbind(c(X1,X3),c(X2,X4))[index,] train_label <- c(rep(0,10000),rep(1,10000))[index] training_result <- kmeans_cluster(train_data,2,2500,1e-8) # there is two output in training result # one is centre training_result[[1]] # One is the predict label training_result[[2]][1:10]
We could also use the kmeans_cluster_predict
function to predict the label which new observations belongs.
X1 <- runif(10000,0,5) X2 <- runif(10000,2,9) index <- sample(1:20000) test_data <- cbind(X1,X2) kmeans_cluster_predict(test_data,training_result[[1]])[1:10]
Check the same section in One-dimension clustering
for specific example.
stats
libraryFollowing code will test the accuracy of kmeans_cluster with the kmeans function from stats library
x1 <- runif(100000,0,1) x2 <- runif(100000,2,3) x3 <- runif(100000,4,5) index <- sample(1:300000,replace = FALSE) train_data <- c(x1,x2,x3)[index] train_label <- c(rep(0,10000),rep(1,10000),rep(2,10000))[index] training_result <- kmeans_cluster(train_data, 3, 2500, 1e-8) kmeans_stat_result <- kmeans(train_data,3) kmeans_cluster_label <- kmeans_label_rematch(training_result[[2]],train_label) kmeans_label <- kmeans_label_rematch(kmeans_stat_result$cluster,train_label) all.equal(kmeans_cluster_label,kmeans_label)
Let's compare the efficiency between kmeans_cluster and kmeans from stats.
library(bench) bench::mark(kmeans_cluster(train_data, 3, 2500, 1e-8),filter_gc = FALSE)[,c(2,3,5)] bench::mark(kmeans(train_data,3,iter.max = 2500))[,c(2,3,5)]
library(kmeans) X1 <- runif(10000,0,1) X2 <- runif(10000,0,1) X3 <- runif(10000,3,5) X4 <- runif(10000,0,1) X5 <- runif(10000,3,5) X6 <- runif(10000,3,5) index <- sample(1:30000) train_data <- cbind(c(X1,X3,X5),c(X2,X4,X6))[index,] train_label <- c(rep(0,10000),rep(1,10000),rep(2,10000))[index] initial_center <- matrix(c(1.1,2,3.3,2,3.3,4),2,3) training_result <- kmeans_cluster(train_data,initial_center ,2500,1e-8) kmeans_stat_result <- kmeans(train_data,t(initial_center)) kmeans_cluster_label <- kmeans_label_rematch(training_result[[2]],train_label) kmeans_label <- kmeans_label_rematch(kmeans_stat_result$cluster,train_label) # Both method will provide same results # but it is not consistent since the method include some randomness # if the cluster is not perfectly separable, same output is not garanteed. all.equal(kmeans_cluster_label,kmeans_label)
bench::mark(kmeans_cluster(train_data,3,2500,1e-8),filter_gc = FALSE)[,c(2,3,5)] bench::mark(kmeans(train_data,3))[,c(2,3,5)]
Unfortunately, kmeans_cluster didn't perform better than the kmeans from the stats library. Potential reason is that kmeans from stats library using Rcpp in the function to speed up some functionality. Also, it used different algorithms which may be more efficient in computation.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.