###### if you want to do kluster only on one or more methods
##########################################################################################################
##########################################################################################################
###########################kluster main function#################################################################
############################################################kluster main function####################################
##########################################################################################################
#########################kluster main function###########################################################################
##########################################################################################################
###### if you want to do kluster only
kluster_eval <- function(data,
clusters, #number of clusters we know
iter_sim, # number of simulation iterations
iter_klust, #number of iterations for clustering with sample_n size x
smpl, #size of the sample_n to be taken with replacement out of data
algorithm = "Default", #select analysis algorithm from BIC, PAMK, CAL, and AP
cluster = FALSE # if TURE it'll do clustering which will take a lot longer!
) {
size <- nrow(data)
if (algorithm == 'BIC') {
# ##starting to store results from different algorithms
# tic()
# ##now let's compute optimal ks with BIC
# BIC.best <- dim(Mclust(as.matrix(data), G=1:15)$z)[2]
# tBIC <- toc()
# tBIC <- as.numeric(tBIC$toc - tBIC$tic)
###kluster procedure
##bic
kbicssum <- list()
kbics2 <- list()
t2 <- list()
tBIC_kluster <- list()
for (j in 1:iter_sim) {
tic()
for (i in 1:iter_klust) {
dat2 <- as.matrix(data[sample(nrow(data), smpl, replace=TRUE), ])
kbics2[[i]] <- dim(Mclust(dat2, G=1:15)$z)[2]
rm(dat2)
}
t <- toc()
t2[[j]] <- t$toc - t$tic
kbicssum[[j]] <- mean(as.numeric(kbics2))
}
tBIC_kluster <- mean(unlist(t2))
m_BIC_k <- round(mean(as.numeric(kbics2)),0)
f_BIC_k <- as.numeric(names(which.max(table(unlist(kbics2)))))
method <- c("BIC_kluster")
ptime <- c(tBIC_kluster)
k_mean <- c(m_BIC_k)
k_freq <- c(f_BIC_k)
sim <- data.frame(method,k_mean,k_freq,ptime)
sim$k_orig <- clusters
sim$e_mean <- k_mean - clusters
sim$e_freq <- k_freq - clusters
sim$n <- size
####cluster and visualize the performance
# km1 <- hkmeans(data, k = BIC.best,iter.max = 300)
# km5 <- hkmeans(data, k = mean_BIC_kluster,iter.max = 300)
# par(mfrow=c(1,2))
# # plot(data, col = km1$cluster, pch = 19, frame = FALSE,
# # main = paste0("HK-means with k = ",BIC.best,""))
# #
# plot(data, col = km5$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster BIC process, k = ",mean_BIC_kluster,""))
#
#
#
# # par(mfrow=c(1,1))
# boxplot(round(as.numeric(kbicssum),0),
# main = paste0("kluster optimum cluster number from BIC w/ resampling.
# Mean resampling estimate = ",mean_BIC_kluster,"
# ",
# " and ordinary BIC suggested ",BIC.best," clusters."))
return(
list("sim"=sim,
"m_BIC_k"=m_BIC_k,
"f_BIC_k"=f_BIC_k,
"BICsimk"=kbics2)
)
} else
if (algorithm == "PAMK") {
# tic()
# pamk.best <- pamk(data)$nc
# tpamk <- toc()
# tpamk <- as.numeric(tpamk$toc - tpamk$tic)
###pam method
kpamsum <- list()
kpam2 <- list()
t2 <- list()
tpam_kluster <- list()
for (j in 1:iter_sim) {
tic()
for (i in 1:iter_klust) {
dat2 <- data[sample(nrow(data), smpl, replace=TRUE), ]
kpam2[[i]] <- pamk(dat2,krange=1:15)$nc
rm(dat2)
}
t <- toc()
t2[[j]] <- t$toc - t$tic
kpamsum[[j]] <- mean(as.numeric(kpam2))
}
tpam_kluster <- mean(unlist(t2))
m_pam_k <- round(mean(as.numeric(kpam2)),0)
f_pam_k <- as.numeric(names(which.max(table(unlist(kpam2)))))
method <- c("pam_kluster")
ptime <- c(tpam_kluster)
k_mean <- c(m_pam_k)
k_freq <- c(f_pam_k)
sim <- data.frame(method,k_mean,k_freq,ptime)
sim$k_orig <- clusters
sim$e_mean <- k_mean - clusters
sim$e_freq <- k_freq - clusters
sim$n <- size
####cluster and visualize the performance
# km2 <- hkmeans(data, k = pamk.best,iter.max = 300)
# km6 <- hkmeans(data, k = mean_pam_kluster,iter.max = 300)
#
#
#
# par(mfrow=c(1,2))
#
# plot(data, col = km2$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with k = ",pamk.best,""))
# points(km2$centers, col = 1:pamk.best, pch = 8, cex = 3)
# plot(data, col = km6$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster PAM process, k = ",mean_pam_kluster,""))
# points(km6$centers, col = 1:mean_pam_kluster, pch = 8, cex = 3)
#
# par(mfrow=c(1,1))
# boxplot(round(as.numeric(kpamsum),0),
# main = paste0("kluster optimum cluster number from PAM w/ resampling.
# Mean resampling estimate = ",mean_pam_kluster,"
# ",
# " and ordinary PAM suggested ",pamk.best," clusters."))
#
#
#
return(
list("sim"=sim,
"m_pam_k"=m_pam_k,
"f_pam_k"=f_pam_k,
"PAMsimk"=kpam2)
)
} else
if (algorithm == "CAL") {
# tic()
# calinski.best <- as.numeric(which.max(cascadeKM(data, 1, 15, iter = 1000)$results[2,]))
# tcalinski <- toc()
# tcalinski <- as.numeric(tcalinski$toc - tcalinski$tic)
kcalsum <- list()
kcal2 <- list()
t2 <- list()
tcal_kluster <- list()
for (j in 1:iter_sim) {
tic()
for (i in 1:iter_klust) {
dat2 <- data[sample(nrow(data), smpl, replace=TRUE), ]
kcal2[[i]] <- as.numeric(which.max(cascadeKM(dat2, 1, 15, iter = 1000)$results[2,]))
rm(dat2)
}
t <- toc()
t2[[j]] <- t$toc - t$tic
kcalsum[[j]] <- mean(as.numeric(kcal2))
}
tcal_kluster <- mean(unlist(t2))
m_cal_kluster <- round(mean(as.numeric(kcal2)),0)
f_cal_k <- as.numeric(names(which.max(table(unlist(kcal2)))))
method <- c("cal_kluster")
ptime <- c(tcal_kluster)
k_mean <- c(m_cal_k)
k_freq <- c(f_cal_k)
sim <- data.frame(method,k_mean,k_freq,ptime)
sim$k_orig <- clusters
sim$e_mean <- k_mean - clusters
sim$e_freq <- k_freq - clusters
sim$n <- size
####cluster and visualize the performance
# km3 <- hkmeans(data, k = calinski.best,iter.max = 300)
# km7 <- hkmeans(data, k = mean_cal_kluster,iter.max = 300)
# par(mfrow=c(1,2))
#
# plot(data, col = km3$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with k = ",calinski.best,""))
# points(km3$centers, col = 1:calinski.best, pch = 8, cex = 3)
#
# plot(data, col = km7$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster Calinski process, k = ",mean_cal_kluster,""))
# points(km7$centers, col = 1:mean_cal_kluster, pch = 8, cex = 3)
#
# par(mfrow=c(1,1))
# boxplot(round(as.numeric(kcalsum),0),
# main = paste0("kluster optimum cluster number from Calinski w/ resampling.
# Mean resampling estimate = ",mean_cal_kluster,"
# and ordinary Calinski suggested ",calinski.best," clusters."))
return(
list("sim"=sim,
"m_cal_k"=m_cal_k,
"f_cal_k"=f_cal_k,
"CALsimk"=kcal2)
)
} else
if (algorithm == "AP") {
# tic()
# apclus.best <- length(apcluster(negDistMat(r=2), data)@clusters)
# tap <- toc()
# tap <- as.numeric(tap$toc - tap$tic)
###AP
kapsum <- list()
kap2 <- list()
t2 <- list()
tap_kluster <- list()
for (j in 1:iter_sim) {
tic()
for (i in 1:iter_klust) {
dat2 <- data[sample(nrow(data), smpl, replace=TRUE), ]
kap2[[i]] <- length(apcluster(negDistMat(r=2), dat2)@clusters)
rm(dat2)
}
t <- toc()
t2[[j]] <- t$toc - t$tic
kapsum[[j]] <- mean(as.numeric(kap2))
}
tap_kluster <- mean(unlist(t2))
m_ap_k <- round(mean(as.numeric(kap2)),0)
f_ap_k <- as.numeric(names(which.max(table(unlist(kap2)))))
method <- c("ap_kluster")
ptime <- c(tap_kluster)
k_mean <- c(m_ap_k)
k_freq <- c(f_ap_k)
sim <- data.frame(method,k_mean,k_freq,ptime)
sim$k_orig <- clusters
sim$e_mean <- k_mean - clusters
sim$e_freq <- k_freq - clusters
sim$n <- size
####cluster and visualize the performance
# km4 <- hkmeans(data, k = apclus.best,iter.max = 300)
# km8 <- hkmeans(data, k = mean_ap_kluster,iter.max = 300)
#
#
#
# par(mfrow=c(1,2))
#
# plot(data, col = km4$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with k = ",apclus.best,""))
# points(km4$centers, col = 1:apclus.best, pch = 8, cex = 3)
#
# plot(data, col = km8$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster AP process, k = ",mean_ap_kluster,""))
# points(km8$centers, col = 1:mean_ap_kluster, pch = 8, cex = 3)
# par(mfrow=c(1,1))
# boxplot(round(as.numeric(kapsum),0),
# main = paste0("kluster optimum cluster number from AP w/ resampling.
# Mean resampling estimate = ",mean_ap_kluster,"
# and ordinary AP suggested ",apclus.best," clusters."))
return(
list("sim"=sim,
"m_ap_k"=m_ap_k,
"f_ap_k"=f_ap_k,
"APsimk"=kap2)
)
} else
if (algorithm == "Default") {
###kluster procedure
##bic
kbicssum <- list()
kbics2 <- list()
t1 <- list()
tBIC_kluster <- list()
kpamsum <- list()
kpam2 <- list()
t2 <- list()
tpam_kluster <- list()
kcalsum <- list()
kcal2 <- list()
t3 <- list()
tcal_kluster <- list()
kapsum <- list()
kap2 <- list()
t4 <- list()
tap_kluster <- list()
for (j in 1:iter_sim) {
for (i in 1:iter_klust) {
dat2 <- data[sample(nrow(data), smpl, replace=TRUE), ]
tic()
kbics2[[i]] <- dim(Mclust(as.matrix(dat2), G=1:15)$z)[2]
t1i <- toc()
t1[[i]] <- t1i$toc - t1i$tic
tic()
kpam2[[i]] <- pamk(dat2,krange=1:15)$nc
t2i <- toc()
t2[[i]] <- t2i$toc - t2i$tic
tic()
kcal2[[i]] <- as.numeric(which.max(cascadeKM(dat2, 1, 15, iter = 1000)$results[2,]))
t3i <- toc()
t3[[i]] <- t3i$toc - t3i$tic
tic()
kap2[[i]] <- length(apcluster(negDistMat(r=2), dat2)@clusters)
t4i <- toc()
t4[[i]] <- t4i$toc - t4i$tic
rm(dat2)
}
kbicssum[[j]] <- mean(as.numeric(kbics2))
kpamsum[[j]] <- mean(as.numeric(kpam2))
kcalsum[[j]] <- mean(as.numeric(kcal2))
kapsum[[j]] <- mean(as.numeric(kap2))
}
tBIC_kluster <- mean(unlist(t1))
m_BIC_k <- round(mean(as.numeric(kbics2)),0)
tpam_kluster <- mean(unlist(t2))
m_pam_k <- round(mean(as.numeric(kpam2)),0)
tcal_kluster <- mean(unlist(t3))
m_cal_k <- round(mean(as.numeric(kcal2)),0)
tap_kluster <- mean(unlist(t4))
m_ap_k <- round(mean(as.numeric(kap2)),0)
f_BIC_k <- as.numeric(names(which.max(table(unlist(kbics2)))))
f_pam_k <- as.numeric(names(which.max(table(unlist(kpam2)))))
f_cal_k <- as.numeric(names(which.max(table(unlist(kcal2)))))
f_ap_k <- as.numeric(names(which.max(table(unlist(kap2)))))
method <- c("BIC_kluster","pam_kluster","cal_kluster","ap_kluster")
ptime <- c(tBIC_kluster*iter_klust,tpam_kluster*iter_klust,tcal_kluster*iter_klust,tap_kluster*iter_klust)
k_mean <- c(m_BIC_k,m_pam_k,m_cal_k,m_ap_k)
k_freq <- c(f_BIC_k,f_pam_k,f_cal_k,f_ap_k)
sim <- data.frame(method,k_mean,k_freq,ptime)
sim$k_orig <- clusters
sim$e_mean <- k_mean - clusters
sim$e_freq <- k_freq - clusters
sim$n <- size
#
#
#
# ####cluster and visualize the performance
# km5 <- hkmeans(data, k = mean_BIC_kluster,iter.max = 300)
# km6 <- hkmeans(data, k = mean_pam_kluster,iter.max = 300)
# km7 <- hkmeans(data, k = mean_cal_kluster,iter.max = 300)
# km8 <- hkmeans(data, k = mean_ap_kluster,iter.max = 300)
#
#
#
#
#
# par(mfrow=c(2,2))
# # plot(data, col = km1$cluster, pch = 19, frame = FALSE,
# # main = paste0("HK-means with k = ",BIC.best,""))
# #
# plot(data, col = km5$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster BIC process, k = ",mean_BIC_kluster,""))
#
# plot(data, col = km6$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster PAM process, k = ",mean_pam_kluster,""))
# points(km6$centers, col = 1:mean_pam_kluster, pch = 8, cex = 3)
#
# plot(data, col = km7$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster Calinski process, k = ",mean_cal_kluster,""))
# points(km7$centers, col = 1:mean_cal_kluster, pch = 8, cex = 3)
#
# plot(data, col = km8$cluster, pch = 19, frame = FALSE,
# main = paste0("HK-means with kluster AP process, k = ",mean_ap_kluster,""))
# points(km8$centers, col = 1:mean_ap_kluster, pch = 8, cex = 3)
#
#
#
#
# boxplot(round(as.numeric(kbicssum),0),
# main = paste0("kluster optimum cluster number from BIC w/ resampling.
# Mean resampling estimate = ",mean_BIC_kluster,"
# ",
# " and ordinary BIC suggested ",BIC.best," clusters."))
# boxplot(round(as.numeric(kpamsum),0),
# main = paste0("kluster optimum cluster number from PAM w/ resampling.
# Mean resampling estimate = ",mean_pam_kluster,"
# ",
# " and ordinary PAM suggested ",pamk.best," clusters."))
# boxplot(round(as.numeric(kcalsum),0),
# main = paste0("kluster optimum cluster number from Calinski w/ resampling.
# Mean resampling estimate = ",mean_cal_kluster,"
# and ordinary Calinski suggested ",calinski.best," clusters."))
# boxplot(round(as.numeric(kapsum),0),
# main = paste0("kluster optimum cluster number from AP w/ resampling.
# Mean resampling estimate = ",mean_ap_kluster,"
# and ordinary AP suggested ",apclus.best," clusters."))
return(
list("sim"=sim,
"m_BIC_k"=m_BIC_k,
"m_pam_k"=m_pam_k,
"m_ap_k"=m_ap_k,
"f_BIC_k"=f_BIC_k,
"f_cal_k"=f_cal_k,
"m_cal_k"=m_cal_k,
"f_pam_k"=f_pam_k,
"f_ap_k"=f_ap_k,
"BICsimk"=kbics2,
"PAMsimk"=kpam2,
"APsimk"=kap2)
)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.