# $Id: xmeans.prog,v 1.23 2012/04/12 11:21:12 tunenori Exp tunenori $
#
# X-MEANS Clustering
#
# Description:
#
# Perform x-means non-hierarchal clustering on a data matrix.
#
# Usage:
#
# xmeans(x, ik = 2, iter.max = 10, pr.proc = F,
# ignore.covar = T, merge.cls = F)
#
# Arguments:
# x: A numeric matrix of data, or an object that can be coerced to
# such a matrix (such as a numeric vector or a data frame with
# all numeric columns).
#
# ik: The initial number of clusters applied to kmeans().
# As xmeans calls kmeans recursively, 'ik' should be sufficient
# small.
#
# iter.max: The maximum of iterations allowed.
#
# pr.proc: logical: If 'TRUE' the system outputs the processing status.
#
# ignore.covar: logical: If 'TRUE', covariances of cluster data are
# ignored. For saving of the time, 'TRUE' is set as the defalut.
#
# merge.cls: logical: If 'TRUE', some clusters may be merged into another
# clusters after iterative division.
#
# Value:
#
# An object of class 'xmeans' which is a list with components:
#
# cluster: A vector of integers indicating the cluster to which each
# point is allocated.
#
# centers: A matrix of cluster centres.
#
# size: The number of points in each cluster. When 'merge.cls' is TRUE,
# some elements may be zero.
#
# References:
#
# Ishioka, T. (2005): "An Expansion of X-means for Automatically
# Determining the Optimal Number of Clusters," The Fourth IASTED
# International Conference on Computational Intelligence (CI 2005),
# Calgary Canada, July 4-6, pp.91-96.
# http://www.rd.dnc.ac.jp/%7Etunenori/doc/487-053.pdf
#
# Ishioka, T. (2000): ``Extended K-means with an Efficient Estimation
# of the number of Clusters,'' Intelligent Data Engineering and
# Automated Learning --- IDEAL 2000, Second International Conference,
# Shatin, N.T., Hong Kong, China, December 2000, proceedings 17--22.
# (Lecture Notes in Computer Science 1983, Kwong Sak Leung, Lai-Wan
# Chan, Helen Meng (Eds.), Springer, 17--22, 2000)
# http://www.rd.dnc.ac.jp/%7Etunenori/doc/xmeans_ideal2000.pdf
#
# Examples:
#
# xmeans(iris[,-5], merge.cls=T)
# plot(cmdscale(dist(iris[,-5])), cex=2, pch=as.numeric(iris[,5]))
#
xmeans <- function(x, ik = 2, iter.max = 10, pr.proc = F, ignore.covar = T, merge.cls = F, algorithm = c("Hartigan-Wong", "Lloyd", "Forgy",
"MacQueen")){
if (ik < 2)
ik <- 2
x <- as.matrix(x)
p <- ncol(x) # p-dimensional multivariate
if (ignore.covar){
q <- 2 * p # number of parameters; mean and var for each "p"
}else{
q <- p * (p+3) / 2 # integer
}
cl<- kmeans(x,ik,iter.max, algorithm=algorithm)
cl.sub <- list()
for (i in 1:ik){ # for each cluster
y.ok <- (cl$cluster == i) # i-th cluster or not
yi <- matrix(x[y.ok], ncol=p) # extract i-th cluster
zi <- yi # save the data for graphics
yi.centers <- cl$centers[i,]
zi.centers <- yi.centers
yi.cluster <- cl$cluster[(cl$cluster == i)]
yi.cluster <- rep(1, length(yi.cluster))
# sub-cluster number should begin from 1
k1 <- 1 # cluster number
k2 <- k1 + 1
bic.prior <- NULL
stack <- list() # divided and unproceeded data are stacked
lnL0 <- lnL(yi, yi.centers, ignore.covar)
yi.lnL <- lnL0$lnL
yi.detVx <- lnL0$detVx
repeat{
# go through at least 1 time;
# y$subcluster exist...
if (pr.proc) cat (paste("k1 =", k1, ", k2 =", k2,"\n"))
if (nrow(yi) == 1){ # sample size is 1
break
}
y <- split2cls(yi, yi.centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar, algorithm=algorithm)
if (y$continue){ # splitting continue
yi.cluster <-
updtCrusterNum(y$continue, yi.cluster, k1, k2, y$subcluster)
zi.centers <-
updtCenters(y$continue, zi.centers, k1, k2, y$centers)
yi.lnL <-
updtlnL(y$continue, yi.lnL, k1, k2, y$lnL.post)
yi.detVx <-
updtdetVx(y$continue, yi.detVx, k1, k2, y$detVx.post)
}
if (pr.proc) print(y$subcluster)
if (pr.proc){ print(y$bic.prior)
print(y$bic.post)
# print(y$lnL.prior) # for debug
# print(y$lnL.post) # for debug
print(y$continue) }
# cat("zi.centers=\n") # for debug
# print(zi.centers) # for debug
if (!y$continue){ # no-node
if ((nstack <- length(stack))){ # there are stacked data
# extract the stacked data
if (pr.proc)
cat(paste("extract the stacked data (", nstack, ")...\n"))
yi <- stack[[nstack]]$data
yi.centers <- stack[[nstack]]$centers
bic.prior <- stack[[nstack]]$bic
lnL.prior <- stack[[nstack]]$lnL
detVx.prior <- stack[[nstack]]$detVx
k1 <- stack[[nstack]]$cls
k2 <- k2 # unchanged
# delete the data set
if (nstack > 1){
stack <- stack[1:(nstack-1)]
}else{
stack <- list() # no stacked data
}
next;
}
# no node and no stack
if (pr.proc) cat ("no node and no stack...\n")
break;
}
# splitting continues...
y1 <- y$clj1 # data
y2 <- y$clj2
yi.ctr1 <- y$centers[1,] # centers
yi.ctr2 <- y$centers[2,]
bic.prior1 <- y$bic.post[1] # bic
bic.prior2 <- y$bic.post[2]
lnL.prior1 <- y$lnL.post[1] # lnL
lnL.prior2 <- y$lnL.post[2]
detVx.prior1 <- y$detVx.post[1] # detVx
detVx.prior2 <- y$detVx.post[2]
# one-hand repeats recursively...
yi <- y1
yi.centers <- yi.ctr1
bic.prior <- bic.prior1
lnL.prior <- lnL.prior1
detVx.prior <- detVx.prior1
# other-hand is stacked...
if (pr.proc) cat ("stacking ...\n")
stack <- c(stack,
list(list(data=y2, centers=yi.ctr2,
bic=bic.prior2, lnL=lnL.prior2, detVx=detVx.prior2, cls=k2)))
# inclement the cluster number
k2 <- k2 + 1
} # end of repeat
# splitting done ...
if (pr.proc){
cat ("splitting done...\n")
cat (paste("main cluster =",i,"*******\n"))
}
cl.sub <- c(cl.sub, list(list(cluster = yi.cluster,
centers = zi.centers, lnL = yi.lnL, detVx = yi.detVx,
size = tabulate(yi.cluster))))
if (pr.proc){
print(cl.sub[[i]])
plot(zi, col=yi.cluster)
if (is.vector(zi.centers))
points(zi.centers[1], zi.centers[2], pch=8)
else # array
points(zi.centers,col=1:(length(zi.centers)/p),pch=8)
}
}
if (pr.proc) print(cl.sub)
xcl <- mergeResult(cl, cl.sub, ik)
if (merge.cls == F) {
return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
}
# merge after progressive dividing
#
if (pr.proc) cat("merging after progressive dividing ...\n")
k <- length(xcl$size) # final cluster number
if (k <= 2){ # minimum cluster number should be 2
if (pr.proc) cat("merging skipped ...\n")
return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
}
if (pr.proc){
cat("xcl$detVx=")
print(xcl$detVx)
cat("xcl$size=")
print(xcl$size)
}
klist <- sort.list(xcl$size) # "small" to "large" order of xcl$detVx list
if (pr.proc) print(klist)
for (i in 1:(k-1)){
for (j in (i+1):k){
k1 = klist[i]
k2 = klist[j]
if (pr.proc) cat(paste("inspecting the clusters", k1,"and", k2,"\n"))
z <- mergedBIC(x, xcl, k1, k2, q, ignore.covar, pr.proc)
if (z$ret == F){
# k1 or k2 has been merged.
# skip this roop
if (pr.proc) cat("skipping... k1=", k1, "k2=", k2,"\n")
next
}
if (z$bicdiv > z$bicmgd){
# we prefer merged model.
# replace larger cls. number to smaller cls. number
if (pr.proc) cat("replace cls.", k2, "to", k1,"\n")
xcl$cluster <- replace(xcl$cluster, (xcl$cluster == k2), k1)
xcl$size[k1] <- xcl$size[k1] + xcl$size[k2]
xcl$size[k2] <- 0
xcl$lnL[k1] <- z$lnLmgd
xcl$lnL[k2] <- 0
xcl$detVx[k1] <- z$detVxmgd
xcl$detVx[k2] <- 0
xcl$centers[k1,] <- z$ctrmgd
xcl$centers[k2,] <- 0
}
}
}
list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size)
}
# marge the result of sub-clustering;
# cluster numbers by first kmeans should be renumbered;
# the other centers and sizes are simply added.
# cl: the result of first kmeans
# cl.sub: the result of subclustering
# ik: cluster number adopted to kmeans.
mergeResult <- function(cl, cl.sub, ik){
cluster <- cl$cluster # main cluster
centers <- NULL
size <- NULL
lnL <- NULL
detVx <- NULL
k <- 0 # uniq cluster numbers; k should be decremental.
for (i in 1:ik)
k <- k + length(cl.sub[[i]]$size)
kk <- k
for (i in ik:1){ # loop for main clusters obtained by kmeans
xsub <- cl.sub[[i]]$cluster
iki <- ik -i +1
centers <- rbind(centers, cl.sub[[iki]]$centers)
size <- c(size, cl.sub[[iki]]$size)
lnL <- c(lnL, cl.sub[[iki]]$lnL)
detVx <- c(detVx, cl.sub[[iki]]$detVx)
for (j in length(cl.sub[[i]]$size):1){ # loop for subclusters
xsub <- replace(xsub, (xsub == j), k)
k <- k -1
}
cluster <- replace(cluster, (cluster == i), xsub)
}
if (k != 0) stop("mergeResult: assertion failed (k = 0)...")
dimnames(centers) <- list(1:kk, NULL)
list(cluster = cluster, centers = centers, lnL = lnL, detVx = detVx, size = size)
}
# update the cluster number by using the result of "split2cls()"
# continue: no splitting
# v: cluster numbers vector for initial cluster.
# k1: cluster numbers should be updated; "k1" becomes "k1" and "k2"
# xsub: sub-cluster numbers vector of "v" whose value is "k1";
# given "xsub" have 1 or 2.
updtCrusterNum <- function(continue, v, k1, k2, xsub){
if (!is.vector(v))
return(xsub)
if (!continue)
return(v)
if (k1 == k2)
stop("updtCrusterNum() : k1 and k2 should differ.")
# below is same algorithm; explicit array operation is slow in R.
# j <- 1
# for (i in 1:length(v)){
# if (v[i] == k1){
# if (xsub[j] == 2)
# v[i] <- k2
# j <- j + 1
# }
# }
# end of algorithm
xsub <- replace(xsub, (xsub == 2), k2) # changed
xsub <- replace(xsub, (xsub == 1), k1) # unchanged
v <- replace(v, (v == k1), xsub)
}
# update the cluster centers by using the result of "split2cls()"
# continue: no update
# org.centers: original centers matrix
# divided.centers: divided centers matrix; it has 2 rows.
updtCenters <- function(continue, org.centers, k1, k2, divided.centers){
if (!is.matrix(org.centers))
return(divided.centers)
if (!continue)
return(org.centers)
if (k1 == k2)
stop("updtCenters() : k1 and k2 should differ.")
z <- NULL
for (i in 1:max(k2, nrow(org.centers))){
if (i == k1)
z <- rbind(z, divided.centers[1,])
else if (i == k2)
z <- rbind(z, divided.centers[2,])
else
z <- rbind(z, org.centers[i,])
}
z
}
# update the lnL by using the result of "split2cls()"
# continue: no update
# org.lnL: original lnL vector
# divided.lnL: divided lnL vector having 2 elements.
updtlnL <- function(continue, org.lnL, k1, k2, divided.lnL){
if (!is.vector(org.lnL))
return(divided.lnL)
if (!continue)
return(org.lnL)
if (k1 == k2)
stop("updtlnL() : k1 and k2 should differ.")
z <- NULL
for (i in 1:max(k2, length(org.lnL))){
if (i == k1)
z <- c(z, divided.lnL[1])
else if (i == k2)
z <- c(z, divided.lnL[2])
else
z <- c(z, org.lnL[i])
}
z
}
# update the detVx by using the result of "split2cls()"
# continue: no update
# org.detVx: original detVx vector
# divided.detVx: divided detVx vector having 2 elements.
updtdetVx <- function(continue, org.detVx, k1, k2, divided.detVx){
if (!is.vector(org.detVx))
return(divided.detVx)
if (!continue)
return(org.detVx)
if (k1 == k2)
stop("updtdetVx() : k1 and k2 should differ.")
z <- NULL
for (i in 1:max(k2, length(org.detVx))){
if (i == k1)
z <- c(z, divided.detVx[1])
else if (i == k2)
z <- c(z, divided.detVx[2])
else
z <- c(z, org.detVx[i])
}
z
}
# split 2 clusters if we would prefer it based on BIC
# q: a number of parameters
# bic.prior: BIC which x is given; if bic.prior=NULL then we calculate
# lnL.prior: lnL which x is given; if bic.prior=NULL then we calculate
# detVx.prior: detVx which x is given; if bic.prior=NULL then we calculate
split2cls <- function(x, centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar, algorithm){
if (is.null(bic.prior)){
pb <- priorBIC(x, centers, q, ignore.covar)
bic.prior <- pb$bic
lnL.prior <- pb$lnL
detVx.prior <- pb$detVx
}
bic.post <- postBICs(x, centers, q, iter.max, ignore.covar, algorithm=algorithm)
subcluster <- bic.post$clsub$cluster
#
# compare whether if we should split
if (is.na(bic.post$bic[3])){
# BIC may has NA because of few data
continue <- FALSE
}else if (bic.post$bic[3] < bic.prior){
# splitting ...
# replace the cluster number to cl$cluster
continue <- TRUE
}else{
# not splitting...
# return "subcluster" stored k1
continue <- FALSE
}
# note that "subcluster" gives 1 or 2
list(continue = continue, subcluster = subcluster,
bic.prior = bic.prior, bic.post = bic.post$bic,
lnL.prior = lnL.prior, lnL.post = bic.post$lnL,
detVx.prior = detVx.prior, detVx.post = bic.post$detVx,
centers = bic.post$clsub$centers,
clj1 = bic.post$clj1, clj2 = bic.post$clj2)
}
# return BIC (prior BIC)
priorBIC <- function(x, centers, q, ignore.covar){
lnL0 <- lnL(x, centers, ignore.covar)
bic <- -2 * lnL0$lnL + q * log(nrow(x)) # BIC
# bic <- -2 * lnL0$lnL + q # AIC
list(lnL = lnL0$lnL, detVx = lnL0$detVx, bic = bic)
}
# return BICs (two posterior BICs)
postBICs <- function(x, centers, q, iter.max, ignore.covar, algorithm){
#
# split to 2 clusters
clsub <- kmeans(x, 2, iter.max, algorithm=algorithm)
y.ok1 <- lapply(clsub$cluster, "==", 1) # 1st sub-cluster or not
y.ok2 <- lapply(clsub$cluster, "==", 2) # 2nd sub-cluster or not
# extract sub data
p <- ncol(x)
clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
# ratio for pdf.
r1 <- clsub$size[1] / sum(clsub$size) # [0,1]
r2 <- 1 - r1 # [0,1]
# two later BICs
# print(clsub$centers[1,]) # for debug
# print(apply(clj1,2,mean)) # for debug
# print(sqrt(apply(clj1,2,var))) # for debug
# print(r1) # for debug
lnL1 <- lnL(clj1, clsub$centers[1,], ignore.covar)
# print(clsub$centers[2,]) # for debug
# print(apply(clj2,2,mean)) # for debug
# print(sqrt(apply(clj2,2,var))) # for debug
# print(r2) # for debug
lnL2 <- lnL(clj2, clsub$centers[2,], ignore.covar)
n1 <- nrow(clj1)
n2 <- nrow(clj2)
# normalizing factor; dist() is in library(mva)
if (is.na(lnL1$detVx) || is.na(lnL2$detVx))
beta <- 0
else
beta <- dist(clsub$center) / (sqrt(lnL1$detVx + lnL2$detVx))
alpha <- 0.5 / pnorm(beta)
BIC1 <- -2 * lnL1$lnL +q * log(n1)
BIC2 <- -2 * lnL2$lnL +q * log(n2)
# BIC1 <- -2 * lnL1$lnL +q # AIC
# BIC2 <- -2 * lnL2$lnL +q # AIC
# cat (paste("alpha =",alpha,"\n")) # for debug
# cat (paste("beta =",beta,"\n")) # for debug
# BIC is not (BIC1 + BIC2)
BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
# BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
list(bic = c(BIC1, BIC2, BIC),
lnL = c(lnL1$lnL, lnL2$lnL),
detVx = c(lnL1$detVx, lnL2$detVx),
clsub = clsub, clj1 = clj1, clj2 = clj2)
}
# return BICs for Two-merged clusters model and devided clusters model
# k1/k2: marged cluster ID
mergedBIC <- function(x, xcl, k1, k2, q, ignore.covar, pr.proc){
# sample size
# check for input data
n1 <- xcl$size[k1]
n2 <- xcl$size[k2]
if (n1 == 0 || n2 == 0){
# already had been merged
cat(paste("already had been merged\n"))
ret <- F
return( list (ret = ret))
}
if (is.null(xcl$lnL[k1]) || is.null(xcl$lnL[k2])){
# lnL may be null because of few data
cat(paste("lnL may be null because of few data\n"))
ret <- F
return( list (ret = ret))
}
# divided clusters model
lnL1 = xcl$lnL[k1]
lnL2 = xcl$lnL[k2]
ctrextrt <- rbind(xcl$centers[k1,], xcl$centers[k2,])
beta <- dist(ctrextrt) / (sqrt(xcl$detVx[k1] + xcl$detVx[k2]))
if (pr.proc) cat(paste("beta=", round (beta, digit=2), "\n"))
# if (beta > 10){
# # 2 clusters far apart
# ret <- F
# return( list (ret = ret))
# }
alpha <- 0.5 / as.numeric(pnorm(beta))
bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
# bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
# extract 2 clusters data
y.ok1 <- lapply(xcl$cluster, "==", k1) # 1st sub-cluster or not
y.ok2 <- lapply(xcl$cluster, "==", k2) # 2nd sub-cluster or not
# extract sub data
p = ncol(x)
clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
xmgd <- rbind(clj1, clj2)
# merged cluster center
ctrmgd <- (n1 * xcl$centers[k1,] + n2 * xcl$centers[k2,]) / (n1 + n2)
lnLmgd <- lnL(xmgd, ctrmgd, ignore.covar)
bicmgd <- -2 * lnLmgd$lnL + q * log(nrow(xmgd)) # BIC
# bicmgd <- -2 * lnLmgd$lnL + q # AIC
ret <- T
list (ret = ret, ctrmgd = ctrmgd, lnLmgd = lnLmgd$lnL, detVxmgd = lnLmgd$detVx, bicmgd = bicmgd, bicdiv = bicdiv)
}
# log-likelihood under the assumption of
# p-dimensional multivariate normal distribution.
# ignore.covar: ignore the covariance
lnL <- function(x, centers, ignore.covar=T){
x <- as.matrix(x)
p <- ncol(x) # p-dimensional multivariate
n <- nrow(x) # sample size
if (missing(centers))
stop("centers must be a number or a matrix")
if (n <= 2) # few data
return(list(lnL=NA, detVx=NA))
vx <- var(x) # var-co.var matrix
# print(x) # for debug
if (p == 1){ # x is vector
invVx <- 1 / as.vector(vx)
detVx <- as.vector(vx)
}else{
if (ignore.covar){
invVx <- diag(1/diag(vx)) # inv. matrix when assuming diag.
detVx <- prod(diag(vx)) # det. when assuming diag.
}else{
invVx <- solve(vx) # inverse matrix of "vx"
y <- chol(vx) # Cholesky decomposition
detVx <- prod(diag(y)) # vx = t(y) %*% y, where y is triangular,
# then, det(vx) = det(t(y)) * det(y)
}
}
t1 <- -p/2 * 1.837877066 # 1.837... = log(2 * 3.1415...)
t2 <- -log(detVx) / 2
xmu <- t(apply(x, 1, "-", centers))
# print(centers) # for debug
# print(xmu) # for debug
# s <- 0
# for (i in 1:n)
# s <- s + t(xmu[i,]) %*% invVx %*% xmu[i,]
if (p == 1){
s <- sum(xmu^2 * invVx)
}else{
s <- sum(apply(xmu, 1, txInvVxX, invVx=invVx))
}
t3 <- -s / 2
ll <- (t1 + t2) * n + as.numeric(t3) # log likelihood
list(lnL=ll, detVx=detVx)
}
# function for calculation of
# t(xmu[i,]) %*% invVx %*% xmu[i,]
txInvVxX <- function(x, invVx){
t(x) %*% invVx %*% x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.