Nothing
## TSS: total sum of squares
## BSS: between cluster sum of squares
## WSS: within cluster sum of squares
.count.observation <-
function(dist.obj)
{
nrow(as.matrix(dist.obj))
}
.PairwiseDistanceMatrixToVariance <-
function(dist.obj)
{
## --
## dist.obj=dist(x)
## --
mean(dist.obj**2)/2
}
.retrieveDistanceMatrix <-
function(dist.obj,indexVector)
{
as.dist(as.matrix(dist.obj)[indexVector,indexVector])
}
.tssByDistanceMatrix <-
function(dist.obj)
{
n <- nrow(as.matrix(dist.obj))
.PairwiseDistanceMatrixToVariance(dist.obj)*(n-1)
}
.wssByDistanceMatrix<-
function(dist.obj,indexVector)
{
n <- length(indexVector)
if (n>1){
i.dist.obj <- .retrieveDistanceMatrix(dist.obj,indexVector)
.PairwiseDistanceMatrixToVariance(i.dist.obj)*(n-1)
} else if (n==1){
return(0)
} else {
return(NA)
}
}
.totwssByDistanceMatrix<-
function(dist.obj,clusters)
{
cluster <- c()
wss <- c()
totwss <- 0
for (i.cluster in unique(clusters)){
indexVector <- which(clusters==i.cluster)
i.wss <- .wssByDistanceMatrix(dist.obj,indexVector)
cluster <- c(cluster,i.cluster)
wss <- c(wss,i.wss)
## print(cluster)
## print(indexVector)
## print(i.wss)
}
totwss <- sum(wss)
return(list(cluster=cluster,wss=wss,totwss=totwss))
}
.bssByDistanceMatrix<-
function(dist.obj,clusters)
{
totwss <- .totwssByDistanceMatrix(dist.obj,clusters)$totwss
tss <- .tssByDistanceMatrix(dist.obj)
totbss <- tss-totwss
## print(tss)
## print(bss)
return(list(totbss=totbss))
}
##' Evaluation on the varaince of a clustering model using squared Euclidean
##' distances, based on distance matrix and cluster membership.
##'
##' Clustering Sum-of-Squares for clustering evaluation.
##' @title Clustering Sum-of-Squares for clustering evaluation
##' @name css
##' @aliases css css.hclust
##' @usage
##'
##' css(dist.obj,clusters)
##'
##' ## Computing Sum-of-Squares given Hierarchical Clustering
##' \method{css}{hclust}(dist.obj, hclust.obj=NULL,hclust.FUN=hclust,
##' hclust.FUN.MoreArgs=list(method="ward"),k=NULL)
##'
##' @param dist.obj a `dist' object as produced by \code{dist} or \code{gdist}.
##' @param clusters a vector with cluster memberships.
##' @param k numeric, the upper bound of the number of clusters to compute.
##' DEFAULT: \emph{20} or the number of observations (if less than \emph{20}).
##' @param hclust.obj a `hclust' object, generated by \emph{\code{hclust}}
##' @param hclust.FUN a function, to generate a hierarchical clustering.
##' Ignored with \code{hclust.obj} specified.
##' DEFAULT: \emph{\code{hclust}}
##' @param hclust.FUN.MoreArgs a list, containing arguments that are passed to \code{hclust.FUN}.
##'
##' @return
##' \code{css} returns a `css' object,
##' which is a list containing the following components
##' \tabular{ll}{
##' k \tab number of clusters\cr
##' wss\tab \emph{\code{k}} within-cluster sum-of-squares \cr
##' totwss\tab total within-cluster sum-of-square\cr
##' totbss\tab total between-cluster sum-of-square\cr
##' tss\tab total sum of squares of the data\cr
##' }
##' , and with an attribute `meta' that contains the input components
##' \tabular{ll}{
##' dist.obj \tab (the input) distance matrix\cr
##' clusters \tab (the input) cluster membership\cr
##' }
##'
##'
##' \code{css.hclust} returns a `css.multi' object,
##' which is a data.frame containing the following columns
##' \tabular{ll}{
##' k \tab number of clusters\cr
##' ev \tab explained variance given \emph{\code{k}}\cr
##' totbss \tab total between-cluster sum-of-square\cr
##' tss \tab total sum of squares of the data\cr
##' }
##' , and with an attribute `meta' that contains
##' \tabular{ll}{
##' cmethod \tab the clustering method\cr
##' dist.obj \tab (the input) distance matrix\cr
##' k \tab (the input) number of clusters\cr
##' clusters \tab the `hclust' object that is either by input or computed by default\cr
##' }
##' @seealso \code{\link{elbow}} for "elbow" plot using `css.multi' object
css <-
function(dist.obj,clusters)
{
if (!(is.integer(clusters) & length(clusters)==.count.observation(dist.obj))){
stop("`clusters' should be a vector of integers of the same length as of the number of observations in `dist.obj'.")
}
ret <- list()
ret$k <- length(unique(clusters))
ret$wss <- .totwssByDistanceMatrix(dist.obj,clusters)$wss
ret$totwss <- .totwssByDistanceMatrix(dist.obj,clusters)$totwss
ret$totbss <- .bssByDistanceMatrix(dist.obj,clusters)$totbss
ret$tss <- .tssByDistanceMatrix(dist.obj)
meta <- list()
meta$dist.obj <- dist.obj
meta$clusters <- clusters
attr(ret,"meta") <- meta
class(ret) <- c("css",class(ret))
return(ret)
}
.css.hclustS <-
function(dist.obj,
hclust.obj=NULL,
hclust.FUN=hclust,
hclust.FUN.MoreArgs=list(method="ward"),
k
)
{
if (.invalid(hclust.obj)){
hclust.obj <- .call.FUN(hclust.FUN,dist.obj,hclust.FUN.MoreArgs)
}
tmp.cutree <- cutree(hclust.obj,k=k)
res <- css(dist.obj,clusters=tmp.cutree)
attr(res,"meta")$hclust.obj <- hclust.obj
res
}
.css.hclustM <-
function(k,...)
{
kVector <- k
ret <- list()
for (i in 1:length(kVector)){
ret[[i]] <- .css.hclustS(k=kVector[i],...)
}
attr(ret,"meta")$hclust.obj <- attr(ret[[1]],"meta")$hclust.obj
ret
}
css.hclust <-
function(dist.obj,
hclust.obj=NULL,
hclust.FUN=hclust,
hclust.FUN.MoreArgs=list(method="ward"),
k=NULL
)
{
if (.invalid(k)) k <- min(.count.observation(dist.obj),20)
res <-
.css.hclustM(dist.obj,
hclust.obj,
hclust.FUN,
hclust.FUN.MoreArgs,
k=1:k
)
tss <- unlist(lapply(res,"[[","tss"))
totbss <- unlist(lapply(res,"[[","totbss"))
hclust.obj <- attr(res,"meta")$hclust.obj
ev <- totbss/tss
ret <- data.frame(k=1:k,ev=ev,totbss=totbss,tss=tss)
meta <- list()
meta$cmethod <- "hclust"
meta$dist.obj <- dist.obj
meta$k <- k
meta$hclust.obj <- hclust.obj
attr(ret,"meta") <- meta
class(ret) <- c("css.multi",class(ret))
ret
}
##' Determining the number of clusters in a data set by the "elbow" rule.
##'
##' Determining the number of clusters in a data set by the "elbow" rule and
##' thresholds in the explained variance (EV) and its increment.
##' @title The "Elbow" Method for Clustering Evaluation
##' @name elbow
##' @aliases elbow elbow.batch plot.elbow
##'
##' @usage
##' ## find a good k given thresholds of EV and its increment.
##' elbow(x,inc.thres,ev.thres,precision=3,print.warning=TRUE)
##'
##' ## a wrapper of `elbow' testing multiple thresholds.
##' elbow.batch(x,inc.thres=c(0.01,0.05,0.1),
##' ev.thres=c(0.95,0.9,0.8,0.75,0.67,0.5,0.33),precision=3)
##'
##' \method{plot}{elbow}(x,elbow.obj=NULL,main,xlab="k",
##' ylab="Explained_Variance",type="b",pch=20,col.abline="red",
##' lty.abline=3,if.plot.new=TRUE,print.info=TRUE,
##' mar=c(4,5,3,3),omi=c(0.75,0,0,0),...)
##'
##' @param x a `css.multi' object, generated by \emph{\code{css.hclust}}
##' @param inc.thres numeric with value(s) from 0 to 1, the threshold of the increment of EV.
##' A single value is used in \code{elbow} while a vector of values in \code{elbow.batch}.
##' @param ev.thres numeric with value(s) from 0 to 1, the threshold of EV.
##' A single value is used in \code{elbow} while a vector of values in \code{elbow.batch}.
##' @param precision integer, the number of digits to round for numerical comparison.
##' @param print.warning logical, whether to print warning messages.
##' @param elbow.obj a `elbow' object, generated by \emph{\code{elbow}} or \emph{\code{elbow.batch}}
##' @param main an overall title for the plot.
##' @param ylab a title for the y axis.
##' @param xlab a title for the x axis.
##' @param type what type of plot should be drawn.\cr
##' See \code{help("plot", package="graphics")}.
##' @param pch Either an integer specifying a symbol or a single character
##' to be used as the default in plotting points (see \code{par}).
##' @param col.abline color for straight lines through the current plot
##' (see option \code{col} in \code{par}).
##' @param lty.abline line type for straight lines through the current plot
##' (see option \code{lty} in \code{par}).
##' @param if.plot.new logical, whether to start a new plot device or not.
##' @param print.info logical, whether to print the information of `elbow.obj'.
##' @param mar A numerical vector of the form 'c(bottom, left, top, right)'
##' which gives the number of lines of margin to be specified on
##' the four sides of the plot (see option \code{mar} in \code{par}).
##' The default is 'c(4, 5, 3, 3) + 0.1'.
##' @param omi A vector of the form 'c(bottom, left, top, right)' giving the
##' size of the outer margins in inches (see option \code{omi} in \code{par}).
##' @param ... arguments to be passed to method \code{plot.elbow},
##' such as graphical parameters (see \code{par}).
##' @return
##' Both \code{elbow} and \code{elbow.btach} return a `elbow' object
##' (if a "good" \emph{\code{k}} exists),
##' which is a list containing the following components
##' \tabular{ll}{
##' k \tab number of clusters\cr
##' ev \tab explained variance given \emph{\code{k}}\cr
##' inc.thres \tab the threshold of the increment in EV\cr
##' ev.thres \tab the threshold of the EV\cr
##' }
##' , and with an attribute `meta' that contains
##' \tabular{ll}{
##' description \tab A description about the "good" \emph{\code{k}}\cr
##' }
##' @seealso \code{\link{css}} and \code{\link{css.hclust}} for computing Clustering Sum-of-Squares.
##' @examples
##' ## load library
##' require("GMD")
##'
##' ## simulate data around 12 points in Euclidean space
##' pointv <- data.frame(x=c(1,2,2,4,4,5,5,6,7,8,9,9),
##' y=c(1,2,8,2,4,4,5,9,9,8,1,9))
##' set.seed(2012)
##' mydata <- c()
##' for (i in 1:nrow(pointv)){
##' mydata <- rbind(mydata,cbind(rnorm(10,pointv[i,1],0.1),
##' rnorm(10,pointv[i,2],0.1)))
##' }
##' mydata <- data.frame(mydata); colnames(mydata) <- c("x","y")
##' plot(mydata,type="p",pch=21, main="Simulated data")
##'
##' ## determine a "good" k using elbow
##' dist.obj <- dist(mydata[,1:2])
##' hclust.obj <- hclust(dist.obj)
##' css.obj <- css.hclust(dist.obj,hclust.obj)
##' elbow.obj <- elbow.batch(css.obj)
##' print(elbow.obj)
##'
##' ## make partition given the "good" k
##' k <- elbow.obj$k; cutree.obj <- cutree(hclust.obj,k=k)
##' mydata$cluster <- cutree.obj
##'
##' ## draw a elbow plot and label the data
##' dev.new(width=12, height=6)
##' par(mfcol=c(1,2),mar=c(4,5,3,3),omi=c(0.75,0,0,0))
##' plot(mydata$x,mydata$y,pch=as.character(mydata$cluster),
##' col=mydata$cluster,cex=0.75,main="Clusters of simulated data")
##' plot(css.obj,elbow.obj,if.plot.new=FALSE)
##'
##' ## clustering with more relaxed thresholds (, resulting a smaller "good" k)
##' elbow.obj2 <- elbow.batch(css.obj,ev.thres=0.90,inc.thres=0.05)
##' mydata$cluster2 <- cutree(hclust.obj,k=elbow.obj2$k)
##'
##' dev.new(width=12, height=6)
##' par(mfcol=c(1,2), mar=c(4,5,3,3),omi=c(0.75,0,0,0))
##' plot(mydata$x,mydata$y,pch=as.character(mydata$cluster2),
##' col=mydata$cluster2,cex=0.75,main="Clusters of simulated data")
##' plot(css.obj,elbow.obj2,if.plot.new=FALSE)
elbow <-
function(x,inc.thres,ev.thres,precision=3,print.warning=TRUE)
{
if (!inherits(x,"css.multi")){
stop("`x' should be an object of class `css.multi' from package GMD.")
}
if(inc.thres>1 | inc.thres<0){
stop("`inc.thres' should be a percentage value, ranging from 0 to 1.")
}
if(ev.thres>1 | ev.thres<0){
stop("`ev.thres' should be a percentage value, ranging from 0 to 1.")
}
if(length(inc.thres)>1){
if (print.warning)
warning("`inc.thres' has a length more than one; the 1st value is used.")
inc.thres <- inc.thres[1]
}
if(length(ev.thres)>1){
if (print.warning)
warning("`ev.thres' has a length more than one; the 1st value is used.")
ev.thres <- ev.thres[1]
}
## increment (the first derivative) of totbss
x$inc <- c(x$ev[-1]-x$ev[-nrow(x)], NA)
## x$dev1 <- x$inc
## x$dev2 <- c(x$dev1[-1]-x$dev1[-nrow(x)], NA)
x$inc <- c(x$ev[-1]-x$ev[-nrow(x)], 0)
##print(x)
##cat(sprintf("x$ev=%s,x$inc=%s\n",x$ev,x$inc))
k1 <- x[x$ev-ev.thres >= -0.1**precision, "k"][1]
k2 <- x[-nrow(x),][inc.thres-x[-nrow(x),]$inc >= -0.1**precision, "k"][1]
if (is.na(k1)){
if (print.warning)
warning(sprintf("The explained variances is smaller than `ev.thres' when k=c(%s:%s); increase `k' or decrease `ev.thres' would help.",1,max(x$k)))
}
if (is.na(k2)){
if (print.warning)
warning(sprintf("The increment of explained variance is larger than `inc.thres' when k=c(%s:%s); increase `k' or increase `inc.thres' would help.",1,max(x$k)))
}
k <- max(k1,k2)
ret <- list(k=k,ev=x$ev[x$k==k],inc.thres=inc.thres,ev.thres=ev.thres)
class(ret) <- c("elbow",class(ret))
return(ret)
}
elbow.batch <-
function(x,inc.thres=c(0.01,0.05,0.1),ev.thres=c(0.95,0.9,0.8,0.75,0.67,0.5,0.33),precision=3)
{
ret <- NULL
inc.thres <- sort(inc.thres, decreasing=FALSE)
ev.thres <- sort(ev.thres, decreasing=TRUE)
for (i.ev.thres in ev.thres){
for (i.inc.thres in inc.thres){
res <- elbow(x,inc.thres=i.inc.thres,ev.thres=i.ev.thres,precision=precision,print.warning=FALSE)
k <- res$k
if(!is.na(k)){
info <- sprintf("A \"good\" k=%s (EV=%.2f) is detected when the EV is no less than %s\nand the increment of EV is no more than %s for a bigger k.\n",k,res$ev,i.ev.thres,i.inc.thres)
ret <- list(k=k,ev=res$ev,inc.thres=i.inc.thres,ev.thres=i.ev.thres)
attr(ret,"description") <- info
class(ret) <- c("elbow",class(ret))
return(ret)
}
}
}
if (is.null(ret)){
stop("A good `k' is not available with provided inc.thres and ev.thres; please make adjustment, e.g. decrease `ev.thres', increase `inc.thres' or increase `k'.")
}
}
##' @export plot.elbow
plot.elbow <-
function(x,
elbow.obj=NULL,
## plot
main,
xlab="k",
ylab="Explained_Variance",
type="b",
pch=20,
col.abline="red",
lty.abline=3,
if.plot.new=TRUE,
print.info=TRUE,
mar=c(4,5,3,3),
omi=c(0.75,0,0,0),
...
)
{
if (!inherits(x,"css.multi") ){
stop("`x' should be an object of class `css.multi'.")
}
if (.invalid(main)){
main <- "Elbow plot"
}
sub <- attr(elbow.obj,"description")
if (if.plot.new){
par(mar=mar,omi=omi)
}
plot(x$k,x$ev,type=type,pch=pch,main=main,xlab=xlab,ylab=ylab,...)
if(!is.null(elbow.obj)){
if (!inherits(elbow.obj,"elbow") ){
stop("`elbow.obj' should be an object of class `elbow'.")
}
k <- elbow.obj$k
abline(v=k,col=col.abline,lty=lty.abline)
mtext(side=3,at=k,text=sprintf("k=%s",k),line=0)
abline(h=x[x$k==k,"ev"],col=col.abline,lty=lty.abline)
##mtext(side=4,at=x[x$k==k,"ev"],text=sprintf("%.2f",x[x$k==k,"ev"]),line=-0.25,col=col.abline,las=2)
text(x=x[x$k==k,"k"],y=x[x$k==k,"ev"],labels=sprintf("%.2f",x[x$k==k,"ev"]),col=col.abline,pos=c(2,3))
print.info <- FALSE
}
if (print.info){
title(main="",sub=sub,font.sub=3,cex=0.5,outer=TRUE,line=2.5)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.