R/css.R

Defines functions count.observation PairwiseDistanceMatrixToVariance retrieveDistanceMatrix tssByDistanceMatrix wssByDistanceMatrix totwssByDistanceMatrix bssByDistanceMatrix css css.hclustS css.hclustM css.hclust elbow elbow.batch plot.elbow

Documented in css css.hclust elbow elbow.batch plot.elbow

## TSS: total sum of squares
## BSS: between cluster sum of squares
## WSS: within cluster sum of squares

.count.observation <-
  function(dist.obj)
{
  nrow(as.matrix(dist.obj))
}


.PairwiseDistanceMatrixToVariance <-
  function(dist.obj)
{
  ## --
  ## dist.obj=dist(x)
  ## --
  mean(dist.obj**2)/2
}


.retrieveDistanceMatrix <-
  function(dist.obj,indexVector)
{
  as.dist(as.matrix(dist.obj)[indexVector,indexVector])
}


.tssByDistanceMatrix <-
  function(dist.obj)
{
  n <- nrow(as.matrix(dist.obj))
  .PairwiseDistanceMatrixToVariance(dist.obj)*(n-1)
}


.wssByDistanceMatrix<-
  function(dist.obj,indexVector)
{
  n <- length(indexVector)
  if (n>1){
    i.dist.obj <- .retrieveDistanceMatrix(dist.obj,indexVector)
    .PairwiseDistanceMatrixToVariance(i.dist.obj)*(n-1)
  } else if (n==1){
    return(0)
  } else {
    return(NA)
  }
}


.totwssByDistanceMatrix<-
  function(dist.obj,clusters)
{
  cluster <- c()
  wss <- c()
  totwss <- 0
  for (i.cluster in unique(clusters)){
    indexVector <- which(clusters==i.cluster)
    i.wss <- .wssByDistanceMatrix(dist.obj,indexVector)
    cluster <- c(cluster,i.cluster)
    wss <- c(wss,i.wss)
##     print(cluster)
##     print(indexVector)
##     print(i.wss)
  }
  totwss <- sum(wss)
  return(list(cluster=cluster,wss=wss,totwss=totwss))
}


.bssByDistanceMatrix<-
  function(dist.obj,clusters)
{
  totwss <- .totwssByDistanceMatrix(dist.obj,clusters)$totwss
  tss <- .tssByDistanceMatrix(dist.obj)
  totbss <- tss-totwss
##   print(tss)
##   print(bss)
  return(list(totbss=totbss))
}



##' Evaluation on the varaince of a clustering model using squared Euclidean
##' distances, based on distance matrix and cluster membership.
##'
##' Clustering Sum-of-Squares for clustering evaluation.
##' @title Clustering Sum-of-Squares for clustering evaluation
##' @name css
##' @aliases css css.hclust
##' @usage
##' 
##' css(dist.obj,clusters)
##'
##' ## Computing Sum-of-Squares given Hierarchical Clustering
##' \method{css}{hclust}(dist.obj, hclust.obj=NULL,hclust.FUN=hclust,
##' hclust.FUN.MoreArgs=list(method="ward"),k=NULL)
##' 
##' @param dist.obj a `dist' object as produced by \code{dist} or \code{gdist}.
##' @param clusters a vector with cluster memberships.
##' @param k numeric, the upper bound of the number of clusters to compute.
##' DEFAULT: \emph{20} or the number of observations (if less than \emph{20}).
##' @param hclust.obj a `hclust' object, generated by \emph{\code{hclust}}
##' @param hclust.FUN a function, to generate a hierarchical clustering.
##' Ignored with \code{hclust.obj} specified.
##' DEFAULT: \emph{\code{hclust}}
##' @param hclust.FUN.MoreArgs a list, containing arguments that are passed to \code{hclust.FUN}.
##' 
##' @return
##' \code{css} returns  a `css' object,
##' which is a list containing the following components 
##' \tabular{ll}{
##'  k \tab number of clusters\cr
##'  wss\tab \emph{\code{k}} within-cluster sum-of-squares \cr
##'  totwss\tab total within-cluster sum-of-square\cr
##'  totbss\tab total between-cluster sum-of-square\cr
##'  tss\tab total sum of squares of the data\cr
##' }
##' , and with an attribute `meta' that contains the input components 
##' \tabular{ll}{
##'   dist.obj \tab (the input) distance matrix\cr
##'   clusters \tab (the input) cluster membership\cr
##' }
##'
##' 
##' \code{css.hclust} returns a `css.multi' object,
##' which is a data.frame containing the following columns 
##' \tabular{ll}{
##'   k 	   \tab number of clusters\cr
##'   ev       \tab explained variance given \emph{\code{k}}\cr
##'   totbss   \tab total between-cluster sum-of-square\cr
##'   tss 	   \tab total sum of squares of the data\cr
##' }
##' , and with an attribute `meta' that contains 
##' \tabular{ll}{
##'   cmethod \tab the clustering method\cr
##'   dist.obj \tab (the input) distance matrix\cr
##'   k \tab (the input) number of clusters\cr
##'   clusters \tab the `hclust' object that is either by input or computed by default\cr
##' }
##' @seealso \code{\link{elbow}} for "elbow" plot using `css.multi' object
css <-
  function(dist.obj,clusters)
{
  if (!(is.integer(clusters) & length(clusters)==.count.observation(dist.obj))){
   stop("`clusters' should be a vector of integers of the same length as of the number of observations in `dist.obj'.") 
  }
  ret <- list()
  ret$k <- length(unique(clusters))
  ret$wss <- .totwssByDistanceMatrix(dist.obj,clusters)$wss
  ret$totwss <- .totwssByDistanceMatrix(dist.obj,clusters)$totwss
  ret$totbss <- .bssByDistanceMatrix(dist.obj,clusters)$totbss
  ret$tss <- .tssByDistanceMatrix(dist.obj)
  meta <- list()
  meta$dist.obj <- dist.obj
  meta$clusters <- clusters
  attr(ret,"meta") <- meta
  class(ret) <- c("css",class(ret))
  return(ret)
}



.css.hclustS <-
  function(dist.obj,
           hclust.obj=NULL,
           hclust.FUN=hclust,
           hclust.FUN.MoreArgs=list(method="ward"),
           k
           )
{
  if (.invalid(hclust.obj)){
    hclust.obj <- .call.FUN(hclust.FUN,dist.obj,hclust.FUN.MoreArgs)
  }
  tmp.cutree <- cutree(hclust.obj,k=k)

  res <- css(dist.obj,clusters=tmp.cutree)
  attr(res,"meta")$hclust.obj <- hclust.obj
  res
}


.css.hclustM <-
  function(k,...)
{
  kVector <- k

  ret <- list()  
  for (i in 1:length(kVector)){
    ret[[i]] <- .css.hclustS(k=kVector[i],...)
  }
  attr(ret,"meta")$hclust.obj <- attr(ret[[1]],"meta")$hclust.obj
  ret
}


css.hclust <-
  function(dist.obj,
           hclust.obj=NULL,
           hclust.FUN=hclust,
           hclust.FUN.MoreArgs=list(method="ward"),
           k=NULL
           )
{
  if (.invalid(k)) k <- min(.count.observation(dist.obj),20)
  res <-
    .css.hclustM(dist.obj,
                 hclust.obj,
                 hclust.FUN,
                 hclust.FUN.MoreArgs,
                 k=1:k
                 )

  tss <- unlist(lapply(res,"[[","tss"))
  totbss <- unlist(lapply(res,"[[","totbss"))
  hclust.obj <-  attr(res,"meta")$hclust.obj
  ev <- totbss/tss
  ret <- data.frame(k=1:k,ev=ev,totbss=totbss,tss=tss)
  
  meta <- list()
  meta$cmethod <- "hclust"
  meta$dist.obj <- dist.obj
  meta$k <- k
  meta$hclust.obj <- hclust.obj

  attr(ret,"meta") <- meta
  class(ret) <- c("css.multi",class(ret))
  
  ret
}




##' Determining the number of clusters in a data set by the "elbow" rule.
##'
##' Determining the number of clusters in a data set by the "elbow" rule and
##' thresholds in the explained variance (EV) and its increment.
##' @title The "Elbow" Method for Clustering Evaluation
##' @name elbow
##' @aliases elbow elbow.batch plot.elbow
##' 
##' @usage
##' ## find a good k given thresholds of EV and its increment.
##' elbow(x,inc.thres,ev.thres,precision=3,print.warning=TRUE)
##'
##' ## a wrapper of `elbow' testing multiple thresholds.
##' elbow.batch(x,inc.thres=c(0.01,0.05,0.1),
##' ev.thres=c(0.95,0.9,0.8,0.75,0.67,0.5,0.33),precision=3)
##'
##' \method{plot}{elbow}(x,elbow.obj=NULL,main,xlab="k",
##' ylab="Explained_Variance",type="b",pch=20,col.abline="red",
##' lty.abline=3,if.plot.new=TRUE,print.info=TRUE,
##' mar=c(4,5,3,3),omi=c(0.75,0,0,0),...)
##' 
##' @param x a `css.multi' object, generated by \emph{\code{css.hclust}}
##' @param inc.thres numeric with value(s) from 0 to 1, the threshold of the increment of EV.
##' A single value is used in \code{elbow} while a vector of values in \code{elbow.batch}.
##' @param ev.thres numeric with value(s) from 0 to 1, the threshold of EV.
##' A single value is used in \code{elbow} while a vector of values in \code{elbow.batch}.
##' @param precision integer, the number of digits to round for numerical comparison.
##' @param print.warning logical, whether to print warning messages.
##' @param elbow.obj a `elbow' object, generated by \emph{\code{elbow}} or \emph{\code{elbow.batch}}
##' @param main an overall title for the plot. 
##' @param ylab a title for the y axis. 
##' @param xlab a title for the x axis. 
##' @param type what type of plot should be drawn.\cr
##' See \code{help("plot", package="graphics")}.
##' @param pch Either an integer specifying a symbol or a single character
##' to be used as the default in plotting points (see \code{par}).
##' @param col.abline color for straight lines through the current plot 
##' (see option \code{col} in \code{par}).
##' @param lty.abline line type for straight lines through the current plot 
##' (see option \code{lty} in \code{par}).
##' @param if.plot.new logical, whether to start a new plot device or not.
##' @param print.info logical, whether to print the information of `elbow.obj'.
##' @param mar A numerical vector of the form 'c(bottom, left, top, right)'
##' which gives the number of lines of margin to be specified on
##' the four sides of the plot (see option \code{mar} in \code{par}).
##' The default is 'c(4, 5, 3, 3) + 0.1'.
##' @param omi A vector of the form 'c(bottom, left, top, right)' giving the
##' size of the outer margins in inches (see option \code{omi} in \code{par}).
##' @param ... arguments to be passed to method \code{plot.elbow},
##' such as graphical parameters (see \code{par}).
##' @return
##' Both \code{elbow} and \code{elbow.btach} return a `elbow' object 
##' (if a "good" \emph{\code{k}} exists), 
##' which is a list containing the following components
##' \tabular{ll}{
##'   k 	    \tab number of clusters\cr
##'   ev        \tab explained variance given \emph{\code{k}}\cr
##'   inc.thres \tab the threshold of the increment in EV\cr
##'   ev.thres 	\tab the threshold of the EV\cr
##' }
##' , and with an attribute `meta' that contains 
##' \tabular{ll}{
##'   description \tab A description about the "good" \emph{\code{k}}\cr
##' }
##' @seealso \code{\link{css}} and \code{\link{css.hclust}} for computing Clustering Sum-of-Squares.
##' @examples
##' ## load library
##' require("GMD")
##' 
##' ## simulate data around 12 points in Euclidean space
##' pointv <- data.frame(x=c(1,2,2,4,4,5,5,6,7,8,9,9),
##' y=c(1,2,8,2,4,4,5,9,9,8,1,9))
##' set.seed(2012)
##' mydata <- c()
##' for (i in 1:nrow(pointv)){
##'   mydata <- rbind(mydata,cbind(rnorm(10,pointv[i,1],0.1),
##' rnorm(10,pointv[i,2],0.1)))
##' }
##' mydata <- data.frame(mydata); colnames(mydata) <- c("x","y")
##' plot(mydata,type="p",pch=21, main="Simulated data")
##' 
##' ## determine a "good" k using elbow
##' dist.obj <- dist(mydata[,1:2])
##' hclust.obj <- hclust(dist.obj)
##' css.obj <- css.hclust(dist.obj,hclust.obj)
##' elbow.obj <- elbow.batch(css.obj)
##' print(elbow.obj)
##' 
##' ## make partition given the "good" k
##' k <- elbow.obj$k; cutree.obj <- cutree(hclust.obj,k=k)
##' mydata$cluster <- cutree.obj
##' 
##' ## draw a elbow plot and label the data
##' dev.new(width=12, height=6)
##' par(mfcol=c(1,2),mar=c(4,5,3,3),omi=c(0.75,0,0,0))
##' plot(mydata$x,mydata$y,pch=as.character(mydata$cluster),
##' col=mydata$cluster,cex=0.75,main="Clusters of simulated data")
##' plot(css.obj,elbow.obj,if.plot.new=FALSE)
##' 
##' ## clustering with more relaxed thresholds (, resulting a smaller "good" k)
##' elbow.obj2 <- elbow.batch(css.obj,ev.thres=0.90,inc.thres=0.05)
##' mydata$cluster2 <- cutree(hclust.obj,k=elbow.obj2$k)
##' 
##' dev.new(width=12, height=6)
##' par(mfcol=c(1,2), mar=c(4,5,3,3),omi=c(0.75,0,0,0))
##' plot(mydata$x,mydata$y,pch=as.character(mydata$cluster2),
##' col=mydata$cluster2,cex=0.75,main="Clusters of simulated data")
##' plot(css.obj,elbow.obj2,if.plot.new=FALSE)
elbow <-
  function(x,inc.thres,ev.thres,precision=3,print.warning=TRUE)
{
  if (!inherits(x,"css.multi")){
    stop("`x' should be an object of class `css.multi' from package GMD.")
  }
  
  if(inc.thres>1 | inc.thres<0){
    stop("`inc.thres' should be a percentage value, ranging from 0 to 1.")
  }
    
  if(ev.thres>1 | ev.thres<0){
    stop("`ev.thres' should be a percentage value, ranging from 0 to 1.")
  }

  if(length(inc.thres)>1){
    if (print.warning)
      warning("`inc.thres' has a length more than one; the 1st value is used.")
    inc.thres <- inc.thres[1]
  }
  
  if(length(ev.thres)>1){
    if (print.warning)
      warning("`ev.thres' has a length more than one; the 1st value is used.")
    ev.thres <- ev.thres[1]
  }
  
  ## increment (the first derivative) of totbss
  x$inc <- c(x$ev[-1]-x$ev[-nrow(x)], NA)
  ## x$dev1 <- x$inc
  ## x$dev2 <- c(x$dev1[-1]-x$dev1[-nrow(x)], NA)

  x$inc <- c(x$ev[-1]-x$ev[-nrow(x)], 0)

  ##print(x)
  ##cat(sprintf("x$ev=%s,x$inc=%s\n",x$ev,x$inc))
  k1 <- x[x$ev-ev.thres >= -0.1**precision, "k"][1]
  k2 <- x[-nrow(x),][inc.thres-x[-nrow(x),]$inc >= -0.1**precision, "k"][1]

  if (is.na(k1)){
    if (print.warning)
      warning(sprintf("The explained variances is smaller than `ev.thres' when k=c(%s:%s); increase `k' or decrease `ev.thres' would help.",1,max(x$k)))
  }
  if (is.na(k2)){
    if (print.warning)
      warning(sprintf("The increment of explained variance is larger than `inc.thres' when k=c(%s:%s); increase `k' or increase `inc.thres' would help.",1,max(x$k)))
  }

  k <- max(k1,k2)
  ret <- list(k=k,ev=x$ev[x$k==k],inc.thres=inc.thres,ev.thres=ev.thres)
  class(ret) <- c("elbow",class(ret))
  return(ret)
}



elbow.batch <-
  function(x,inc.thres=c(0.01,0.05,0.1),ev.thres=c(0.95,0.9,0.8,0.75,0.67,0.5,0.33),precision=3)
{
  ret <- NULL
  inc.thres <- sort(inc.thres, decreasing=FALSE)
  ev.thres <- sort(ev.thres, decreasing=TRUE)
  for (i.ev.thres in ev.thres){
    for (i.inc.thres in inc.thres){
      res <- elbow(x,inc.thres=i.inc.thres,ev.thres=i.ev.thres,precision=precision,print.warning=FALSE)
      k <- res$k
      if(!is.na(k)){
        info <- sprintf("A \"good\" k=%s (EV=%.2f) is detected when the EV is no less than %s\nand the increment of EV is no more than %s for a bigger k.\n",k,res$ev,i.ev.thres,i.inc.thres)
        ret <- list(k=k,ev=res$ev,inc.thres=i.inc.thres,ev.thres=i.ev.thres)
        attr(ret,"description") <- info
        class(ret) <- c("elbow",class(ret))
        return(ret)
      }
    }
  }
  if (is.null(ret)){
    stop("A good `k' is not available with provided inc.thres and ev.thres; please make adjustment, e.g. decrease `ev.thres', increase `inc.thres' or increase `k'.")
  }
}



##' @export plot.elbow
plot.elbow <- 
  function(x,
           elbow.obj=NULL,
           ## plot
           main,
           xlab="k",
           ylab="Explained_Variance",
           type="b",
           pch=20,
           col.abline="red",
           lty.abline=3,
           if.plot.new=TRUE,
           print.info=TRUE,
           mar=c(4,5,3,3),
           omi=c(0.75,0,0,0),
           ...
           )
{
  if (!inherits(x,"css.multi") ){
    stop("`x' should be an object of class `css.multi'.")
  }

  if (.invalid(main)){
    main <- "Elbow plot"
  }
  sub <- attr(elbow.obj,"description")

  if (if.plot.new){
    par(mar=mar,omi=omi)
  }
  plot(x$k,x$ev,type=type,pch=pch,main=main,xlab=xlab,ylab=ylab,...)

  if(!is.null(elbow.obj)){
    if (!inherits(elbow.obj,"elbow") ){
      stop("`elbow.obj' should be an object of class `elbow'.")
    }
    k <- elbow.obj$k
    abline(v=k,col=col.abline,lty=lty.abline)
    mtext(side=3,at=k,text=sprintf("k=%s",k),line=0)
    abline(h=x[x$k==k,"ev"],col=col.abline,lty=lty.abline)
    ##mtext(side=4,at=x[x$k==k,"ev"],text=sprintf("%.2f",x[x$k==k,"ev"]),line=-0.25,col=col.abline,las=2)
    text(x=x[x$k==k,"k"],y=x[x$k==k,"ev"],labels=sprintf("%.2f",x[x$k==k,"ev"]),col=col.abline,pos=c(2,3))
    print.info <- FALSE
  }
  
  if (print.info){
    title(main="",sub=sub,font.sub=3,cex=0.5,outer=TRUE,line=2.5)
  }
  
}

Try the GMD package in your browser

Any scripts or data that you put into this service are public.

GMD documentation built on May 29, 2017, 10:41 a.m.