R/funcs.R
In ClusBoot: Bootstrap a Clustering Solution to Establish the Stability of the Clusters

Documented in boot.proportions boot.silhouette calc.silhouette clusboot fpc.clusterboot plot.clusboot

# Sugnet Lubbe
# MuViSU (Centre for Multi-Dimensional Data Visualisation)
# Department of Statistics and Actuarial Science
# Stellenbosch University, June 2022
# ============================================================

# ---------------------------------------------------------------------------------
# fpc.clusterboot
# function adapted from fpc package to perform different bootstrapping options
# ---------------------------------------------------------------------------------
#' Resampling according to the methods discussed in Hennig (2007)
#'
#' @param data a data matrix or distance object which will be the input to the clustering function
#' @param B number of bootstrap replicates
#' @param distances see `?fpc::clusterboot`
#' @param bootmethod see `?fpc::clusterboot`
#' @param bscompare see `?fpc::clusterboot`
#' @param multipleboot see `?fpc::clusterboot`
#' @param jittertuning see `?fpc::clusterboot`
#' @param noisetuning see `?fpc::clusterboot`
#' @param subtuning see `?fpc::clusterboot`
#' @param clustermethod see `?fpc::clusterboot`
#' @param noisemethod see `?fpc::clusterboot`
#' @param count see `?fpc::clusterboot`
#' @param seed see `?fpc::clusterboot`
#' @param datatomatrix see `?fpc::clusterboot`
#' @param ... additional arguments to be sent to the function specified in clustermethod
#'
#' @export
#' @return a list with two components; boot.out contains the computations for clusboot and out contains the clustering
#'           solution of the original data set
#'
fpc.clusterboot <- function(data, B, distances = (inherits(data, "dist")),
                            bootmethod = "boot", bscompare = TRUE, multipleboot = FALSE,
                            jittertuning = 0.05, noisetuning = c(0.05, 4), subtuning = floor(nrow(data)/2),
                            clustermethod, noisemethod = FALSE, count = TRUE, seed = NULL, datatomatrix = TRUE,
                            ...)
{
      sumlogic <- function(x, y, relation = "eq") switch(relation,
                                                       eq = sum(x == y, na.rm = TRUE), s = sum(x < y, na.rm = TRUE),
                                                       l = sum(x > y, na.rm = TRUE), se = sum(x <= y, na.rm = TRUE),
                                                       le = sum(x >= y, na.rm = TRUE))
   if (!is.null(seed))
      set.seed(seed)
    invisible(distances)
    if (datatomatrix)
      data <- as.matrix(data)
    n <- nrow(data)
    p <- ncol(data)
    if (datatomatrix & !distances) {
      cod <- stats::cov(data)
      md <- colMeans(data)
    }
    lb <- length(bootmethod)

    if (distances)
      c1 <- clustermethod(stats::as.dist(data), ...)
    else c1 <- clustermethod(data, ...)
    if (noisemethod) {
      if (c1$nccl == 0)
        stop("No clusters, only noise estimated!")
    }
    else c1$nccl <- c1$nc

    if (("jitter" %in% bootmethod) | ("bojit" %in% bootmethod)) {
      if (!datatomatrix | distances)
        stop("datatomatrix=FALSE and distances require boot or subset as bootmethod.")
      jsd <- numeric(0)
      ecd <- eigen(cod, symmetric = TRUE)
      ecd$values[ecd$values < 0] <- 0
      ecd$values[is.na(ecd$values)] <- 0
      rotdata <- data %*% solve(t(ecd$vectors))
      for (i in 1:p) {
        sx <- sort(rotdata[, i])
        dx <- sx[2:n] - sx[1:(n - 1)]
        dx <- dx[dx > 0]
        jsd[i] <- stats::quantile(dx, jittertuning)
      }
    }
    if ("noise" %in% bootmethod) {
      if (!datatomatrix | distances)
        stop("datatomatrix=FALSE and distances require boot or subset as bootmethod.")
      ecd <- eigen(cod, symmetric = TRUE)
      ecd$values[ecd$values < 0] <- 0
    }

    boot.out <- matrix (NA, nrow=2*nrow(data)*nrow(data), ncol=B)
# *************************************************
      for (b in 1:B) {
        if (count) cat(bootmethod[1], b, "\n")
        if (bootmethod[1] == "boot")
            {  bsamp <- sample(n, n, replace = TRUE)
               if (!multipleboot) bsamp <- unique(bsamp)
               if (distances) mdata <- data[bsamp, bsamp]
               else mdata <- data[bsamp, ]
            }
        if (bootmethod[1] == "subset")
            {  bsamp <- sample(n, subtuning, replace = FALSE)
               if (distances) mdata <- data[bsamp, bsamp]
               else mdata <- data[bsamp, ]
            }
        if (bootmethod[1] == "jitter")
            {  jnoise <- matrix(0, ncol = p, nrow = n)
               for (j in 1:p) jnoise[, j] <- stats::rnorm(n, sd = jsd[j])
               jnoise <- jnoise %*% t(ecd$vectors)
               mdata <- data + jnoise
               bsamp <- 1:n
            }
        if (bootmethod[1] == "bojit")
            {  bsamp <- sample(n, n, replace = TRUE)
               jnoise <- matrix(0, ncol = p, nrow = n)
               for (j in 1:p) jnoise[, j] <- stats::rnorm(n, sd = jsd[j])
               jnoise <- jnoise %*% t(ecd$vectors)
               mdata <- data[bsamp, ] + jnoise
            }
        if (bootmethod[1] == "noise")
            {  noiseind <- as.logical(stats::rbinom(n, 1, noisetuning[1]))
               nn <- sum(noiseind)
               jnoise <- matrix(0, ncol = p, nrow = nn)
               for (j in 1:p) jnoise[, j] <- stats::runif(nn, min = -noisetuning[2] *
                                                sqrt(ecd$values[j]), max = noisetuning[2] *
                                                sqrt(ecd$values[j]))
               jnoise <- t(t(jnoise %*% t(ecd$vectors)) + md)
               mdata <- data
               mdata[noiseind, ] <- jnoise
               bsamp <- (1:n)[!noiseind]
             }

        if ("diss" %in% names(formals(clustermethod)) & distances)
          bc1 <- clustermethod(mdata, diss = TRUE, ...)
        else bc1 <- clustermethod(mdata, ...)
        if (noisemethod) {
          effnc1 <- c1$nccl
          effnb1 <- bc1$nccl
        }
        else {
          effnc1 <- c1$nc
          effnb1 <- bc1$nc
        }

        if ("diss" %in% names(formals(clustermethod)) & distances)
          bc1 <- clustermethod(mdata, diss = TRUE, ...)
        else bc1 <- clustermethod(mdata, ...)

        out <- rep(0,n)
        for (i in 1:effnb1)
          out[bc1$clusterlist[[i]]] <- i
        clusD <- totD <- matrix (0, nrow=n, ncol=n)

        boot.sample <- table(bsamp)
        boot.names <- as.numeric(names(boot.sample))
        Dmat <- matrix (boot.sample,ncol=1) %*% matrix (boot.sample, nrow=1)
        totD[boot.names,boot.names] <- Dmat

        kk <- nlevels(factor(out))
          for (i in 1:kk)
          {
            clus.count <- table(bsamp[out==i])
            clus.names <- as.numeric(names(clus.count))
            Dmat <- matrix (clus.count,ncol=1) %*% matrix (clus.count,nrow=1)
            clusD[clus.names,clus.names] <- Dmat
          }

        boot.out[,b] <- c(clusD, totD)
      }
    # *************************************************

    out <- rep(0,n)
    for (i in 1:length(c1$clusterlist))
      out[c1$clusterlist[[i]]] <- i
    list(boot.out=boot.out, out=out)
}

# ---------------------------------------------------------------------------------
# clusboot
# ---------------------------------------------------------------------------------

#' Performs bootstrap on a cluster analysis output
#'
#' @param datmat a data matrix or distance object which will be the input to the clustering function
#' @param B number of bootstrap replicates
#' @param clustering.func the function which will perform the clustering and output a vector of cluster memberships
#' @param ... more arguments to be passed to the clustering function
#'
#' @return an object of class `clusboot` which is a list with the following components:
#' \item{proportions}{matrix of size nxn with cell ij containing the proportion of bootstrap replicates
#'                    in which object i and object j clustered together.}
#' \item{clustering}{a vector of length n containing the cluster membership of the n input objects.}
#' \item{sil}{a vector of length the number of clusters containing the bootstrap-silhouette values for the clusters.}
#' \item{indv.sil}{a vector of length n containing the bootstrap-silhouette values for the individual objects.}
#' \item{sil.order}{a vector of length n containing the ordering of the n objects used by the functions
#'                  `boot.silhouette` and `boot.proportions` to order objects in the same cluster adjacent and
#'                  clusters in decreasing order of cluster tightness.}
#' \item{ave.sil.width}{the overall stability of the clustering solution, obtained by averaging over the individual object bootstrap-silhouette values.}
#'
#' @export
#'
#' @details
#' Any R function performing cluster analysis can be specified in `clustering.func` although a wrapper function is
#' typically needed to isolate only the vector output of cluster memberships. See `?complete.linkage` as an example.
#' Should users perfer to use alternative resamling schemes, other than the bootstrap, Hennig (2007) discuss a variety
#' of options which could be accessed by specifying `clustering.func = fpc.clusterboot`. In addition, the sampling
#' method is specified in the argument `bootmethod` and additional arguments for the function `clusterboot` in the
#' package `fpc` must be given. Note that only the resampling facilities of `clusterboot` is utilised while the
#' computation of proportions and silhouette widths remain unchanged. The output object of class `clusboot`
#' will remain unchanged as only the resampling section of `clusterboot` is used.
#'
#' @examples
#' clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=complete.linkage)
#' library(fpc)
#' clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=fpc.clusterboot,
#'           clustermethod=hclustCBI, method="complete", bootmethod="subset", subtuning=10)
#'
#' @references Hennig, C., 2007. Cluster-wise assessment of cluster stability. Computational Statistics & Data Analysis,
#' 52(1), pp.258-271.

clusboot <- function(datmat, B=1000, clustering.func=complete.linkage, ...)
{
  obj <- datmat
  if (data.class(datmat) != "matrix") datmat <- as.matrix(datmat)
  n <- nrow(datmat)
  if (is.null(rownames(datmat))) sample.names <- 1:n
  else sample.names <- rownames(datmat)
  boot.samples <- matrix(sample (1:n, size=n*B, replace=T), ncol=B)

  if (!identical(clustering.func, fpc.clusterboot))
  {
  boot.out <- apply(boot.samples,2,function(x)
                                   { if (data.class(obj) == "dist")
                                          out <- clustering.func(stats::as.dist(datmat[x,x]), ...)
                                     else out <- clustering.func(datmat[x,], ...)
                                     clusD <- totD <- matrix (0, nrow=n, ncol=n, dimnames=list(sample.names,sample.names))

                                     boot.sample <- table(x)
                                     boot.names <- as.numeric(names(boot.sample))
                                     Dmat <- matrix (boot.sample,ncol=1) %*% matrix (boot.sample, nrow=1)
                                     totD[boot.names,boot.names] <- Dmat

                                     kk <- nlevels(factor(out))
                                     for (i in 1:kk)
                                     {
                                        clus.count <- table(x[out==i])
                                        clus.names <- as.numeric(names(clus.count))
                                        Dmat <- matrix (clus.count,ncol=1) %*% matrix (clus.count,nrow=1)
                                        clusD[clus.names,clus.names] <- Dmat
                                     }
                                     cbind(clusD, totD)
                                   })
  # boot.out is a matrix with B columns
  # the first nrow/2 is the n*n elements of clusD
  # the remaining nrow/2 is the n*n elements of totD
  }
  else
  {
    fpc.boot.out <- fpc.clusterboot(data=obj, B = B, ...)
    boot.out <- fpc.boot.out$boot.out
  }

  boot.out <- apply (boot.out, 1, sum)
  clusD <- matrix (boot.out[1:(n*n)], nrow=n, ncol=n)
  totD <- matrix (boot.out[-(1:(n*n))], nrow=n, ncol=n)

  clusD <- clusD/totD
  dimnames(clusD) <- list(sample.names,sample.names)
  if (!identical(clustering.func, fpc.clusterboot))
    out <- clustering.func(obj, ...)
  else out <- fpc.boot.out$out
  output <- list(proportions=clusD, clustering=out)
  class(output) <- "clusboot"
  output <- calc.silhouette(output)
  output
}

# ---------------------------------------------------------------------------------
# plot.clusboot
# ---------------------------------------------------------------------------------

#' MDS plot of similarities given by the proportion of bootstrap replicates where objects cluster together
#'
#' @param x an object of class clusboot
#' @param col single colour or a vector specifying a colour for each object
#' @param show.silhouette logical indicating whether plotting character size should represent the inidivual silhouette values
#' @param ... more arguments to be passed to `plot()`
#'
#' @return matrix of similarities (proportions)
#' @export
#'
#' @examples
#' out <- clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=complete.linkage)
#' plot(out)

plot.clusboot <- function(x, col, show.silhouette=TRUE, ...)
{
  Dmat <- 1-x$proportions

  stress.func <- function (y, delta)
  {
    Y <- matrix (y, ncol=2)
    dd <- stats::dist(Y)
    sum((dd-delta)^2)/sum(dd^2)
  }
  Y <- stats::cmdscale(Dmat)
  y <- stats::optim(as.vector(Y), stress.func, delta=stats::as.dist(Dmat))$par
  Y <- matrix(y, ncol=2)

  plot (Y[,1], Y[,2], asp=1, type="n", xaxt="n", yaxt="n", xlab="", ylab="")

  cluster.vec <- x$clustering
  k <- nlevels(factor(cluster.vec))
  if (missing(col)) col <- grDevices::rainbow(k)
  if (length(col)<k) col <- rep(col,k)

  for (i in 1:k)
    if (show.silhouette) graphics::points(Y[cluster.vec==i,1], Y[cluster.vec==i,2], col=col[i], cex=(1-x$indv.sil[cluster.vec==i])*3+0.1, ...)
    else graphics::points(Y[cluster.vec==i,1], Y[cluster.vec==i,2], col=col[i], ...)
  Y
}

# ---------------------------------------------------------------------------------
# boot.proportions
# ---------------------------------------------------------------------------------

#' Heatmap of the proportion of bootstrap replicates where objects cluster together
#'
#' @param x an object of class clusboot
#' @param col vector of colours for shading to indicate proportion values
#' @param show.vals logical value indicating whether proportion values should be added to individual cells
#' @param text.col colour of text for show.vals if `TRUE`
#' @param cluster.col colour of lines demarcating cluster membership
#' @param ... more arguments to be passed to `plot()`
#'
#' @export
#'
#' @examples
#' out <- clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=complete.linkage)
#' boot.proportions(out)

boot.proportions <- function(x, col=grDevices::heat.colors(101,rev=TRUE), show.vals=F, text.col="black", cluster.col="firebrick", ...)
{
  Pmat <- x$proportions[x$sil.order, x$sil.order]
  clus.vec <- x$clustering[x$sil.order]
  n <- nrow(Pmat)
  m <- length(col)-1
  old.par <- graphics::par(mar=c(2.1, 2.1, 0.1, 0.1), no.readonly=TRUE)
  on.exit(graphics::par(old.par))
  plot (0:n, 0:n, type="n", xaxs="i", yaxs="i", xaxt="n", yaxt="n", xlab="", ylab="", ...)
  graphics::axis (side=1, at=(1:n)-0.5, label=colnames(Pmat), las=2, cex.axis=0.6)
  graphics::axis (side=2, at=(n:1)-0.5, label=rownames(Pmat), las=2, cex.axis=0.6)
  for (x in 1:n)
    for (y in 1:n)
      { graphics::rect(x-1, n-y+1, x, n-y, col=col[floor(Pmat[x,y]*m+1)], border=NA)
        if (show.vals) graphics::text (x-0.5, n-y+0.5, format(round(Pmat[x,y]*100)/100,nsmall=2), cex=0.4, adj=c(0.5,0.5))
    }
  for (i in 2:n)
    if (clus.vec[i-1] != clus.vec[i])
      {
         graphics::lines (rep(i-1,2), c(0,n), col=cluster.col)
         graphics::lines (c(0,n), rep(n-i+1,2), col=cluster.col)
      }
  invisible(NULL)
}

# ---------------------------------------------------------------------------------
# calc.silhouette
# ---------------------------------------------------------------------------------

#' Computes the silhouette vales based on proportion of times items cluster together
#'
#' @param clusboot.out an object of class clusboot
#'
#' @return an object of class clusboot

calc.silhouette <- function(clusboot.out)
{
  cluster.vec <- clusboot.out$clustering
  k <- nlevels(factor(cluster.vec))
  cluster.names <- levels(factor(cluster.vec))
  n <- length(cluster.vec)
  Pmat <- clusboot.out$proportions

  sil <- rep(NA,k)
  indv.sil <- rep(NA, n)
  for (i in 1:k)
  {
    current.clus <- (1:length(cluster.vec))[cluster.vec==cluster.names[i]]
    current.p <- Pmat[current.clus, current.clus,drop=F]
    temp.p <- current.p
    diag(temp.p) <- NA
    indv.sil[current.clus] <- apply(temp.p, 2, mean, na.rm=T)
    own.p <- mean(current.p[lower.tri(current.p)])
    other.p <- 0
    for (j in (1:k)[-i])
      {  other.clus <- (1:length(cluster.vec))[cluster.vec==j]
         other.mat <- Pmat[current.clus,other.clus]
         other.mean <- mean(other.mat)
         if (!is.na(other.mean)) if (other.mean>other.p) other.p <- other.mean
      }
    sil[i] <- own.p-other.p
    indv.sil[current.clus] <- indv.sil[current.clus] - other.p
  }
  names(sil) <- cluster.names
  names(indv.sil) <- paste(cluster.vec,rownames(Pmat),sep="-")

  sil.order <- cluster.names[order(sil)]
  indv.sil.order <- order(match(cluster.vec,sil.order), indv.sil, decreasing=TRUE)
  clusboot.out$sil <- sil
  clusboot.out$indv.sil <- indv.sil
  clusboot.out$sil.order <- indv.sil.order
  clusboot.out$avg.sil.width <- mean(indv.sil,na.rm=TRUE)
  clusboot.out
}

# ---------------------------------------------------------------------------------
# boot.silhouette
# ---------------------------------------------------------------------------------

#' Produces silhouette plots
#'
#' @param clusboot.out an object of class clusboot
#' @param ... more arguments to be passed to `barplot()`
#'
#' @export
#'
#' @return list of silhouette widths
#'
#' @examples
#' out <- clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=complete.linkage)
#' boot.silhouette(out)

boot.silhouette <- function(clusboot.out, ...)
{
  old.par <- graphics::par(mfrow=c(1,2), no.readonly = TRUE)
  on.exit (graphics::par(old.par))
  sil.val <- sort(clusboot.out$sil, na.last=TRUE)
  indv.val <- clusboot.out$indv.sil
  n <- length(indv.val)
  clus.num <- clusboot.out$clustering[rev(clusboot.out$sil.order)]

  mp <- graphics::barplot(indv.val[rev(clusboot.out$sil.order)], horiz=T, las=2, cex.names=0.5, ...)
  for (i in 2:length(mp))
    if (clus.num[i-1] != clus.num[i]) graphics::lines (graphics::par("usr")[1:2], rep((mp[i-1]+mp[i])/2,2), col="firebrick")
  graphics::barplot(sil.val, horiz=T, xlim=c(0,1), ...)
  list (individual.silhouette.values = clusboot.out$indv.sil, cluster.silhouette.values = clusboot.out$sil)
}

# ---------------------------------------------------------------------------------
# complete.linkage
# ---------------------------------------------------------------------------------

#' Wrapper function for performing complete linkage clustering
#'
#' @param X samples x variables data matrix
#' @param k number of clusters
#'
#' @return vector of cluster memberships
#' @export
#'
#' @examples
#' complete.linkage(scale(case.study.psychiatrist), k=6)

complete.linkage <- function (X, k)
{ stats::cutree(stats::hclust(stats::dist(X)), k) }
Any scripts or data that you put into this service are public.
ClusBoot documentation built on Sept. 11, 2024, 9:21 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ClusBoot
Bootstrap a Clustering Solution to Establish the Stability of the Clusters

R/funcs.R
In ClusBoot: Bootstrap a Clustering Solution to Establish the Stability of the Clusters

Defines functions boot.silhouette calc.silhouette boot.proportions plot.clusboot clusboot fpc.clusterboot

Documented in boot.proportions boot.silhouette calc.silhouette clusboot fpc.clusterboot plot.clusboot

Try the ClusBoot package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ClusBoot Bootstrap a Clustering Solution to Establish the Stability of the Clusters

R/funcs.R In ClusBoot: Bootstrap a Clustering Solution to Establish the Stability of the Clusters

Defines functions boot.silhouette calc.silhouette boot.proportions plot.clusboot clusboot fpc.clusterboot

Documented in boot.proportions boot.silhouette calc.silhouette clusboot fpc.clusterboot plot.clusboot

Try the ClusBoot package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ClusBoot
Bootstrap a Clustering Solution to Establish the Stability of the Clusters

R/funcs.R
In ClusBoot: Bootstrap a Clustering Solution to Establish the Stability of the Clusters