SillyPuttyVignette.R
In SillyPutty: Silly Putty Clustering

## ----opts, echo=FALSE-------------------------------------------------------------------------
knitr::opts_chunk$set(fig.width=8, fig.height=8)
options(width=96)
.format <- knitr::opts_knit$get("rmarkdown.pandoc.to")
.tag <- function(N, cap ) ifelse(.format == "html",
                                 paste("Figure", N, ":",  cap),
                                 cap)

## ----mycss, results="asis", echo=FALSE--------------------------------------------------------
cat('
<style type="text/css">
.figure { text-align: center; }
.caption { font-weight: bold; }
</style>
')

## ----Setup------------------------------------------------------------------------------------
library(SillyPutty)
library(Umpire)
suppressMessages( library(Mercator) )
suppressMessages( library(mclust) ) # for adjusted rand index

## ----genData----------------------------------------------------------------------------------
set.seed(21315)
trueK <- 4
## Set up survival outcome; baseline is exponential
sm <- SurvivalModel(baseHazard=1/5, accrual=5, followUp=1)
## Build a CancerModel with four subtypes
nBlocks <- 20    
cm <- CancerModel(name="cansim",
                  nPossible=nBlocks,
                  nPattern=trueK,
                  OUT = function(n) rnorm(n, 0, 1), 
                  SURV= function(n) rnorm(n, 0, 1),
                  survivalModel=sm)
## Include 100 blocks/pathways that are not hit by cancer
nTotalBlocks <- nBlocks + 100
## Assign values to hyperparameters
## block size
blockSize <- round(rnorm(nTotalBlocks, 100, 30))
## log normal mean hyperparameters
mu0    <- 6
sigma0 <- 1.5
## log normal sigma hyperparameters
rate   <- 28.11
shape  <- 44.25
## block correlation
p <- 0.6
w <- 5
## Set up the baseline Engine
rho <- rbeta(nTotalBlocks, p*w, (1-p)*w)
base <- lapply(1:nTotalBlocks,
               function(i) {
                 bs <- blockSize[i]
                 co <- matrix(rho[i], nrow=bs, ncol=bs)
                 diag(co) <- 1
                 mu <- rnorm(bs, mu0, sigma0)
                 sigma <- matrix(1/rgamma(bs, rate=rate, shape=shape), nrow=1)
                 covo <- co *(t(sigma) %*% sigma)
                 MVN(mu, covo)
               })
eng <- Engine(base)
## Alter the means if there is a hit
altered <- alterMean(eng, normalOffset, delta=0, sigma=1)
## Build the CancerEngine using character strings
object <- CancerEngine(cm, "eng", "altered")
rm(sm, nBlocks, cm, nTotalBlocks, blockSize, mu0, sigma0, rate, shape, p, w, rho, base, eng, altered)

## ----simData----------------------------------------------------------------------------------
trueN <- 144
dset <- rand(object, trueN, keepall = TRUE) # contains two objects
labels <- dset$clinical$CancerSubType # the true clusters/types
d1 <- dset$data # the noise-free simulated data

## ----noiseModel-------------------------------------------------------------------------------
SpecialNoise <- function(nFeat, nu = 0.1, shape = 1.02, scale = 0.05/shape) {
  NoiseModel(nu = nu,
             tau = rgamma(nFeat, shape = shape, scale = scale),
             phi = 0)
}
nm <- SpecialNoise(nrow(d1), nu = 0)
d1 <- blur(nm, d1)
dim(d1)

## ----distancematrix---------------------------------------------------------------------------
tdis <- t(d1)
dimnames(tdis) <- list(paste("Sample", 1:nrow(tdis), sep=''),
                     paste("Feature", 1:ncol(tdis), sep=''))
dis <- dist(tdis)   ## This step is the rate-liomiting factor. Only way to speed up is to use fewerw samples
names(labels) <- rownames(tdis)

## ----eval=FALSE, echo=FALSE, results='hide'---------------------------------------------------
# dataset <- tdis
# eucdist <- dis
# trueGroups <- labels
# save(eucdist, trueGroups, file="../data/eucdist.rda")

## ----mercViews--------------------------------------------------------------------------------
mercViews <- function(object, main, tag = NULL) {
  opar <- par(mfrow = c(2, 2))
  on.exit(par(opar))
  pts <- barplot(object, main = main)
  if (!is.null(tag)) {
    gt <- as.vector(as.matrix(table(getClusters(object))))
    loc <- pts[round((c(0, cumsum(gt))[-(1 + length(gt))] + cumsum(gt))/2)]
    mtext(tag, side =1, line = 0, at = loc, col = object@palette, font = 2)
  }
  plot(object, view = "tsne", main = "t-SNE")
  plot(object, view = "hclust")
  plot(object, view = "mds", main = "MDS")
}

## ----fig01, fig.cap = .tag(1, "Hierachical Clustering, with four clusters.")------------------
set.seed(1987)
vis <- Mercator(dis, "euclid", "hclust", K = trueK)
palette <- vis@palette[c(1:3, 7, 8, 6, 10, 4, 11, 5, 15, 14, 17:18, 9, 12, 16, 19:24)]
vis@palette <- palette
vis <- addVisualization(vis, "mds")
vis <- addVisualization(vis, "tsne")
mercViews(vis, "Hierarchical Clustering, Five Clusters")

## ----ari--------------------------------------------------------------------------------------
ari.hier <- adjustedRandIndex(labels, vis@clusters)
ari.hier

## ----fig02, fig.cap = .tag(2, "Visualization of true cancer clusters.")-----------------------
truebin <- remapColors(vis, setClusters(vis, labels))
mercViews(truebin, main = "True Cluster Types", 
          tag = unique(sort(labels)))

## ----fig03, fig.cap = .tag(3, "PAM Clustering, K = 4.")---------------------------------------
pc <- pam(dis, k = trueK, diss=TRUE)
pamc <- remapColors(vis, setClusters(vis, pc$clustering))
mercViews(pamc, main = "PAM, K = 4", 
          tag = paste("P", 1:trueK, sep = ""))
ari.pam <- adjustedRandIndex(labels, pamc@clusters)
ari.pam

## ----RandomSilly------------------------------------------------------------------------------
set.seed(12)
y2 <- suppressWarnings(RandomSillyPutty(dis, trueK, N = 100)) ## this is also slow
ari.max <- adjustedRandIndex(truebin@clusters, y2@MX)
ari.min <- adjustedRandIndex(truebin@clusters, y2@MN)
ari.max
ari.min

## ----fig04, fig.cap = .tag(4, "Random SillyPutty clustering,  K = 4.")------------------------
randSillyBin <- remapColors(vis, setClusters(vis, y2@MX))
mercViews(randSillyBin, main = "SillyPutty Cluster Types, K = 4", 
          tag = paste("C", 1:trueK, sep = ""))

## ----fig05, fig.cap = .tag(5, "Cluster assignements with best and worst silhouette widths after random starts.")----
plot(y2, randSillyBin@view[["mds"]], distobj = dis, col = randSillyBin@palette)
summary(y2)

## ----fig06, fig.cap = .tag(6, "Hierarchical Clustering + SillyPutty, K = 4.")-----------------
hierSilly <- SillyPutty(vis@clusters, dis)
hierSillyBin <- remapColors(vis, setClusters(vis, hierSilly@cluster))
mercViews(hierSillyBin, main = "HClust + Silly, k = 4",tag = paste("C", 1:trueK, sep = ""))
ari.Sillyhier <- adjustedRandIndex(labels, hierSillyBin@clusters)
ari.Sillyhier

## ----fig07, fig.width=6, fig.height=5, fig.cap=.tag(7, "Best mean siilhouette width, by number of clusters, found by combining huierarchical clustering with Silly Putty.")----
y <- findClusterNumber(dis, start = 2, end = 12, method = "HCSP")
plot(names(y), y, xlab = "K", ylab = "Silhouette Width", type = "b", lwd = 2, pch = 16)

## ---------------------------------------------------------------------------------------------
sessionInfo()