Nothing
#######################################################################################
###############
############### AntMAN Package : Tests and Examples
###############
###############
#######################################################################################
quit() ## Skip this test - too long
library("AntMAN")
library('salso') ### load the salso library
### We load the following libraries that provide nice graphical tools for dendograms representation
library("ggplot2")
library("ggdendro")
suppressPackageStartupMessages(library('dendextend'))
### Load the data
data(carcinoma)
## write the data in a vector y_mvb just to for notational purposes,
## it reminds data for multivariate binomial data
y_mvb <- carcinoma
## Just verify that we have loaded the right data
is(y_mvb)
head(y_mvb)
### Data quantities
n <- dim(y_mvb)[1] ## The number of observation
d <- dim(y_mvb)[2] ## The dimension of the data
######
## Set the Gibbs Parameter
mcmc_params = AM_mcmc_parameters(niter=5000 + 50 * 5000, burnin=5000, thin=50, verbose=0, output = c("CI","K","M","Tau","H","Q","W"))
## We are going to use a Bayesian Latent Class analysis, i.e. a mixture of Multivariate Bernoulli
## for the first analysis I will define independent beta prior with (1,1) parameters (i.e. uniform)
## for each component
mixture_mvb_params <- AM_mix_hyperparams_multiber(a0=rep(1,d),b0= rep(1,d))
### WE USE THE DEFAULT PRIOR FOR NEG_BINIMIAL
components_prior <- AM_mix_components_prior_negbin(R=1, init_P=0.1,a_P=1,b_P =1)
weights_prior <- AM_mix_weights_prior_gamma(init=5, a=1, b=1)
### Let's start with all the data in separate cluster (i.e. n groups each of size 1)
set.seed(321)
fit <- AM_mcmc_fit(
#initial_clustering=init_ci_mvb,
init_K = 20,
y = y_mvb,
mix_kernel_hyperparams = mixture_mvb_params,
mix_components_prior =components_prior,
mix_weight_prior = weights_prior,
mcmc_parameters = mcmc_params)
summary(fit)
#
#
### Save the postarior chains
# save(fit,file="fit_car_long.Rout")
### Load the posterior chains
#load("fit_car_long.Rout")
plot(fit)
#### Obtain original graphical parameters
oldpar <- par(no.readonly = TRUE)
#### Plot the trace plot of the number of cluster as well as the posterior pdf
par(mfrow=c(1,2))
plot(fit$K)
plot(table(fit$K))
par(mfrow=c(1,1))
##
G <- length(fit$K) ## length of the posterior chain
## Posterior estimation of the number of clusters
table(fit$K)/G
#### Posterior Clustering
### Since posterior chain are saved as a list from AntMAN we will
### transform this in a matrix. Moreover we have to add one because
### AntMAN codes the clustering from 0 to K-1.
ci <- t(do.call(cbind,fit$CI))+1
dim(ci)
hatc <- dlso(ci) ### that provides the fuction dlso to find the clustering
### that minimizes the Binder loss function
hatk <- length(unique(hatc)) ### the estimated number of cluster is the number of
### clusters in hatc
table(hatc)/n ## frequencies of the clusters
C <- AM_coclustering(fit) ### Compute the similarity matrix
### Visualize the similarity matrix
AM_plot_similarity_matrix (C[ order(hatc),order(hatc)]) ### does not work at the moment
## Old code!
image(1:n,1:n,C[ order(hatc),order(hatc)],xaxt='n',yaxt="n",xlab="ord. labels",ylab="ord. labels",main="Ordered similarity matrix")
axis(side=1, labels= hatc,at=1:n)
axis(side=2, labels= hatc,at=1:n)
############################################################
########### Cluster specific parameters estimate ###########
############################################################
### define the matrix theta that contain the draw at each iteration
### of the data specific parameter tau_{c_i} for i=1...n
### Then we need an array with the right dimensions=
### "number of posterior sample" x "sample size" x "data dimension"
thetapost <- array(0,dim=c(G,n,d))
#dim(thetapost)
for(g in 1:G){
thetapost[g,,] =(( fit$TAU[[g]]$theta)[ci[g,],])
}
### We can compute an individual specific (slice in this example) estimation of this parameter
thetahat <- apply(thetapost,c(2,3),mean)
### The cluster specific parameter will be saved in a matrix
est <- matrix(ncol=dim(y_mvb)[2],nrow=hatk)
### We set a single loop for bot compute the cluster specific estimate and plot them
par(mfrow=c(1,hatk))
for(j in 1:hatk){ ## for each cluster:
#### a) compute the cluster specific estimate
est[j,] <- apply(thetahat[hatc==j,],2,mean)
### b) plot the estimate
plot(1:7,apply(thetahat[hatc==j,],2,mean),type="h",xaxt="n",xlim=c(0.8,7.2),ylim=c(0,1),col=j,lwd=2,xlab="pathologist",ylab=expression(paste(hat(theta)^"*")),main=paste("group ",j))
#### c) plot the observed frequencies
lines((1:7)+0.1,apply(y_mvb[hatc==j,],2,mean),type="h",xaxt="n",xlim=c(0.8,7.2),ylim=c(0,1),col=j,lwd=2,ylab="",lty=2)
### d) add the abscissa axis wit the corresponding labels
axis(1, at=1:7, labels=colnames(y_mvb))
}
#########################################################################
########### BNP Hierarchical Clustering following Medvedovic ###########
#########################################################################
### Fix the color forrhe three clusters
color <- c("black","red","green")
### Compute the dendogram using the ggplot2 syntax where %>% is the pipe operator
hclu <-dist(1-C) %>% hclust %>% as.dendrogram
hclu <- rotate(hclu,hatc)
hclu <- hclu %>% set("leaves_pch", 19) %>% set("leaves_col", color[sort(hatc)]) %>% set("branches_k_color",value=color,k=3) %>% set("branches_lwd", 1.2) %>% set("labels_colors",value=color,k=3) %>% set("labels_cex", 0.7)
plot(hclu,main="Cluster Dendogram")
##################################################################################
########## Predictive goodnes of clustering index Argiento et al (2014) #########
##################################################################################
### The data matrix as just 20 different row that we store in the vector yub (y unique)
yun <- unique.matrix(y_mvb)
### in the matrix cnew we are going to save the component at which the new identical observations are
### assigned to
cnew <- matrix(nrow=G,ncol=dim(yun)[1])
for(g in 1:G){ ## for each MCMC draw
for(ii in 1:(dim(yun)[1])){## for each of the 20 different value in yun
ploc <- vector(length=fit$M[g])
for(mm in 1:fit$M[g]) { ### compute the component probability
ploc[mm] <- prod(dbinom(yun[ii,],size=rep(1,d),prob= fit$TAU[[g]]$theta[mm,]))*fit$S[[g]][mm]
}
cnew[g,ii] <- sample(1:fit$M[g],size=1,prob= ploc) ### assign the new value to a component according to the probabilities just computed
}
}
pnew <- rep(0,n) ### vector of the predictive indices one for each observation
pchin <- rep(0,n) ### a vector that just indicate to wich of the 20 unique values the i-th observation is equal to
for(ii in 1:(dim(yun)[1])){ ### for each of the 20 unique values
## find the index of these observations that are equal to observation ii
indx <- which(apply(y_mvb, 1, function(x) identical(x[1:d], yun[ii,])))
for(id in indx){ ## and for these value
pnew[id] <- mean((cnew[,ii]-ci[,id])==0) ## compute the predictive indices
pchin[id] <- ii
}
}
### plot the predictive goodness of clustering indices
plot(pnew,col=hatc,pch=pchin,xlab="",ylab=expression(paste(p^new)),main="predictive index")
abline(h=0.7,lty=2,col="red") ## add a threshold
idx <- which(pnew<0.7) ## find which observation express and index below the threshold
y_mvb[idx,] ## give a look at these observation for which clustering is more uncertain
##### PDF plots
pdf(file.path(tempdir(), "trestime.pdf"),paper="special",height=4, width=12)
par(mar=c(3,3.5,1.5,0.5)+0.1)
#'mar' A numerical vector of the form 'c(bottom, left, top, right)'
# which gives the number of lines of margin to be specified on
# the four sides of the plot. The default is 'c(5, 4, 4, 2) +
# 0.1'.
par(mgp=c(2, 1.0, 0))
#'mgp' The margin line (in 'mex' units) for the axis title, axis
# labels and axis line. Note that 'mgp[1]' affects 'title'
# whereas 'mgp[2:3]' affect 'axis'. The default is 'c(3, 1,
# 0)'.
par(mfrow=c(1,3))
## SIMILARITY
image(1:n,1:n,C[ order(hatc),order(hatc)],xaxt='n',yaxt="n",xlab="ord. labels",ylab="ord. labels",main="Ordered similarity matrix")
axis(side=1, labels= hatc,at=1:n)
axis(side=2, labels= hatc,at=1:n)
### DENDOGRAM
hclu <- hclu %>% set("labels_cex", 0.01)
plot(hclu,main="Cluster Dendogram",xlab="ord labels")
### INDICE
plot(pnew,col=hatc,pch=pchin,xlab="1:n",ylab=expression(paste(p^new)),main="predictive index")
abline(h=0.7,lty=2,col="red")
#### reset the graphical parameters
par(oldpar)
dev.off()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.