Description Usage Arguments Value Author(s) Examples
Slice Sampling of the Dirichlet Process Mixture Model with a prior on alpha
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
z |
data matrix |
hyperG0 |
prior mixing distribution. |
a |
shape hyperparameter of the Gamma prior on the concentration parameter of the Dirichlet
Process. Default is |
b |
scale hyperparameter of the Gamma prior on the concentration parameter of the Dirichlet
Process. Default is |
N |
number of MCMC iterations. |
doPlot |
logical flag indicating whether to plot MCMC iteration or not. Default to
|
nbclust_init |
number of clusters at initialization. Default to 30 (or less if there are less than 30 observations). |
plotevery |
an integer indicating the interval between plotted iterations when |
diagVar |
logical flag indicating whether the variance of each cluster is estimated as a
diagonal matrix, or as a full matrix. Default is |
use_variance_hyperprior |
logical flag indicating whether a hyperprior is added
for the variance parameter. Default is |
verbose |
logical flag indicating whether partition info is written in the console at each MCMC iteration. |
... |
additional arguments to be passed to |
a object of class DPMclust
with the following attributes:
mcmc_partitions
: a list of length N
. Each
element mcmc_partitions[n]
is a vector of length
n
giving the partition of the n
observations.
alpha
: a vector of length N
. cost[j]
is the cost
associated to partition c[[j]]
listU_mu
:a list of length N
containing the matrices of
mean vectors for all the mixture components at each MCMC iteration
listU_Sigma
:a list of length N
containing the arrays of
covariances matrices for all the mixture components at each MCMC iteration
U_SS_list
:a list of length N
containing the lists of
sufficient statistics for all the mixture components at each MCMC iteration
weights_list
:a list of length N
containing the logposterior values
at each MCMC iterations
logposterior_list
:a list of length N
containing the logposterior values
at each MCMC iterations
data
: the data matrix d x n
with d
dimensions in rows
and n
observations in columns.
nb_mcmcit
: the number of MCMC iterations
clust_distrib
:the parametric distribution of the mixture component - "gaussian"
hyperG0
:the prior on the cluster location
Boris Hejblum
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | rm(list=ls())
#Number of data
n <- 500
#n <- 2000
set.seed(1234)
#set.seed(123)
#set.seed(4321)
# Sample data
m <- matrix(nrow=2, ncol=4, c(-1, 1, 1.5, 2, 2, -2, -1.5, -2))
p <- c(0.2, 0.1, 0.4, 0.3) # frequence des clusters
sdev <- array(dim=c(2,2,4))
sdev[, ,1] <- matrix(nrow=2, ncol=2, c(0.3, 0, 0, 0.3))
sdev[, ,2] <- matrix(nrow=2, ncol=2, c(0.1, 0, 0, 0.3))
sdev[, ,3] <- matrix(nrow=2, ncol=2, c(0.3, 0.15, 0.15, 0.3))
sdev[, ,4] <- .3*diag(2)
c <- rep(0,n)
z <- matrix(0, nrow=2, ncol=n)
for(k in 1:n){
c[k] = which(rmultinom(n=1, size=1, prob=p)!=0)
z[,k] <- m[, c[k]] + sdev[, , c[k]]%*%matrix(rnorm(2, mean = 0, sd = 1), nrow=2, ncol=1)
#cat(k, "/", n, " observations simulated\n", sep="")
}
d<-2
# Set parameters of G0
hyperG0 <- list()
hyperG0[["mu"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.001
hyperG0[["nu"]] <- d+2
hyperG0[["lambda"]] <- diag(d)/10
# hyperprior on the Scale parameter of DPM
a <- 0.0001
b <- 0.0001
# Number of iterations
N <- 30
# do some plots
doPlot <- TRUE
nbclust_init <- 30
## Data
########
library(ggplot2)
p <- (ggplot(data.frame("X"=z[1,], "Y"=z[2,]), aes(x=X, y=Y))
+ geom_point()
+ ggtitle("Toy example Data"))
p
## alpha priors plots
#####################
prioralpha <- data.frame("alpha"=rgamma(n=5000, shape=a, scale=1/b),
"distribution" =factor(rep("prior",5000),
levels=c("prior", "posterior")))
p <- (ggplot(prioralpha, aes(x=alpha))
+ geom_histogram(aes(y=..density..),
colour="black", fill="white", bins=30)
+ geom_density(alpha=.6, fill="red", color=NA)
+ ggtitle(paste("Prior distribution on alpha: Gamma(", a,
",", b, ")\n", sep=""))
+ theme_bw()
)
p
if(interactive()){
# Gibbs sampler for Dirichlet Process Mixtures
##############################################
MCMCsample <- DPMGibbsN(z, hyperG0, a, b, N=500, doPlot, nbclust_init, plotevery=100,
gg.add=list(theme_bw(),
guides(shape=guide_legend(override.aes = list(fill="grey45")))),
diagVar=FALSE)
plot_ConvDPM(MCMCsample, from=2)
s <- summary(MCMCsample, burnin = 200, thin=2, posterior_approx=FALSE,
lossFn = "MBinderN")
F <- FmeasureC(pred=s$point_estim$c_est, ref=c)
postalpha <- data.frame("alpha"=MCMCsample$alpha[50:500],
"distribution" = factor(rep("posterior",500-49),
levels=c("prior", "posterior")))
p <- (ggplot(postalpha, aes(x=alpha))
+ geom_histogram(aes(y=..density..), binwidth=.1,
colour="black", fill="white")
+ geom_density(alpha=.2, fill="blue")
+ ggtitle("Posterior distribution of alpha\n")
# Ignore NA values for mean
# Overlay with transparent density plot
+ geom_vline(aes(xintercept=mean(alpha, na.rm=TRUE)),
color="red", linetype="dashed", size=1)
)
p
p <- (ggplot(drop=FALSE, alpha=.6)
+ geom_density(aes(x=alpha, fill=distribution),
color=NA, alpha=.6,
data=prioralpha)
#+ geom_density(aes(x=alpha, fill=distribution),
# color=NA, alpha=.6,
# data=postalpha)
+ ggtitle("Prior and posterior distributions of alpha\n")
+ scale_fill_discrete(drop=FALSE)
+ theme_bw()
+xlim(0,10)
+ylim(0, 1.3)
)
p
}
# k-means comparison
####################
plot(x=z[1,], y=z[2,], col=kmeans(t(z), centers=4)$cluster,
xlab = "d = 1", ylab= "d = 2", main="k-means with K=4 clusters")
KM <- kmeans(t(z), centers=4)
dataKM <- data.frame("X"=z[1,], "Y"=z[2,],
"Cluster"=as.character(KM$cluster))
dataCenters <- data.frame("X"=KM$centers[,1],
"Y"=KM$centers[,2],
"Cluster"=rownames(KM$centers))
p <- (ggplot(dataKM)
+ geom_point(aes(x=X, y=Y, col=Cluster))
+ geom_point(aes(x=X, y=Y, fill=Cluster, order=Cluster),
data=dataCenters, shape=22, size=5)
+ scale_colour_discrete(name="Cluster")
+ ggtitle("K-means with K=4 clusters\n"))
p
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.