Description Usage Arguments Author(s) Examples
View source: R/My_Mix_clustering.R
The funct make clusters, it takes as entry the data, number of clusters, number of iterations of EM algorithm, initialization clustering algorithm. It outputs the clusters, AIC BIC values, posterioti probabilities
1 | My_Mix_clustering(X, clust, iterations, initialisation)
|
X |
Dataframe, X which we apply clustering |
clust |
integer, number of clusters the funct should ouput |
iterations |
integer, number of iterations of EM algorithm |
initialisation |
"kmeans" or "random", the clustering algorithm for initialization |
Nour-Dass HAMMADI & Farida BENCHENLAL students M2 Data Mining at Lyon 2 Lumiere university
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | ##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
library(bayess)
library(MASS)
library(mvtnorm)
library(dplyr)
library(FactoMineR)
library(factoextra)
x <- mtcars
x$vs = as.factor(x$vs)
x$am = as.factor(x$am)
x$gear = as.factor(x$gear)
## The function is currently defined as
function(X, clust, iterations, initialisation) {
set.seed(123)
n <- nrow(X) # nombre de lignes
col <- ncol(X) # nombre de colonnes
## S?paration des variables qualitatives et variables quantitatives:
## Varibales quantitatives
donnees_quati <- as.matrix(X
col <- ncol(donnees_quati)
## Variables qualitatives
donnees_quali <- X
col_quali <- ncol(donnees_quali)
mod <- sapply(seq(col_quali), function(i) {
length(levels(donnees_quali[, i]))
})
# initialisation des objets
prop <- matrix(NA, iterations + 1, clust)
mu <- array(NA, dim = c(iterations + 1, clust, col))
sigma <- array(NA, dim = c(iterations + 1, clust, col, col))
alpha <- array(NA, dim = c(iterations + 1, clust, col_quali))
mode(alpha) <- "list"
log_vrai <- rep(0, iterations + 1)
# initialisation de l'algorithme => random/kmeans
if (initialisation == 'random') {
prop[1,] <- rdirichlet(1, par = rep(1, clust))
mu[1, ,] <- donnees_quati[sample(1:n, clust),]
for (k in 1:clust)
sigma[1, k, ,] <- rWishart(1, 8, var(donnees_quati))
}
if (initialisation == 'kmeans') {
z <- kmeans(donnees_quati, clust)$clust
for (k in 1:clust) {
prop[1, k] <- mean(z == k)
mu[1, k,] <- colMeans(donnees_quati[which(z == k),])
sigma[1, k, ,] <- var(donnees_quati[which(z == k),])
}
}
# initialisation des parametres
for (k in 1:clust) {
for (i in 1:col_quali) {
alpha[1, k, i] <- list(rdirichlet(1, rep(1, mod[i])))
names(alpha[1, k, i][[1]]) <- levels(donnees_quali[, i])
}
}
# calcul de log de vraisemblance
for (i in 1:n) {
tmp <- 0
for (k in 1:clust) {
fk <- 1
for (j in 1:col_quali) {
fk <-fk * alpha[1, k, j][[1]][donnees_quali[i, j]]
}
tmp <-
tmp + prop[1, k] * (fk * dmvnorm(donnees_quati[i,], mu[1, k,], sigma[1, k, ,]))
}
log_vrai[1] <- log_vrai[1] + log(tmp)
}
# algorithme EM
for (iter in 1:iterations) {
#E-step
tik <- matrix(NA, n, clust)
for (k in 1:clust) {
fk <- 1
for (i in 1:col_quali) {
fk <- fk * alpha[iter, k, i][[1]][donnees_quali[, i]]
}
tik[, k] <-
prop[iter, k] * (fk + dmvnorm(donnees_quati, mu[iter, k,], sigma[iter, k, ,]))
}
tik <- tik / rowSums(tik)
#M-step
for (k in 1:clust) {
nk <- sum(tik[, k])
prop[iter + 1, k] <- nk / n
mu[iter + 1, k,] <- colSums(tik[, k] * donnees_quati) / nk
sigma[iter + 1, k, ,] <- Reduce('+', lapply(1:n, function(m) {
tik[m, k] * (donnees_quati[m,] - mu[iter + 1, k,])
}))
for (i in 1:col_quali) {
alpha[iter + 1, k, i] <- list(sapply(1:mod[i], function(a) {
sum(tik[, k] * (donnees_quali[, i] == levels(donnees_quali[, i])[a])) / nk
}))
names(alpha[iter + 1, k, i][[1]]) <- levels(donnees_quali[, i])
}
}
#calcul de log vraisemblance
for (i in 1:n) {
tmp <- 0
for (k in 1:clust) {
fk <- 1
for (j in 1:col_quali) {
fk <-fk * alpha[iter + 1, k, j][[1]][donnees_quali[i, j]]
}
tmp <-
tmp + prop[iter + 1, k] * (fk * dmvnorm(donnees_quati[i,], mu[iter + 1, k,], sigma[iter + 1, k, ,]))
}
log_vrai[iter + 1] <- log_vrai[iter + 1] + log(tmp)
}
}
z <- max.col(tik)
BIC <- log_vrai[iterations + 1] - clust / 2 * log(n)
ICL <- BIC - sum(tik * log(tik), na.rm = TRUE)
return(
list(
prop = prop,
mu = mu,
sigma = sigma,
clust = clust,
log_vrai = log_vrai,
z = z,
BIC = BIC,
ICL = ICL
)
)
}
mix_clust_kmeans <- My_Mix_clustering(x, 3, 20, 'kmeans')
mix_clust_random <- My_Mix_clustering(x, 3, 20, 'random')
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.