#' Calculate cluster validation metrics for FANNY method. See NbClust::NbClust() and cluster::fanny() for more details.
#'
#' @param data matrix or dataset
#' @param diss dissimilarity matrix to be used. By default, diss=NULL, but if it is replaced by a dissimilarity matrix, distance should be "NULL".
#' @param distance the distance measure to be used to compute the dissimilarity matrix. This must be one of: "euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski" or "NULL". By default, distance="euclidean". If the distance is "NULL", the dissimilarity matrix (diss) should be given by the user. If distance is not "NULL", the dissimilarity matrix should be "NULL".
#' @param min.nc minimal number of clusters, between 1 and (number of objects - 1)
#' @param max.nc maximal number of clusters, between 2 and (number of objects - 1), greater or equal to min.nc. By default, max.nc=15
#' @param method the cluster analysis method to be used. This should be one of: "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid", "kmeans"
#' @param index the index to be calculated. This should be one of : "kl", "ch", "hartigan", "ccc", "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", "hubert", "sdindex", "dindex", "sdbw", "all" (all indices except GAP, Gamma, Gplus and Tau), "alllong" (all indices with Gap, Gamma, Gplus and Tau included).
#' @param alphaBeale significance value for Beale's index.
#' @param memb.exp value to pass to cluster::fanny()
#'
#' @return list containing index values, critical values, best number of clusters from each metric, and the best partitioning results
#' @export
# data = ord.df
# diss = chord.df
# distance = NULL
# min.nc = 2
# max.nc = 12
# method = "fanny"
# index = "all"
# alphaBeale = 0.1
# memb.exp= 1.2
NbClust_fanny <-function(data = NULL,
diss=NULL,
distance ="euclidean",
min.nc=2,
max.nc=15,
method =NULL,
index = "all",
alphaBeale = 0.1,
memb.exp= 2
){
x<-0
min_nc <- min.nc
max_nc <- max.nc
if(is.null(method))stop("method is NULL")
method <- pmatch(method, c("ward.D2", "single", "complete", "average",
"mcquitty", "median", "centroid", "kmeans","ward.D", "fanny"))
indice <- pmatch(index, c("kl","ch","hartigan","ccc","scott","marriot","trcovw","tracew","friedman",
"rubin","cindex","db","silhouette","duda","pseudot2","beale","ratkowsky","ball",
"ptbiserial","gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn",
"hubert", "sdindex", "dindex", "sdbw", "all","alllong"))
if (is.na(indice))stop("invalid clustering index")
if (indice == -1)stop("ambiguous index")
if ((indice == 3)|| (indice == 5)|| (indice == 6)|| (indice == 7)|| (indice == 8)|| (indice == 9)|| (indice == 10)|| (indice == 11)|| (indice == 18)|| (indice == 27)|| (indice == 29)|| (indice == 31)|| (indice == 32)){
if((max.nc-min.nc)<2)
stop("The difference between the minimum and the maximum number of clusters must be at least equal to 2")
}
if(is.null(data)){
if(method==8){
stop("\n","method = kmeans, data matrix is needed")
}else{
if ((indice == 1 )|| (indice == 2)|| (indice == 3 )|| (indice == 4 )|| (indice == 5)|| (indice == 6)|| (indice == 7)|| (indice == 8)|| (indice == 9)|| (indice == 10)|| (indice == 12)|| (indice == 14)|| (indice == 15)|| (indice == 16)|| (indice == 17)|| (indice == 18)
|| (indice == 19)|| (indice == 20)|| (indice == 23)|| (indice == 24)|| (indice == 25) || (indice == 27)|| (indice == 28)
|| (indice == 29) || (indice == 30)|| (indice == 31) || (indice == 32))
stop("\n","Data matrix is needed. Only frey, mcclain, cindex, sihouette and dunn can be computed.", "\n")
if(is.null(diss)){
stop("data matrix and dissimilarity matrix are both null")
}else{
cat("\n","Only frey, mcclain, cindex, sihouette and dunn can be computed. To compute the other indices, data matrix is needed","\n")
}
}
}else{
jeu1 <- as.matrix(data)
numberObsBefore <- dim(jeu1)[1]
jeu <- na.omit(jeu1) # returns the object with incomplete cases removed
nn <- numberObsAfter <- dim(jeu)[1]
pp <- dim(jeu)[2]
TT <- t(jeu)%*%jeu
sizeEigenTT <- length(eigen(TT)$value)
eigenValues <- eigen(TT/(nn-1))$value
# Only for indices using vv : CCC, Scott, marriot, tracecovw, tracew, friedman, rubin
if (any(indice == 4) || (indice == 5) || (indice == 6) || (indice == 7) || (indice == 8) || (indice == 9) || (indice == 10) || (indice == 31) || (indice == 32)){
for (i in 1:sizeEigenTT) {
if (eigenValues[i] < 0) {
#cat(paste("There are only", numberObsAfter,"nonmissing observations out of a possible", numberObsBefore ,"observations."))
stop("The TSS matrix is indefinite. There must be too many missing values. The index cannot be calculated.")
}
}
s1 <- sqrt(eigenValues)
ss <- rep(1,sizeEigenTT)
for (i in 1:sizeEigenTT)
{
if (s1[i]!=0)
ss[i]=s1[i]
}
vv <- prod(ss)
}
}
#&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&#
# #
# Distances #
# #
#&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&#
if(is.null(distance)){
distanceM<-7}
if(!is.null(distance)){
distanceM <- pmatch(distance, c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"))
}
if (is.na(distanceM)){
stop("invalid distance")
}
if(is.null(diss)){
if (distanceM == 1) {
md <- dist(jeu, method="euclidean") # "dist" function computes and returns the distance matrix computed by using the specified distance measure to compute the distances between the rows of a data matrix.
}
if (distanceM == 2) {
md <- dist(jeu, method="maximum")
}
if (distanceM == 3) {
md <- dist(jeu, method="manhattan")
}
if (distanceM == 4) {
md <- dist(jeu, method="canberra")
}
if (distanceM == 5) {
md <- dist(jeu, method="binary")
}
if (distanceM == 6) {
md <- dist(jeu, method="minkowski")
}
if (distanceM == 7) {
stop("dissimilarity matrix and distance are both NULL")
}
}
if(!is.null(diss)){
if((distanceM==1)||(distanceM==2)|| (distanceM==3)|| (distanceM==4)|| (distanceM==5)|| (distanceM==6)){
stop("dissimilarity matrix and distance are both not null")
}else{
md <- diss
}
}
#&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&#
# #
# Methods #
# #
#&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&#
res <- array(0, c(max_nc-min_nc+1,30))
x_axis <- min_nc:max_nc
resCritical <- array(0, c(max_nc-min_nc+1,4))
rownames(resCritical) <- min_nc:max_nc
colnames(resCritical) <- c("CritValue_Duda", "CritValue_PseudoT2", "Fvalue_Beale", "CritValue_Gap")
rownames(res) <- min_nc:max_nc
colnames(res) <- c("KL","CH","Hartigan","CCC","Scott","Marriot", "TrCovW", "TraceW","Friedman","Rubin","Cindex","DB",
"Silhouette", "Duda", "Pseudot2", "Beale", "Ratkowsky", "Ball", "Ptbiserial", "Gap", "Frey", "McClain","Gamma", "Gplus", "Tau", "Dunn",
"Hubert", "SDindex", "Dindex", "SDbw")
if (is.na(method))
stop("invalid clustering method")
if (method == -1)
stop("ambiguous method")
if (method == 1) {
hc<-hclust(md,method = "ward.D2")
}
if (method == 2) {
hc<-hclust(md,method = "single")
}
if (method == 3){
hc<-hclust(md,method = "complete")
}
if (method == 4) {
hc<-hclust(md,method = "average")
}
if (method == 5) {
hc<-hclust(md,method = "mcquitty")
}
if (method == 6) {
hc<-hclust(md,method = "median")
}
if (method == 7) {
hc<-hclust(md,method = "centroid")
}
if (method == 9) {
hc<-hclust(md,method = "ward.D")
}
#&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&#
# #
# Indices #
# #
#&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&#
##############################
# #
# SD and SDbw #
# #
##############################
centers<-function(cl,x){
x <- as.matrix(x)
n <- length(cl)
k <- max(cl)
centers <- matrix(nrow = k, ncol = ncol(x))
{
for (i in 1:k)
{
for (j in 1:ncol(x))
{
centers[i, j] <- mean(x[cl == i, j])
}
}
}
return(centers)
}
Average.scattering <- function (cl, x){
x <- as.matrix(x)
n <- length(cl)
k <- max(cl)
centers.matrix <- centers(cl,x)
cluster.size <- numeric(0)
variance.clusters <- matrix(0, ncol = ncol(x), nrow = k)
var <- matrix(0, ncol = ncol(x), nrow = k)
for (u in 1:k)
cluster.size[u] <- sum(cl == u)
for (u in 1:k) {
for (j in 1:ncol(x)) {
for(i in 1:n) {
if(cl[i]==u)
variance.clusters[u,j]<- variance.clusters[u,j]+(x[i, j]-centers.matrix[u,j])^2
}
}
}
for (u in 1:k) {
for (j in 1:ncol(x))
variance.clusters[u,j]= variance.clusters[u,j]/ cluster.size[u]
}
variance.matrix <- numeric(0)
for(j in 1:ncol(x))
variance.matrix[j]=var(x[,j])*(n-1)/n
Somme.variance.clusters<-0
for (u in 1:k)
Somme.variance.clusters<-Somme.variance.clusters+sqrt((variance.clusters[u,]%*%(variance.clusters[u,])))
# Standard deviation
stdev<-(1/k)*sqrt(Somme.variance.clusters)
#Average scattering for clusters
scat<- (1/k)* (Somme.variance.clusters /sqrt(variance.matrix %*% variance.matrix))
scat <- list(stdev=stdev, centers=centers.matrix, variance.intraclusters= variance.clusters, scatt=scat)
return(scat)
}
density.clusters<-function(cl, x){
x <- as.matrix(x)
k <- max(cl)
n <- length(cl)
distance <- matrix(0, ncol = 1, nrow = n)
density <- matrix(0, ncol = 1, nrow = k)
centers.matrix<-centers(cl,x)
stdev<-Average.scattering(cl,x)$stdev
for(i in 1:n) {
u=1
while(cl[i] != u )
u<-u+1
for (j in 1:ncol(x)){
distance[i]<- distance[i]+(x[i,j]-centers.matrix[u,j])^2
}
distance[i]<-sqrt(distance[i])
if (distance[i] <= stdev)
density[u]= density[u]+1
}
dens<-list(distance=distance, density=density)
return(dens)
}
density.bw<-function(cl, x){
x <- as.matrix(x)
k <- max(cl)
n <- length(cl)
centers.matrix<-centers(cl,x)
stdev<-Average.scattering(cl,x)$stdev
density.bw<- matrix(0, ncol = k, nrow = k)
u<-1
for(u in 1:k){
for(v in 1:k){
if(v!=u){
distance<- matrix(0, ncol = 1, nrow = n)
moy<-(centers.matrix[u,]+centers.matrix[v,])/2
for(i in 1:n){
if((cl[i]==u)||(cl[i]==v)){
for (j in 1:ncol(x)){
distance[i]<- distance[i]+(x[i,j]-moy[j])^2
}
distance[i]<- sqrt(distance[i])
if(distance[i]<= stdev){
density.bw[u,v]<-density.bw[u,v]+1
}
}
}
}
}
}
density.clust<-density.clusters(cl,x)$density
S<-0
for(u in 1:k)
for(v in 1:k){
if(max(density.clust[u], density.clust[v])!=0)
S=S+ (density.bw[u,v]/max(density.clust[u], density.clust[v]))
}
density.bw<-S/(k*(k-1))
return(density.bw)
}
Dis <- function (cl, x){ # Dis : Total separation between clusters
x <- as.matrix(x)
k <- max(cl)
centers.matrix <- centers(cl,x)
Distance.centers<-dist(centers.matrix)
Dmin<-min(Distance.centers)
Dmax<-max(Distance.centers)
Distance.centers<-as.matrix(Distance.centers)
s2<-0
for (u in 1:k){
s1=0
for(j in 1:ncol(Distance.centers)){
s1<-s1 + Distance.centers[u,j]
}
s2<-s2+1/s1
}
Dis<-(Dmax/Dmin)*s2
return(Dis)
}
##################################
# #
# Hubert index #
# #
##################################
Index.Hubert<-function(x, cl){
k <- max(cl)
n<-dim(x)[1]
y <- matrix(0, ncol = dim(x)[2], nrow = n)
P<- as.matrix(md)
meanP<-mean(P)
variance.matrix <- numeric(0)
md <- dist(x, method="euclidean")
for(j in 1:n)
variance.matrix[j]=var(P[,j])*(n-1)/n
varP<-sqrt(variance.matrix %*% variance.matrix)
centers.clusters<-centers(cl,x)
for(i in 1:n){
for(u in 1:k){
if(cl[i]==u)
y[i,]<-centers.clusters[u,]
}
}
Q<- as.matrix(dist(y, method="euclidean"))
meanQ<-mean(Q)
for(j in 1:n)
variance.matrix[j]=var(Q[,j])*(n-1)/n
varQ<-sqrt(variance.matrix %*% variance.matrix)
M<-n*(n-1)/2
S<-0
n1<-n-1
for(i in 1:n1){
j<-i+1
while(j<=n){
S<-S+(P[i,j]-meanP)*(Q[i,j]-meanQ)
j<-j+1
}
}
gamma<-S/(M*varP*varQ)
return(gamma)
}
##################################
# #
# Gamma, Gplus and Tau #
# #
##################################
Index.sPlussMoins <- function (cl1,md){
cn1 <- max(cl1)
n1 <- length(cl1)
dmat <- as.matrix(md)
average.distance <- median.distance <- separation <- cluster.size <- within.dist1 <- between.dist1 <- numeric(0)
separation.matrix <- matrix(0, ncol = cn1, nrow = cn1)
di <- list()
for (u in 1:cn1) {
cluster.size[u] <- sum(cl1 == u)
du <- as.dist(dmat[cl1 == u, cl1 == u])
within.dist1 <- c(within.dist1, du)
average.distance[u] <- mean(du)
median.distance[u] <- median(du)
bv <- numeric(0)
for (v in 1:cn1) {
if (v != u) {
suv <- dmat[cl1 == u, cl1 == v]
bv <- c(bv, suv)
if (u < v) {
separation.matrix[u, v] <- separation.matrix[v,u] <- min(suv)
between.dist1 <- c(between.dist1, suv)
}
}
}
}
nwithin1 <- length(within.dist1)
nbetween1 <- length(between.dist1)
meanwithin1 <- mean(within.dist1)
meanbetween1 <- mean(between.dist1)
s.plus <- s.moins <- 0
#s.moins<-sum(rank(c(within.dist1,between.dist1),ties="first")[1:nwithin1]-rank(within.dist1,ties="first"))
#s.plus <-sum(rank(c(-within.dist1,-between.dist1),ties="first")[1:nwithin1]-rank(-within.dist1,ties="first"))
for (k in 1: nwithin1){
s.plus <- s.plus+(colSums(outer(between.dist1,within.dist1[k], ">")))
s.moins <- s.moins+(colSums(outer(between.dist1,within.dist1[k], "<")))
}
Index.Gamma <- (s.plus-s.moins)/(s.plus+s.moins)
Index.Gplus <- (2*s.moins)/(n1*(n1-1))
t.tau <- (nwithin1*nbetween1)-(s.plus+s.moins)
Index.Tau <- (s.plus-s.moins)/(((n1*(n1-1)/2-t.tau)*(n1*(n1-1)/2))^(1/2))
results <- list(gamma=Index.Gamma, gplus=Index.Gplus, tau=Index.Tau)
return(results)
}
##################################
# #
# Frey and McClain #
# #
##################################
Index.15and28 <- function (cl1,cl2,md){
cn1 <- max(cl1)
n1 <- length(cl1)
dmat <- as.matrix(md)
average.distance <- median.distance <- separation <- cluster.size <- within.dist1 <- between.dist1 <- numeric(0)
separation.matrix <- matrix(0, ncol = cn1, nrow = cn1)
di <- list()
for (u in 1:cn1){
cluster.size[u] <- sum(cl1 == u)
du <- as.dist(dmat[cl1 == u, cl1 == u])
within.dist1 <- c(within.dist1, du)
#average.distance[u] <- mean(du)
#median.distance[u] <- median(du)
#bv <- numeric(0)
for (v in 1:cn1) {
if (v != u) {
suv <- dmat[cl1 == u, cl1 == v]
#bv <- c(bv, suv)
if (u < v) {
separation.matrix[u, v] <- separation.matrix[v,u] <- min(suv)
between.dist1 <- c(between.dist1, suv)
}
}
}
}
cn2 <- max(cl2)
n2 <- length(cl2)
dmat <- as.matrix(md)
average.distance <- median.distance <- separation <- cluster.size <- within.dist2 <- between.dist2 <- numeric(0)
separation.matrix <- matrix(0, ncol = cn2, nrow = cn2)
di <- list()
for (w in 1:cn2) {
cluster.size[w] <- sum(cl2 == w)
dw <- as.dist(dmat[cl2 == w, cl2 == w])
within.dist2 <- c(within.dist2, dw)
#average.distance[w] <- mean(dw)
#median.distance[w] <- median(dw)
bx <- numeric(0)
for (x in 1:cn2) {
if (x != w) {
swx <- dmat[cl2 == w, cl2 == x]
bx <- c(bx, swx)
if (w < x) {
separation.matrix[w, x] <- separation.matrix[x,w] <- min(swx)
between.dist2 <- c(between.dist2, swx)
}
}
}
}
nwithin1 <- length(within.dist1)
nbetween1 <- length(between.dist1)
meanwithin1 <- mean(within.dist1)
meanbetween1 <- mean(between.dist1)
meanwithin2 <- mean(within.dist2)
meanbetween2 <- mean(between.dist2)
Index.15 <- (meanbetween2-meanbetween1)/(meanwithin2-meanwithin1)
Index.28 <- (meanwithin1/nwithin1)/(meanbetween1/nbetween1)
results <- list(frey=Index.15,mcclain=Index.28)
return(results)
}
##################################
# #
# Point-biserial #
# #
##################################
Indice.ptbiserial <- function (x,md,cl1){
nn <- dim(x)[1]
pp <- dim(x)[2]
md2 <- as.matrix(md)
m01 <- array(NA, c(nn,nn))
nbr <- (nn*(nn-1))/2
pb <- array(0,c(nbr,2))
m3 <- 1
for (m1 in 2:nn)
{
m12 <- m1-1
for (m2 in 1:m12)
{
if (cl1[m1]==cl1[m2]) m01[m1,m2]<-0
if (cl1[m1]!=cl1[m2]) m01[m1,m2]<-1
pb[m3,1] <- m01[m1,m2]
pb[m3,2] <- md2[m1,m2]
m3 <- m3+1
}
}
y <- pb[,1]
x <- pb[,2]
biserial.cor <- function (x, y, use = c("all.obs", "complete.obs"), level = 1){
if (!is.numeric(x))
stop("'x' must be a numeric variable.\n")
y <- as.factor(y)
if (length(levs <- levels(y)) > 2)
stop("'y' must be a dichotomous variable.\n")
if (length(x) != length(y))
stop("'x' and 'y' do not have the same length")
use <- match.arg(use)
if (use == "complete.obs") {
cc.ind <- complete.cases(x, y)
x <- x[cc.ind]
y <- y[cc.ind]
}
ind <- y == levs[level]
diff.mu <- mean(x[ind]) - mean(x[!ind])
prob <- mean(ind)
diff.mu * sqrt(prob * (1 - prob))/sd(x)
}
ptbiserial <- biserial.cor(x=pb[,2], y=pb[,1], level = 2)
return(ptbiserial)
}
##########################################
# #
# Duda, pseudot2 and beale #
# #
##########################################
Indices.WKWL <- function (x,cl1=cl1,cl2=cl2){
dim2 <- dim(x)[2]
wss <- function(x) {
x <- as.matrix(x)
n <- length(x)
centers <- matrix(nrow = 1, ncol = ncol(x))
if (ncol(x) == 1) {
centers[1, ] <- mean(x) }
if (is.null(dim(x))){
bb <- matrix(x,byrow=FALSE,nrow=1,ncol=ncol(x))
centers[1, ] <- apply(bb, 2, mean)
}else{
centers[1, ] <- apply(x, 2, mean)
}
x.2 <- sweep(x,2,centers[1,],"-")
withins <- sum(x.2^2)
wss <- sum(withins)
return(wss)
}
ncg1 <- 1
ncg1max <- max(cl1)
while((sum(cl1==ncg1)==sum(cl2==ncg1)) && ncg1 <=ncg1max){
ncg1 <- ncg1+1 }
g1 <- ncg1
ncg2 <- max(cl2)
nc2g2 <- ncg2-1
while((sum(cl1==nc2g2)==sum(cl2==ncg2)) && nc2g2 >=1){
ncg2 <- ncg2-1
nc2g2 <- nc2g2-1
}
g2 <- ncg2
NK <- sum(cl2==g1)
WK.x <- x[cl2==g1,]
WK <- wss(x=WK.x)
NL <- sum(cl2==g2)
WL.x <- x[cl2==g2,]
WL <- wss(x=WL.x)
NM <- sum(cl1==g1)
WM.x <- x[cl1==g1,]
WM <- wss(x=WM.x)
duda <- (WK+WL)/WM
BKL <- WM-WK-WL
pseudot2 <- BKL/((WK+WL)/(NK+NL-2))
beale <- (BKL/(WK+WL))/(((NM-1)/(NM-2))*(2^(2/dim2)-1))
results <- list(duda=duda,pseudot2=pseudot2,NM=NM,NK=NK,NL=NL,beale=beale)
return(results)
}
########################################################################
# #
# ccc, scott, marriot, trcovw, tracew, friedman and rubin #
# #
########################################################################
Indices.WBT <- function(x,cl,P,s,vv){
n <- dim(x)[1]
pp <- dim(x)[2]
qq <- max(cl)
z <- matrix(0,ncol=qq,nrow=n)
clX <- as.matrix(cl)
for (i in 1:n)
for (j in 1:qq){
z[i,j]==0
if (clX[i,1]==j)
{z[i,j]=1}
}
xbar <- solve(t(z)%*%z)%*%t(z)%*%x
B <- t(xbar)%*%t(z)%*%z%*%xbar
W <- P-B
marriot <- (qq^2)*det(W)
trcovw <- sum(diag(cov(W)))
tracew <- sum(diag(W))
if(det(W)!=0){
scott <- n*log(det(P)/det(W))
}else{
cat("Error: division by zero!")}
friedman <- sum(diag(solve(W)*B))
rubin <- sum(diag(P))/sum(diag(W))
R2 <- 1-sum(diag(W))/sum(diag(P))
v1 <- 1
u <- rep(0,pp)
c <- (vv/(qq))^(1/pp)
u <- s/c
k1 <- sum((u>=1)==TRUE)
p1 <- min(k1,qq-1)
if (all(p1>0,p1<pp)){
for (i in 1:p1)
v1 <- v1*s[i]
c <- (v1/(qq))^(1/p1)
u <- s/c
b1 <- sum(1/(n+u[1:p1]))
b2 <- sum(u[p1+1:pp]^2/(n+u[p1+1:pp]),na.rm=TRUE)
E_R2 <- 1-((b1+b2)/sum(u^2))*((n-qq)^2/n)*(1+4/n)
ccc <- log((1-E_R2)/(1-R2))*(sqrt(n*p1/2)/((0.001+E_R2)^1.2))
}else{
b1 <- sum(1/(n+u))
E_R2 <- 1-(b1/sum(u^2))*((n-qq)^2/n)*(1+4/n)
ccc <- log((1-E_R2)/(1-R2))*(sqrt(n*pp/2)/((0.001+E_R2)^1.2))
}
results <- list(ccc=ccc,scott=scott,marriot=marriot,trcovw=trcovw,tracew=tracew,friedman=friedman,rubin=rubin)
return(results)
}
########################################################################
# #
# Kl, Ch, Hartigan, Ratkowsky and Ball #
# #
########################################################################
Indices.Traces <- function(data, d, clall, index = "all"){
x <- data
cl0 <- clall[,1]
cl1 <- clall[,2]
cl2 <- clall[,3]
clall <- clall
nb.cl0 <- table(cl0)
nb.cl1 <- table(cl1)
nb.cl2 <- table(cl2)
nb1.cl0 <- sum(nb.cl0==1)
nb1.cl1 <- sum(nb.cl1==1)
nb1.cl2 <- sum(nb.cl2==1)
gss <- function(x, cl, d){
n <- length(cl)
k <- max(cl)
centers <- matrix(nrow = k, ncol = ncol(x))
for (i in 1:k){
if (ncol(x) == 1){
centers[i, ] <- mean(x[cl == i, ])
}
if (is.null(dim(x[cl == i, ]))){
bb <- matrix(x[cl == i, ],byrow=FALSE,nrow=1,ncol=ncol(x))
centers[i, ] <- apply(bb, 2, mean)
}else {
centers[i, ] <- apply(x[cl == i, ], 2, mean)
}
}
allmean <- apply(x, 2, mean)
dmean <- sweep(x, 2, allmean, "-")
allmeandist <- sum(dmean^2)
withins <- rep(0, k)
x.2 <- (x - centers[cl, ])^2
for (i in 1:k){
withins[i] <- sum(x.2[cl == i, ])
}
wgss <- sum(withins)
bgss <- allmeandist - wgss
results <- list(wgss=wgss,bgss=bgss, centers=centers)
return(results)
}
vargss <- function(x, clsize, varwithins){
nvar <- dim(x)[2]
n <- sum(clsize)
k <- length(clsize)
varallmean <- rep(0, nvar)
varallmeandist <- rep(0, nvar)
varwgss <- rep(0, nvar)
for (l in 1:nvar) varallmean[l] <- mean(x[, l])
vardmean <- sweep(x, 2, varallmean, "-")
for (l in 1:nvar) {
varallmeandist[l] <- sum((vardmean[, l])^2)
varwgss[l] <- sum(varwithins[, l])
}
varbgss <- varallmeandist - varwgss
vartss <- varbgss + varwgss
zvargss <- list(vartss = vartss, varbgss = varbgss)
return(zvargss)
}
varwithinss <- function(x, centers, cluster) {
nrow <- dim(centers)[1]
nvar <- dim(x)[2]
varwithins <- matrix(0, nrow, nvar)
x <- (x - centers[cluster, ])^2
for (l in 1:nvar) {
for (k in 1:nrow) {
varwithins[k, l] <- sum(x[cluster == k, l])
}
}
return(varwithins)
}
indice.kl <- function (x, clall, d = NULL, centrotypes = "centroids"){
if (nb1.cl1 > 0){
KL <- NA
}
if (sum(c("centroids", "medoids") == centrotypes) == 0)
stop("Wrong centrotypes argument")
if ("medoids" == centrotypes && is.null(d))
stop("For argument centrotypes = 'medoids' d cannot be null")
if (!is.null(d)) {
if (!is.matrix(d)) {
d <- as.matrix(d)
}
row.names(d) <- row.names(x)
}
#if (is.null(dim(x))) {
# dim(x) <- c(length(x), 1)
#}
m <- ncol(x)
g <- k <- max(clall[, 2])
KL <- abs((g - 1)^(2/m) * gss(x, clall[, 1], d)$wgss -
g^(2/m) * gss(x, clall[, 2], d)$wgss)/abs((g)^(2/m) *
gss(x, clall[, 2], d)$wgss - (g + 1)^(2/m) *
gss(x, clall[, 3], d)$wgss)
return(KL)
}
indice.ch <- function (x, cl, d = NULL, centrotypes = "centroids"){
if (nb1.cl1 > 0){
CH <- NA
}
if (sum(c("centroids", "medoids") == centrotypes) == 0)
stop("Wrong centrotypes argument")
if ("medoids" == centrotypes && is.null(d))
stop("For argument centrotypes = 'medoids' d cannot be null")
if (!is.null(d)) {
if (!is.matrix(d)) {
d <- as.matrix(d)
}
row.names(d) <- row.names(x)
}
#if (is.null(dim(x))) {
# dim(x) <- c(length(x), 1)
#}
n <- length(cl)
k <- max(cl)
CH <- (gss(x, cl, d)$bgss/(k-1))/
(gss(x, cl, d)$wgss/(n-k))
return(CH)
}
# hartigan
indice.hart <- function(x, clall, d = NULL, centrotypes = "centroids"){
if (sum(c("centroids", "medoids") == centrotypes) == 0)
stop("Wrong centrotypes argument")
if ("medoids" == centrotypes && is.null(d))
stop("For argument centrotypes = 'medoids' d cannot be null")
if (!is.null(d)) {
if (!is.matrix(d)) {
d <- as.matrix(d)
}
row.names(d) <- row.names(x)
}
#if (is.null(dim(x))) {
# dim(x) <- c(length(x), 1)
#}
n <- nrow(x)
g <- max(clall[, 1])
HART <- (gss(x, clall[, 2], d)$wgss/gss(x, clall[, 3],d)$wgss - 1) * (n - g - 1)
return(HART)
}
indice.ball <- function(x, cl, d = NULL, centrotypes = "centroids"){
wgssB <- gss(x, cl, d)$wgss
qq <- max(cl)
ball <- wgssB/qq
return(ball)
}
indice.ratkowsky <- function(x, cl, d, centrotypes = "centroids"){
qq <- max(cl)
clsize <- table(cl)
centers <- gss(x, cl, d)$centers
varwithins <- varwithinss(x, centers, cl)
zvargss <- vargss(x, clsize, varwithins)
ratio <- mean(sqrt(zvargss$varbgss/zvargss$vartss))
ratkowsky <- ratio/sqrt(qq)
return(ratkowsky)
}
indice <- pmatch(index, c("kl", "ch", "hart", "ratkowsky", "ball", "all"))
if (is.na(indice))
stop("invalid clustering index")
if (indice == -1)
stop("ambiguous index")
vecallindex <- numeric(5)
if (any(indice == 1) || (indice == 6))
vecallindex[1] <- indice.kl(x,clall,d)
if (any(indice == 2) || (indice == 6))
vecallindex[2] <- indice.ch(x,cl=clall[,2],d)
if (any(indice == 3) || (indice == 6))
vecallindex[3] <- indice.hart(x,clall,d)
if (any(indice == 4) || (indice == 6))
vecallindex[4] <- indice.ratkowsky(x,cl=cl1, d)
if (any(indice == 5) || (indice == 6))
vecallindex[5] <- indice.ball(x,cl=cl1,d)
names(vecallindex) <- c("kl", "ch", "hart", "ratkowsky", "ball")
if (indice < 6)
vecallindex <- vecallindex[indice]
return(vecallindex)
}
########################################################################
# #
# C-index #
# #
########################################################################
Indice.cindex <- function (d, cl){
d <- data.matrix(d)
DU <- 0
r <- 0
v_max <- array(1, max(cl))
v_min <- array(1, max(cl))
for (i in 1:max(cl)) {
n <- sum(cl == i)
if (n > 1) {
t <- d[cl == i, cl == i]
DU = DU + sum(t)/2
v_max[i] = max(t)
if (sum(t == 0) == n){
v_min[i] <- min(t[t != 0])
}else{
v_min[i] <- 0}
r <- r + n * (n - 1)/2
}
}
Dmin = min(v_min)
Dmax = max(v_max)
if (Dmin == Dmax) {
result <- NA
}else{
result <- (DU - r * Dmin)/(Dmax * r - Dmin * r)
}
result
}
########################################################################
# #
# DB #
# #
########################################################################
Indice.DB <- function (x, cl, d = NULL, centrotypes = "centroids", p = 2, q = 2){
if (sum(c("centroids") == centrotypes) == 0)
stop("Wrong centrotypes argument")
if (!is.null(d)) {
if (!is.matrix(d)) {
d <- as.matrix(d)
}
row.names(d) <- row.names(x)
}
if (is.null(dim(x))) {
dim(x) <- c(length(x), 1)
}
x <- as.matrix(x)
n <- length(cl)
k <- max(cl)
dAm <- d
centers <- matrix(nrow = k, ncol = ncol(x))
if (centrotypes == "centroids") {
for (i in 1:k) {
for (j in 1:ncol(x)) {
centers[i, j] <- mean(x[cl == i, j])
}
}
}else {
stop("wrong centrotypes argument")
}
S <- rep(0, k)
for (i in 1:k) {
ind <- (cl == i)
if (sum(ind) > 1) {
centerI <- centers[i, ]
centerI <- rep(centerI, sum(ind))
centerI <- matrix(centerI, nrow = sum(ind), ncol = ncol(x),
byrow = TRUE)
S[i] <- mean(sqrt(apply((x[ind, ] - centerI)^2, 1,
sum))^q)^(1/q)
}else{ S[i] <- 0}
}
M <- as.matrix(dist(centers, p = p))
R <- array(Inf, c(k, k))
r = rep(0, k)
for (i in 1:k) {
for (j in 1:k) {
R[i, j] = (S[i] + S[j])/M[i, j]
}
r[i] = max(R[i, ][is.finite(R[i, ])])
}
DB = mean(r[is.finite(r)])
resul <- list(DB = DB, r = r, R = R, d = M, S = S, centers = centers)
resul
}
########################################################################
# #
# Silhouette #
# #
########################################################################
Indice.S <- function (d, cl) {
d <- as.matrix(d)
Si <- 0
for (k in 1:max(cl)) {
if ((sum(cl == k)) <= 1){
Sil <- 1
}else{
Sil <- 0
for (i in 1:length(cl)) {
if (cl[i] == k) {
ai <- sum(d[i, cl == k])/(sum(cl == k) - 1)
dips <- NULL
for (j in 1:max(cl)) if (cl[i] != j)
if (sum(cl == j) != 1)
dips <- cbind(dips, c((sum(d[i, cl == j]))/(sum(cl ==
j))))
else dips <- cbind(dips, c((sum(d[i, cl ==
j]))))
bi <- min(dips)
Sil <- Sil + (bi - ai)/max(c(ai, bi))
}
}
}
Si <- Si + Sil
}
Si/length(cl)
}
########################################################################
# #
# Gap #
# #
########################################################################
Indice.Gap <- function (x, clall, reference.distribution = "unif", B = 10,
method = "ward.D2", d = NULL, centrotypes = "centroids"){
GAP <- function(X, cl, referenceDistribution, B, method, d, centrotypes)
{
set.seed(1)
simgap <- function(Xvec){
ma <- max(Xvec)
mi <- min(Xvec)
set.seed(1)
Xout <- runif(length(Xvec), min = mi, max = ma)
return(Xout)
}
pcsim <- function(X, d, centrotypes){
if (centrotypes == "centroids") {
Xmm <- apply(X, 2, mean)
}
for (k in (1:dim(X)[2])) {
X[, k] <- X[, k] - Xmm[k]
}
ss <- svd(X)
Xs <- X %*% ss$v
Xnew <- apply(Xs, 2, simgap)
Xt <- Xnew %*% t(ss$v)
for (k in (1:dim(X)[2])) {
Xt[, k] <- Xt[, k] + Xmm[k]
}
return(Xt)
}
if (is.null(dim(x))){
dim(x) <- c(length(x), 1)
}
ClassNr <- max(cl)
Wk0 <- 0
WkB <- matrix(0, 1, B)
for (bb in (1:B)) {
if (reference.distribution == "unif")
Xnew <- apply(X, 2, simgap)
else if (reference.distribution == "pc")
Xnew <- pcsim(X, d, centrotypes)
else stop("Wrong reference distribution type")
if (bb == 1) {
pp <- cl
if (ClassNr == length(cl))
pp2 <- 1:ClassNr
else if (method == "k-means")
{ set.seed(1)
pp2 <- kmeans(Xnew, ClassNr, 100)$cluster
}
else if (method == "single" || method == "complete" ||
method == "average" || method == "ward.D2" ||
method == "mcquitty" || method == "median" ||
method == "centroid"|| method=="ward.D")
pp2 <- cutree(hclust(dist(Xnew), method = method),
ClassNr)
else stop("Wrong clustering method")
if (ClassNr > 1) {
for (zz in (1:ClassNr)) {
Xuse <- X[pp == zz, ]
Wk0 <- Wk0 + sum(diag(var(Xuse))) * (length(pp[pp ==
zz]) - 1)/(dim(X)[1] - ClassNr)
Xuse2 <- Xnew[pp2 == zz, ]
WkB[1, bb] <- WkB[1, bb] + sum(diag(var(Xuse2))) *
(length(pp2[pp2 == zz]) - 1)/(dim(X)[1] -
ClassNr)
}
}
if (ClassNr == 1)
{
Wk0 <- sum(diag(var(X)))
WkB[1, bb] <- sum(diag(var(Xnew)))
}
}
if (bb > 1) {
if (ClassNr == length(cl))
pp2 <- 1:ClassNr
else if (method == "k-means")
{
set.seed(1)
pp2 <- kmeans(Xnew, ClassNr, 100)$cluster
}
else if (method == "single" || method == "complete" ||
method == "average" || method == "ward.D2" ||
method == "mcquitty" || method == "median" ||
method == "centroid"||method == "ward.D")
pp2 <- cutree(hclust(dist(Xnew), method = method),
ClassNr)
else stop("Wrong clustering method")
if (ClassNr > 1) {
for (zz in (1:ClassNr)) {
Xuse2 <- Xnew[pp2 == zz, ]
WkB[1, bb] <- WkB[1, bb] + sum(diag(var(Xuse2))) *
length(pp2[pp2 == zz])/(dim(X)[1] - ClassNr)
}
}
if (ClassNr == 1) {
WkB[1, bb] <- sum(diag(var(Xnew)))
}
}
}
Sgap <- mean(log(WkB[1, ])) - log(Wk0)
Sdgap <- sqrt(1 + 1/B) * sqrt(var(log(WkB[1, ]))) * sqrt((B -
1)/B)
resul <- list(Sgap = Sgap, Sdgap = Sdgap)
resul
}
if (sum(c("centroids", "medoids") == centrotypes) == 0)
stop("Wrong centrotypes argument")
if ("medoids" == centrotypes && is.null(d))
stop("For argument centrotypes = 'medoids' d can not be null")
if (!is.null(d)) {
if (!is.matrix(d)) {
d <- as.matrix(d)
}
row.names(d) <- row.names(x)
}
X <- as.matrix(x)
gap1 <- GAP(X, clall[, 1], reference.distribution, B, method,
d, centrotypes)
gap <- gap1$Sgap
gap2 <- GAP(X, clall[, 2], reference.distribution, B, method,
d, centrotypes)
diffu <- gap - (gap2$Sgap - gap2$Sdgap)
resul <- list(gap = gap, diffu = diffu)
resul
}
########################################################################
# #
# SD, sdbw, dunn #
# #
########################################################################
Index.sdindex<-function(x, clmax, cl)
{
x <- as.matrix(x)
Alpha<-Dis(clmax,x)
Scatt<-Average.scattering(cl,x)$scatt
Dis0<-Dis(cl,x)
SD.indice<-Alpha*Scatt + Dis0
return(SD.indice)
}
Index.SDbw<-function(x, cl)
{
x <- as.matrix(x)
Scatt<-Average.scattering(cl,x)$scatt
Dens.bw<-density.bw(cl,x)
SDbw<-Scatt+Dens.bw
return(SDbw)
}
########################################################################
# #
# D index #
# #
########################################################################
Index.Dindex<- function(cl, x)
{
x <- as.matrix(x)
distance<-density.clusters(cl, x)$distance
n<-length(distance)
S<-0
for(i in 1:n)
S<-S+distance[i]
inertieIntra<-S/n
return(inertieIntra)
}
#####################################################################
# #
# Dunn index #
# #
#####################################################################
Index.dunn <- function(md, clusters, Data=NULL, method="euclidean")
{
distance <- as.matrix(md)
nc <- max(clusters)
interClust <- matrix(NA, nc, nc)
intraClust <- rep(NA, nc)
for (i in 1:nc)
{
c1 <- which(clusters==i)
for (j in i:nc) {
if (j==i) intraClust[i] <- max(distance[c1,c1])
if (j>i) {
c2 <- which(clusters==j)
interClust[i,j] <- min(distance[c1,c2])
}
}
}
dunn <- min(interClust,na.rm=TRUE)/max(intraClust)
return(dunn)
}
################
for (nc in min_nc:max_nc)
{
if (any(method == 1) || (method == 2) || (method == 3) || (method == 4) ||
(method == 5) || (method == 6) || (method == 7)||(method == 9))
{
cl1 <- cutree(hc, k=nc)
cl2 <- cutree(hc, k=nc+1)
clall <- cbind(cl1, cl2)
clmax <- cutree(hc, k=max_nc)
if (nc >= 2)
{
cl0 <- cutree(hc, k=nc-1)
clall1 <- cbind(cl0, cl1, cl2)
}
if (nc == 1)
{
cl0 <-rep(NA,nn)
clall1 <- cbind(cl0, cl1, cl2)
}
}
if (method == 8)
{
set.seed(1)
cl2 <- kmeans(jeu,nc+1)$cluster
set.seed(1)
clmax <- kmeans(jeu,max_nc)$cluster
if (nc > 2)
{
set.seed(1)
cl1 <- kmeans(jeu,nc)$cluster
clall <- cbind(cl1, cl2)
set.seed(1)
cl0 <- kmeans(jeu,nc-1)$cluster
clall1 <- cbind(cl0, cl1, cl2)
}
if (nc == 2)
{
set.seed(1)
cl1 <- kmeans(jeu,nc)$cluster
clall <- cbind(cl1, cl2)
cl0 <- rep(1,nn)
clall1 <- cbind(cl0, cl1, cl2)
}
if (nc == 1)
{
stop("Number of clusters must be higher than 2")
}
}
if (method == 10)
{
set.seed(1)
cl2 <- fanny(x = md, k = nc+1, diss = T, memb.exp = memb.exp)$clustering
set.seed(1)
clmax <-fanny(x = md, k = max_nc, diss = T, memb.exp = memb.exp)$clustering
if (nc > 2)
{
set.seed(1)
cl1 <- fanny(x = md, k = nc, diss = T, memb.exp = memb.exp)$clustering
clall <- cbind(cl1, cl2)
set.seed(1)
cl0 <- fanny(x = md, k = nc-1, diss = T, memb.exp = memb.exp)$clustering
clall1 <- cbind(cl0, cl1, cl2)
}
if (nc == 2)
{
set.seed(1)
cl1 <- fanny(x = md, k = nc, diss = T, memb.exp = memb.exp)$clustering
clall <- cbind(cl1, cl2)
cl0 <- rep(1,nn)
clall1 <- cbind(cl0, cl1, cl2)
}
if (nc == 1)
{
stop("Number of clusters must be higher than 2")
}
}
j <- table(cl1) # table uses the cross-classifying factors to build a contingency table of the counts at each combination of factor levels.
s <- sum(j==1)
j2 <- table(cl2)
s2 <- sum(j2==1)
########### Indices.Traces-hartigan - 3e colonne de res ############
if (any(indice == 3) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,3] <- Indices.Traces(jeu, md, clall1, index = "hart")
}
########### Cubic Clustering Criterion-CCC - 4e colonne de res ############
if (any(indice == 4) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,4] <- Indices.WBT(x=jeu, cl=cl1, P=TT,s=ss,vv=vv)$ccc
}
########### Scott and Symons - 5e colonne de res ############
if (any(indice == 5) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,5] <- Indices.WBT(x=jeu, cl=cl1, P=TT,s=ss,vv=vv)$scott
}
########### Marriot - 6e colonne de res ############
if (any(indice == 6) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,6] <- Indices.WBT(x=jeu, cl=cl1, P=TT,s=ss,vv=vv)$marriot
}
########### Trace Cov W - 7e colonne de res ############
if (any(indice == 7) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,7] <- Indices.WBT(x=jeu, cl=cl1, P=TT,s=ss,vv=vv)$trcovw
}
########### Trace W - 8e colonne de res ############
if (any(indice == 8) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,8] <- Indices.WBT(x=jeu, cl=cl1, P=TT,s=ss,vv=vv)$tracew
}
########### Friedman - 9e colonne de res ############
if (any(indice == 9) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,9] <- Indices.WBT(x=jeu, cl=cl1, P=TT,s=ss,vv=vv)$friedman
}
########### Rubin - 10e colonne de res ############
if (any(indice == 10) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,10] <- Indices.WBT(x=jeu, cl=cl1, P=TT,s=ss,vv=vv)$rubin
}
########### Indices.WKWL-duda - 14e colonne de res ############
if (any(indice == 14) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,14] <- Indices.WKWL(x=jeu,cl1=cl1,cl2=cl2)$duda
}
########### Indices.WKWL-pseudot2 - 15e colonne de res ############
if (any(indice == 15) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,15] <- Indices.WKWL(x=jeu,cl1=cl1,cl2=cl2)$pseudot2
}
########### Indices.WKWL-beale - 16e colonne de res ############
if (any(indice == 16) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,16] <- beale <- Indices.WKWL(x=jeu,cl1=cl1,cl2=cl2)$beale
}
########### Indices.WKWL- duda or pseudot2 or beale ############
if (any(indice == 14) || (indice == 15) || (indice == 16) || (indice == 31) || (indice == 32))
{
NM <- Indices.WKWL(x=jeu,cl1=cl1,cl2=cl2)$NM
NK <- Indices.WKWL(x=jeu,cl1=cl1,cl2=cl2)$NK
NL <- Indices.WKWL(x=jeu,cl1=cl1,cl2=cl2)$NL
zz <- 3.20 # Best standard score in Milligan and Cooper 1985
zzz <- zz*sqrt(2*(1-8/((pi^2)*pp))/(NM*pp))
if (any(indice == 14) || (indice == 31) || (indice == 32))
{
resCritical[nc-min_nc+1,1] <- critValue <- 1-(2/(pi*pp))-zzz
}
if ((indice == 15)|| (indice == 31) || (indice == 32))
{
critValue <- 1-(2/(pi*pp))-zzz
resCritical[nc-min_nc+1,2] <- ((1-critValue)/critValue)*(NK+NL-2)
}
if (any(indice == 16) || (indice == 31) || (indice == 32))
{
df2 <- (NM-2)*pp
resCritical[nc-min_nc+1,3] <- 1-pf(beale,pp,df2)
}
}
########### Indices.TracesL-ball - 18e colonne de res ############
if (any(indice == 18) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,18] <- Indices.Traces(jeu, md, clall1, index = "ball")
}
########### Indice.Point-Biserial - 19e colonne de res ############
if (any(indice == 19) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,19] <- Indice.ptbiserial(x=jeu, md=md, cl1=cl1)
}
########### Gap - 20e colonne de res ############
if (any(indice == 20) || (indice == 32))
{
if (method == 1) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "ward.D2", d = NULL, centrotypes = "centroids")
}
if (method == 2) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "single", d = NULL, centrotypes = "centroids")
}
if (method == 3) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "complete", d = NULL, centrotypes = "centroids")
}
if (method == 4) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "average", d = NULL, centrotypes = "centroids")
}
if (method == 5) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "mcquitty", d = NULL, centrotypes = "centroids")
}
if (method == 6) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "median", d = NULL, centrotypes = "centroids")
}
if (method == 7) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "centroid", d = NULL, centrotypes = "centroids")
}
if (method == 9) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "ward.D", d = NULL, centrotypes = "centroids")
}
if (method == 8) {
resultSGAP <- Indice.Gap(x=jeu, clall=clall, reference.distribution = "unif", B = 10, method = "k-means", d = NULL, centrotypes = "centroids")
}
res[nc-min_nc+1,20] <- resultSGAP$gap
resCritical[nc-min_nc+1,4] <- resultSGAP$diffu
}
if (nc >=2)
{
########### Indices.Traces-kl - 1e colonne de res ############
if (any(indice == 1) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,1] <- Indices.Traces(jeu, md, clall1, index = "kl")
}
########### Indices.Traces-ch - 2e colonne de res ############
if (any(indice == 2) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,2] <- Indices.Traces(jeu, md, clall1, index = "ch")
}
########### Indice.cindex - 11e colonne de res ############
if (any(indice == 11) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,11] <- Indice.cindex(d=md, cl=cl1)
}
########### Indice.DB - 12e colonne de res ############
if (any(indice == 12) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,12] <- Indice.DB(x=jeu, cl=cl1, d = NULL, centrotypes = "centroids", p = 2, q = 2)$DB
}
########### Silhouette - 13e colonne de res ############
if (any(indice == 13) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,13] <- Indice.S(d=md, cl=cl1)
}
########### Indices.Traces-ratkowsky- 17e colonne de res ############
if (any(indice == 17) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,17] <- Indices.Traces(jeu, md, clall1, index = "ratkowsky")
}
########### Indice.Frey - 21e colonne de res ############
if (any(indice == 21) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,21] <- Index.15and28(cl1=cl1,cl2=cl2,md=md)$frey
}
########### Indice.McClain - 22e colonne de res ############
if (any(indice == 22) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,22] <- Index.15and28(cl1=cl1,cl2=cl2,md=md)$mcclain
}
########### Indice.Gamma - 23e colonne de res ############
if (any(indice == 23) || (indice == 32))
{
res[nc-min_nc+1,23] <- Index.sPlussMoins(cl1=cl1,md=md)$gamma
}
########### Indice.Gplus- 24e colonne de res ############
if (any(indice == 24) || (indice == 32))
{
res[nc-min_nc+1,24] <- Index.sPlussMoins(cl1=cl1,md=md)$gplus
}
########### Indice.Tau - 25e colonne de res ############
if (any(indice == 25) || (indice == 32))
{
res[nc-min_nc+1,25] <- Index.sPlussMoins(cl1=cl1,md=md)$tau
}
########### Indices.Dunn - 26e colonne de res ############
if (any(indice == 26 ) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,26] <- Index.dunn(md, cl1, Data=jeu, method=NULL)
}
########### Indices.Hubert - 27e colonne de res ############
if (any(indice == 27 ) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,27] <- Index.Hubert(jeu, cl1)
}
########### Indices.SD - 28e colonne de res ############
if (any(indice == 28 ) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,28] <- Index.sdindex(jeu, clmax, cl1)
}
########### Indices.Dindex - 29e colonne de res ############
if (any(indice == 29 ) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,29] <- Index.Dindex(cl1, jeu)
}
########### Indices.SDbw - 30e colonne de res ############
if (any(indice == 30 ) || (indice == 31) || (indice == 32))
{
res[nc-min_nc+1,30] <- Index.SDbw(jeu, cl1)
}
}else{
res[nc-min_nc+1,1] <- NA
res[nc-min_nc+1,2] <- NA
res[nc-min_nc+1,11] <- NA
res[nc-min_nc+1,12] <- NA
res[nc-min_nc+1,13] <- NA
res[nc-min_nc+1,17] <- NA
res[nc-min_nc+1,21] <- NA
res[nc-min_nc+1,22] <- NA
res[nc-min_nc+1,23] <- NA
res[nc-min_nc+1,24] <- NA
res[nc-min_nc+1,25] <- NA
res[nc-min_nc+1,26] <- NA
res[nc-min_nc+1,27] <- NA
res[nc-min_nc+1,28] <- NA
res[nc-min_nc+1,29] <- NA
res[nc-min_nc+1,30] <- NA
}
}
#########################################################################################################
####################### Best Number of Clusters ###################
#########################################################################################################
nc.KL<-indice.KL<-0
if (any(indice == 1) || (indice == 31) || (indice == 32))
{
# KL - The value of u, which maximizes KL(u), is regarded as specifying the number of clusters [ClusterSim package].
nc.KL <- (min_nc:max_nc)[which.max(res[,1])]
indice.KL <- max(res[,1],na.rm = TRUE)
best.nc<-nc.KL
}
nc.CH<-indice.CH<-0
if (any(indice == 2) || (indice == 31) || (indice == 32))
{
# CH - The value of u, which maximizes CH(u), is regarded as specifying the number of clusters [ClusterSim package].
nc.CH <- (min_nc:max_nc)[which.max(res[,2])]
indice.CH <- max(res[,2],na.rm = TRUE)
best.nc<-nc.CH
}
nc.CCC<-indice.CCC<-0
if (any(indice == 4) || (indice == 31) || (indice == 32))
{
# CCC - The maximum value accross the hierarchy levels is used to indicate the optimal number of clusters in data [29].
nc.CCC <- (min_nc:max_nc)[which.max(res[,4])]
indice.CCC <- max(res[,4],na.rm = TRUE)
best.nc<-nc.CCC
}
nc.DB<-indice.DB<-0
if (any(indice == 12) || (indice == 31) || (indice == 32))
{
# DB - The value of u, which minimizes DB(u), is regarded as specifying the number of clusters [clusterSim package].
nc.DB <- (min_nc:max_nc)[which.min(res[,12])]
indice.DB <- min(res[,12],na.rm = TRUE)
best.nc<-nc.DB
}
nc.Silhouette<-indice.Silhouette<-0
if (any(indice == 13) || (indice == 31) || (indice == 32))
{
# SILHOUETTE - The value of u, which maximizes S(u), is regarded as specifying the number of clusters [ClusterSim package].
nc.Silhouette <- (min_nc:max_nc)[which.max(res[,13])]
indice.Silhouette <- max(res[,13],na.rm = TRUE)
best.nc<-nc.Silhouette
}
nc.Gap<-indice.Gap<-0
# GAP - Choose the number of clusters via finding the smallest q such that: Gap(q)=Gap(q+1)-Sq+1 (q=1,\u{85},n-2) [ClusterSim package].
if (any(indice == 20) || (indice == 32))
{
found <- FALSE
for (ncG in min_nc:max_nc){
if ((resCritical[ncG-min_nc+1,4] >=0) && (!found)){
ncGap <- ncG
indiceGap <- res[ncG-min_nc+1,20]
found <- TRUE
}
}
if (found){
nc.Gap <- ncGap
indice.Gap <- indiceGap
best.nc<-nc.Gap
}else{
nc.Gap <- NA
indice.Gap <- NA
}
}
nc.Duda<-indice.Duda<-0
# DUDA - Choose the number of clusters via finding the smallest q such that: duda >= critical_value [Duda and Hart (1973)].
if (any(indice == 14) || (indice == 31) || (indice == 32))
{
foundDuda <- FALSE
for (ncD in min_nc:max_nc)
{
if ((res[ncD-min_nc+1,14]>=resCritical[ncD-min_nc+1,1]) && (!foundDuda))
{
ncDuda <- ncD
indiceDuda <- res[ncD-min_nc+1,14]
foundDuda <- TRUE
}
}
if (foundDuda)
{
nc.Duda <- ncDuda
indice.Duda <- indiceDuda
best.nc<-nc.Duda
}
else
{
nc.Duda <- NA
indice.Duda <- NA
}
}
nc.Pseudo<-indice.Pseudo<-0
# PSEUDOT2 - Chooses the number of clusters via finding the smallest q such that: pseudot2 <= critical_value [SAS User's guide].
if (any(indice == 15) || (indice == 31) || (indice == 32))
{
foundPseudo <- FALSE
for (ncP in min_nc:max_nc)
{
if ((res[ncP-min_nc+1,15]<=resCritical[ncP-min_nc+1,2]) && (!foundPseudo))
{
ncPseudo <- ncP
indicePseudo <- res[ncP-min_nc+1,15]
foundPseudo <- TRUE
}
}
if (foundPseudo)
{
nc.Pseudo <- ncPseudo
indice.Pseudo <- indicePseudo
best.nc<-nc.Pseudo
}
else
{
nc.Pseudo <- NA
indice.Pseudo <- NA
}
}
nc.Beale<-indice.Beale<-0
if (any(indice == 16) || (indice == 31) || (indice == 32))
{
# BEALE - Chooses the number of clusters via finding the smallest q such that: Fvalue_beale >= 0.1 [Gordon (1999)].
foundBeale <- FALSE
for (ncB in min_nc:max_nc)
{
if ((resCritical[ncB-min_nc+1,3]>=alphaBeale) && (!foundBeale)){
ncBeale <- ncB
indiceBeale <- res[ncB-min_nc+1,16]
foundBeale <- TRUE
}
}
if (foundBeale){
nc.Beale <- ncBeale
indice.Beale <- indiceBeale
best.nc<-nc.Beale
}
else
{
nc.Beale <- NA
indice.Beale <- NA
}
}
nc.ptbiserial<-indice.ptbiserial<-0
if (any(indice == 19) || (indice == 31) || (indice == 32))
{
# POINT-BISERIAL - The maximum value was used to suggest the optimal number of clusters in the data [29].
nc.ptbiserial <- (min_nc:max_nc)[which.max(res[,19])]
indice.ptbiserial <- max(res[,19],na.rm = TRUE)
best.nc<-nc.ptbiserial
}
foundNC<-foundIndice<-numeric(0)
nc.Frey<-indice.Frey<-0
if (any(indice == 21) || (indice == 31) || (indice == 32))
{
# FREY AND VAN GROENEWOUD - The best results occured when clustering was continued until the ratio fell below 1.00 for the last
# series of times. At this point, the cluster level before this series was taken as the optimal partition.
# If the ration never fell below 1.00, a one cluster solution was assumed [29].
foundFrey <- FALSE
i<-1
for (ncF in min_nc:max_nc)
{
if (res[ncF-min_nc+1,21] < 1)
{
ncFrey <- ncF-1
indiceFrey <- res[ncF-1-min_nc+1,21]
foundFrey <- TRUE
foundNC[i]<-ncFrey
foundIndice[i]<-indiceFrey
i<-i+1
}
}
if (foundFrey)
{
nc.Frey <- foundNC[1]
indice.Frey <- foundIndice[1]
best.nc<-nc.Frey
}
else
{
nc.Frey <- NA
indice.Frey <- NA
print(paste("Frey index : No clustering structure in this data set"))
}
}
nc.McClain<-indice.McClain<-0
if (any(indice == 22) || (indice == 31) || (indice == 32))
{
# MCCLAIN AND RAO - The minimum value of the index was found to give the best recovery information [29].
nc.McClain <- (min_nc:max_nc)[which.min(res[,22])]
indice.McClain <- min(res[,22],na.rm = TRUE)
best.nc<-nc.McClain
}
nc.Gamma<-indice.Gamma<-0
if (any(indice == 23) || (indice == 31) || (indice == 32))
{
# GAMMA - Maximum values were taken to represent the correct hierarchy level [29].
nc.Gamma <- (min_nc:max_nc)[which.max(res[,23])]
indice.Gamma <- max(res[,23],na.rm = TRUE)
best.nc<-nc.Gamma
}
nc.Gplus<-indice.Gplus<-0
if (any(indice == 24) || (indice == 31) || (indice == 32))
{
# GPLUS - Minimum values were used to determine the number of clusters in the data [29].
nc.Gplus <- (min_nc:max_nc)[which.min(res[,24])]
indice.Gplus <- min(res[,24],na.rm = TRUE)
best.nc<-nc.Gplus
}
nc.Tau<-indice.Tau<-0
if (any(indice == 25) || (indice == 31) || (indice == 32))
{
# TAU - The maximum value in the hierarchy sequence was taken as indicating the correct number of clusters [29].
nc.Tau <- (min_nc:max_nc)[which.max(res[,25])]
indice.Tau <- max(res[,25],na.rm = TRUE)
best.nc<-nc.Tau
}
#Some indices need to compute difference between hierarchy levels to identify relevant number of clusters
if((indice==3)||(indice == 5)||(indice == 6)||(indice == 7)||(indice == 8)||(indice == 9)||(indice == 10)||(indice == 18)||(indice == 27)||(indice == 11)||(indice == 29)||(indice == 31)||(indice == 32))
{
DiffLev <- array(0, c(max_nc-min_nc+1,12))
DiffLev[,1] <- min_nc:max_nc
for (nc3 in min_nc:max_nc)
{
if (nc3==min_nc)
{
DiffLev[nc3-min_nc+1,2] <- abs(res[nc3-min_nc+1,3]-NA) # Hartigan
DiffLev[nc3-min_nc+1,3] <- abs(res[nc3-min_nc+1,5]-NA) #Scott
DiffLev[nc3-min_nc+1,4] <- abs(res[nc3-min_nc+1,6]-NA) # Marriot
DiffLev[nc3-min_nc+1,5] <- abs(res[nc3-min_nc+1,7]-NA) #Trcovw
DiffLev[nc3-min_nc+1,6] <- abs(res[nc3-min_nc+1,8]-NA) #Tracew
DiffLev[nc3-min_nc+1,7] <- abs(res[nc3-min_nc+1,9]-NA) #Friedman
DiffLev[nc3-min_nc+1,8] <- abs(res[nc3-min_nc+1,10]-NA) #Rubin
DiffLev[nc3-min_nc+1,9] <- abs(res[nc3-min_nc+1,18]-NA) # Ball
DiffLev[nc3-min_nc+1,10] <- abs(res[nc3-min_nc+1,27]-NA) # Hubert
DiffLev[nc3-min_nc+1,12] <- abs(res[nc3-min_nc+1,29]-NA) # D index
}
else
{
if(nc3==max_nc)
{
DiffLev[nc3-min_nc+1,2] <- abs(res[nc3-min_nc+1,3]-res[nc3-min_nc,3])
DiffLev[nc3-min_nc+1,3] <- abs(res[nc3-min_nc+1,5]-res[nc3-min_nc,5])
DiffLev[nc3-min_nc+1,4] <- abs(res[nc3-min_nc+1,6]-NA) # Marriot
DiffLev[nc3-min_nc+1,5] <- abs(res[nc3-min_nc+1,7]-res[nc3-min_nc,7]) # trcovw
DiffLev[nc3-min_nc+1,6] <- abs(res[nc3-min_nc+1,8]-NA) #traceW
DiffLev[nc3-min_nc+1,7] <- abs(res[nc3-min_nc+1,9]-res[nc3-min_nc,9])
DiffLev[nc3-min_nc+1,8] <- abs(res[nc3-min_nc+1,10]-NA) #Rubin
DiffLev[nc3-min_nc+1,9] <- abs(res[nc3-min_nc+1,18]-res[nc3-min_nc,18])
DiffLev[nc3-min_nc+1,10] <- abs(res[nc3-min_nc+1,27]-NA)
DiffLev[nc3-min_nc+1,12] <- abs(res[nc3-min_nc+1,29]-NA) # D index
}
else
{
DiffLev[nc3-min_nc+1,2] <- abs(res[nc3-min_nc+1,3]-res[nc3-min_nc,3]) # Hartigan
DiffLev[nc3-min_nc+1,3] <- abs(res[nc3-min_nc+1,5]-res[nc3-min_nc,5])
DiffLev[nc3-min_nc+1,4] <- ((res[nc3-min_nc+2,6]-res[nc3-min_nc+1,6])-(res[nc3-min_nc+1,6]-res[nc3-min_nc,6]))
DiffLev[nc3-min_nc+1,5] <- abs(res[nc3-min_nc+1,7]-res[nc3-min_nc,7])
DiffLev[nc3-min_nc+1,6] <- ((res[nc3-min_nc+2,8]-res[nc3-min_nc+1,8])-(res[nc3-min_nc+1,8]-res[nc3-min_nc,8]))
DiffLev[nc3-min_nc+1,7] <- abs(res[nc3-min_nc+1,9]-res[nc3-min_nc,9])
DiffLev[nc3-min_nc+1,8] <- ((res[nc3-min_nc+2,10]-res[nc3-min_nc+1,10])-(res[nc3-min_nc+1,10]-res[nc3-min_nc,10]))
DiffLev[nc3-min_nc+1,9] <- abs(res[nc3-min_nc+1,18]-res[nc3-min_nc,18])
DiffLev[nc3-min_nc+1,10] <- abs((res[nc3-min_nc+1,27]-res[nc3-min_nc,27]))
DiffLev[nc3-min_nc+1,12] <-((res[nc3-min_nc+2,29]-res[nc3-min_nc+1,29])-(res[nc3-min_nc+1,29]-res[nc3-min_nc,29])) #Dindex
}
}
}
}
nc.Hartigan<-indice.Hartigan<-0
if (any(indice == 3) || (indice == 31) || (indice == 32))
{
# HARTIGAN - The maximum differences between hierarchy levels were taken as indicating the correct number of clusters in the data [29].
nc.Hartigan <- DiffLev[,1][which.max(DiffLev[,2])]
indice.Hartigan <- max(DiffLev[,2],na.rm = TRUE)
best.nc<-nc.Hartigan
}
nc.Ratkowsky<-indice.Ratkowsky<-0
if (any(indice == 17) || (indice == 31) || (indice == 32))
{
# RATKOWSKY - The optimal number of groups is taken as the level where this criterion exhibits its maximum value [29].
nc.Ratkowsky <- (min_nc:max_nc)[which.max(res[,17])]
indice.Ratkowsky <- max(res[,17],na.rm = TRUE)
best.nc<-nc.Ratkowsky
}
nc.cindex<-indice.cindex<-0
if (any(indice == 11) || (indice == 31) || (indice == 32))
{
#CINDEX - The minimum value across the hierarchy levels was used to indicate the optimal number of clusters [29].
nc.cindex <- (min_nc:max_nc)[which.min(res[,11])]
indice.cindex <- min(res[,11],na.rm = TRUE)
best.nc<-nc.cindex
}
nc.Scott<-indice.Scott<-0
if (any(indice == 5) || (indice == 31) || (indice == 32))
{
# SCOTT - The maximum difference between hierarchy levels was used to suggest the correct number of partitions [29].
nc.Scott <- DiffLev[,1][which.max(DiffLev[,3])]
indice.Scott <- max(DiffLev[,3],na.rm = TRUE)
best.nc<-nc.Scott
}
nc.Marriot<-indice.Marriot<-0
if (any(indice == 6) || (indice == 31) || (indice == 32))
{
# MARRIOT - The maximum difference between successive levels was used to determine the best partition level [29].
nc.Marriot <- DiffLev[,1][which.max(DiffLev[,4])]
round(nc.Marriot, digits=1)
indice.Marriot <- max(DiffLev[,4],na.rm = TRUE)
best.nc<-nc.Marriot
}
nc.TrCovW<-indice.TrCovW<-0
if (any(indice == 7) || (indice == 31) || (indice == 32))
{
nc.TrCovW <- DiffLev[,1][which.max(DiffLev[,5])]
indice.TrCovW <- max(DiffLev[,5],na.rm = TRUE)
best.nc<-nc.TrCovW
}
nc.TraceW<-indice.TraceW<-0
if (any(indice == 8) || (indice == 31) || (indice == 32))
{
# TRACE W - To determine the number of clusters in the data, maximum difference scores were used [29].
nc.TraceW <- DiffLev[,1][which.max(DiffLev[,6])]
indice.TraceW <- max(DiffLev[,6],na.rm = TRUE)
best.nc<-nc.TraceW
}
nc.Friedman<-indice.Friedman<-0
if (any(indice == 9) || (indice == 31) || (indice == 32))
{
# FRIEDMAN - The maximum difference in values of trace W-1B criterion was used to indicate the optimal number of clusters [29].
nc.Friedman <- DiffLev[,1][which.max(DiffLev[,7])]
indice.Friedman <- max(DiffLev[,7],na.rm = TRUE)
best.nc<-nc.Friedman
}
nc.Rubin<-indice.Rubin<-0
if (any(indice == 10) || (indice == 31) || (indice == 32))
{
# RUBIN - The difference between levels was used [29].
nc.Rubin <- DiffLev[,1][which.min(DiffLev[,8])]
indice.Rubin <- min(DiffLev[,8],na.rm = TRUE)
best.nc<-nc.Rubin
}
nc.Ball<-indice.Ball<-0
if (any(indice == 18) || (indice == 31) || (indice == 32))
{
# BALL - The largest difference between levels was used to indicate the optimal solution [29].
nc.Ball <- DiffLev[,1][which.max(DiffLev[,9])]
indice.Ball <- max(DiffLev[,9],na.rm = TRUE)
best.nc<-nc.Ball
}
nc.Dunn<-indice.Dunn<-0
if (any(indice == 26) || (indice == 31) || (indice == 32))
{
# Dunn -
nc.Dunn <- (min_nc:max_nc)[which.max(res[,26])]
indice.Dunn <- max(res[,26],na.rm = TRUE)
best.nc<-nc.Dunn
}
nc.Hubert<-indice.Hubert<-0
if (any(indice == 27) || (indice == 31) || (indice == 32))
{
# Hubert -
nc.Hubert <- 0.00
indice.Hubert <- 0.00
#x11()
par(mfrow = c(1,2))
plot(x_axis,res[,27], tck=0, type="b", col="red", xlab= expression(paste("Number of clusters ")), ylab= expression(paste("Hubert Statistic values")))
plot(DiffLev[,1],DiffLev[,10], tck=0, type="b", col="blue", xlab= expression(paste("Number of clusters ")), ylab= expression(paste("Hubert statistic second differences")))
cat(paste ("*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.", "\n", "\n"))
}
nc.sdindex<-indice.sdindex<-0
if (any(indice == 28) || (indice == 31) || (indice == 32))
{
# SD -
nc.sdindex <- (min_nc:max_nc)[which.min(res[,28])]
indice.sdindex<- min(res[,28],na.rm = TRUE)
best.nc<-nc.sdindex
}
nc.Dindex<-indice.Dindex<-0
if (any(indice == 29) || (indice == 31) || (indice == 32))
{
nc.Dindex <- 0.00
indice.Dindex<- 0.00
#x11()
par(mfrow = c(1,2))
plot(x_axis,res[,29], tck=0, type="b", col="red", xlab= expression(paste("Number of clusters ")), ylab= expression(paste("Dindex Values")))
plot(DiffLev[,1],DiffLev[,12], tck=0, type="b", col="blue", xlab= expression(paste("Number of clusters ")), ylab= expression(paste("Second differences Dindex Values")))
cat(paste ("*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.", "\n", "\n"))
}
nc.SDbw<-indice.SDbw<-0
if (any(indice == 30) || (indice == 31) || (indice == 32))
{
# SDbw -
nc.SDbw <- (min_nc:max_nc)[which.min(res[,30])]
indice.SDbw<- min(res[,30],na.rm = TRUE)
best.nc<-nc.SDbw
}
######################################################################################################################
######################## Displaying results #########################################
######################################################################################################################
if (indice < 31)
{
res <- res[,c(indice)]
if (indice == 14) { resCritical <- resCritical[,1] }
if (indice == 15) { resCritical <- resCritical[,2] }
if (indice == 16) { resCritical <- resCritical[,3] }
if (indice == 20) { resCritical <- resCritical[,4] }
}
if (indice == 31)
{
res <- res[,c(1:19,21:22,26:30)]
resCritical <- resCritical[,c(1:3)]
}
if (any(indice == 20) || (indice == 23) || (indice == 24) || (indice == 25) || (indice == 32)){
results <- c(nc.KL, indice.KL, nc.CH, indice.CH, nc.Hartigan, indice.Hartigan, nc.CCC, indice.CCC, nc.Scott, indice.Scott,
nc.Marriot, indice.Marriot, nc.TrCovW, indice.TrCovW, nc.TraceW, indice.TraceW, nc.Friedman,
indice.Friedman, nc.Rubin, indice.Rubin, nc.cindex, indice.cindex, nc.DB, indice.DB, nc.Silhouette,
indice.Silhouette, nc.Duda, indice.Duda, nc.Pseudo, indice.Pseudo, nc.Beale, indice.Beale, nc.Ratkowsky,
indice.Ratkowsky, nc.Ball, indice.Ball, nc.ptbiserial, indice.ptbiserial, nc.Gap, indice.Gap,
nc.Frey, indice.Frey, nc.McClain, indice.McClain, nc.Gamma, indice.Gamma, nc.Gplus, indice.Gplus,
nc.Tau, indice.Tau, nc.Dunn, indice.Dunn, nc.Hubert, indice.Hubert, nc.sdindex, indice.sdindex, nc.Dindex, indice.Dindex, nc.SDbw, indice.SDbw)
results1<-matrix(c(results),nrow=2,ncol=30)
resultats <- matrix(c(results),nrow=2,ncol=30,dimnames = list(c("Number_clusters","Value_Index"),
c("KL","CH","Hartigan","CCC", "Scott", "Marriot", "TrCovW",
"TraceW", "Friedman", "Rubin", "Cindex", "DB", "Silhouette",
"Duda","PseudoT2", "Beale", "Ratkowsky", "Ball", "PtBiserial",
"Gap", "Frey", "McClain", "Gamma", "Gplus", "Tau", "Dunn",
"Hubert", "SDindex", "Dindex", "SDbw")))
}else{
results <- c(nc.KL, indice.KL, nc.CH, indice.CH, nc.Hartigan, indice.Hartigan, nc.CCC, indice.CCC, nc.Scott, indice.Scott,
nc.Marriot, indice.Marriot, nc.TrCovW, indice.TrCovW, nc.TraceW, indice.TraceW, nc.Friedman, indice.Friedman,
nc.Rubin, indice.Rubin, nc.cindex, indice.cindex, nc.DB, indice.DB, nc.Silhouette, indice.Silhouette,
nc.Duda, indice.Duda, nc.Pseudo, indice.Pseudo, nc.Beale, indice.Beale, nc.Ratkowsky, indice.Ratkowsky,
nc.Ball, indice.Ball, nc.ptbiserial, indice.ptbiserial, nc.Frey, indice.Frey, nc.McClain, indice.McClain,
nc.Dunn, indice.Dunn, nc.Hubert, indice.Hubert, nc.sdindex, indice.sdindex, nc.Dindex, indice.Dindex, nc.SDbw, indice.SDbw
)
results1<-matrix(c(results),nrow=2,ncol=26)
resultats <- matrix(c(results),nrow=2,ncol=26,dimnames = list(c("Number_clusters","Value_Index"),
c("KL","CH","Hartigan","CCC", "Scott", "Marriot", "TrCovW",
"TraceW", "Friedman", "Rubin", "Cindex", "DB", "Silhouette",
"Duda","PseudoT2", "Beale", "Ratkowsky", "Ball", "PtBiserial",
"Frey", "McClain", "Dunn", "Hubert", "SDindex", "Dindex", "SDbw")))
}
if (any(indice <= 20)||(indice == 23)||(indice == 24)||(indice == 25))
{
resultats <- resultats[,c(indice)]
}
if (any(indice == 21)|| (indice == 22))
{
indice3 <-indice-1
resultats <- resultats[,c(indice3)]
}
if (any(indice == 26) || (indice == 27) || (indice == 28) || ( indice == 29)|| ( indice == 30))
{
indice4 <- indice-4
resultats <- resultats[,c(indice4)]
}
resultats<-round(resultats, digits=4)
res<-round(res, digits=4)
resCritical<-round(resCritical, digits=4)
# if (numberObsAfter != numberObsBefore)
# {
# cat(paste(numberObsAfter,"observations were used out of", numberObsBefore ,"possible observations due to missing values."))
# }
# if (numberObsAfter == numberObsBefore)
# {
# cat(paste("All", numberObsAfter,"observations were used.", "\n", "\n"))
# }
######################## Summary results #####################################
if(any(indice == 31) || (indice == 32))
{
cat("*******************************************************************", "\n")
cat("* Among all indices: ", "\n")
BestCluster<-results1[1,]
c=0
for(i in min.nc:max.nc)
{
vect<-which(BestCluster==i)
if(length(vect)>0)
cat("*",length(vect), "proposed", i,"as the best number of clusters", "\n")
if(c<length(vect))
{
j=i
c<-length(vect)
}
}
cat("\n"," ***** Conclusion ***** ", "\n", "\n")
cat("* According to the majority rule, the best number of clusters is ",j , "\n", "\n", "\n")
cat("*******************************************************************", "\n")
########################## The Best partition ###################
if (any(method == 1) || (method == 2) || (method == 3) || (method == 4) ||
(method == 5) || (method == 6) || (method == 7)||(method == 9))
partition<- cutree(hc, k=j)
else
{
set.seed(1)
partition<-kmeans(jeu,j)$cluster
}
}
if (any(indice==1)||(indice==2)||(indice==3)||(indice==4)||(indice==5)||(indice==6)||(indice==7)
||(indice==8)||(indice==9)||(indice==10)||(indice==11)||(indice==12)||(indice==13)||(indice==14)
||(indice==15)||(indice==16)||(indice==17)||(indice==18)||(indice==19)||(indice==20)
||(indice==21)||(indice==22)||(indice==23)||(indice==24)||(indice==25)||(indice==26)
||(indice==28)||(indice==30))
{
if (any(method == 1) || (method == 2) || (method == 3) || (method == 4) ||
(method == 5) || (method == 6) || (method == 7) || (method == 9))
partition<- cutree(hc, k=best.nc)
else
{
set.seed(1)
partition<-kmeans(jeu,best.nc)$cluster
}
}
######################### Summary results ############################
if ((indice == 14)|| (indice == 15)|| (indice == 16)|| (indice == 20)|| (indice == 31)|| (indice == 32))
{
results.final <- list(All.index=res,All.CriticalValues=resCritical,Best.nc=resultats, Best.partition=partition)
}
if ((indice == 27)|| (indice == 29))
results.final <- list(All.index=res)
if (any(indice==1)||(indice==2)||(indice==3)||(indice==4)||(indice==5)||(indice==6)||(indice==7)
||(indice==8)||(indice==9)||(indice==10)||(indice==11)||(indice==12)||(indice==13)
||(indice==17)||(indice==18)||(indice==19)||(indice==21)||(indice==22)||(indice==23)||(indice==24)
||(indice==25)||(indice==26)||(indice==28)||(indice==30))
results.final <- list(All.index=res,Best.nc=resultats, Best.partition=partition)
return(results.final)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.