Nothing
##=============================================================================
##' @title Compute the CLUSTATIS partitioning algorithm on different blocks of quantitative variables. Can be performed using a multi-start strategy or initial partition provided by the user.
##'
##'
##' @description
##' Partitioning algorithm for quantitative variables. Each cluster is associated with a compromise computed by the STATIS method. Moreover, a noise cluster can be set up.
##'
##' @usage
##' clustatis_kmeans(Data, Blocks, clust, nstart=100, rho=0, NameBlocks=NULL,
##' Itermax=30,Graph_groups=TRUE, Graph_weights=FALSE,
##' scale=FALSE, print_attempt=FALSE)
##'
##'
##' @param Data data frame or matrix. Correspond to all the blocks of variables merged horizontally
##'
##' @param Blocks numerical vector. The number of variables of each block. The sum must be equal to the number of columns of Data
##'
##' @param clust numerical vector or integer. Initial partition or number of starting partitions if integer. If numerical vector, the numbers must be 1,2,3,...,number of clusters
##'
##' @param nstart integer. Number of starting partitions. Default: 100
##'
##' @param rho numerical between 0 and 1. Threshold for the noise cluster. Default:0
##'
##' @param NameBlocks string vector. Name of each block. Length must be equal to the length of Blocks vector. If NULL, the names are B1,...Bm. Default: NULL
##'
##' @param Itermax numerical. Maximum of iterations by partitionning algorithm. Default: 30
##'
##' @param Graph_groups logical. Should each cluster compromise be plotted? Default: TRUE
##'
##' @param Graph_weights logical. Should the barplot of the weights in each cluster be plotted? Default: FALSE
##'
##' @param scale logical. Should the data variables be scaled? Default: FALSE
##'
##' @param print_attempt logical. Print the number of remaining attempts in the multi-start case? Default: FALSE
##'
##'
##'
##'
##'
##'
##' @return a list with:
##' \itemize{
##' \item group: the clustering partition. If rho>0, some blocks could be in the noise cluster ("K+1")
##' \item rho: the threshold for the noise cluster
##' \item homogeneity: percentage of homogeneity of the blocks in each cluster and the overall homogeneity
##' \item rv_with_compromise: RV coefficient of each block with its cluster compromise
##' \item weights: weight associated with each block in its cluster
##' \item comp_RV: RV coefficient between the compromises associated with the various clusters
##' \item compromise: the W compromise of each cluster
##' \item coord: the coordinates of objects of each cluster
##' \item inertia: percentage of total variance explained by each axis for each cluster
##' \item rv_all_cluster: the RV coefficient between each block and each cluster compromise
##' \item criterion: the CLUSTATIS criterion error
##' \item param: parameters called
##' \item type: parameter passed to other functions
##' }
##'
##'
##'
##'
##' @references
##' Llobell, F., Cariou, V., Vigneau, E., Labenne, A., & Qannari, E. M. (2018). Analysis and clustering of multiblock datasets by means of the STATIS and CLUSTATIS methods. Application to sensometrics. Food Quality and Preference, in Press.\cr
##' Llobell, F., Vigneau, E., Qannari, E. M. (2019). Clustering datasets by means of CLUSTATIS with identification of atypical datasets. Application to sensometrics. Food Quality and Preference, 75, 97-104.
##'
##'
##'
##' @keywords quantitative
##'
##' @examples
##'
##' data(smoo)
##' NameBlocks=paste0("S",1:24)
##' #with multi-start
##' cl_km=clustatis_kmeans(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks, clust=3)
##' #with an initial partition
##' cl=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##' Graph_dend=FALSE)
##' partition=cl$cutree_k$partition3
##' cl_km2=clustatis_kmeans(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##' clust=partition, Graph_weights=FALSE, Graph_groups=FALSE)
##' graphics.off()
##'
##' @seealso \code{\link{plot.clustatis}}, \code{\link{clustatis}}, \code{\link{summary.clustatis}}, \code{\link{statis}}
##'
##' @export
##=============================================================================
clustatis_kmeans=function(Data,Blocks, clust, nstart=100, rho=0, NameBlocks=NULL,Itermax=30, Graph_groups=TRUE,
Graph_weights=FALSE, scale=FALSE, print_attempt=FALSE){
nblo=length(Blocks)
#initialisation or muti-start?
if(length(clust)==1)
{
ngroups=clust
init=FALSE
}else{
ngroups=length(unique(clust))
partition=clust
init=TRUE
if(sum(sort(unique(clust))!=(1:ngroups))>0)
{
stop("The clusters must be 1,2,...,number of clusters")
}
}
n=nrow(Data)
if (is.null(NameBlocks)) NameBlocks=paste("B",1:nblo,sep="-")
if(is.null(rownames(Data))) rownames(Data)=paste0("X", 1:nrow(Data))
if(is.null(colnames(Data))) colnames(Data)=paste0("Y",1:ncol(Data))
#parapet for numerical Data
for (i in 1: ncol(Data))
{
if (is.numeric(Data[,i])==FALSE)
{
stop(paste("The data must be numeric (column",i,")"))
}
}
#parapet for number of objects
if(n<3)
{
stop("At least 3 objects are required")
}
#parapet for Blocks
if(sum(Blocks)!=ncol(Data))
{
stop("Error with Blocks")
}
#Parapet for NameBlocks
if(length(NameBlocks)!=nblo)
{
stop("Error with the length of NameBlocks")
}
#parapet for scale: no constant variable
if(scale==TRUE)
{
for (i in 1:ncol(Data))
{
if (sd(Data[,i])==0)
{
stop(paste("Column", i, "is constant"))
}
}
}
#no NA
if(sum(is.na(Data))>0)
{
print("NA detected:")
tabna=which(is.na(Data), arr.ind = TRUE)
print(tabna)
stop(paste("NA are not accepted"))
}
#parapet for number of groups and number of blocks
if(ngroups>nblo)
{
stop("number of groups > number of blocks")
}
#centering and scaling if necessary
Data=scale(Data,center=TRUE,scale=scale)
J=rep(1:nblo , times = Blocks ) # indicates which block each variable belongs to
#parapet for constant configurations and compute of Wi
Wi=array(0,dim=c(nrow(Data),nrow(Data),nblo))
for (i in 1:nblo)
{
Xi=as.matrix(Data[,J==i])
Wi[,,i]=tcrossprod(Xi)
nor=sqrt(sum(diag(crossprod(Wi[,,i]))))
if(nor==0)
{
stop(paste("Configuration",i,"is constant"))
}
Wi[,,i]=Wi[,,i]/nor#standardization
}
#####RV matrix#####
RV=matrix(0,nblo,nblo)
diag(RV)=rep(1,nblo)
if(nblo>1)
{
for (i in 1:(nblo-1)) {
for (j in (i+1):nblo) {
RV[i,j]=sum(diag(crossprod(Wi[,,i], Wi[,,j])))
RV[j,i]=RV[i,j]
} }
}
if(init==FALSE)
{
sumRV2=0
for (tent in 1: nstart)
{
coupe=sample(1:ngroups,nblo,replace=TRUE)
while(nlevels(as.factor(coupe))<ngroups)
{
coupe=sample(1:ngroups,nblo,replace=TRUE)#avoid empty groups
}
iter=0
sumRV2_act=0
goout=0
oldgroup=coupe
erreur=NULL
while (goout!=1 & iter<=Itermax)
{
#avoid empty group
if (nlevels(as.factor(oldgroup))<ngroups)
{
coupe=sample(1:ngroups,nblo,replace=TRUE)
while(nlevels(as.factor(coupe))<ngroups)
{
coupe=sample(1:ngroups,nblo,replace=TRUE)
}
oldgroup=coupe
}
#compute the compromises
Wk=array(0,dim=c(nrow(Data),nrow(Data),ngroups))
lamb=NULL
alpha=list()
for (i in 1: ngroups)
{
cluster=which(oldgroup==i)
Wj=list()
for (tab in 1:length(cluster)) {
Wj[[tab]]=Wi[,,cluster[tab]]
}
statisgr=.crit_statisWj(Wj, cluster, RV)
Wk[,,i]=statisgr$W
lamb[i]=statisgr$lambda/length(cluster)
alpha[[i]]=statisgr$u
erreur[i]=statisgr$Q
}
#compute the criterion
cr=list()
for ( i in 1:nblo)
{
a=NULL
for (k in 1: ngroups)
{
W_k=as.matrix(Wk[,,k])
normW=sum(diag(crossprod(W_k)))
W_i=Wi[,,i]
a=c(a,sum(diag(crossprod(W_i, W_k)))^2/(normW))
}
cr[[i]]=sqrt(a)
}
#choice of the clusters
newgroup=NULL
for (i in 1:nblo)
{
if (max(cr[[i]])<rho)
{
newgroup[i]=0
} else{
newgroup[i]=which.max(cr[[i]])
}
}
if (sum(newgroup!=oldgroup)>0)
{
oldgroup=newgroup
}else{
goout=1
}
#1 cluster left?
if(0%in% oldgroup & nlevels(factor(oldgroup))!=ngroups+1)
{
warning(paste("One cluster is empty with "), ngroups," clusters, rho is too high")
oldgroup[oldgroup==0]="K+1"
return(oldgroup)
}
iter=iter+1
}
#best partition?
quali_tent=sum(erreur)
for (i in 1:nblo)
{
if (max(cr[[i]])<rho)
{
sumRV2_act=sumRV2_act+rho**2
}else{
sumRV2_act=sumRV2_act+max(cr[[i]]**2)
}
}
if((sumRV2_act)>sumRV2)
{
sumRV2=sumRV2_act
qual=quali_tent
parti=oldgroup
alpha_bon=alpha
lamb_bon=lamb
Wk_bon=Wk
cr_bon=cr
err_bon=sum(erreur)
erreur_bon=erreur
}
if(print_attempt==TRUE)
{
print(nstart-tent)
}
}
}
if(init==TRUE)
{
sumRV2=0
coupe=as.numeric(partition)
iter=0
sumRV2_act=0
goout=0
oldgroup=coupe
erreur=NULL
while (goout!=1 & iter<=Itermax)
{
#compute the compromises
Wk=array(0,dim=c(nrow(Data),nrow(Data),ngroups))
lamb=NULL
alpha=list()
for (i in 1: ngroups)
{
cluster=which(oldgroup==i)
Wj=list()
for (tab in 1:length(cluster)) {
Wj[[tab]]=Wi[,,cluster[tab]]
}
statisgr=.crit_statisWj(Wj, cluster, RV)
Wk[,,i]=statisgr$W
lamb[i]=statisgr$lambda/length(cluster)
alpha[[i]]=statisgr$u
erreur[i]=statisgr$Q
}
#compute the criterion
cr=list()
for ( i in 1:nblo)
{
a=NULL
for (k in 1: ngroups)
{
W_k=as.matrix(Wk[,,k])
normW=sum(diag(crossprod(W_k)))
W_i=Wi[,,i]
a=c(a,sum(diag(crossprod(W_i, W_k)))^2/(normW))
}
cr[[i]]=sqrt(a)
}
#choice of the clusters
newgroup=NULL
for (i in 1:nblo)
{
if (max(cr[[i]])<rho)
{
newgroup[i]=0
}else{
newgroup[i]=which.max(cr[[i]])
}
}
if (sum(newgroup!=oldgroup)>0)
{
oldgroup=newgroup
}else{
goout=1
}
#1 cluster left?
if(0%in% oldgroup & nlevels(factor(oldgroup))!=ngroups+1)
{
warning(paste("One cluster is empty with "), ngroups," clusters, rho is too high")
oldgroup[oldgroup==0]="K+1"
return(oldgroup)
}
iter=iter+1
}
#results of the partition
quali_tent=sum(erreur)
for (i in 1:nblo)
{
if (max(cr[[i]])<rho)
{
sumRV2_act=sumRV2_act+rho**2
}else{
sumRV2_act=sumRV2_act+max(cr[[i]]**2)
}
}
sumRV2=sumRV2_act
qual=quali_tent
parti=oldgroup
alpha_bon=alpha
lamb_bon=lamb
Wk_bon=Wk
cr_bon=cr
err_bon=sum(erreur)
erreur_bon=erreur
}
#to simplify names
oldgroup=parti
Wk=Wk_bon
alpha=alpha_bon
lamb=lamb_bon
cr=cr_bon
#tab1: homogeneity
Wj=list()
for (i in 1:nblo)
{
Wj[[i]]=Wi[,,i]
}
stati=.crit_statisWj(Wj, 1:nblo, RV)
lambda_tot=stati$lambda/nblo
ne=NULL
for (i in 1:ngroups)
{
ne=c(ne,length(which(oldgroup==i)))
}
adios=which(oldgroup==0)
overall=(sum(lamb*ne))/(sum(ne))
if(length(adios)==0)
{
tab1=data.frame(homogeneity=round(c(lamb, overall, lambda_tot),3)*100,nb_blocks=c(ne, nblo, nblo))
rownames(tab1)=c(paste("Cluster",1:ngroups),"Overall","One group")
}else{
cluster=which(oldgroup==0)
Wj=list()
for (tab in 1:length(cluster)) {
Wj[[tab]]=Wi[,,cluster[tab]]
}
statisk1=.crit_statisWj(Wj, cluster, RV)
lambk1=statisk1$lambda/length(cluster)
tab1=data.frame(homogeneity=round(c(lamb,lambk1, overall,lambda_tot),3)*100, nb_blocks=c(ne,length(adios), sum(ne), nblo))
rownames(tab1)=c(paste("Cluster",1:ngroups), "Noise cluster", "Overall", "One group")
}
colnames(tab1)[1]="homogeneity (%)"
#rv with compromise and weights
rvcomp=list()
for (k in 1:ngroups)
{
cluster=which(parti==k)
temp=NULL
for (i in cluster)
{
temp=c(temp, round(max(cr[[i]]),3))
}
rvcomp[[k]]=temp
alpha[[k]]=as.vector(alpha[[k]])
names(rvcomp[[k]])=names(alpha[[k]])=NameBlocks[cluster]
}
#groups
partit=parti
parti[adios]="K+1"
names(parti)=NameBlocks
grou=data.frame(Cluster=parti)
#RV among compromises
tab4=matrix(0,ngroups,ngroups)
for (i in 1: ngroups)
{
for(j in 1:ngroups)
{
W_k=as.matrix(Wk_bon[,,i])
W_l=as.matrix(Wk_bon[,,j])
normWk=sqrt(sum(diag(crossprod(W_k))))
normWl=sqrt(sum(diag(crossprod(W_l))))
tab4[i,j]=sum(diag(crossprod(W_l, W_k)))/(normWl*normWk)
}
}
rownames(tab4)=paste("Cluster",1:ngroups)
colnames(tab4)=paste("Cluster",1:ngroups)
#proximity inter-cluster
prox=eigen(tab4)$values[1]/ngroups
#coordinates
compromise=list()
coord=list()
inertia=list()
for (i in 1:ngroups)
{
e=svd(Wk[,,i])
C=e$u%*%sqrt(diag(abs(e$d)))
C=C[,-ncol(C)]
if(i==1)
{
Cref=C
}
if (i!=1)
{
for (j in 1:ncol(C))
{
if(var(C[,j])>10^(-12) & var(Cref[,j])>10^(-12))
{
if (cor(Cref[,j],C[,j])<0)
{
C[,j]=-C[,j]#axes orientation
}
}
}
}
pouriner=round(e$d/sum(e$d)*100,2)
comprom=Wk[,,i]
rownames(comprom)=colnames(comprom)=rownames(C)=rownames(Data)
compromise[[i]]=comprom
colnames(C)=paste("Dim", 1:(nrow(Data)-1))
coord[[i]]=C
pouriner=pouriner[-length(pouriner)]
names(pouriner)=paste("Dim", 1:(nrow(Data)-1))
inertia[[i]]=pouriner
#graphical representations
if (Graph_groups==TRUE)
{
dev.new()
plot(C[,1],C[,2],type="n",lwd=5,pch=16,asp=1,xlab=paste("Dim 1 (",pouriner[1],"%)"), ylab=paste("Dim 2 (",pouriner[2],"%)"),xlim=c(min(C[,1])-0.05,max(C[,1])+0.05))
text(C[,1],C[,2],rownames(Data),col=rainbow(nrow(Data)))
abline(h=0,v=0)
title(paste("Cluster",i))
}
}
names(compromise)=names(rvcomp)=names(alpha)=paste("Cluster",1:ngroups)
if(length(adios)==0)
{
criterion=err_bon
}else{
criterion=NULL
}
#weights barplot
if(Graph_weights==TRUE)
{
for (i in 1:ngroups)
{
dev.new()
barplot(alpha[[i]])
title(paste("Weights in the cluster",i))
}
}
#rounds
for (i in 1: ngroups)
{
compromise[[i]]=round(compromise[[i]],3)
coord[[i]]=round(coord[[i]],3)
}
for (i in 1:nblo)
{
cr[[i]]=round(cr[[i]],3)
Wi[,,i]=round(Wi[,,i],3)
names(cr[[i]])=paste("Cluster", 1:ngroups)
}
if(nblo>1)
{
dimnames(Wi)[[1]]=dimnames(Wi)[[2]]=rownames(Data)
dimnames(Wi)[[3]]=NameBlocks
}
names(cr)=NameBlocks
names(compromise)=names(coord)=names(inertia)=paste("Cluster",1:ngroups)
#results
res=list(group=grou, rho=rho, homogeneity=tab1, rv_with_compromise=rvcomp, weights=alpha, compRV=round(tab4,2),
compromise=compromise, coord=coord, inertia=inertia,
rv_all_cluster=cr, criterion=criterion, param=list(nblo=nblo, ng=ngroups, n=n), type="K")
class(res)="clustatis"
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.