Nothing
##=============================================================================
##' @title Perform a cluster analysis of blocks of quantitative variables
##'
##' @description
##' Hierarchical clustering of quantitative Blocks followed by a partitioning algorithm (consolidation). Each cluster of blocks is associated with a compromise
##' computed by the STATIS method. Moreover, a noise cluster can be set up.
##'
##' @usage
##' clustatis(Data,Blocks,NameBlocks=NULL,Noise_cluster=FALSE,scale=FALSE,
##' Itermax=30, Graph_dend=TRUE, Graph_bar=TRUE,
##' printlevel=FALSE, gpmax=min(6, length(Blocks)-2), rhoparam=NULL,
##' Testonlyoneclust=FALSE, alpha=0.05, nperm=50)
##'
##'
##' @param Data data frame or matrix. Correspond to all the blocks of variables merged horizontally
##'
##' @param Blocks numerical vector. The number of variables of each block. The sum must be equal to the number of columns of Data
##'
##' @param NameBlocks string vector. Name of each block. Length must be equal to the length of Blocks vector. If NULL, the names are B1,...Bm. Default: NULL
##'
##' @param Noise_cluster logical. Should a noise cluster be computed? Default: FALSE
##'
##' @param scale logical. Should the data variables be scaled? Default: FALSE
##'
##' @param Itermax numerical. Maximum of iteration for the partitioning algorithm. Default: 30
##'
##' @param Graph_dend logical. Should the dendrogram be plotted? Default: TRUE
##'
##' @param Graph_bar logical. Should the barplot of the difference of the criterion and the barplot of the overall homogeneity at each merging step of the hierarchical algorithm be plotted? Default: TRUE
##'
##' @param printlevel logical. Print the number of remaining levels during the hierarchical clustering algorithm? Default: FALSE
##'
##' @param gpmax logical. What is maximum number of clusters to consider? Default: min(6, number of blocks -2)
##'
##' @param rhoparam numerical. What is the threshold for the noise cluster? Between 0 and 1, high value can imply lot of blocks set aside. If NULL, automatic threshold is computed.
##'
##' @param Testonlyoneclust logical. Test if there is more than one cluster? Default: FALSE
##'
##' @param alpha numerical between 0 and 1. What is the threshold to test if there is more than one cluster? Default: 0.05
##'
##' @param nperm numerical. How many permutations are required to test if there is more than one cluster? Default: 50
##'
##'
##'
##' @return Each partitionK contains a list for each number of clusters of the partition, K=1 to gpmax with:
##' \itemize{
##' \item group: the clustering partition of datasets after consolidation. If Noise_cluster=TRUE, some blocks could be in the noise cluster ("K+1")
##' \item rho: the threshold for the noise cluster (computed or input parameter)
##' \item homogeneity: homogeneity index (%) of each cluster and the overall homogeneity index (%) of the partition
##' \item rv_with_compromise: RV coefficient of each block with its cluster compromise
##' \item weights: weight associated with each block in its cluster
##' \item comp_RV: RV coefficient between the compromises associated with the various clusters
##' \item compromise: the W compromise of each cluster
##' \item coord: the coordinates of objects of each cluster
##' \item inertia: percentage of total variance explained by each axis for each cluster
##' \item rv_all_cluster: the RV coefficient between each block and each cluster compromise
##' \item criterion: the CLUSTATIS criterion error
##' \item param: parameters called in the consolidation
##' \item type: parameter passed to other functions
##' }
##' There is also at the end of the list:
##' \itemize{
##' \item dend: The CLUSTATIS dendrogram
##' \item cutree_k: the partition obtained by cutting the dendrogram for K clusters (before consolidation).
##' \item overall_homogeneity_ng: percentage of overall homogeneity by number of clusters before consolidation (and after if there is no noise cluster)
##' \item diff_crit_ng: variation of criterion when a merging is done before consolidation (and after if there is no noise cluster)
##' \item test_one_cluster: decision and pvalue to know if there is more than one cluster
##' \item param: parameters called
##' \item type: parameter passed to other functions
##' }
##'
##'
##'
##' @keywords quantitative
##'
##' @references
##' Llobell, F., Cariou, V., Vigneau, E., Labenne, A., & Qannari, E. M. (2018). Analysis and clustering of multiblock datasets by means of the STATIS and CLUSTATIS methods. Application to sensometrics. Food Quality and Preference, in Press.\cr
##' Llobell, F., Vigneau, E., Qannari, E. M. (2019). Clustering datasets by means of CLUSTATIS with identification of atypical datasets. Application to sensometrics. Food Quality and Preference, 75, 97-104.
##'
##'
##' @importFrom stats as.dendrogram cor cutree runif
##'
##'
##' @examples
##'
##' data(smoo)
##' NameBlocks=paste0("S",1:24)
##' cl=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks)
##' #plot(cl, ngroups=3, Graph_dend=FALSE)
##' summary(cl)
##' #with noise cluster
##' cl2=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##' Noise_cluster=TRUE, Graph_dend=FALSE, Graph_bar=FALSE)
##' #with noise cluster and defined rho threshold
##' cl3=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##' Noise_cluster=TRUE, Graph_dend=FALSE, Graph_bar=FALSE, rhoparam=0.5)
##'
##' @seealso \code{\link{plot.clustatis}}, \code{\link{summary.clustatis}} , \code{\link{clustatis_kmeans}}, \code{\link{statis}}
##'
##' @export
##=============================================================================
clustatis=function(Data,Blocks,NameBlocks=NULL,Noise_cluster=FALSE, scale=FALSE,
Itermax=30, Graph_dend=TRUE, Graph_bar=TRUE,
printlevel=FALSE, gpmax=min(6, length(Blocks)-2), rhoparam=NULL,
Testonlyoneclust=FALSE, alpha=0.05, nperm=50){
nblo=length(Blocks)
n=nrow(Data)
if (is.null(NameBlocks)) NameBlocks=paste("B",1:nblo,sep="-")
if(is.null(rownames(Data))) rownames(Data)=paste0("X", 1:nrow(Data))
if(is.null(colnames(Data))) colnames(Data)=paste0("Y",1:ncol(Data))
#parapet for numerical Data
for (i in 1: ncol(Data))
{
if (is.numeric(Data[,i])==FALSE)
{
stop(paste("The data must be numeric (column",i,")"))
}
}
#parapet for number of objects
if(n<3)
{
stop("At least 3 objects are required")
}
#parapet for number of blocks
if(nblo<4)
{
stop("At least 4 blocks are required")
}
#parapet for Blocks
if(sum(Blocks)!=ncol(Data))
{
stop("Error with Blocks")
}
#Parapet for NameBlocks
if(length(NameBlocks)!=nblo)
{
stop("Error with the length of NameBlocks")
}
#parapet for scale: no constant variable
if(scale==TRUE)
{
for (i in 1:ncol(Data))
{
if (sd(Data[,i])==0)
{
stop(paste("Column", i, "is constant"))
}
}
}
#parapet for gpmax
if (gpmax>(nblo-1))
{
stop(paste("gpmax > number of blocks-1"))
}
#no NA
if(sum(is.na(Data))>0)
{
print("NA detected:")
tabna=which(is.na(Data), arr.ind = TRUE)
print(tabna)
stop(paste("NA are not accepted"))
}
#centering and scaling if necessary
Data=scale(Data,center=TRUE,scale=scale)
J=rep(1:nblo , times = Blocks ) # indicates which block each variable belongs to
#parapet for constant configurations and compute of Wi
Wi=array(0,dim=c(nrow(Data),nrow(Data),nblo))
for (i in 1:nblo)
{
Xi=as.matrix(Data[,J==i])
Wi[,,i]=tcrossprod(Xi)
nor=sqrt(sum(diag(tcrossprod(Wi[,,i]))))
if(nor==0)
{
stop(paste("Configuration",i,"is constant"))
}
Wi[,,i]=Wi[,,i]/nor #standardization
}
#only one cluster?
if (Testonlyoneclust==TRUE)
{
testonecluster=.one_cluster_or_more(Data, Blocks, nperm = nperm, scale=scale, alpha=alpha)
if(testonecluster$decision==TRUE)
{
testonecluster$decision="Only one cluster can be considered"
}else{
testonecluster$decision="Clustering is necessary"
}
}else{
testonecluster=list(decision="Untested", pvalue="Untested")
}
#RV matrix:
RV=matrix(0,nblo,nblo)
diag(RV)=rep(1,nblo)
if(nblo>1)
{
for (i in 1:(nblo-1)) {
for (j in (i+1):nblo) {
RV[i,j]=sum(diag(crossprod(Wi[,,i],Wi[,,j])))
RV[j,i]=RV[i,j]
} }
}
# criterion Q when each data table forms a group itself: 0
crit=rep(0,nblo)
Q_current=0
# critS=rep(1,nblo)
S_current=nblo
cc=1:nblo
# the names of the clusters are 1 to 'number of clusters'
code=1:nblo;
# the names of the clusters are taken from the HCA
cvar=1:nblo
# when a new cluster is formed from clusters i and j, it is named min(i,j))
mergelist=matrix(0,nblo-1,2)
results=matrix(0,nblo-1,5)
colnames(results)=c("merg1", "merg2", "new.clust", "agg.crit.hac","clust.crit.hac")
idgr=matrix(0,nblo,nblo)
idgr[nblo,]=1:nblo;
ordr = c()
fusions = -(1:nblo)
hmerge =matrix(0,nblo-1,2)
ncluster=nblo
#for homogeneity
quality=NULL
for (level in 1:(nblo-1)) {
################
# loops i et j
################
# We search the two clusters that will be merged.
deltamin=10^100
for (i in 1:(ncluster-1)) {
for (j in (i+1):(ncluster)) {
# merging the i'th and j'th cluster:
newcluster=c(which(cc==i), which(cc==j))
Wj=list()
for (tab in 1:length(newcluster)) {
Wj[[tab]]=Wi[,,newcluster[tab]]
}
newstatis=.crit_statisWj_fast(Wj, newcluster, RV)
deltacurrent=newstatis-sum(crit[c(i,j)])
# if deltacurrent is smaller than deltamin, the current information is saved:
if (deltacurrent<deltamin) {
deltamin=deltacurrent
c1=cvar[cc==i]
c2=cvar[cc==j]
merge1=c(c1[1], c2[1])
c1=code[cc==i]
c2=code[cc==j]
merge=c(c1[1], c2[1])
cl_1=i
cl_2=j
statismerge=newstatis
}
} # end of loop j
} # end of loop i
ncluster=ncluster-1
###############################
# renewal of the parameters
###############################
Q_current=Q_current+deltamin
mergelist[level,]=merge1
results[level,1:5]=c(merge, nblo+level, deltamin, Q_current)
crit[cl_1]=statismerge
crit=crit[-cl_2]
cc[which(cc==cl_2)]=cl_1
cc[which(cc>cl_2)]=cc[which(cc>cl_2)]-1
cvar[cc==cl_1]=min(merge1)
code[cc==cl_1]=nblo+level
idgr[ncluster,]=cc
indic= c(fusions[merge1[1]],fusions[merge1[2]])
hmerge[level,]<-indic
fusions[which(idgr[ncluster,]==cl_1)]<- level
ordr <- .order_var(ordr,which(idgr[ncluster,]==cl_1))
# #overall homogeneity
quality=c(quality, (nblo-Q_current)/nblo)
#show the level
if(printlevel==TRUE)
{
print(nblo-1-level)
}
} # end of level
# Dendogram:
resultscah=list(labels=NameBlocks, height=results[,4], merge=hmerge,order=ordr)
mytot<-resultscah
class(mytot)="hclust"
mydendC=as.dendrogram(mytot)
#show the dendrogram
if (Graph_dend==TRUE)
{
dev.new()
cex=0.6
par(cex=cex)
par(mar=c(7, 4, 4, 2) + 0.1)
plot(mydendC, type ="rectangle", main="CLUSTATIS Dendrogram", axes=TRUE, cex=cex,ylab="Height")
par(cex=1)
par(mar=c(5, 4, 4, 2) + 0.1)
}
#show the criterion and homogeneity evolutions
if (Graph_bar==TRUE)
{
dev.new()
barplot(results[,4][(nblo-gpmax):(nblo-1)],xlab="Nb clusters",ylab="delta", main="Variation of criterion",
axisnames = TRUE,names.arg = paste((gpmax+1):2,"->",(gpmax):1),las=2,cex.names = 0.6,cex.main=1.2,col = "blue")
dev.new()
barplot(quality[(nblo-gpmax):(nblo-1)]*100,xlab="Nb clusters",ylab="Overall homogeneity (%)", main="Overall homogeneity (%)",
axisnames = TRUE,names.arg = paste((gpmax):1),las=2,cex.names = 0.6,cex.main=1.2,col = "blue")
}
#number of clusters advised
criter=sort(results[,5],decreasing = TRUE)
H=NULL
for (k in 1:min(gpmax,nblo-2))
{
H[k]=(criter[k]/criter[k+1]-1)*(nblo-k-1)
}
nbgroup_hart=which.max(H[-(min(gpmax,nblo-2))]-H[-1])+1
cat(paste("Recommended number of clusters =", nbgroup_hart),"\n")
######find the threshold rho for the noise cluster and consolidation
res.consol=list()
cutree_k=list()
rho=NULL
for (K in 1:gpmax)
{
coupe=cutree(mytot,K)
cutree_k[[K]]=coupe
if (Noise_cluster==TRUE)
{
if (is.null(rhoparam)==TRUE)
{
oldgroup=coupe
#compute the compromises
Wk=array(0,dim=c(nrow(Data),nrow(Data),K))
for (i in 1: K)
{
cluster=which(oldgroup==i)
Wj=list()
for (tab in 1:length(cluster)) {
Wj[[tab]]=Wi[,,cluster[tab]]
}
statisk=.crit_statisWj(Wj, cluster, RV)
Wk[,,i]=statisk$W
}
#compute the criterions
cr=list()
cr2=list()
for ( i in 1:nblo)
{
a=NULL
for (k in 1:K)
{
W_k=as.matrix(Wk[,,k])
normW=sum(diag(crossprod(W_k)))
W_i=Wi[,,i]
a=c(a,sum(diag(crossprod(W_i, W_k)))^2/(normW))
}
cr[[i]]=sqrt(a)
if (K>1)
{
cr2[[i]]=sort(sqrt(a), decreasing = TRUE)[1:2]
}else{
cr2[[i]]=sqrt(a)
}
}
rho[K]=mean(unlist(cr2))
} else{
rho[K]=rhoparam
}
} else{
rho[K]=0
}
#consolidation
res.consol[[K]]=clustatis_kmeans(Data, Blocks, coupe, rho=rho[K], NameBlocks = NameBlocks, Itermax=Itermax,
Graph_groups=FALSE, Graph_weights = FALSE, scale = scale)
}
names(cutree_k)=names(rho)=names(res.consol)=paste0("partition", 1:gpmax)
#after consolidation
diff_crit_bef=results[,4][(nblo-gpmax+1):(nblo-1)]
if (Noise_cluster==FALSE)
{
#after consolidation
overall_after=NULL
crit_after=NULL
for (i in 1: gpmax)
{
overall_after[i]=res.consol[[i]]$homogeneity[i+1,1]
crit_after[i]=res.consol[[i]]$criterion
}
diff_crit_after=crit_after[-gpmax]-crit_after[-1]
diff_crit_after=sort(diff_crit_after)
overall_after=sort(overall_after, decreasing = TRUE)
overall_homogeneity_ng=rbind(quality[(nblo-gpmax):(nblo-1)]*100, overall_after)
diff_crit_ng=rbind(diff_crit_bef, diff_crit_after)
rownames(diff_crit_ng)=rownames(overall_homogeneity_ng)=c("before consolidation", "after consolidation")
colnames(diff_crit_ng)=paste((gpmax):2,"->",(gpmax-1):1)
colnames(overall_homogeneity_ng)=paste (gpmax:1, "cluster(s)")
}else{
overall_homogeneity_ng=quality[(nblo-gpmax):(nblo-1)]*100
diff_crit_ng=diff_crit_bef
names(diff_crit_ng)=paste((gpmax):2,"->",(gpmax-1):1)
names(overall_homogeneity_ng)=paste (gpmax:1, "cluster(s)")
}
#results
res=c(res.consol, list(dend=mydendC, cutree_k=cutree_k,
overall_homogeneity_ng=round(overall_homogeneity_ng,1),
diff_crit_ng=round(diff_crit_ng,2), test_one_cluster=testonecluster, param=list(nblo=nblo, ng=nbgroup_hart,
Noise_cluster=Noise_cluster, gpmax=gpmax, n=n), type="H+C"))
class(res)="clustatis"
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.