Nothing
##=============================================================================
##' @title Perform a cluster analysis of blocks from a CATA experiment
##'
##' @description
##' Hierarchical clustering of blocks from a CATA experiment. Each cluster of blocks is associated with a compromise
##' computed by the CATATIS method. The hierarchical clustering is followed by a partitioning algorithm (consolidation).
##' Non-binary data are accepted.
##'
##' @usage
##'cluscata(Data, nblo, NameBlocks=NULL, NameVar=NULL, Noise_cluster=FALSE,
##' Itermax=30, Graph_dend=TRUE, Graph_bar=TRUE, printlevel=FALSE,
##' gpmax=min(6, nblo-2), Testonlyoneclust=FALSE, alpha=0.05,
##' nperm=50, Warnings=FALSE)
##'
##' @param Data data frame or matrix where the blocks of binary variables are merged horizontally. If you have a different format, see \code{\link{change_cata_format}}
##'
##' @param nblo numerical. Number of blocks (subjects).
##'
##' @param NameBlocks string vector. Name of each block (subject). Length must be equal to the number of blocks. If NULL, the names are S1,...Sm. Default: NULL
##'
##' @param NameVar string vector. Name of each variable (attribute, the same names for each subject). Length must be equal to the number of attributes. If NULL, the colnames of the first block are taken. Default: NULL
##'
##' @param Noise_cluster logical. Should a noise cluster be computed? Default: FALSE
##'
##' @param Itermax numerical. Maximum of iteration for the partitioning algorithm. Default:30
##'
##' @param Graph_dend logical. Should the dendrogram be plotted? Default: TRUE
##'
##' @param Graph_bar logical. Should the barplot of the difference of the criterion and the barplot of the overall homogeneity at each merging step of the hierarchical algorithm be plotted? Default: TRUE
##'
##' @param printlevel logical. Print the number of remaining levels during the hierarchical clustering algorithm? Default: FALSE
##'
##' @param gpmax logical. What is maximum number of clusters to consider? Default: min(6, nblo-2)
##'
##' @param Testonlyoneclust logical. Test if there is more than one cluster? Default: FALSE
##'
##' @param alpha numerical between 0 and 1. What is the threshold to test if there is more than one cluster? Default: 0.05
##'
##' @param nperm numerical. How many permutations are required to test if there is more than one cluster? Default: 50
##'
##' @param Warnings logical. Display warnings about the fact that none of the subjects in some clusters checked an attribute or product? Default: FALSE
##'
##'
##' @return Each partitionK contains a list for each number of clusters of the partition, K=1 to gpmax with:
##' \itemize{
##' \item group: the clustering partition after consolidation. If Noise_cluster=TRUE, some subjects could be in the noise cluster ("K+1")
##' \item rho: the threshold for the noise cluster
##' \item homogeneity: homogeneity index (%) of each cluster and the overall homogeneity index (%) of the partition
##' \item s_with_compromise: similarity coefficient of each subject with its cluster compromise
##' \item weights: weight associated with each subject in its cluster
##' \item compromise: the compromise of each cluster
##' \item CA: list. the correspondance analysis results on each cluster compromise (coordinates, contributions...)
##' \item inertia: percentage of total variance explained by each axis of the CA for each cluster
##' \item s_all_cluster: the similarity coefficient between each subject and each cluster compromise
##' \item criterion: the CLUSCATA criterion error
##' \item param: parameters called
##' \item type: parameter passed to other functions
##' }
##' There is also at the end of the list:
##' \itemize{
##' \item dend: The CLUSCATA dendrogram
##' \item cutree_k: the partition obtained by cutting the dendrogram in K clusters (before consolidation).
##' \item overall_homogeneity_ng: percentage of overall homogeneity by number of clusters before consolidation (and after if there is no noise cluster)
##' \item diff_crit_ng: variation of criterion when a merging is done before consolidation (and after if there is no noise cluster)
##' \item test_one_cluster: decision and pvalue to know if there is more than one cluster
##' \item param: parameters called
##' \item type: parameter passed to other functions
##' }
##'
##'
##' @keywords CATA
##'
##' @references
##' Llobell, F., Cariou, V., Vigneau, E., Labenne, A., & Qannari, E. M. (2019). A new approach for the analysis of data and the clustering of subjects in a CATA experiment. Food Quality and Preference, 72, 31-39.\cr
##' Llobell, F., Giacalone, D., Labenne, A., Qannari, E.M. (2019). Assessment of the agreement and cluster analysis of the respondents in a CATA experiment. Food Quality and Preference, 77, 184-190.
##'
##' @examples
##' \donttest{
##' data(straw)
##' #with 40 subjects
##' res=cluscata(Data=straw[,1:(16*40)], nblo=40)
##' #plot(res, ngroups=3, Graph_dend=FALSE)
##' summary(res, ngroups=3)
##' #With noise cluster
##' res2=cluscata(Data=straw[,1:(16*40)], nblo=40, Noise_cluster=TRUE,
##' Graph_dend=FALSE, Graph_bar=FALSE)
##' #with all subjects
##' res=cluscata(Data=straw, nblo=114, printlevel=TRUE)
##'
##'
##' #Vertical format
##' data("fish")
##' Data=fish[1:66,2:30]
##' chang2=change_cata_format2(Data, nprod= 6, nattr= 27, nsub = 11, nsess= 1)
##' res3=cluscata(Data= chang2$Datafinal, nblo = 11, NameBlocks = chang2$NameSub)
##' }
##'
##' @seealso \code{\link{plot.cluscata}}, \code{\link{summary.cluscata}} , \code{\link{catatis}}, \code{\link{cluscata_kmeans}}, \code{\link{change_cata_format}}, \code{\link{change_cata_format2}}
##'
##' @export
##=============================================================================
cluscata=function(Data, nblo, NameBlocks=NULL, NameVar=NULL, Noise_cluster=FALSE, Itermax=30,
Graph_dend=TRUE, Graph_bar=TRUE, printlevel=FALSE,gpmax=min(6, nblo-2),
Testonlyoneclust=FALSE, alpha=0.05, nperm=50, Warnings=FALSE){
#initialisation
n=nrow(Data)
p=ncol(Data)
nvar=p/nblo
#parapet for nblo
if (as.integer(nvar)!=nvar)
{
stop("number of columns modulo nblo != 0")
}
Blocks=rep(nvar,nblo)
J=rep(1:nblo , times = Blocks )# indicates which block each variable belongs to
#rownames, colnames, NameBlocks
if (is.null(NameBlocks)) NameBlocks=paste("S",1:nblo,sep="-")
if(is.null(rownames(Data))) rownames(Data)=paste0("X", 1:nrow(Data))
if(is.null(colnames(Data))) colnames(Data)=rep(paste0("Y",1:nvar), nblo)
X=Data
#Parapet for NameBlocks
if(length(NameBlocks)!=nblo)
{
stop("Error with the length of NameBlocks")
}
#parapet for numerical Data
for (i in 1: ncol(Data))
{
if (is.numeric(Data[,i])==FALSE)
{
stop(paste("The data must be numeric (column",i,")"))
}
}
#parapet for number of objects
if(n<3)
{
stop("At least 3 products are required")
}
#parapet for number of blocks
if(nblo<4)
{
stop("At least 4 subjects are required")
}
#parapet for number of attributes
if(nvar<3)
{
stop("At least 3 attributes are required")
}
#parapet for gpmax
if (gpmax>(nblo-1))
{
stop(paste("gpmax > number of blocks-1"))
}
#no NA
if(sum(is.na(Data))>0)
{
print("NA detected:")
tabna=which(is.na(Data), arr.ind = TRUE)
print(tabna)
stop(paste("NA are not accepted"))
}
#prepare data
Xi=array(0,dim=c(n,nvar,nblo)) # array with all subjects matrices
muk=NULL
for(j in 1:nblo)
{
Aj=as.matrix(X[,J==j])
normXi=sqrt(sum(diag(tcrossprod(Aj,Aj))))
muk[j]=normXi
if(normXi==0)
{
stop(paste("error: the subject",NameBlocks[j], "has only 0"))
}
Xi[,,j]=Aj/normXi #standardization
}
#only one cluster?
if (Testonlyoneclust==TRUE)
{
testonecluster=.one_cluster_or_more_cata(Data, nblo, nperm = nperm, alpha=alpha)
if(testonecluster$decision==TRUE)
{
testonecluster$decision="Only one cluster can be considered"
}else{
testonecluster$decision="Clustering is necessary"
}
}else{
testonecluster=list(decision="Untested", pvalue="Untested")
}
# S matrix:
S=matrix(0,nblo,nblo)
diag(S)=rep(1,nblo)
for (i in 1:(nblo-1)) {
for (j in (i+1):nblo) {
S[i,j]=sum(diag(tcrossprod(Xi[,,i],Xi[,,j])))
S[j,i]=S[i,j]
} }
# criterion when each data table forms a group itself: 0
crit=rep(0,nblo)
Q_current=0
critS=rep(1,nblo)
S_current=nblo
cc=1:nblo
# the names of the clusters are 1 to 'number of clusters'
code=1:nblo;
# the names of the clusters are taken from the HCA
cvar=1:nblo
# when a new cluster is formed from clusters i and j, it is named min(i,j))
mergelist=matrix(0,nblo-1,2)
results=matrix(0,nblo-1,6)
colnames(results)=c("merg1", "merg2", "new.clust", "agg.crit.hac","clust.crit.hac","S")
idgr=matrix(0,nblo,nblo)
idgr[nblo,]=1:nblo;
ordr = c()
fusions = -(1:nblo)
hmerge =matrix(0,nblo-1,2)
ncluster=nblo
#for homogeneity
quality=NULL
for (level in 1:(nblo-1)) {
################
# loops i et j
################
# We search the two clusters that will be merged.
deltamin=10^100
for (i in 1:(ncluster-1)) {
for (j in (i+1):(ncluster)) {
# merging the i'th and j'th cluster:
newcluster=c(which(cc==i), which(cc==j))
Xj=list()
for (tab in 1:length(newcluster)) {
Xj[[tab]]=Xi[,,newcluster[tab]]
}
newcatatis=.crit_cataXj_fast(Xj, newcluster, S)
deltacurrent=newcatatis-sum(crit[c(i,j)])
# if deltacurrent is smaller than deltamin, the current information is saved:
if (deltacurrent<deltamin) {
deltamin=deltacurrent
c1=cvar[cc==i]
c2=cvar[cc==j]
merge1=c(c1[1], c2[1])
c1=code[cc==i]
c2=code[cc==j]
merge=c(c1[1], c2[1])
cl_1=i
cl_2=j
catatismerge=newcatatis
}
} # end of loop j
} # end of loop i
ncluster=ncluster-1
###############################
# renewal of the parameters
###############################
Q_current=Q_current+deltamin
mergelist[level,]=merge1
results[level,1:5]=c(merge, nblo+level, deltamin, Q_current)
crit[cl_1]=catatismerge
crit=crit[-cl_2]
cc[which(cc==cl_2)]=cl_1
cc[which(cc>cl_2)]=cc[which(cc>cl_2)]-1
cvar[cc==cl_1]=min(merge1)
code[cc==cl_1]=nblo+level
idgr[ncluster,]=cc
indic= c(fusions[merge1[1]],fusions[merge1[2]])
hmerge[level,]<-indic
fusions[which(idgr[ncluster,]==cl_1)]<- level
ordr <- .order_var(ordr,which(idgr[ncluster,]==cl_1))
#overall homogeneity
quality=c(quality, (nblo-Q_current)/nblo)
if(printlevel==TRUE)
{
print(nblo-1-level)
}
} # end of level
# Dendogram:
resultscah=list(labels=NameBlocks, height=results[,4], merge=hmerge,order=ordr)
mytot<-resultscah
class(mytot)="hclust"
mydendC=as.dendrogram(mytot)
#show the dendrogram
if (Graph_dend==TRUE)
{
dev.new()
cex=0.6
par(cex=cex)
par(mar=c(7, 4, 4, 2) + 0.1)
plot(mydendC, type ="rectangle", main="CLUSCATA Dendrogram", axes=TRUE, cex=cex,ylab="Height")
par(cex=1)
par(mar=c(5, 4, 4, 2) + 0.1)
}
#show the criterion and homogeneity evolutions
if (Graph_bar==TRUE)
{
dev.new()
barplot(results[,4][(nblo-gpmax):(nblo-1)],xlab="Nb clusters",ylab="delta", main="Variation of criterion",
axisnames = TRUE,names.arg = paste((gpmax+1):2,"->",(gpmax):1),las=2,cex.names = 0.6,cex.main=1.2,col = "blue")
dev.new()
barplot(quality[(nblo-gpmax):(nblo-1)]*100,xlab="Nb clusters",ylab="Overall homogeneity (%)", main="Overall homogeneity (%)",
axisnames = TRUE,names.arg = paste((gpmax):1),las=2,cex.names = 0.6,cex.main=1.2,col = "blue")
}
#number of clusters advised
criter=sort(results[,5],decreasing = TRUE)
H=NULL
for (k in 1:min(gpmax,nblo-2))
{
H[k]=(criter[k]/criter[k+1]-1)*(nblo-k-1)
}
nbgroup_hart=which.max(H[-(min(gpmax,nblo-2))]-H[-1])+1
cat(paste("Recommended number of clusters =", nbgroup_hart),"\n")
######find the threshold rho for the noise cluster and consolidation
res.consol=list()
cutree_k=list()
rho=NULL
for (K in 1:gpmax)
{
coupe=cutree(mytot,K)
cutree_k[[K]]=coupe
if (Noise_cluster==TRUE)
{
oldgroup=coupe
#compute the compromises
Ck=array(0,dim=c(nrow(Data),nvar,K))
for (i in 1: K)
{
cluster=which(oldgroup==i)
Xj=list()
for (tab in 1:length(cluster)) {
Xj[[tab]]=Xi[,,cluster[tab]]
}
catatisk=.crit_cataXj(Xj, cluster, S)
Ck[,,i]=catatisk$C
}
#compute the criterions
cr=list()
cr2=list()
for ( i in 1:nblo)
{
a=NULL
for (k in 1:K)
{
C_k=as.matrix(Ck[,,k])
normC=sum(diag(tcrossprod(C_k)))
a=c(a,sum(diag(tcrossprod(Xi[,,i], C_k)))^2/(normC))
}
cr[[i]]=sqrt(a)
if (K>1)
{
cr2[[i]]=sort(sqrt(a), decreasing = TRUE)[1:2]
}else{
cr2[[i]]=sqrt(a)
}
}
rho[K]=mean(unlist(cr2))
} else{
rho[K]=0
}
#consolidation
res.consol[[K]]=cluscata_kmeans(Data, nblo, coupe, rho=rho[K], NameBlocks = NameBlocks, NameVar=NameVar, Itermax=Itermax,
Graph_groups=FALSE, Warnings=Warnings)
}
names(cutree_k)=names(rho)=names(res.consol)=paste0("partition", 1:gpmax)
#after consolidation
diff_crit_bef=results[,4][(nblo-gpmax+1):(nblo-1)]
if (Noise_cluster==FALSE)
{
#after consolidation
overall_after=NULL
crit_after=NULL
for (i in 1: gpmax)
{
overall_after[i]=res.consol[[i]]$homogeneity[i+1,1]
crit_after[i]=res.consol[[i]]$criterion
}
diff_crit_after=crit_after[-gpmax]-crit_after[-1]
diff_crit_after=sort(diff_crit_after)
overall_after=sort(overall_after, decreasing = TRUE)
overall_homogeneity_ng=rbind(quality[(nblo-gpmax):(nblo-1)]*100, overall_after)
diff_crit_ng=rbind(diff_crit_bef, diff_crit_after)
rownames(diff_crit_ng)=rownames(overall_homogeneity_ng)=c("before consolidation", "after consolidation")
colnames(diff_crit_ng)=paste((gpmax):2,"->",(gpmax-1):1)
colnames(overall_homogeneity_ng)=paste (gpmax:1, "cluster(s)")
}else{
overall_homogeneity_ng=quality[(nblo-gpmax):(nblo-1)]*100
diff_crit_ng=diff_crit_bef
names(diff_crit_ng)=paste((gpmax):2,"->",(gpmax-1):1)
names(overall_homogeneity_ng)=paste (gpmax:1, "cluster(s)")
}
#results
res=c(res.consol, list(dend=mydendC, cutree_k=cutree_k,
overall_homogeneity_ng=round(overall_homogeneity_ng,1),
diff_crit_ng=round(diff_crit_ng,2),test_one_cluster=testonecluster, param=list(nblo=nblo, ng=nbgroup_hart,
Noise_cluster=Noise_cluster, gpmax=gpmax, n=n, nvar=nvar), type="H+C"))
class(res)="cluscata"
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.