R/cluscata_liking.R
In ClustBlock: Clustering of Datasets

Documented in cluscata_liking

## =============================================================================
##' @title Perform a cluster analysis of subjects on CATA/liking combination
##'
##' @description
##' Clustering of subjects (blocks) from combination of CATA and liking experiments.
##'
##' @usage
##' cluscata_liking(Data, nblo, NameBlocks=NULL, NameVar=NULL,  Itermax=30,
##'                        Graph_dend=TRUE, Graph_bar=TRUE,
##'                         printlevel=FALSE,gpmax=min(6, nblo-1))
##'
##' @param Data data frame or matrix where the blocks of variables (attributes) are merged horizontally thanks to \code{\link{combinCATALiking}} function
##'
##' @param nblo  numerical. Number of blocks (subjects).
##'
##' @param NameBlocks string vector. Name of each block (subject). Length must be equal to the number of blocks. If NULL, the names are S1,...Sm. Default: NULL
##'
##' @param NameVar string vector. Name of each variable (attribute, the same names for each subject). Length must be equal to the number of attributes. If NULL, the colnames of the first block are taken. Default: NULL
##'
##' @param Itermax numerical. Maximum of iteration for the partitioning algorithm. Default:30
##'
##' @param Graph_dend logical. Should the dendrogram be plotted? Default: TRUE
##'
##' @param Graph_bar logical. Should the barplot of the difference of the criterion and the barplot of the overall homogeneity at each merging step of the hierarchical algorithm be plotted? Default: TRUE
##'
##' @param printlevel logical. Print the number of remaining levels during the hierarchical clustering algorithm? Default: FALSE
##'
##' @param gpmax logical. What is maximum number of clusters to consider? Default: min(6, nblo-2)
##'
##' @return Each partitionK contains a list for each number of clusters of the partition, K=1 to gpmax with:
##'         \itemize{
##'          \item group: the clustering partition after consolidation.
##'          \item compromise: the compromise of each cluster
##'          \item dist_all_cluster: the distance between each subject and each cluster compromise
##'          \item criterion: the CLUSCATA-liking criterion error
##'          \item param: parameters called
##'          \item type: parameter passed to other functions
##'          }
##'          There is also at the end of the list:
##'          \itemize{
##'          \item dend: The CLUSCATA dendrogram
##'          \item cutree_k: the partition obtained by cutting the dendrogram in K clusters (before consolidation).
##'          \item diff_crit_ng: variation of criterion when a merging is done before consolidation (and after)
##'          \item param: parameters called
##'          \item type: parameter passed to other functions
##'          }
##'
##' @keywords CATA liking
##'
##' @references
##' Vigneau, E., Cariou, V., Giacalone, D., Berget, I., & Llobell, F. (2022). Combining hedonic information and CATA description for consumer segmentation. Food Quality and Preference, 95, 104358.
##'
##' @importFrom  FactoMineR tab.disjonctif
##'
##' @examples
##' data(cata_ryebread)
##' data(liking_ryebread)
##' cataliking=combinCATALiking(cata_ryebread, liking_ryebread)
##'
##'#with only 40 subjects
##' resclustcatal=cluscata_liking(Data=cataliking[,1:(40*14)], nblo=40, gpmax=5)
##' plot(resclustcatal, cata_ryebread[,1:(40*14)], liking_ryebread[,1:40])
##'
##'
##' @seealso   \code{\link{plot.cluscata_liking}}, \code{\link{summary.cluscata_liking}} , \code{\link{combinCATALiking}}
##'
##' @export
## =============================================================================



cluscata_liking=function(Data, nblo, NameBlocks=NULL, NameVar=NULL,  Itermax=30,
                         Graph_dend=TRUE, Graph_bar=TRUE, printlevel=FALSE,gpmax=min(6, nblo-1)){

  #initialisation
  n=nrow(Data)
  p=ncol(Data)
  nvar=p/nblo
  #parapet for nblo
  if (as.integer(nvar)!=nvar)
  {
    stop("number of columns modulo nblo != 0")
  }
  Blocks=rep(nvar,nblo)
  J=rep(1:nblo , times =  Blocks )# indicates which block each variable belongs to

  #rownames, colnames, NameBlocks
  if (is.null(NameBlocks)) NameBlocks=paste("S",1:nblo,sep="-")
  if(is.null(rownames(Data))) rownames(Data)=paste0("X", 1:nrow(Data))
  if(is.null(colnames(Data))) colnames(Data)=rep(paste0("Y",1:Blocks[1]), nblo)

  X=Data


  #Parapet for NameBlocks
  if(length(NameBlocks)!=nblo)
  {
    stop("Error with the length of NameBlocks")
  }


  #parapet for number of objects
  if(n<3)
  {
    stop("At least 3 objects are required")
  }

  #parapet for number of blocks
  if(nblo<4)
  {
    stop("At least 4 blocks are required")
  }


  #parapet for gpmax
  if (gpmax>(nblo-1))
  {
    stop(paste("gpmax > number of blocks-1"))
  }

  #no NA
  if(sum(is.na(Data))>0)
  {
    print("NA detected:")
    tabna=which(is.na(Data), arr.ind = TRUE)
    print(tabna)
    stop(paste("NA are not accepted"))
  }


  #prepare data
  Xi=array(0,dim=c(n,Blocks[1],nblo))  # array with all subjects matrices
  muk=NULL
  for(j in 1:nblo)
  {
    Aj=as.matrix(X[,J==j])
    normXi=sqrt(sum(diag(Aj%*%t(Aj))))
    muk[j]=normXi
    if(normXi==0)
    {
      stop(paste("error: the subject",NameBlocks[j], "has only 0"))
    }
    Xi[,,j]=Aj/normXi #standardization
  }


  # criterion when each data table forms a group itself: 0
  crit=rep(0,nblo)
  Q_current=0
  critS=rep(1,nblo)
  S_current=nblo

  cc=1:nblo
  # the names of the clusters are 1 to 'number of clusters'
  code=1:nblo;
  # the names of the clusters are taken from the HCA
  cvar=1:nblo
  # when a new cluster is formed from clusters i and j, it is named min(i,j))
  mergelist=matrix(0,nblo-1,2)
  results=matrix(0,nblo-1,6)
  colnames(results)=c("merg1", "merg2", "new.clust", "agg.crit.hac","clust.crit.hac","S")
  idgr=matrix(0,nblo,nblo)
  idgr[nblo,]=1:nblo;
  ordr = c()
  fusions = -(1:nblo)
  hmerge =matrix(0,nblo-1,2)

  ncluster=nblo


  for (level in 1:(nblo-1)) {

    ################
    # loops i et j
    ################

    # We search the two clusters that will be merged.

    deltamin=10^100
    for (i in 1:(ncluster-1)) {
      for (j in (i+1):(ncluster)) {
        # merging the i'th and j'th cluster:
        newcluster=c(which(cc==i), which(cc==j))
        Xj=list()
        for (tab in 1:length(newcluster)) {
          Xj[[tab]]=Xi[,,newcluster[tab]]
        }
        newcatatislik=.crit_cataMean(Xj)
        deltacurrent=newcatatislik$Q-sum(crit[c(i,j)])
        # if deltacurrent is smaller than deltamin, the current information is saved:
        if (deltacurrent<deltamin) {
          deltamin=deltacurrent
          c1=cvar[cc==i]
          c2=cvar[cc==j]
          merge1=c(c1[1], c2[1])
          c1=code[cc==i]
          c2=code[cc==j]
          merge=c(c1[1], c2[1])
          cl_1=i
          cl_2=j
          catatislikmerge=newcatatislik
        }
      }      # end of loop j
    }      # end of loop i
    ncluster=ncluster-1

    ###############################
    # renewal of the parameters
    ###############################

    Q_current=Q_current+deltamin
    mergelist[level,]=merge1
    results[level,1:5]=c(merge, nblo+level, deltamin, Q_current)
    crit[cl_1]=catatislikmerge$Q
    crit=crit[-cl_2]
    cc[which(cc==cl_2)]=cl_1
    cc[which(cc>cl_2)]=cc[which(cc>cl_2)]-1
    cvar[cc==cl_1]=min(merge1)
    code[cc==cl_1]=nblo+level
    idgr[ncluster,]=cc
    indic= c(fusions[merge1[1]],fusions[merge1[2]])
    hmerge[level,]<-indic
    fusions[which(idgr[ncluster,]==cl_1)]<- level
    ordr <- .order_var(ordr,which(idgr[ncluster,]==cl_1))

    if(printlevel==TRUE)
    {
      print(nblo-1-level)
    }
  }  # end of level


  # Dendogram:
  resultscah=list(labels=NameBlocks, height=results[,4], merge=hmerge,order=ordr)
  mytot<-resultscah
  class(mytot)="hclust"
  mydendC=as.dendrogram(mytot)

  #show the dendrogram
  if (Graph_dend==TRUE)
  {
    dev.new()
    cex=0.6
    par(cex=cex)
    par(mar=c(7, 4, 4, 2) + 0.1)
    plot(mydendC, type ="rectangle",  main="CLUSCATA liking Dendrogram", axes=TRUE, cex=cex,ylab="Height")
    par(cex=1)
    par(mar=c(5, 4, 4, 2) + 0.1)
  }


  #show the criterion and homogeneity evolutions
  if (Graph_bar==TRUE)
  {
    dev.new()
    barplot(results[,4][(nblo-gpmax):(nblo-1)],xlab="Nb clusters",ylab="delta", main="Variation of criterion",
            axisnames = TRUE,names.arg = paste((gpmax+1):2,"->",(gpmax):1),las=2,cex.names = 0.6,cex.main=1.2,col = "blue")
  }


  #number of clusters advised
  criter=sort(results[,5],decreasing = TRUE)
  H=NULL
  for (k in 1:(gpmax-1))
  {
    H[k]=((criter[k]/criter[k+1]-1))*(nblo-k-1)
  }
  #barplot(H[-(gpmax-1)]-H[-1], names.arg=2:gpmax)
  nbgroup_hart=which.max(H[-(gpmax-1)]-H[-1])+1
  cat(paste("Recommended number of clusters =", nbgroup_hart),"\n")




  ###### consolidation
  res.consol=list()
  cutree_k=list()
  for (K in 1:gpmax)
  {
    coupe=cutree(mytot,K)
    cutree_k[[K]]=coupe

    #consolidation

    res.consol[[K]]=.cluscata_kmeans_liking(Data, nblo, coupe,
                                           NameBlocks = NameBlocks, NameVar=NameVar, Itermax=Itermax)

  }
  names(cutree_k)=names(res.consol)=paste0("partition", 1:gpmax)

  #after consolidation
  diff_crit_bef=results[,4][(nblo-gpmax+1):(nblo-1)]
  #after consolidation
  overall_after=NULL
  crit_after=NULL
  for (i in 1: gpmax)
  {
    crit_after[i]=res.consol[[i]]$criterion
  }
  diff_crit_after=crit_after[-gpmax]-crit_after[-1]
  diff_crit_after=sort(diff_crit_after)
  diff_crit_ng=rbind(diff_crit_bef, diff_crit_after)
  rownames(diff_crit_ng)=c("before consolidation", "after consolidation")
  colnames(diff_crit_ng)=paste((gpmax):2,"->",(gpmax-1):1)



  #results
  res=c(res.consol, list(dend=mydendC, cutree_k=cutree_k,
                         diff_crit_ng=round(diff_crit_ng,2), param=list(nblo=nblo, ng=nbgroup_hart,
                                                                        gpmax=gpmax, n=n, nvar=nvar), type="H+C"))

  class(res)="cluscata_liking"

  return(res)

}