Nothing
## =============================================================================
##' @title Perform a cluster analysis of blocks of quantitative variables
##'
##' @description
##' Hierarchical clustering of quantitative Blocks followed by a partitioning algorithm (consolidation). Each cluster of blocks is associated with a compromise
##' computed by the STATIS method. Moreover, a noise cluster can be set up.
##'
##' @usage
##' clustatis(Data,Blocks,NameBlocks=NULL,Noise_cluster=FALSE,
##' Unique_threshold=TRUE,scale=FALSE,
##' Itermax=30, Graph_dend=TRUE, Graph_bar=TRUE,
##' printlevel=FALSE, gpmax=min(6, length(Blocks)-2), rhoparam=NULL,
##' Testonlyoneclust=FALSE, alpha=0.05, nperm=50)
##'
##'
##' @param Data data frame or matrix. Correspond to all the blocks of variables merged horizontally
##'
##' @param Blocks numerical vector. The number of variables of each block. The sum must be equal to the number of columns of Data
##'
##' @param NameBlocks string vector. Name of each block. Length must be equal to the length of Blocks vector. If NULL, the names are B1,...Bm. Default: NULL
##'
##' @param Noise_cluster logical. Should a noise cluster be computed? Default: FALSE
##'
##' @param Unique_threshold logical. Use same rho for every cluster? Default: TRUE
##'
##' @param scale logical. Should the data variables be scaled? Default: FALSE
##'
##' @param Itermax numerical. Maximum of iteration for the partitioning algorithm. Default: 30
##'
##' @param Graph_dend logical. Should the dendrogram be plotted? Default: TRUE
##'
##' @param Graph_bar logical. Should the barplot of the difference of the criterion and the barplot of the overall homogeneity at each merging step of the hierarchical algorithm be plotted? Default: TRUE
##'
##' @param printlevel logical. Print the number of remaining levels during the hierarchical clustering algorithm? Default: FALSE
##'
##' @param gpmax logical. What is maximum number of clusters to consider? Default: min(6, number of blocks -2)
##'
##' @param rhoparam numerical or vector. What is the threshold for the noise cluster? Between 0 and 1, high value can imply lot of blocks set aside. If NULL, automatic threshold is computed. Can be different for each group (in this case, provide a vector)
##'
##' @param Testonlyoneclust logical. Test if there is more than one cluster? Default: FALSE
##'
##' @param alpha numerical between 0 and 1. What is the threshold to test if there is more than one cluster? Default: 0.05
##'
##' @param nperm numerical. How many permutations are required to test if there is more than one cluster? Default: 50
##'
##'
##'
##' @return Each partitionK contains a list for each number of clusters of the partition, K=1 to gpmax with:
##' \itemize{
##' \item group: the clustering partition of datasets after consolidation. If Noise_cluster=TRUE, some blocks could be in the noise cluster ("K+1")
##' \item rho: the threshold(s) for the noise cluster (computed or input parameter)
##' \item homogeneity: homogeneity index (%) of each cluster and the overall homogeneity index (%) of the partition
##' \item rv_with_compromise: RV coefficient of each block with its cluster compromise
##' \item weights: weight associated with each block in its cluster
##' \item comp_RV: RV coefficient between the compromises associated with the various clusters
##' \item compromise: the W compromise of each cluster
##' \item coord: the coordinates of objects of each cluster
##' \item inertia: percentage of total variance explained by each axis for each cluster
##' \item rv_all_cluster: the RV coefficient between each block and each cluster compromise
##' \item criterion: the CLUSTATIS criterion error
##' \item param: parameters called in the consolidation
##' \item type: parameter passed to other functions
##' }
##' There is also at the end of the list:
##' \itemize{
##' \item dend: The CLUSTATIS dendrogram
##' \item cutree_k: the partition obtained by cutting the dendrogram for K clusters (before consolidation).
##' \item overall_homogeneity_ng: percentage of overall homogeneity by number of clusters before consolidation (and after if there is no noise cluster)
##' \item diff_crit_ng: variation of criterion when a merging is done before consolidation (and after if there is no noise cluster)
##' \item test_one_cluster: decision and pvalue to know if there is more than one cluster
##' \item param: parameters called
##' \item type: parameter passed to other functions
##' }
##'
##'
##'
##' @keywords quantitative
##'
##' @references
##' Llobell, F., Cariou, V., Vigneau, E., Labenne, A., & Qannari, E. M. (2018). Analysis and clustering of multiblock datasets by means of the STATIS and CLUSTATIS methods. Application to sensometrics. Food Quality and Preference, in Press.\cr
##' Llobell, F., Vigneau, E., Qannari, E. M. (2019). Clustering datasets by means of CLUSTATIS with identification of atypical datasets. Application to sensometrics. Food Quality and Preference, 75, 97-104.
##' Llobell, F., & Qannari, E. M. (2020). CLUSTATIS: Cluster analysis of blocks of variables. Electronic Journal of Applied Statistical Analysis, 13(2).
##'
##' @importFrom stats as.dendrogram cor cutree runif
##'
##'
##' @examples
##'
##' data(smoo)
##' NameBlocks=paste0("S",1:24)
##' cl=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks)
##' #plot(cl, ngroups=3, Graph_dend=FALSE)
##' summary(cl)
##' #with noise cluster
##' cl2=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##' Noise_cluster=TRUE, Graph_dend=FALSE, Graph_bar=FALSE)
##' #with noise cluster and defined rho threshold
##' cl3=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##' Noise_cluster=TRUE, Graph_dend=FALSE, Graph_bar=FALSE, rhoparam=0.5)
##' #different Noise cluster thresholds
##' cl4=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##' Noise_cluster=TRUE, Graph_dend=FALSE, Graph_bar=FALSE, Unique_threshold= FALSE,
##' rhoparam=c(0.6, 0.5,0.4))
##'
##' @seealso \code{\link{plot.clustatis}}, \code{\link{summary.clustatis}} , \code{\link{clustatis_kmeans}}, \code{\link{statis}}
##'
##' @export
## =============================================================================
clustatis <- function(Data, Blocks, NameBlocks = NULL, Noise_cluster = FALSE,
Unique_threshold = TRUE, scale = FALSE,
Itermax = 30, Graph_dend = TRUE, Graph_bar = TRUE,
printlevel = FALSE, gpmax = min(6, length(Blocks) - 2), rhoparam = NULL,
Testonlyoneclust = FALSE, alpha = 0.05, nperm = 50) {
nblo <- length(Blocks)
n <- nrow(Data)
if (is.null(NameBlocks)) NameBlocks <- paste("B", 1:nblo, sep = "-")
if (is.null(rownames(Data))) rownames(Data) <- paste0("X", 1:nrow(Data))
if (is.null(colnames(Data))) colnames(Data) <- paste0("Y", 1:ncol(Data))
# parapet for numerical Data
for (i in 1:ncol(Data))
{
if (is.numeric(Data[, i]) == FALSE) {
stop(paste("The data must be numeric (column", i, ")"))
}
}
# parapet for number of objects
if (n < 3) {
stop("At least 3 objects are required")
}
# parapet for number of blocks
if (nblo < 4) {
stop("At least 4 blocks are required")
}
# parapet for Blocks
if (sum(Blocks) != ncol(Data)) {
stop("Error with Blocks")
}
# Parapet for NameBlocks
if (length(NameBlocks) != nblo) {
stop("Error with the length of NameBlocks")
}
# parapet for scale: no constant variable
if (scale == TRUE) {
for (i in 1:ncol(Data))
{
if (sd(Data[, i]) == 0) {
stop(paste("Column", i, "is constant"))
}
}
}
# parapet for gpmax
if (gpmax > (nblo - 1)) {
stop(paste("gpmax > number of blocks-1"))
}
# parapet for rho
if (Noise_cluster == TRUE && is.null(rhoparam) == FALSE) {
if (Unique_threshold == TRUE) {
if (length(rhoparam) > 1) {
stop(paste("If Unique_threshold is TRUE, rhoparam should be NULL or a number between 0 and 1"))
}
if (rhoparam < 0 | rhoparam > 1) {
stop(paste("If Unique_threshold is TRUE, rhoparam should be NULL or a number between 0 and 1"))
}
} else {
if (length(rhoparam) > 1) {
gpmax <- length(rhoparam)
} else {
stop(paste("If Unique_threshold is FALSE, rhoparam should be NULL or a vector of numbers between 0 and 1"))
}
for (i in 1:gpmax)
{
if (rhoparam[i] < 0 | rhoparam[i] > 1) {
stop(paste("If Unique_threshold is FALSE, rhoparam should be NULL or a vector of numbers between 0 and 1"))
}
}
}
}
# no NA
if (sum(is.na(Data)) > 0) {
print("NA detected:")
tabna <- which(is.na(Data), arr.ind = TRUE)
print(tabna)
stop(paste("NA are not accepted"))
}
# centering and scaling if necessary
Data <- scale(Data, center = TRUE, scale = scale)
J <- rep(1:nblo, times = Blocks) # indicates which block each variable belongs to
# parapet for constant configurations and compute of Wi
Wi <- array(0, dim = c(nrow(Data), nrow(Data), nblo))
for (i in 1:nblo)
{
Xi <- as.matrix(Data[, J == i])
Wi[, , i] <- tcrossprod(Xi)
nor <- sqrt(sum(diag(tcrossprod(Wi[, , i]))))
if (nor == 0) {
stop(paste("Configuration", i, "is constant"))
}
Wi[, , i] <- Wi[, , i] / nor # standardization
}
# only one cluster?
if (Testonlyoneclust == TRUE) {
testonecluster <- .one_cluster_or_more(Data, Blocks, nperm = nperm, scale = scale, alpha = alpha)
if (testonecluster$decision == TRUE) {
testonecluster$decision <- "Only one cluster can be considered"
} else {
testonecluster$decision <- "Clustering is necessary"
}
} else {
testonecluster <- list(decision = "Untested", pvalue = "Untested")
}
# RV matrix:
RV <- matrix(0, nblo, nblo)
diag(RV) <- rep(1, nblo)
if (nblo > 1) {
for (i in 1:(nblo - 1)) {
for (j in (i + 1):nblo) {
RV[i, j] <- sum(diag(crossprod(Wi[, , i], Wi[, , j])))
RV[j, i] <- RV[i, j]
}
}
}
# criterion Q when each data table forms a group itself: 0
crit <- rep(0, nblo)
Q_current <- 0
# critS=rep(1,nblo)
S_current <- nblo
cc <- 1:nblo
# the names of the clusters are 1 to 'number of clusters'
code <- 1:nblo
# the names of the clusters are taken from the HCA
cvar <- 1:nblo
# when a new cluster is formed from clusters i and j, it is named min(i,j))
mergelist <- matrix(0, nblo - 1, 2)
results <- matrix(0, nblo - 1, 5)
colnames(results) <- c("merg1", "merg2", "new.clust", "agg.crit.hac", "clust.crit.hac")
idgr <- matrix(0, nblo, nblo)
idgr[nblo, ] <- 1:nblo
ordr <- c()
fusions <- -(1:nblo)
hmerge <- matrix(0, nblo - 1, 2)
ncluster <- nblo
# for homogeneity
quality <- NULL
for (level in 1:(nblo - 1)) {
################
# loops i et j
################
# We search the two clusters that will be merged.
deltamin <- 10^100
for (i in 1:(ncluster - 1)) {
for (j in (i + 1):(ncluster)) {
# merging the i'th and j'th cluster:
newcluster <- c(which(cc == i), which(cc == j))
Wj <- list()
for (tab in 1:length(newcluster)) {
Wj[[tab]] <- Wi[, , newcluster[tab]]
}
newstatis <- .crit_statisWj_fast(Wj, newcluster, RV)
deltacurrent <- newstatis - sum(crit[c(i, j)])
# if deltacurrent is smaller than deltamin, the current information is saved:
if (deltacurrent < deltamin) {
deltamin <- deltacurrent
c1 <- cvar[cc == i]
c2 <- cvar[cc == j]
merge1 <- c(c1[1], c2[1])
c1 <- code[cc == i]
c2 <- code[cc == j]
merge <- c(c1[1], c2[1])
cl_1 <- i
cl_2 <- j
statismerge <- newstatis
}
} # end of loop j
} # end of loop i
ncluster <- ncluster - 1
###############################
# renewal of the parameters
###############################
Q_current <- Q_current + deltamin
mergelist[level, ] <- merge1
results[level, 1:5] <- c(merge, nblo + level, deltamin, Q_current)
crit[cl_1] <- statismerge
crit <- crit[-cl_2]
cc[which(cc == cl_2)] <- cl_1
cc[which(cc > cl_2)] <- cc[which(cc > cl_2)] - 1
cvar[cc == cl_1] <- min(merge1)
code[cc == cl_1] <- nblo + level
idgr[ncluster, ] <- cc
indic <- c(fusions[merge1[1]], fusions[merge1[2]])
hmerge[level, ] <- indic
fusions[which(idgr[ncluster, ] == cl_1)] <- level
ordr <- .order_var(ordr, which(idgr[ncluster, ] == cl_1))
# #overall homogeneity
quality <- c(quality, (nblo - Q_current) / nblo)
# show the level
if (printlevel == TRUE) {
print(nblo - 1 - level)
}
} # end of level
# Dendogram:
resultscah <- list(labels = NameBlocks, height = results[, 4], merge = hmerge, order = ordr)
mytot <- resultscah
class(mytot) <- "hclust"
mydendC <- as.dendrogram(mytot)
# show the dendrogram
if (Graph_dend == TRUE) {
dev.new()
cex <- 0.6
par(cex = cex)
par(mar = c(7, 4, 4, 2) + 0.1)
plot(mydendC, type = "rectangle", main = "CLUSTATIS Dendrogram", axes = TRUE, cex = cex, ylab = "Height")
par(cex = 1)
par(mar = c(5, 4, 4, 2) + 0.1)
}
# show the criterion and homogeneity evolutions
if (Graph_bar == TRUE) {
dev.new()
barplot(results[, 4][(nblo - gpmax):(nblo - 1)],
xlab = "Nb clusters", ylab = "delta", main = "Variation of criterion",
axisnames = TRUE, names.arg = paste((gpmax + 1):2, "->", (gpmax):1), las = 2, cex.names = 0.6, cex.main = 1.2, col = "blue"
)
dev.new()
barplot(quality[(nblo - gpmax):(nblo - 1)] * 100,
xlab = "Nb clusters", ylab = "Overall homogeneity (%)", main = "Overall homogeneity (%)",
axisnames = TRUE, names.arg = paste((gpmax):1), las = 2, cex.names = 0.6, cex.main = 1.2, col = "blue"
)
}
# number of clusters advised
criter <- sort(results[, 5], decreasing = TRUE)
H <- NULL
for (k in 1:min(gpmax, nblo - 2))
{
H[k] <- (criter[k] / criter[k + 1] - 1) * (nblo - k - 1)
}
nbgroup_hart <- which.max(H[-(min(gpmax, nblo - 2))] - H[-1]) + 1
cat(paste("Recommended number of clusters =", nbgroup_hart), "\n")
###### find the threshold(s) rho for the noise cluster and consolidation
res.consol <- list()
cutree_k <- list()
rho <- NULL
rho_k <- list()
for (K in 1:gpmax)
{
coupe <- cutree(mytot, K)
cutree_k[[K]] <- coupe
if (Noise_cluster == TRUE) {
if (is.null(rhoparam) == TRUE) {
oldgroup <- coupe
# compute the compromises
Wk <- array(0, dim = c(nrow(Data), nrow(Data), K))
for (i in 1:K)
{
cluster <- which(oldgroup == i)
Wj <- list()
for (tab in 1:length(cluster)) {
Wj[[tab]] <- Wi[, , cluster[tab]]
}
statisk <- .crit_statisWj(Wj, cluster, RV)
Wk[, , i] <- statisk$W
}
# compute the criterions
cr <- list()
cr2 <- list()
for (i in 1:nblo)
{
a <- NULL
for (k in 1:K)
{
W_k <- as.matrix(Wk[, , k])
normW <- sum(diag(crossprod(W_k)))
W_i <- Wi[, , i]
a <- c(a, sum(diag(crossprod(W_i, W_k)))^2 / (normW))
}
cr[[i]] <- sqrt(a)
if (K > 1) {
cr2[[i]] <- sort(sqrt(a), decreasing = TRUE)[1:2]
} else {
cr2[[i]] <- sqrt(a)
}
}
rho[K] <- mean(unlist(cr2))
# multiple threshold
if (Unique_threshold == FALSE) {
grou <- NULL
for (i in 1:nblo)
{
grou[i] <- which.max(cr[[i]])
}
mat_utile <- matrix(unlist(cr2), nrow = nblo, ncol = 2, byrow = TRUE)
rho_k[[K]] <- 0
for (k in 1:K)
{
rho_k[[K]] <- c(rho_k[[K]], (mean(mat_utile[which(grou == k), 1]) + mean(mat_utile[which(grou == k), 2])) / 2)
}
rho_k[[K]] <- rho_k[[K]][-1]
}
} else {
if (Unique_threshold == TRUE) {
rho[K] <- rhoparam
} else {
rho_k[[K]] <- rhoparam[1:K]
}
}
} else {
rho[K] <- 0
}
if (Unique_threshold == FALSE) {
rho2consider <- unlist(rho_k[[K]])
} else {
rho2consider <- rho[K]
}
# consolidation
res.consol[[K]] <- clustatis_kmeans(Data, Blocks, coupe,
rho = rho2consider, NameBlocks = NameBlocks, Itermax = Itermax,
Graph_groups = FALSE, Graph_weights = FALSE, scale = scale
)
}
names(cutree_k) <- names(res.consol) <- paste0("partition", 1:gpmax)
# after consolidation
diff_crit_bef <- results[, 4][(nblo - gpmax + 1):(nblo - 1)]
if (Noise_cluster == FALSE) {
# after consolidation
overall_after <- NULL
crit_after <- NULL
for (i in 1:gpmax)
{
overall_after[i] <- res.consol[[i]]$homogeneity[i + 1, 1]
crit_after[i] <- res.consol[[i]]$criterion
}
diff_crit_after <- crit_after[-gpmax] - crit_after[-1]
diff_crit_after <- sort(diff_crit_after)
overall_after <- sort(overall_after, decreasing = TRUE)
overall_homogeneity_ng <- rbind(quality[(nblo - gpmax):(nblo - 1)] * 100, overall_after)
diff_crit_ng <- rbind(diff_crit_bef, diff_crit_after)
rownames(diff_crit_ng) <- rownames(overall_homogeneity_ng) <- c("before consolidation", "after consolidation")
colnames(diff_crit_ng) <- paste((gpmax):2, "->", (gpmax - 1):1)
colnames(overall_homogeneity_ng) <- paste(gpmax:1, "cluster(s)")
} else {
overall_homogeneity_ng <- quality[(nblo - gpmax):(nblo - 1)] * 100
diff_crit_ng <- diff_crit_bef
names(diff_crit_ng) <- paste((gpmax):2, "->", (gpmax - 1):1)
names(overall_homogeneity_ng) <- paste(gpmax:1, "cluster(s)")
}
# results
res <- c(res.consol, list(
dend = mydendC, cutree_k = cutree_k,
overall_homogeneity_ng = round(overall_homogeneity_ng, 1),
diff_crit_ng = round(diff_crit_ng, 2), test_one_cluster = testonecluster, param = list(
nblo = nblo, ng = nbgroup_hart,
Noise_cluster = Noise_cluster, gpmax = gpmax, n = n
), type = "H+C"
))
class(res) <- "clustatis"
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.