R/clustatis_kmeans.R

Defines functions clustatis_kmeans

Documented in clustatis_kmeans

## =============================================================================

##' @title Compute the CLUSTATIS partitioning algorithm on different blocks of quantitative variables
##' @description
##' Partitioning algorithm for quantitative variables. Each cluster is associated with a compromise computed by the STATIS method. Can be performed using a multi-start strategy or initial partition provided by the user. Moreover, a noise cluster can be set up.
##'
##' @usage
##' clustatis_kmeans(Data, Blocks, clust, nstart=100, rho=0, NameBlocks=NULL,
##' Itermax=30,Graph_groups=TRUE, Graph_weights=FALSE,
##'  scale=FALSE, print_attempt=FALSE)
##'
##'
##' @param Data data frame or matrix. Correspond to all the blocks of variables merged horizontally
##'
##' @param Blocks  numerical vector. The number of variables of each block. The sum must be equal to the number of columns of Data
##'
##' @param clust numerical vector or integer. Initial partition or number of starting partitions if integer. If numerical vector, the numbers must be 1,2,3,...,number of clusters
##'
##' @param nstart integer. Number of starting partitions. Default: 100
##'
##' @param rho numerical or vector between 0 and 1. Threshold for the noise cluster. Default:0. If you want a different threshold for each cluster, you can provide a vector.
##'
##' @param NameBlocks string vector. Name of each block. Length must be equal to the length of Blocks vector. If NULL, the names are B1,...Bm. Default: NULL
##'
##' @param Itermax numerical. Maximum of iterations by partitioning algorithm. Default: 30
##'
##' @param Graph_groups logical. Should each cluster compromise be plotted? Default: TRUE
##'
##' @param Graph_weights logical. Should the barplot of the weights in each cluster be plotted? Default: FALSE
##'
##' @param scale logical. Should the data variables be scaled? Default: FALSE
##'
##' @param print_attempt logical. Print the number of remaining attempts in the multi-start case? Default: FALSE
##'
##'
##'
##'
##'
##'
##' @return a list with:
##'         \itemize{
##'          \item group: the clustering partition. If rho>0, some blocks could be in the noise cluster ("K+1")
##'          \item rho: the threshold(s) for the noise cluster
##'          \item homogeneity: percentage of homogeneity of the blocks in each cluster and the overall homogeneity
##'          \item rv_with_compromise: RV coefficient of each block with its cluster compromise
##'          \item weights: weight associated with each block in its cluster
##'          \item comp_RV: RV coefficient between the compromises associated with the various clusters
##'          \item compromise: the W compromise of each cluster
##'          \item coord: the coordinates of objects of each cluster
##'          \item inertia: percentage of total variance explained by each axis for each cluster
##'          \item rv_all_cluster: the RV coefficient between each block and each cluster compromise
##'          \item criterion: the CLUSTATIS criterion error
##'          \item param: parameters called
##'          \item type: parameter passed to other functions
##'          }
##'
##'
##'
##'
##' @references
##' Llobell, F., Cariou, V., Vigneau, E., Labenne, A., & Qannari, E. M. (2018). Analysis and clustering of multiblock datasets by means of the STATIS and CLUSTATIS methods. Application to sensometrics. Food Quality and Preference, in Press.\cr
##' Llobell, F., Vigneau, E., Qannari, E. M. (2019). Clustering datasets by means of CLUSTATIS with identification of atypical datasets. Application to sensometrics. Food Quality and Preference, 75, 97-104.
##'
##'
##'
##' @keywords quantitative
##'
##' @examples
##'
##'  data(smoo)
##'  NameBlocks=paste0("S",1:24)
##'  #with multi-start
##'  cl_km=clustatis_kmeans(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks, clust=3)
##'  #with an initial partition
##'  cl=clustatis(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##'  Graph_dend=FALSE)
##'  partition=cl$cutree_k$partition3
##'  cl_km2=clustatis_kmeans(Data=smoo,Blocks=rep(2,24),NameBlocks = NameBlocks,
##'  clust=partition, Graph_weights=FALSE, Graph_groups=FALSE)
##'  graphics.off()
##'
##' @seealso   \code{\link{plot.clustatis}}, \code{\link{clustatis}}, \code{\link{summary.clustatis}}, \code{\link{statis}}
##'
##' @export


## =============================================================================




clustatis_kmeans <- function(Data, Blocks, clust, nstart = 100, rho = 0, NameBlocks = NULL,
                             Itermax = 30, Graph_groups = TRUE,
                             Graph_weights = FALSE, scale = FALSE, print_attempt = FALSE) {
  nblo <- length(Blocks)

  # initialisation or muti-start?
  if (length(clust) == 1) {
    ngroups <- clust
    init <- FALSE
  } else {
    ngroups <- length(unique(clust))
    partition <- clust
    init <- TRUE
    if (sum(sort(unique(clust)) != (1:ngroups)) > 0) {
      stop("The clusters must be 1,2,...,number of clusters")
    }
  }

  n <- nrow(Data)
  if (is.null(NameBlocks)) NameBlocks <- paste("B", 1:nblo, sep = "-")
  if (is.null(rownames(Data))) rownames(Data) <- paste0("X", 1:nrow(Data))
  if (is.null(colnames(Data))) colnames(Data) <- paste0("Y", 1:ncol(Data))

  # parapet for numerical Data
  for (i in 1:ncol(Data))
  {
    if (is.numeric(Data[, i]) == FALSE) {
      stop(paste("The data must be numeric (column", i, ")"))
    }
  }

  # parapet for number of objects
  if (n < 3) {
    stop("At least 3 objects are required")
  }

  # parapet for Blocks
  if (sum(Blocks) != ncol(Data)) {
    stop("Error with Blocks")
  }

  # Parapet for NameBlocks
  if (length(NameBlocks) != nblo) {
    stop("Error with the length of NameBlocks")
  }


  # parapet for scale: no constant variable
  if (scale == TRUE) {
    for (i in 1:ncol(Data))
    {
      if (sd(Data[, i]) == 0) {
        stop(paste("Column", i, "is constant"))
      }
    }
  }

  # parapet for rho
  if (length(rho) == 1) {
    rho <- rep(rho, ngroups)
  }
  if (length(rho) != ngroups) {
    stop(paste("rho should be a a vector of numbers between 0 and 1 and of lenght 1 or equal to the number of clusters"))
  }
  for (i in 1:ngroups)
  {
    if (rho[i] < 0 | rho[i] > 1) {
      stop(paste("rho should be a number or a vector of numbers between 0 and 1"))
    }
  }

  # no NA
  if (sum(is.na(Data)) > 0) {
    print("NA detected:")
    tabna <- which(is.na(Data), arr.ind = TRUE)
    print(tabna)
    stop(paste("NA are not accepted"))
  }

  # parapet for number of groups and number of blocks
  if (ngroups > nblo) {
    stop("number of groups > number of blocks")
  }

  # centering and scaling if necessary
  Data <- scale(Data, center = TRUE, scale = scale)

  J <- rep(1:nblo, times = Blocks) # indicates which block each variable belongs to

  # parapet for constant configurations and compute of Wi

  Wi <- array(0, dim = c(nrow(Data), nrow(Data), nblo))
  for (i in 1:nblo)
  {
    Xi <- as.matrix(Data[, J == i])
    Wi[, , i] <- tcrossprod(Xi)
    nor <- sqrt(sum(diag(crossprod(Wi[, , i]))))
    if (nor == 0) {
      stop(paste("Configuration", i, "is constant"))
    }
    Wi[, , i] <- Wi[, , i] / nor # standardization
  }

  ##### RV matrix#####

  RV <- matrix(0, nblo, nblo)
  diag(RV) <- rep(1, nblo)
  if (nblo > 1) {
    for (i in 1:(nblo - 1)) {
      for (j in (i + 1):nblo) {
        RV[i, j] <- sum(diag(crossprod(Wi[, , i], Wi[, , j])))
        RV[j, i] <- RV[i, j]
      }
    }
  }



  if (init == FALSE) {
    sumRV2 <- 0
    for (tent in 1:nstart)
    {
      coupe <- sample(1:ngroups, nblo, replace = TRUE)
      while (nlevels(as.factor(coupe)) < ngroups) {
        coupe <- sample(1:ngroups, nblo, replace = TRUE) # avoid empty groups
      }


      iter <- 0
      sumRV2_act <- 0
      goout <- 0
      oldgroup <- coupe
      erreur <- NULL
      while (goout != 1 & iter <= Itermax) {
        # avoid empty group
        if (nlevels(as.factor(oldgroup)) < ngroups) {
          coupe <- sample(1:ngroups, nblo, replace = TRUE)
          while (nlevels(as.factor(coupe)) < ngroups) {
            coupe <- sample(1:ngroups, nblo, replace = TRUE)
          }
          oldgroup <- coupe
        }
        # compute the compromises
        Wk <- array(0, dim = c(nrow(Data), nrow(Data), ngroups))
        lamb <- NULL
        alpha <- list()
        for (i in 1:ngroups)
        {
          cluster <- which(oldgroup == i)
          Wj <- list()
          for (tab in 1:length(cluster)) {
            Wj[[tab]] <- Wi[, , cluster[tab]]
          }
          statisgr <- .crit_statisWj(Wj, cluster, RV)
          Wk[, , i] <- statisgr$W
          lamb[i] <- statisgr$lambda / length(cluster)
          alpha[[i]] <- statisgr$u
          erreur[i] <- statisgr$Q
        }


        # compute the criterion
        cr <- list()
        for (i in 1:nblo)
        {
          a <- NULL
          for (k in 1:ngroups)
          {
            W_k <- as.matrix(Wk[, , k])
            normW <- sum(diag(crossprod(W_k)))
            W_i <- Wi[, , i]
            a <- c(a, sum(diag(crossprod(W_i, W_k)))^2 / (normW))
          }
          cr[[i]] <- sqrt(a)
        }

        # choice of the clusters
        newgroup <- NULL
        for (i in 1:nblo)
        {
          newgroup[i] <- which.max(cr[[i]])
          if (max(cr[[i]]) < rho[newgroup[i]]) {
            newgroup[i] <- 0
          }
        }

        if (sum(newgroup != oldgroup) > 0) {
          oldgroup <- newgroup
        } else {
          goout <- 1
        }

        # 1 cluster left?
        if (0 %in% oldgroup & nlevels(factor(oldgroup)) != ngroups + 1) {
          warning(paste("One cluster is empty with "), ngroups, " clusters, rho is too high")
          oldgroup[oldgroup == 0] <- "K+1"
          return(oldgroup)
        }

        iter <- iter + 1
      }

      # best partition?
      quali_tent <- sum(erreur)
      for (i in 1:nblo)
      {
        rhoglob <- mean(rho)
        if (oldgroup[i] == 0) {
          sumRV2_act <- sumRV2_act + rhoglob**2
        } else {
          sumRV2_act <- sumRV2_act + max(cr[[i]]**2)
        }
      }
      if ((sumRV2_act) > sumRV2) {
        sumRV2 <- sumRV2_act
        qual <- quali_tent
        parti <- oldgroup
        alpha_bon <- alpha
        lamb_bon <- lamb
        Wk_bon <- Wk
        cr_bon <- cr
        err_bon <- sum(erreur)
        erreur_bon <- erreur
      }
      if (print_attempt == TRUE) {
        print(nstart - tent)
      }
    }
  }

  if (init == TRUE) {
    sumRV2 <- 0
    coupe <- as.numeric(partition)
    iter <- 0
    sumRV2_act <- 0
    goout <- 0
    oldgroup <- coupe
    erreur <- NULL
    while (goout != 1 & iter <= Itermax) {
      # compute the compromises
      Wk <- array(0, dim = c(nrow(Data), nrow(Data), ngroups))
      lamb <- NULL
      alpha <- list()
      for (i in 1:ngroups)
      {
        cluster <- which(oldgroup == i)
        Wj <- list()
        for (tab in 1:length(cluster)) {
          Wj[[tab]] <- Wi[, , cluster[tab]]
        }
        statisgr <- .crit_statisWj(Wj, cluster, RV)
        Wk[, , i] <- statisgr$W
        lamb[i] <- statisgr$lambda / length(cluster)
        alpha[[i]] <- statisgr$u
        erreur[i] <- statisgr$Q
      }


      # compute the criterion
      cr <- list()

      for (i in 1:nblo)
      {
        a <- NULL
        for (k in 1:ngroups)
        {
          W_k <- as.matrix(Wk[, , k])
          normW <- sum(diag(crossprod(W_k)))
          W_i <- Wi[, , i]
          a <- c(a, sum(diag(crossprod(W_i, W_k)))^2 / (normW))
        }
        cr[[i]] <- sqrt(a)
      }

      # choice of the clusters
      newgroup <- NULL
      for (i in 1:nblo)
      {
        newgroup[i] <- which.max(cr[[i]])
        if (max(cr[[i]]) < rho[newgroup[i]]) {
          newgroup[i] <- 0
        }
      }

      if (sum(newgroup != oldgroup) > 0) {
        oldgroup <- newgroup
      } else {
        goout <- 1
      }

      # 1 cluster left?
      if (0 %in% oldgroup & nlevels(factor(oldgroup)) != ngroups + 1) {
        warning(paste("One cluster is empty with "), ngroups, " clusters, rho is too high")
        oldgroup[oldgroup == 0] <- "K+1"
        return(oldgroup)
      }
      iter <- iter + 1
    }

    # results of the partition
    quali_tent <- sum(erreur)
    rhoglob <- mean(rho)
    for (i in 1:nblo)
    {
      if (oldgroup[i] == 0) {
        sumRV2_act <- sumRV2_act + rhoglob**2
      } else {
        sumRV2_act <- sumRV2_act + max(cr[[i]]**2)
      }
    }

    sumRV2 <- sumRV2_act
    qual <- quali_tent
    parti <- oldgroup
    alpha_bon <- alpha
    lamb_bon <- lamb
    Wk_bon <- Wk
    cr_bon <- cr
    err_bon <- sum(erreur)
    erreur_bon <- erreur
  }

  # to simplify names
  oldgroup <- parti
  Wk <- Wk_bon
  alpha <- alpha_bon
  lamb <- lamb_bon
  cr <- cr_bon

  # tab1: homogeneity
  Wj <- list()
  for (i in 1:nblo)
  {
    Wj[[i]] <- Wi[, , i]
  }
  stati <- .crit_statisWj(Wj, 1:nblo, RV)
  lambda_tot <- stati$lambda / nblo
  ne <- NULL
  for (i in 1:ngroups)
  {
    ne <- c(ne, length(which(oldgroup == i)))
  }
  adios <- which(oldgroup == 0)
  overall <- (sum(lamb * ne)) / (sum(ne))
  if (length(adios) == 0) {
    tab1 <- data.frame(homogeneity = round(c(lamb, overall, lambda_tot), 3) * 100, nb_blocks = c(ne, nblo, nblo))
    rownames(tab1) <- c(paste("Cluster", 1:ngroups), "Overall", "One group")
  } else {
    cluster <- which(oldgroup == 0)
    Wj <- list()
    for (tab in 1:length(cluster)) {
      Wj[[tab]] <- Wi[, , cluster[tab]]
    }
    statisk1 <- .crit_statisWj(Wj, cluster, RV)
    lambk1 <- statisk1$lambda / length(cluster)
    tab1 <- data.frame(homogeneity = round(c(lamb, lambk1, overall, lambda_tot), 3) * 100, nb_blocks = c(ne, length(adios), sum(ne), nblo))
    rownames(tab1) <- c(paste("Cluster", 1:ngroups), "Noise cluster", "Overall", "One group")
  }
  colnames(tab1)[1] <- "homogeneity (%)"

  # rv with compromise and weights
  rvcomp <- list()
  for (k in 1:ngroups)
  {
    cluster <- which(parti == k)
    temp <- NULL
    for (i in cluster)
    {
      temp <- c(temp, round(max(cr[[i]]), 3))
    }
    rvcomp[[k]] <- temp
    alpha[[k]] <- as.vector(alpha[[k]])
    names(rvcomp[[k]]) <- names(alpha[[k]]) <- NameBlocks[cluster]
  }

  # groups
  partit <- parti
  parti[adios] <- "K+1"
  names(parti) <- NameBlocks
  grou <- data.frame(Cluster = parti)

  # RV among compromises
  tab4 <- matrix(0, ngroups, ngroups)
  for (i in 1:ngroups)
  {
    for (j in 1:ngroups)
    {
      W_k <- as.matrix(Wk_bon[, , i])
      W_l <- as.matrix(Wk_bon[, , j])
      normWk <- sqrt(sum(diag(crossprod(W_k))))
      normWl <- sqrt(sum(diag(crossprod(W_l))))
      tab4[i, j] <- sum(diag(crossprod(W_l, W_k))) / (normWl * normWk)
    }
  }

  rownames(tab4) <- paste("Cluster", 1:ngroups)
  colnames(tab4) <- paste("Cluster", 1:ngroups)

  # proximity inter-cluster
  prox <- eigen(tab4)$values[1] / ngroups

  # coordinates
  compromise <- list()
  coord <- list()
  inertia <- list()
  for (i in 1:ngroups)
  {
    e <- svd(Wk[, , i])
    C <- e$u %*% sqrt(diag(abs(e$d)))
    C <- C[, -ncol(C)]
    if (i == 1) {
      Cref <- C
    }
    if (i != 1) {
      for (j in 1:ncol(C))
      {
        if (var(C[, j]) > 10^(-12) & var(Cref[, j]) > 10^(-12)) {
          if (cor(Cref[, j], C[, j]) < 0) {
            C[, j] <- -C[, j] # axes orientation
          }
        }
      }
    }
    pouriner <- round(e$d / sum(e$d) * 100, 2)
    comprom <- Wk[, , i]
    rownames(comprom) <- colnames(comprom) <- rownames(C) <- rownames(Data)
    compromise[[i]] <- comprom
    colnames(C) <- paste("Dim", 1:(nrow(Data) - 1))
    coord[[i]] <- C
    pouriner <- pouriner[-length(pouriner)]
    names(pouriner) <- paste("Dim", 1:(nrow(Data) - 1))
    inertia[[i]] <- pouriner

    # graphical representations
    if (Graph_groups == TRUE) {
      dev.new()
      plot(C[, 1], C[, 2], type = "n", lwd = 5, pch = 16, asp = 1, xlab = paste("Dim 1 (", pouriner[1], "%)"), ylab = paste("Dim 2 (", pouriner[2], "%)"), xlim = c(min(C[, 1]) - 0.05, max(C[, 1]) + 0.05))
      text(C[, 1], C[, 2], rownames(Data), col = rainbow(nrow(Data)))
      abline(h = 0, v = 0)
      title(paste("Cluster", i))
    }
  }
  names(compromise) <- names(rvcomp) <- names(alpha) <- names(rho) <- paste("Cluster", 1:ngroups)

  if (length(adios) == 0) {
    criterion <- err_bon
  } else {
    criterion <- NULL
  }

  # weights barplot
  if (Graph_weights == TRUE) {
    for (i in 1:ngroups)
    {
      dev.new()
      barplot(alpha[[i]])
      title(paste("Weights in the cluster", i))
    }
  }

  # rounds
  for (i in 1:ngroups)
  {
    compromise[[i]] <- round(compromise[[i]], 3)
    coord[[i]] <- round(coord[[i]], 3)
  }
  for (i in 1:nblo)
  {
    cr[[i]] <- round(cr[[i]], 3)
    Wi[, , i] <- round(Wi[, , i], 3)
    names(cr[[i]]) <- paste("Cluster", 1:ngroups)
  }
  if (nblo > 1) {
    dimnames(Wi)[[1]] <- dimnames(Wi)[[2]] <- rownames(Data)
    dimnames(Wi)[[3]] <- NameBlocks
  }




  names(cr) <- NameBlocks
  names(compromise) <- names(coord) <- names(inertia) <- paste("Cluster", 1:ngroups)



  # results
  res <- list(
    group = grou, rho = rho, homogeneity = tab1, rv_with_compromise = rvcomp, weights = alpha, compRV = round(tab4, 2),
    compromise = compromise, coord = coord, inertia = inertia,
    rv_all_cluster = cr, criterion = criterion, param = list(nblo = nblo, ng = ngroups, n = n), type = "K"
  )
  class(res) <- "clustatis"

  return(res)
}

Try the ClustBlock package in your browser

Any scripts or data that you put into this service are public.

ClustBlock documentation built on June 8, 2025, 10:32 a.m.