R/nhclu_kmeans.R
In bioregion: Comparison of Bioregionalisation Methods

Documented in nhclu_kmeans

#' Non-hierarchical clustering: K-means analysis
#'
#' This function performs non-hierarchical clustering based on dissimilarity 
#' using a k-means analysis.
#'
#' @param dissimilarity The output object from [dissimilarity()] or 
#' [similarity_to_dissimilarity()], or a `dist` object. If a `data.frame` is 
#' used, the first two columns should represent pairs of sites (or any pair of 
#' nodes), and the subsequent column(s) should contain the dissimilarity indices.
#'
#' @param index The name or number of the dissimilarity column to use. By 
#' default, the third column name of `dissimilarity` is used.
#'
#' @param seed A value for the random number generator (`NULL` for random by 
#' default).
#'
#' @param n_clust An `integer` vector or a single `integer` value specifying 
#' the requested number(s) of clusters.
#'
#' @param iter_max An `integer` specifying the maximum number of iterations for 
#' the k-means method (see [kmeans][stats::kmeans]).
#'
#' @param nstart An `integer` specifying how many random sets of `n_clust` 
#' should be selected as starting points for the k-means analysis 
#' (see [kmeans][stats::kmeans]).
#'
#' @param algorithm A `character` specifying the algorithm to use for k-means 
#' (see [kmeans][stats::kmeans]). Available options are Hartigan-Wong, Lloyd, 
#' Forgy, and MacQueen.
#'
#' @param algorithm_in_output A `boolean` indicating whether the original 
#' output of [kmeans][stats::kmeans] should be included in the output. Defaults 
#' to `TRUE` (see Value).
#'
#' @return
#' A `list` of class `bioregion.clusters` with five components:
#' \enumerate{
#' \item{**name**: A `character` string containing the name of the algorithm.}
#' \item{**args**: A `list` of input arguments as provided by the user.}
#' \item{**inputs**: A `list` of characteristics of the clustering process.}
#' \item{**algorithm**: A `list` of all objects associated with the clustering 
#' procedure, such as original cluster objects (only if 
#' `algorithm_in_output = TRUE`).}
#' \item{**clusters**: A `data.frame` containing the clustering results.}}
#'
#' If `algorithm_in_output = TRUE`, the `algorithm` slot includes the output of 
#' [kmeans][stats::kmeans].
#'
#' @details
#' This method partitions data into k groups such that the sum of squares of 
#' Euclidean distances from points to the assigned cluster centers is minimized. 
#' K-means cannot be applied directly to dissimilarity or beta-diversity metrics 
#' because these distances are not Euclidean. Therefore, it first requires 
#' transforming the dissimilarity matrix using Principal Coordinate Analysis 
#' (PCoA) with [pcoa][ape::pcoa], and then applying k-means to the coordinates 
#' of points in the PCoA.
#'
#' Because this additional transformation alters the initial dissimilarity 
#' matrix, the partitioning around medoids method ([nhclu_pam]) is preferred.
#'
#' @seealso 
#' For more details illustrated with a practical example, 
#' see the vignette: 
#' \url{https://biorgeo.github.io/bioregion/articles/a4_2_non_hierarchical_clustering.html}.
#' 
#' Associated functions: 
#' [nhclu_clara] [nhclu_clarans] [nhclu_dbscan] [nhclu_pam] [nhclu_affprop] 
#'
#' @author
#' Boris Leroy (\email{leroy.boris@gmail.com}) \cr
#' Pierre Denelle (\email{pierre.denelle@gmail.com}) \cr
#' Maxime Lenormand (\email{maxime.lenormand@inrae.fr}) 
#' 
#' @examples
#' comat <- matrix(sample(0:1000, size = 500, replace = TRUE, prob = 1/1:1001),
#' 20, 25)
#' rownames(comat) <- paste0("Site",1:20)
#' colnames(comat) <- paste0("Species",1:25)
#'
#' comnet <- mat_to_net(comat)
#'
#' dissim <- dissimilarity(comat, metric = "all")
#' 
#' clust <- nhclu_kmeans(dissim, n_clust = 2:10, index = "Simpson")
#'
#' @importFrom stats as.dist kmeans
#' @importFrom ape pcoa
#'         
#' @export

nhclu_kmeans <- function(dissimilarity,
                         index = names(dissimilarity)[3],
                         seed = NULL,
                         n_clust = c(1,2,3),
                         iter_max = 10,
                         nstart = 10,
                         algorithm = "Hartigan-Wong",
                         algorithm_in_output = TRUE){
  
  # 1. Controls ---------------------------------------------------------------
  controls(args = NULL, data = dissimilarity, type = "input_nhandhclu")
  if(!inherits(dissimilarity, "dist")){
    controls(args = NULL, data = dissimilarity, type = "input_dissimilarity")
    controls(args = NULL, data = dissimilarity, 
             type = "input_data_frame_nhandhclu")
    controls(args = index, data = dissimilarity, type = "input_net_index")
    net <- dissimilarity
    # Convert tibble into dataframe
    if(inherits(net, "tbl_df")){
      net <- as.data.frame(net)
    }
    net[, 3] <- net[, index]
    net <- net[, 1:3]
    controls(args = NULL, data = net, type = "input_net_index_value")
    dist.obj <- stats::as.dist(
      net_to_mat(net,
                 weight = TRUE, squared = TRUE, symmetrical = TRUE))
  } else {
    controls(args = NULL, data = dissimilarity, type = "input_dist")
    dist.obj <- dissimilarity
    if(is.null(labels(dist.obj))){
      attr(dist.obj, "Labels") <- paste0(1:attr(dist.obj, "Size"))
      message("No labels detected, they have been assigned automatically.")
    }
  }
  
  if(!is.null(seed)){
    controls(args = seed, data = NULL, type = "strict_positive_integer")
  }
  controls(args = n_clust, data = NULL, 
           type = "strict_positive_integer_vector")
  controls(args = iter_max, data = NULL, type = "positive_integer")
  controls(args = nstart, data = NULL, type = "positive_integer")
  controls(args = algorithm, data = NULL, type = "character")
  if(!(algorithm %in% c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"))){
    stop(paste0("Please choose algorithm from the following:\n",
                "Hartigan-Wong, Lloyd, Forgy or MacQueen."),
         call. = FALSE)
  }
  controls(args = algorithm_in_output, data = NULL, type = "boolean")
  
  # 2. Function ---------------------------------------------------------------
  outputs <- list(name = "nhclu_kmeans")
  
  outputs$args <- list(index = index,
                       seed = seed,
                       n_clust = n_clust,
                       iter_max = iter_max,
                       nstart = nstart,
                       algorithm = algorithm,
                       algorithm_in_output = algorithm_in_output)
  
  outputs$inputs <- list(bipartite = FALSE,
                         weight = TRUE,
                         pairwise = TRUE,
                         pairwise_metric = ifelse(!inherits(dissimilarity, 
                                                            "dist"), 
                                                  ifelse(is.numeric(index), 
                                                         names(net)[3], index), 
                                                  NA),
                         dissimilarity = TRUE,
                         nb_sites = attr(dist.obj, "Size"),
                         hierarchical = FALSE)
  
  outputs$algorithm <- list()
  
  outputs$clusters <- data.frame(matrix(ncol = 1,
                                        nrow = length(labels(dist.obj)),
                                        dimnames = list(labels(dist.obj),
                                                        "name")))
  
  outputs$clusters$name <- labels(dist.obj)
  
  # kmeans only works on Euclidean distances, so the dissimilarity matrix needs
  # to be transformed into a multivariate space with euclidean distances
  # with a Principal Coordinate Analysis
  if(length(unique(as.numeric(dist.obj))) == 1 &&
     unique(as.numeric(dist.obj)) == 0){
    stop("All sites are completely similar.")
  } else{
    if(length(unique(as.numeric(dist.obj))) == 1 &&
       unique(as.numeric(dist.obj)) == 1){
      warning("All sites are completely dissimilar.")
    }
    outputs$clustering_algorithms$pcoa <- ape::pcoa(dist.obj)
  }
  
  # Performing the kmeans on the PCoA with all axes
  if(is.null(seed)){
    outputs$algorithm <- lapply(n_clust,
                                function(x)
                                  stats::kmeans(dist.obj,
                                                centers = x,
                                                iter.max = iter_max,
                                                nstart = nstart,
                                                algorithm = algorithm))
  }else{
    set.seed(seed)
    outputs$algorithm <- lapply(n_clust,
                                function(x)
                                  stats::kmeans(dist.obj,
                                                centers = x,
                                                iter.max = iter_max,
                                                nstart = nstart,
                                                algorithm = algorithm))
    rm(.Random.seed, envir=globalenv())
  }
  names(outputs$algorithm) <- paste0("K_", n_clust)
  
  outputs$clusters <- data.frame(
    outputs$clusters,
    data.frame(lapply(names(outputs$algorithm),
                      function(x) outputs$algorithm[[x]]$cluster)))
  
  outputs$clusters <- knbclu(outputs$clusters, reorder = TRUE)
  
  outputs$cluster_info <- data.frame(
    partition_name = names(outputs$clusters)[2:length(outputs$clusters),
                                             drop = FALSE],
    n_clust = apply(outputs$clusters[, 2:length(outputs$clusters),
                                     drop = FALSE],
                    2, function(x) length(unique(x))))
  
  # Set algorithm in output
  if (!algorithm_in_output) {
    outputs$algorithm <- NA
  }
  
  class(outputs) <-  append("bioregion.clusters", class(outputs))
  
  return(outputs)
}