R/09_EFTs_custering.R
In LPDynR: Land Productivity Dynamics Indicator

Documented in EFT_clust

#### Ecosystem Functional Types: Clustering

#'
#' @author Xavier Rotllan-Puig
#' @title EFT_clust
#' @description EFT_clust derives the Ecosystem Functional Types using K-means to perform a clustering on the
#' pixels of the SpatRaster object
#' @details \code{\link[stats]{kmeans}} does not optimize the final number of clusters. It needs to be set by means of 'n_clust'
#' (default = 20). There are several methods and statistics to determine the optimal number. \code{\link{clust_optim}}
#' produces a scree plot to help the user to decide the optimal number of clusters.
#'
#' EFT_clust passes as default to \code{\link[stats]{kmeans}} iter.max = 500 and algorithm = "MacQueen", but these can be
#' modified passing these arguments through '...'
#'
#' Please note that the variables are standardised (mean = 0; sd = 1) before running the clustering
#'
#' An evaluation of the clustering is provided together with the SpatRaster object.
#' It is calculated as model$betweenss / model$totss * 100;
#' where 'betweenss' and 'totss' are generated by \code{\link[stats]{kmeans}}
#'
#' @rawNamespace import(data.table, except = shift)
#' @importFrom stats complete.cases kmeans na.omit
#' @importFrom dplyr bind_rows
#' @rawNamespace import(terra, except = na.omit)
#' @param obj2clust SpatRaster object (or its file name). Each layer is one variable
#' @param n_clust Numeric. Number of total clusters. Optional. Default = 20
#' @param standardise_vars Logical. Optional. If TRUE (default), variables are standardised (mean = 0; sd = 1)
#' @param filename Character. Output filename. Optional
#' @param ... Arguments for \code{\link[stats]{kmeans}}. Optional
#' @return A list with two components: (1) a SpatRaster object with the clusters and (2) a vector with the clustering evaluation in percentage
#' @seealso \code{\link{PCAs4clust}}; \code{\link{clust_optim}}; \code{\link[stats]{kmeans}}
#' @name EFT_clust
#' @export
#' @examples
#' \donttest{
#' dirctry <- paste0(system.file(package='LPDynR'), "/extdata")
#' variables_noCor <- rm_multicol(dir2process = dirctry,
#'                                multicol_cutoff = 0.7)
#' EFT_clust(obj2clust = variables_noCor,
#'          n_clust = 10)
#'}



EFT_clust <- function(obj2clust = NULL,
                      n_clust = 20,
                      standardise_vars = TRUE,
                      filename = "",
                      ...){

  ## Reading in data

  if(is.null(obj2clust)) stop("Please provide objects of classe SpatRaster (or file names to read in some)")

  if(is.character(obj2clust)){
    obj2clust <- rast(obj2clust)
  }else if(!class(obj2clust) %in% c("SpatRaster")){
    stop("Please provide objects of classe SpatRaster (or a file name to read in from)")
  }

  if(!is.numeric(n_clust) | is.na(n_clust) | is.null(n_clust))
    stop("Please provide a number of clusters")


  obj2clust_ini <- as.data.frame(obj2clust)
  obj2clust_ini$rn <- 1:nrow(obj2clust_ini)

  setDT(obj2clust_ini)

  clstr <- NULL
  obj2clust_ini_NA <- obj2clust_ini[!complete.cases(obj2clust_ini), ]   #to be used at the end to fill the raster
  obj2clust_ini_NA[, clstr := NA]
  obj2clust_ini_NA[, clstr := as.integer(clstr)]
  obj2clust_ini_NA <- obj2clust_ini_NA[, .SD, .SDcols = c("clstr", "rn")]

  obj2clust_ini <- stats::na.omit(obj2clust_ini)


  ## Standardising
  if(standardise_vars == TRUE){
    cols2scale <- colnames(obj2clust_ini)
    cols2scale <- cols2scale[!cols2scale %in% c("rn")]
    cols2keep <- paste0(cols2scale, "_scld")
    obj2clust_ini[, (cols2keep) := lapply(.SD, function(x) as.vector(scale(x))), .SDcols = cols2scale]
    obj2clust_ini <- obj2clust_ini[, .SD, .SDcols = c(cols2keep, "rn")]
    }


  ## Clustering using optimal number of clusters
  dts <- list(...)
  if(is.null(dts$nstart)) dts$nstart <- 1                   # kmeans makes 'nstart' initial configurations and reports the best one
  if(is.null(dts$iter.max)) dts$iter.max <- 500
  if(is.null(dts$algorithm)) dts$algorithm <- "MacQueen"
  # Hartigan-Wong algorithm generally does a better job than either of those, but trying several random
  # starts (‘nstart’> 1) is often recommended. In rare cases, when some of the points (rows of ‘x’) are
  # extremely close, the algorithm may not converge in the “Quick-Transfer” stage, signalling a warning
  #(and returning ‘ifault = 4’). Slight rounding of the data may be advisable in that case. (from ?kmeans)

  kmeans_clustring <- kmeans(obj2clust_ini[, - c("rn")],
                             centers = n_clust,
                             nstart = dts$nstart,
                             iter.max = dts$iter.max,
                             algorithm = dts$algorithm
                             )
  clust_eval <- kmeans_clustring$betweenss / kmeans_clustring$totss * 100


  ## Binding NA data and adding back spatial information ####
  obj2clust_ini[, clstr := kmeans_clustring$cluster]
  obj2clust_ini <- obj2clust_ini[, .SD, .SDcols = c("clstr", "rn")]

  all_data <- bind_rows(obj2clust_ini, obj2clust_ini_NA)
  #rm(obj2clust_ini, obj2clust_ini_NA)
  #gc()

  setorder(all_data, rn)

  #obj2clust1 <- obj2clust[[1]]
  EFTs_raster <- suppressWarnings(rast(nrows = nrow(obj2clust), ncols = ncol(obj2clust),
                                  crs = crs(obj2clust),
                                  ext = ext(obj2clust),
                                  #resolution,
                                  vals = all_data$clstr))

  names(EFTs_raster) <- "clusterNum"

  ## Saving results
  if (filename != "") writeRaster(EFTs_raster, filename = filename, overwrite = TRUE)
  return(list(EFTs_raster, clust_eval))

}