R/ps_regions.R
In phylospatial: Spatial Phylogenetic Analysis

Documented in ps_regions ps_regions_eval

#' Cluster analysis to identify phylogenetic regions
#'
#' Perform a clustering analysis that categorizes sites into biogeographic regions based on phylogenetic community
#' compositional similarity.
#'
#' @param ps A `phylospatial` object. If \code{method} is anything other than \code{"kmeans"}, it must contain a
#'    `dissim` component generated by \link{ps_add_dissim}.
#' @param k Number of spatial clusters to divide the region into (positive integer). See \link{ps_regions_eval} to
#'    help choose a value of k by comparing the variance explained by different numbers of regions.
#' @param method Clustering method. Options include all methods listed under \link[stats]{hclust}, and \code{"kmeans"}.
#'    If `"kmeans"` is selected, the `dissim` component of `ps` is ignored.
#' @param endemism Logical indicating whether community values should be divided by column totals (taxon range sizes)
#'    to derive endemism. Only used if \code{method = "kmeans"}; in other cases this information should instead be
#'    supplied to \link{ps_add_dissim}.
#' @param normalize Logical indicating whether community values should be divided by row totals (community sums). If `TRUE`,
#'    dissimilarity is based on proportional community composition. This happens after endemism is derived. Only used if
#'    \code{method = "kmeans"}; in other cases this information should instead be supplied to \link{ps_add_dissim}.
#'
#' @return A raster or matrix with an integer indicating which of the \code{k} regions each site belongs to.
#' @references Daru, B. H., Elliott, T. L., Park, D. S., & Davies, T. J. (2017). Understanding the processes underpinning
#' patterns of phylogenetic regionalization. Trends in Ecology & Evolution, 32(11), 845-860.
#' @examples
#' ps <- ps_simulate()
#'
#' # using kmeans clustering algorithm
#' terra::plot(ps_regions(ps, method = "kmeans"))
#'
#' # to use a hierarchical clustering method, first we have to `ps_add_dissim()`
#' terra::plot(ps_regions(ps_add_dissim(ps), k = 7, method = "average"))
#'
#' @export
ps_regions <- function(ps, k = 5, method = "average", endemism = FALSE, normalize = TRUE){

      enforce_ps(ps)

      # sites with taxa
      r <- a <- occupied(ps)

      if(method == "kmeans"){
            comm <- ps$comm[a,]
            if(endemism) comm <- apply(comm, 2, function(x) x / sum(x, na.rm = TRUE))
            if(normalize) comm <- t(apply(comm, 1, function(x) x / sum(x, na.rm = TRUE)))
            comm[!is.finite(comm)] <- 0
            comm <- t(apply(comm, 1, function(x) x * ps$tree$edge.length)) # scale by branch length
            regions <- stats::kmeans(comm, k)$cluster
      }else{
            stopifnot("Input data set contains no `dissim` data, which is required for methods other than `kmeans`; add it first using `ps_add_dissim()`." = !is.null(ps$dissim))
            d <- as.matrix(ps$dissim)
            rownames(d) <- colnames(d) <- paste("cell", 1:ncol(d))
            da <- d[a, a]

            # sites fully segregated by the 2 basal clades have Inf distance; replace with twice the max observed distance
            da[is.infinite(da)] <- max(da[!is.infinite(da)]) * 2
            da <- stats::as.dist(da)

            clust <- stats::hclust(da, method = method)
            regions <- stats::cutree(clust, k)
      }

      r[a] <- regions
      r[!a] <- NA

      if(!is.null(ps$spatial)){
            r <- matrix(r, ncol = 1)
            colnames(r) <- "phyloregion"
            return(to_spatial(r, ps$spatial))
      }else{
            return(r)
      }

}


#' Evaluate region numbers
#'
#' This function compares multiple potential values for `k`, the number of clusters in to use in `ps_regions()`,
#' to help you decide how well different numbers of regions fit your data set. For each value of `k`, it performs
#' a cluster analysis and calculates the proportion of total variance explained (SSE, the sum of squared pairwise
#' distances explained). It also calculates second-order metrics of the relationship between k and SSE. While
#' many data sets have no optimal value of `k` and the choice is often highly subjective, these evaluation metrics
#' can help you identify potential points where the variance explained stops increasing quickly as `k` increases.
#'
#' @param ps A `phylospatial` object. Must contain a `dissim` component generated by \link{ps_add_dissim}.
#' @param k Vector of positive integers giving possible values for `k`. Values greater than the number of
#'    sites in the data set will be ignored.
#' @param plot Logical indicating whether to print a plot of the results (`TRUE`, the default) or return a data
#'    frame of the results (`FALSE`).
#' @param ... Further arguments passed to \link{ps_regions}.
#'
#' @return The function generates a data frame with the following columns. If `plot = FALSE` the data frame is
#' returned, otherwise the function prints a plot of the latter variables as a function of `k`:
#' \itemize{
#'  \item "k": The number of clusters.
#'  \item "sse": The proportion of total variance explained, with variance defined as squared pairwise community
#'    phylogenetic dissimilarity between sites.
#'  \item "curvature": The local second derivative. Lower (more negative) values indicate more attractive
#'    break-point values of k.
#'  \item "dist11": The distance from the point to the 1:1 line on a plot of `k` vs `sse` in which k values over the
#'    interval from 1 to the number of sites are rescaled to the unit interval. Higher values indicate more
#'    attractive values for k.
#' }
#'
#' @examples
#' ps <- ps_add_dissim(ps_simulate())
#' ps_regions_eval(ps, k = 1:15, plot = TRUE)
#'
#' @export
ps_regions_eval <- function(ps, k = 1:20, plot = TRUE, ...){

      dots <- list(...)
      if("method" %in% names(dots)) if(dots$method == "kmeans")
            stop("This function does not currently support `method = 'kmeans'`.")
      stopifnot("Input data set contains no `dissim` data, which is required; add it first using `ps_add_dissim()`." = !is.null(ps$dissim))

      d <- as.matrix(ps$dissim)^2
      diag(d) <- NA
      ss_tot <- sum(d, na.rm = TRUE)
      n <- sum(occupied(ps))
      k <- k[k <= n]

      # sum of squared variance explained
      sse <- rep(NA, length(k))
      for(i in 1:length(k)){
            kk <- k[i]
            r <- ps_regions(ps, k = kk, ...)$phyloregion[]
            sse[i] <- sum(sapply(1:kk, function(i) sum(d[r == i, r != i], na.rm = TRUE))) / ss_tot
      }

      # curvature of sse ~ k
      lag <- function(v, n = 1) {
            if(n > 0){
                  c(rep(NA, n), v[1:length(v) - n])
            }else{
                  c(v[(abs(n)+1):length(v)], rep(NA, abs(n)))
            }
      }
      curvature <- (lag(sse, -1) - sse) / (lag(k, -1) - k) - (sse - lag(sse)) / (k - lag(k))

      # distance from 1:1 line
      dist11 <- abs((k-1) / (n-1) - sse) / sqrt(2)

      eval <- data.frame(k = k, sse = sse, curvature = curvature, dist11 = dist11)
      if(plot){
            graphics::matplot(eval$k, eval[, 2:ncol(eval)], type = c("b"),
                              pch = 1, col = 1:(ncol(eval) - 1),
                              xlab = "k", ylab = NA)
            graphics::abline(h = 1, col = 1)
            graphics::abline(h = 0, col = 2)
            graphics::abline(h = max(eval$dist11, na.rm = TRUE), col = 3)
            graphics::legend("bottomright", legend = colnames(eval)[2:ncol(eval)],
                             col = 1:(ncol(eval) - 1), pch = 1)
      }else{
            return(eval)
      }
}