Nothing
#' Cluster analysis to identify phylogenetic regions
#'
#' Perform a clustering analysis that categorizes sites into biogeographic regions based on phylogenetic community
#' compositional similarity.
#'
#' @param ps A `phylospatial` object. If \code{method} is anything other than \code{"kmeans"}, it must contain a
#' `dissim` component generated by \link{ps_add_dissim}.
#' @param k Number of spatial clusters to divide the region into (positive integer). See \link{ps_regions_eval} to
#' help choose a value of k by comparing the variance explained by different numbers of regions.
#' @param method Clustering method. Options include all methods listed under \link[stats]{hclust}, and \code{"kmeans"}.
#' If `"kmeans"` is selected, the `dissim` component of `ps` is ignored.
#' @param endemism Logical indicating whether community values should be divided by column totals (taxon range sizes)
#' to derive endemism. Only used if \code{method = "kmeans"}; in other cases this information should instead be
#' supplied to \link{ps_add_dissim}.
#' @param normalize Logical indicating whether community values should be divided by row totals (community sums). If `TRUE`,
#' dissimilarity is based on proportional community composition. This happens after endemism is derived. Only used if
#' \code{method = "kmeans"}; in other cases this information should instead be supplied to \link{ps_add_dissim}.
#'
#' @return A raster or matrix with an integer indicating which of the \code{k} regions each site belongs to.
#' @references Daru, B. H., Elliott, T. L., Park, D. S., & Davies, T. J. (2017). Understanding the processes underpinning
#' patterns of phylogenetic regionalization. Trends in Ecology & Evolution, 32(11), 845-860.
#' @examples
#' ps <- ps_simulate()
#'
#' # using kmeans clustering algorithm
#' terra::plot(ps_regions(ps, method = "kmeans"))
#'
#' # to use a hierarchical clustering method, first we have to `ps_add_dissim()`
#' terra::plot(ps_regions(ps_add_dissim(ps), k = 7, method = "average"))
#'
#' @export
ps_regions <- function(ps, k = 5, method = "average", endemism = FALSE, normalize = TRUE){
enforce_ps(ps)
# sites with taxa
r <- a <- occupied(ps)
if(method == "kmeans"){
comm <- ps$comm[a,]
if(endemism) comm <- apply(comm, 2, function(x) x / sum(x, na.rm = TRUE))
if(normalize) comm <- t(apply(comm, 1, function(x) x / sum(x, na.rm = TRUE)))
comm[!is.finite(comm)] <- 0
comm <- t(apply(comm, 1, function(x) x * ps$tree$edge.length)) # scale by branch length
regions <- stats::kmeans(comm, k)$cluster
}else{
stopifnot("Input data set contains no `dissim` data, which is required for methods other than `kmeans`; add it first using `ps_add_dissim()`." = !is.null(ps$dissim))
d <- as.matrix(ps$dissim)
rownames(d) <- colnames(d) <- paste("cell", 1:ncol(d))
da <- d[a, a]
# sites fully segregated by the 2 basal clades have Inf distance; replace with twice the max observed distance
da[is.infinite(da)] <- max(da[!is.infinite(da)]) * 2
da <- stats::as.dist(da)
clust <- stats::hclust(da, method = method)
regions <- stats::cutree(clust, k)
}
r[a] <- regions
r[!a] <- NA
if(!is.null(ps$spatial)){
r <- matrix(r, ncol = 1)
colnames(r) <- "phyloregion"
return(to_spatial(r, ps$spatial))
}else{
return(r)
}
}
#' Evaluate region numbers
#'
#' This function compares multiple potential values for `k`, the number of clusters in to use in `ps_regions()`,
#' to help you decide how well different numbers of regions fit your data set. For each value of `k`, it performs
#' a cluster analysis and calculates the proportion of total variance explained (SSE, the sum of squared pairwise
#' distances explained). It also calculates second-order metrics of the relationship between k and SSE. While
#' many data sets have no optimal value of `k` and the choice is often highly subjective, these evaluation metrics
#' can help you identify potential points where the variance explained stops increasing quickly as `k` increases.
#'
#' @param ps A `phylospatial` object. Must contain a `dissim` component generated by \link{ps_add_dissim}.
#' @param k Vector of positive integers giving possible values for `k`. Values greater than the number of
#' sites in the data set will be ignored.
#' @param plot Logical indicating whether to print a plot of the results (`TRUE`, the default) or return a data
#' frame of the results (`FALSE`).
#' @param ... Further arguments passed to \link{ps_regions}.
#'
#' @return The function generates a data frame with the following columns. If `plot = FALSE` the data frame is
#' returned, otherwise the function prints a plot of the latter variables as a function of `k`:
#' \itemize{
#' \item "k": The number of clusters.
#' \item "sse": The proportion of total variance explained, with variance defined as squared pairwise community
#' phylogenetic dissimilarity between sites.
#' \item "curvature": The local second derivative. Lower (more negative) values indicate more attractive
#' break-point values of k.
#' \item "dist11": The distance from the point to the 1:1 line on a plot of `k` vs `sse` in which k values over the
#' interval from 1 to the number of sites are rescaled to the unit interval. Higher values indicate more
#' attractive values for k.
#' }
#'
#' @examples
#' ps <- ps_add_dissim(ps_simulate())
#' ps_regions_eval(ps, k = 1:15, plot = TRUE)
#'
#' @export
ps_regions_eval <- function(ps, k = 1:20, plot = TRUE, ...){
dots <- list(...)
if("method" %in% names(dots)) if(dots$method == "kmeans")
stop("This function does not currently support `method = 'kmeans'`.")
stopifnot("Input data set contains no `dissim` data, which is required; add it first using `ps_add_dissim()`." = !is.null(ps$dissim))
d <- as.matrix(ps$dissim)^2
diag(d) <- NA
ss_tot <- sum(d, na.rm = TRUE)
n <- sum(occupied(ps))
k <- k[k <= n]
# sum of squared variance explained
sse <- rep(NA, length(k))
for(i in 1:length(k)){
kk <- k[i]
r <- ps_regions(ps, k = kk, ...)$phyloregion[]
sse[i] <- sum(sapply(1:kk, function(i) sum(d[r == i, r != i], na.rm = TRUE))) / ss_tot
}
# curvature of sse ~ k
lag <- function(v, n = 1) {
if(n > 0){
c(rep(NA, n), v[1:length(v) - n])
}else{
c(v[(abs(n)+1):length(v)], rep(NA, abs(n)))
}
}
curvature <- (lag(sse, -1) - sse) / (lag(k, -1) - k) - (sse - lag(sse)) / (k - lag(k))
# distance from 1:1 line
dist11 <- abs((k-1) / (n-1) - sse) / sqrt(2)
eval <- data.frame(k = k, sse = sse, curvature = curvature, dist11 = dist11)
if(plot){
graphics::matplot(eval$k, eval[, 2:ncol(eval)], type = c("b"),
pch = 1, col = 1:(ncol(eval) - 1),
xlab = "k", ylab = NA)
graphics::abline(h = 1, col = 1)
graphics::abline(h = 0, col = 2)
graphics::abline(h = max(eval$dist11, na.rm = TRUE), col = 3)
graphics::legend("bottomright", legend = colnames(eval)[2:ncol(eval)],
col = 1:(ncol(eval) - 1), pch = 1)
}else{
return(eval)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.