Nothing
#' Test significance of split-lines in spatial subdivision
#'
#' @description
#' Assess the statistical significance of each split-line (bioregion boundary) identified by `hespdiv` by comparing its observed performance to a null distribution. The null is generated by permuting the data `n` times and recomputing the split-line performance after each permutation. Multiple shuffling strategies are supported to probe the influence of spatial structure on delineation.
#'
#' @details
#' Two shuffling scopes are available:
#' \describe{
#' \item{\strong{"all"}:}{Shuffle across the entire study area, ignoring polygonal subdivisions.}
#' \item{\strong{"within"}:}{Shuffle only within each parent polygon (the region in which the split-line is nested), preserving local spatial structure.}
#' }
#'
#' Two shuffling types are available:
#' \describe{
#' \item{\strong{"localities"}:}{Shuffle whole localities, preserving each site's assemblage (recommended, since occurrences within a locality are not independent).}
#' \item{\strong{"occurrences"}:}{Shuffle individual occurrences across sites (use with caution; may violate within-locality independence).}
#' }
#'
#' For each split-line, the function reports the observed performance, the mean and standard deviation of the permuted (null) performances, an empirical one-sided p-value (proportion of permuted values \emph{as or more extreme} than observed; ties included), and a z-score quantifying departure from the null.
#'
#' @param obj An object of class \code{hespdiv}.
#' @param n Integer. Number of permutations used to form the null distribution.
#' @param shuffle.scope Character. Either \code{"all"} (shuffle across the full study area) or \code{"within"} (shuffle only within each parent polygon).
#' @param shuffle.type Character. Either \code{"localities"} (shuffle whole localities, preserving assemblages) or \code{"occurrences"} (shuffle individual occurrences). \code{"localities"} is generally preferred.
#' @param maintain.n Logical. Only honored when \code{shuffle.scope = "within"} and \code{shuffle.type = "localities"}. If \code{TRUE}, attempts to keep the randomized child polygon occurrence counts close to the observed (maximum discrepancy up to \code{max_n/2}, where \code{max_n} is the largest locality size). Ignored otherwise.
#'
#' @return Invisibly returns an object of class \code{nullhespdiv}, a list with:
#' \itemize{
#' \item \code{$stats}, a data frame summarizing each split-line with:
#' \itemize{
#' \item \code{performance}: observed performance.
#' \item \code{mean.random}: mean of null performances.
#' \item \code{sd.random}: standard deviation of null performances.
#' \item \code{p.val}: empirical one-sided p-value (ties included).
#' \item \code{z.score.random}: standardized effect size.
#' }
#' \item \code{$null}, a matrix or data frame containing all null performance
#' values for every split-line across permutations.
#' }
#'
#' @family functions for hespdiv results post-processing
#' @examples
#' # if split-line is strongly significant, the choice of parameters should not
#' # matter. For example (look at p-value, z.score.random, sd.random and
#' # mean.random):
#' (nulltest(example_hespdiv, maintain.n = FALSE, shuffle.type = "occurrences"))
#' (nulltest(example_hespdiv, maintain.n = FALSE, shuffle.type = "localities"))
#' (nulltest(example_hespdiv, maintain.n = TRUE, shuffle.type = "localities"))
#'
#' @export
nulltest <- function(obj, n = 999, maintain.n = TRUE, shuffle.scope = "within",
shuffle.type = "localities"){
# Validate shuffle.scope and shuffle.type arguments
shuffle.scope <- .arg_check(name = "shuffle.scope", given = shuffle.scope,
NAMES = c("all", "within"))
shuffle.type <- .arg_check(name = "shuffle.type", given = shuffle.type,
NAMES = c("localities", "occurrences"))
if (isTRUE(maintain.n) && !(identical(shuffle.scope, "within") &&
identical(shuffle.type, "localities"))) {
warning(
"`maintain.n` is ignored unless `shuffle.scope = \"within\"` and ",
"`shuffle.type = \"localities\"`. Disabling it.",
call. = FALSE
)
maintain.n <- FALSE
}
# Ensure correct object type
if (!inherits(obj, "hespdiv"))
stop("`obj` must be a `hespdiv` object.", call. = FALSE)
# Extract coordinates and data from object
coords <- obj$call.info$Call_ARGS$xy.dat
data <- obj$call.info$Call_ARGS$data
N <- nrow(coords) # Number of occurrences
l <- length(obj$split.lines) # Number of split-lines
# Prepare to store comparison and p-value results for each permutation
comp.vals <- vector(mode = "list", length = n)
p.vals <- vector(mode = "list", length = n)
for (i in 1:n){
comp.vals[[i]] <- numeric(l)
p.vals[[i]] <- numeric(l)
}
# Define comparison operator depending on whether maximizing or minimizing
# comparison value. Checks if random is as-or-more-extreme than observed.
if (obj$call.info$Call_ARGS$maximize){
pal <- function(x, criteria){ x >= criteria}
} else {
pal <- function(x, criteria){ x <= criteria}
}
# Determine the correct data slicing function based on data type
if (is.data.frame(data) | is.matrix(data)){
.slicer <- .slicer.table
} else {
if (is.list(data)) {
.slicer <- .slicer.list
} else {
.slicer <- .slicer.vect
}
}
# Main permutation loop
if (shuffle.scope == "all" && shuffle.type == "occurrences") {
# in each run mix all occurrence labels and re-sample coordinates
# with them, then filter mixed coordinates with child polygons:
for ( i in 1:n) {
# mix all occurrence ids (labels)
ids <- sample(1:N,N,replace = FALSE)
# sample coordinates with mixed ids
co <- coords[ids,]
for (split.id in 1 : l){
# identify child pol ids in poly.stats
pol_ids <- which(obj$poly.stats$root.id ==
obj$split.stats$plot.id[split.id])
# get ids of coordinates in each child polygon
split.ids1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
split.ids2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
# slice data with these ids
dat_pol1 <- .slicer(data, split.ids1)
dat_pol2 <- .slicer(data, split.ids2)
# compare data
comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
obj$call.info$Call_ARGS$generalize.f(dat_pol1),
obj$call.info$Call_ARGS$generalize.f(dat_pol2))
# compare null comp.val with original comp.val
p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
obj$split.stats$performance[split.id])
}
}
}
if (shuffle.scope == "all" && shuffle.type == "localities") {
# in each run mix locality coordinates, then filter mixed coordinates with
# child polygons:
for ( i in 1:n) {
co <- .shuffle_localities(coords)
for (split.id in 1 : l){
# identify child pol ids in poly.stats
pol_ids <- which(obj$poly.stats$root.id ==
obj$split.stats$plot.id[split.id])
# get ids of coordinates in each child polygon
split.ids1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
split.ids2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
# slice data with these ids
dat_pol1 <- .slicer(data, split.ids1)
dat_pol2 <- .slicer(data, split.ids2)
# compare data
comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
obj$call.info$Call_ARGS$generalize.f(dat_pol1),
obj$call.info$Call_ARGS$generalize.f(dat_pol2))
# compare null comp.val with original comp.val
p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
obj$split.stats$performance[split.id])
}
}
}
if (shuffle.scope == "within" && shuffle.type == "occurrences") {
# in each run mix occurrences within parent polygon, then filter mixed
# coordinates with child polygons:
for ( i in 1:n) {
for (split.id in 1 : l){
# get labels of observations inside parent polygon
ids <- .get_ids(obj$polygons.xy[[as.character(obj$split.stats$plot.id[split.id])]],
coords)
# extract points inside parent polygon & mix their labels (rows)
co <- coords[sample(ids),]
# identify child pol ids in poly.stats
pol_ids <- which(obj$poly.stats$root.id ==
obj$split.stats$plot.id[split.id])
# get (mixed) coordinate labels in each child polygon
local_idx1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
local_idx2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
# sample unpermuted order of points with permuted point ideces
split.ids1 <- ids[local_idx1]
split.ids2 <- ids[local_idx2]
# slice data with these ids
dat_pol1 <- .slicer(data, split.ids1)
dat_pol2 <- .slicer(data, split.ids2)
# compare data
comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
obj$call.info$Call_ARGS$generalize.f(dat_pol1),
obj$call.info$Call_ARGS$generalize.f(dat_pol2))
# compare null comp.val with original comp.val
p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
obj$split.stats$performance[split.id])
}
}
}
if (shuffle.scope == "within" && shuffle.type == "localities") {
# in each run mix localities within parent polygon keeping similar or
# different number of occurrences per polygon, then filter mixed
# coordinates with child polygons:
for ( i in 1:n) {
for (split.id in 1 : l){
# get labels of observations inside parent polygon
ids <- .get_ids(obj$polygons.xy[[as.character(obj$split.stats$plot.id[split.id])]],
coords)
# identify child pol ids in poly.stats
pol_ids <- which(obj$poly.stats$root.id ==
obj$split.stats$plot.id[split.id])
if (maintain.n){ # maintain similar number of occurrences in each
# polygon while shuffling localities
# get number of observation in polygon 1
quota1 <- obj$poly.stats$n.obs[pol_ids[1]]
# shuffle localities and obtain ids to sample data
assignments <- .shuffle_localities_by_quota(ids, coords, quota1)
# slice data
dat_pol1 <- .slicer(data, assignments[[1]])
dat_pol2 <- .slicer(data, assignments[[2]])
} else {
# shuffle localities irrespective of occurrence number variations.
co <- .shuffle_localities(coords[ids,])
# get ids of coordinates in each child polygon
local_idx1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
local_idx2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
# map back to original row numbers of coords
split.ids1 <- ids[local_idx1]
split.ids2 <- ids[local_idx2]
# slice data with these ids
dat_pol1 <- .slicer(data, split.ids1)
dat_pol2 <- .slicer(data, split.ids2)
}
# compare data
comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
obj$call.info$Call_ARGS$generalize.f(dat_pol1),
obj$call.info$Call_ARGS$generalize.f(dat_pol2))
# compare null comp.val with original comp.val
p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
obj$split.stats$performance[split.id])
}
}
}
p.vals <- (Reduce("+", p.vals)+1)/(n+1) # convert logical to p.val
comp.vals <- t(as.data.frame(comp.vals))
rownames(comp.vals) <- 1:n
stats <- data.frame(ID = 1:l, performance = obj$split.stats$performance,
mean.random = apply(comp.vals,2,mean),
sd.random = apply(comp.vals,2,sd),
p.val = p.vals)
stats$z.score.random <- (stats$performance - stats$mean.random)/stats$sd.random
out <- structure(list(stats = stats, null = comp.vals), class = "nullhespdiv")
return(invisible(out))
}
#' Shuffle locality positions but preserve within-locality assemblages
#'
#' @param coords A data.frame or matrix of coordinates (rows = occurrences, columns = coordinates).
#' All rows with identical values are treated as the same locality.
#' @return A matrix/data.frame with the same dimensions as coords, where each group of
#' rows belonging to the same locality has been reassigned to a new, randomly chosen locality's position.
#' @noRd
.shuffle_localities <- function(coords) {
# Combine coordinate columns into a unique string per row (locality key)
loc_keys <- apply(coords, 1, function(x) paste(x, collapse = "_"))
# Encode each locality as a factor (grouping variable)
loc_factor <- factor(loc_keys)
# Unique locality keys (sorted alphabetically, as factor levels)
loc_levels <- levels(loc_factor)
# Unique coordinates, ordered to match loc_levels
loc_coords <- coords[match(loc_levels, loc_keys), , drop = FALSE]
# Shuffle localities (rows of loc_coords)
shuffled_idx <- sample(seq_along(loc_levels))
shuffled_coords <- loc_coords[shuffled_idx, , drop = FALSE]
# For each occurrence, assign new (shuffled) locality coordinates based on their group
co <- shuffled_coords[as.integer(loc_factor), , drop = FALSE]
return(co)
}
##' Randomly assign whole localities to two polygons to fill a quota as closely as possible
#'
#' @param ids Integer vector: occurrence indices in the parent polygon
#' @param coords Data.frame: all coords (so `coords[ids,]` is valid)
#' @param quota Integer: desired number of occurrences in polygon 1
#' @return List: indices for polygon 1 and polygon 2 (by occurrence)
#' @noRd
.shuffle_localities_by_quota <- function(ids, coords, quota) {
# Subset the coordinates for the relevant occurrences (in parent polygon)
sub_coords <- coords[ids, , drop = FALSE]
# Generate a unique key for each locality by concatenating all coordinate columns
loc_keys <- apply(sub_coords, 1, function(x) paste(x, collapse = "_"))
# Create a factor where each level represents a unique locality
loc_factor <- factor(loc_keys)
# Get all unique localities as factor levels, sorted
locality_levels <- levels(loc_factor)
# Count how many occurrences are in each locality (locality "size")
locality_sizes <- table(loc_factor)
# Shuffle the order of localities to randomise sampling
shuffled_locs <- sample(locality_levels)
cum_sum <- 0 # Current number of occurrences assigned
# Add localities until quota is reached or just exceeded
cumsums <- cumsum(locality_sizes[shuffled_locs ])
ids_with <- 1 : which(cumsums >= quota)[1]
ids_without <- which(cumsums < quota)
if (length(ids_without) == 0){ # edge case: no localities assigned to polygon 1
assigned_locs <- shuffled_locs[ids_with]
} else {
if (length(ids_with) == length(cumsums)){ # edge case: all localities assigned to polygon 1
assigned_locs <- shuffled_locs[ids_without]
} else {
# Measure how close we are to the quota including and excluding the last locality
discrepancy_with <- abs(cumsums[cumsums >= quota][1] - quota)
discrepancy_without <- abs(cumsums[cumsums < quota][length(cumsums[cumsums < quota])] - quota)
# If it's closer to quota without the last locality, remove it
if (discrepancy_without < discrepancy_with) {
assigned_locs <- shuffled_locs[ids_without]
} else {
assigned_locs <- shuffled_locs[ids_with]
}
}
}
# Get the indices of occurrences that belong to assigned localities
in_poly1 <- which(loc_factor %in% assigned_locs)
in_poly2 <- setdiff(seq_along(ids), in_poly1) # The rest go to polygon 2
# Return the assignment as a list of indices for the two polygons
return(list(
poly1 = ids[in_poly1],
poly2 = ids[in_poly2]
))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.