R/hespdiv_nullltest.R

Defines functions .shuffle_localities_by_quota .shuffle_localities nulltest

Documented in nulltest

#' Test significance of split-lines in spatial subdivision
#'
#' @description
#' Assess the statistical significance of each split-line (bioregion boundary) identified by `hespdiv` by comparing its observed performance to a null distribution. The null is generated by permuting the data `n` times and recomputing the split-line performance after each permutation. Multiple shuffling strategies are supported to probe the influence of spatial structure on delineation.
#'
#' @details
#' Two shuffling scopes are available:
#' \describe{
#'   \item{\strong{"all"}:}{Shuffle across the entire study area, ignoring polygonal subdivisions.}
#'   \item{\strong{"within"}:}{Shuffle only within each parent polygon (the region in which the split-line is nested), preserving local spatial structure.}
#' }
#'
#' Two shuffling types are available:
#' \describe{
#'   \item{\strong{"localities"}:}{Shuffle whole localities, preserving each site's assemblage (recommended, since occurrences within a locality are not independent).}
#'   \item{\strong{"occurrences"}:}{Shuffle individual occurrences across sites (use with caution; may violate within-locality independence).}
#' }
#'
#' For each split-line, the function reports the observed performance, the mean and standard deviation of the permuted (null) performances, an empirical one-sided p-value (proportion of permuted values \emph{as or more extreme} than observed; ties included), and a z-score quantifying departure from the null.
#'
#' @param obj An object of class \code{hespdiv}.
#' @param n Integer. Number of permutations used to form the null distribution.
#' @param shuffle.scope Character. Either \code{"all"} (shuffle across the full study area) or \code{"within"} (shuffle only within each parent polygon).
#' @param shuffle.type Character. Either \code{"localities"} (shuffle whole localities, preserving assemblages) or \code{"occurrences"} (shuffle individual occurrences). \code{"localities"} is generally preferred.
#' @param maintain.n Logical. Only honored when \code{shuffle.scope = "within"} and \code{shuffle.type = "localities"}. If \code{TRUE}, attempts to keep the randomized child polygon occurrence counts close to the observed (maximum discrepancy up to \code{max_n/2}, where \code{max_n} is the largest locality size). Ignored otherwise.
#'
#' @return Invisibly returns an object of class \code{nullhespdiv}, a list with:
#' \itemize{
#'   \item \code{$stats}, a data frame summarizing each split-line with:
#'   \itemize{
#'     \item \code{performance}: observed performance.
#'     \item \code{mean.random}: mean of null performances.
#'     \item \code{sd.random}: standard deviation of null performances.
#'     \item \code{p.val}: empirical one-sided p-value (ties included).
#'     \item \code{z.score.random}: standardized effect size.
#'   }
#'   \item \code{$null}, a matrix or data frame containing all null performance
#'   values for every split-line across permutations.
#' }
#'
#' @family functions for hespdiv results post-processing
#' @examples
#' # if split-line is strongly significant, the choice of parameters should not
#' # matter. For example (look at p-value, z.score.random, sd.random and
#' # mean.random):
#' (nulltest(example_hespdiv, maintain.n = FALSE, shuffle.type = "occurrences"))
#' (nulltest(example_hespdiv, maintain.n = FALSE, shuffle.type = "localities"))
#' (nulltest(example_hespdiv, maintain.n = TRUE, shuffle.type = "localities"))
#'
#' @export

nulltest <- function(obj, n = 999, maintain.n  = TRUE, shuffle.scope = "within",
                     shuffle.type = "localities"){

  # Validate shuffle.scope and shuffle.type arguments
  shuffle.scope <- .arg_check(name = "shuffle.scope", given = shuffle.scope,
                             NAMES = c("all", "within"))
  shuffle.type <- .arg_check(name = "shuffle.type", given = shuffle.type,
                              NAMES = c("localities", "occurrences"))

  if (isTRUE(maintain.n) && !(identical(shuffle.scope, "within") &&
                              identical(shuffle.type, "localities"))) {
    warning(
      "`maintain.n` is ignored unless `shuffle.scope = \"within\"` and ",
      "`shuffle.type = \"localities\"`. Disabling it.",
      call. = FALSE
    )
    maintain.n <- FALSE
  }

  # Ensure correct object type
  if (!inherits(obj, "hespdiv"))
    stop("`obj` must be a `hespdiv` object.", call. = FALSE)

  # Extract coordinates and data from object
  coords <- obj$call.info$Call_ARGS$xy.dat
  data <- obj$call.info$Call_ARGS$data
  N <- nrow(coords)  # Number of occurrences
  l <- length(obj$split.lines) # Number of split-lines

  # Prepare to store comparison and p-value results for each permutation
  comp.vals <- vector(mode = "list", length = n)
  p.vals <- vector(mode = "list", length = n)
  for (i in 1:n){
    comp.vals[[i]] <- numeric(l)
    p.vals[[i]] <- numeric(l)
  }

  # Define comparison operator depending on whether maximizing or minimizing
  # comparison value. Checks if random is as-or-more-extreme than observed.
  if (obj$call.info$Call_ARGS$maximize){
    pal <- function(x, criteria){ x >= criteria}
  } else {
    pal <- function(x, criteria){ x <= criteria}
  }

  # Determine the correct data slicing function based on data type
  if (is.data.frame(data) | is.matrix(data)){
    .slicer <- .slicer.table
  } else {
    if (is.list(data)) {
      .slicer <- .slicer.list
    } else {
      .slicer <- .slicer.vect
    }
  }

  # Main permutation loop
  if (shuffle.scope == "all" && shuffle.type == "occurrences") {
    # in each run mix all occurrence labels and re-sample coordinates
    # with them, then filter mixed coordinates with child polygons:
    for ( i in 1:n) {
      # mix all occurrence ids (labels)
      ids <- sample(1:N,N,replace = FALSE)
      # sample coordinates with mixed ids
      co <- coords[ids,]

      for (split.id in 1 : l){
        # identify child pol ids in poly.stats
        pol_ids <- which(obj$poly.stats$root.id ==
                           obj$split.stats$plot.id[split.id])
        # get ids of coordinates in each child polygon
        split.ids1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
        split.ids2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
        # slice data with these ids
        dat_pol1 <- .slicer(data, split.ids1)
        dat_pol2 <- .slicer(data, split.ids2)
        # compare data
        comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
          obj$call.info$Call_ARGS$generalize.f(dat_pol1),
          obj$call.info$Call_ARGS$generalize.f(dat_pol2))
        # compare null comp.val with original comp.val
        p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
                                     obj$split.stats$performance[split.id])
      }
    }
  }
  if (shuffle.scope == "all" && shuffle.type == "localities") {
    # in each run mix locality coordinates, then filter mixed coordinates with
    # child polygons:
    for ( i in 1:n) {
      co <- .shuffle_localities(coords)

      for (split.id in 1 : l){
        # identify child pol ids in poly.stats
        pol_ids <- which(obj$poly.stats$root.id ==
                           obj$split.stats$plot.id[split.id])
        # get ids of coordinates in each child polygon
        split.ids1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
        split.ids2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
        # slice data with these ids
        dat_pol1 <- .slicer(data, split.ids1)
        dat_pol2 <- .slicer(data, split.ids2)
        # compare data
        comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
          obj$call.info$Call_ARGS$generalize.f(dat_pol1),
          obj$call.info$Call_ARGS$generalize.f(dat_pol2))
        # compare null comp.val with original comp.val
        p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
                                     obj$split.stats$performance[split.id])
      }
    }
  }
  if (shuffle.scope == "within" && shuffle.type == "occurrences") {
    # in each run mix occurrences within parent polygon, then filter mixed
    # coordinates with child polygons:
    for ( i in 1:n) {
      for (split.id in 1 : l){
        # get labels of observations inside parent polygon
        ids <- .get_ids(obj$polygons.xy[[as.character(obj$split.stats$plot.id[split.id])]],
                        coords)
        # extract points inside parent polygon & mix their labels (rows)
        co <- coords[sample(ids),]
        # identify child pol ids in poly.stats
        pol_ids <- which(obj$poly.stats$root.id ==
                           obj$split.stats$plot.id[split.id])
        # get (mixed) coordinate labels in each child polygon
        local_idx1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
        local_idx2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
        # sample unpermuted order of points with permuted point ideces
        split.ids1 <- ids[local_idx1]
        split.ids2 <- ids[local_idx2]

        # slice data with these ids
        dat_pol1 <- .slicer(data, split.ids1)
        dat_pol2 <- .slicer(data, split.ids2)

        # compare data
        comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
          obj$call.info$Call_ARGS$generalize.f(dat_pol1),
          obj$call.info$Call_ARGS$generalize.f(dat_pol2))
        # compare null comp.val with original comp.val
        p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
                                     obj$split.stats$performance[split.id])
      }
    }
  }
  if (shuffle.scope == "within" && shuffle.type == "localities") {
    # in each run mix localities within parent polygon keeping similar or
    # different number of occurrences per polygon, then filter mixed
    # coordinates with child polygons:
    for ( i in 1:n) {
      for (split.id in 1 : l){
        # get labels of observations inside parent polygon
        ids <- .get_ids(obj$polygons.xy[[as.character(obj$split.stats$plot.id[split.id])]],
                        coords)
        # identify child pol ids in poly.stats
        pol_ids <- which(obj$poly.stats$root.id ==
                           obj$split.stats$plot.id[split.id])

        if (maintain.n){ # maintain similar number of occurrences in each
          # polygon while shuffling localities

          # get number of observation in polygon 1
          quota1 <- obj$poly.stats$n.obs[pol_ids[1]]

          # shuffle localities and obtain ids to sample data
          assignments <- .shuffle_localities_by_quota(ids, coords, quota1)
          # slice data
          dat_pol1 <- .slicer(data, assignments[[1]])
          dat_pol2 <- .slicer(data, assignments[[2]])
        } else {
          # shuffle localities irrespective of occurrence number variations.
          co <- .shuffle_localities(coords[ids,])
          # get ids of coordinates in each child polygon
          local_idx1 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[1]]]], co)
          local_idx2 <- .get_ids(obj$polygons.xy[[as.character(obj$poly.stats$plot.id)[pol_ids[2]]]], co)
          # map back to original row numbers of coords
          split.ids1 <- ids[local_idx1]
          split.ids2 <- ids[local_idx2]
          # slice data with these ids
          dat_pol1 <- .slicer(data, split.ids1)
          dat_pol2 <- .slicer(data, split.ids2)
        }
        # compare data
        comp.vals[[i]][split.id] <- obj$call.info$Call_ARGS$compare.f(
          obj$call.info$Call_ARGS$generalize.f(dat_pol1),
          obj$call.info$Call_ARGS$generalize.f(dat_pol2))
        # compare null comp.val with original comp.val
        p.vals[[i]][split.id] <- pal(comp.vals[[i]][split.id],
                                     obj$split.stats$performance[split.id])
      }
    }
  }


  p.vals <- (Reduce("+", p.vals)+1)/(n+1) # convert logical to p.val
  comp.vals <- t(as.data.frame(comp.vals))
  rownames(comp.vals) <- 1:n
  stats <- data.frame(ID = 1:l, performance = obj$split.stats$performance,
                      mean.random = apply(comp.vals,2,mean),
                      sd.random = apply(comp.vals,2,sd),
                      p.val = p.vals)
  stats$z.score.random <- (stats$performance - stats$mean.random)/stats$sd.random
  out <- structure(list(stats = stats, null = comp.vals), class = "nullhespdiv")
  return(invisible(out))
}

#' Shuffle locality positions but preserve within-locality assemblages
#'
#' @param coords A data.frame or matrix of coordinates (rows = occurrences, columns = coordinates).
#'               All rows with identical values are treated as the same locality.
#' @return A matrix/data.frame with the same dimensions as coords, where each group of
#'         rows belonging to the same locality has been reassigned to a new, randomly chosen locality's position.
#' @noRd
.shuffle_localities <- function(coords) {
  # Combine coordinate columns into a unique string per row (locality key)
  loc_keys <- apply(coords, 1, function(x) paste(x, collapse = "_"))

  # Encode each locality as a factor (grouping variable)
  loc_factor <- factor(loc_keys)

  # Unique locality keys (sorted alphabetically, as factor levels)
  loc_levels <- levels(loc_factor)

  # Unique coordinates, ordered to match loc_levels
  loc_coords <- coords[match(loc_levels, loc_keys), , drop = FALSE]

  # Shuffle localities (rows of loc_coords)
  shuffled_idx <- sample(seq_along(loc_levels))
  shuffled_coords <- loc_coords[shuffled_idx, , drop = FALSE]

  # For each occurrence, assign new (shuffled) locality coordinates based on their group
  co <- shuffled_coords[as.integer(loc_factor), , drop = FALSE]

  return(co)
}

##' Randomly assign whole localities to two polygons to fill a quota as closely as possible
#'
#' @param ids Integer vector: occurrence indices in the parent polygon
#' @param coords Data.frame: all coords (so `coords[ids,]` is valid)
#' @param quota Integer: desired number of occurrences in polygon 1
#' @return List: indices for polygon 1 and polygon 2 (by occurrence)
#' @noRd
.shuffle_localities_by_quota <- function(ids, coords, quota) {
  # Subset the coordinates for the relevant occurrences (in parent polygon)
  sub_coords <- coords[ids, , drop = FALSE]

  # Generate a unique key for each locality by concatenating all coordinate columns
  loc_keys <- apply(sub_coords, 1, function(x) paste(x, collapse = "_"))

  # Create a factor where each level represents a unique locality
  loc_factor <- factor(loc_keys)

  # Get all unique localities as factor levels, sorted
  locality_levels <- levels(loc_factor)

  # Count how many occurrences are in each locality (locality "size")
  locality_sizes <- table(loc_factor)

  # Shuffle the order of localities to randomise sampling
  shuffled_locs <- sample(locality_levels)
  cum_sum <- 0         # Current number of occurrences assigned

  # Add localities until quota is reached or just exceeded
  cumsums <- cumsum(locality_sizes[shuffled_locs ])
  ids_with <- 1 : which(cumsums >= quota)[1]
  ids_without <- which(cumsums < quota)

  if (length(ids_without) == 0){ # edge case: no localities assigned to polygon 1
    assigned_locs <- shuffled_locs[ids_with]
  } else {
    if (length(ids_with) == length(cumsums)){ # edge case: all localities assigned to polygon 1
      assigned_locs <- shuffled_locs[ids_without]
    } else {

      # Measure how close we are to the quota including and excluding the last locality
      discrepancy_with <- abs(cumsums[cumsums >= quota][1] - quota)
      discrepancy_without <- abs(cumsums[cumsums < quota][length(cumsums[cumsums < quota])] - quota)


      # If it's closer to quota without the last locality, remove it
      if (discrepancy_without < discrepancy_with) {
        assigned_locs <- shuffled_locs[ids_without]
      } else {
        assigned_locs <- shuffled_locs[ids_with]
      }
    }
  }
  # Get the indices of occurrences that belong to assigned localities
  in_poly1 <- which(loc_factor %in% assigned_locs)
  in_poly2 <- setdiff(seq_along(ids), in_poly1) # The rest go to polygon 2

  # Return the assignment as a list of indices for the two polygons
  return(list(
    poly1 = ids[in_poly1],
    poly2 = ids[in_poly2]
  ))
}

Try the hespdiv package in your browser

Any scripts or data that you put into this service are public.

hespdiv documentation built on May 21, 2026, 5:09 p.m.