R/buffer.R
In spatialsample: Spatial Resampling Infrastructure

Documented in buffer_indices

#' Apply an inclusion radius and exclusion buffer to indices
#'
#' @param data An object of class `sf` or `sfc`.
#' @param indices List of indices in each fold generated by `split_unnamed()`.
#' @param radius Numeric: points within this distance of the initially-selected
#' test points will be assigned to the assessment set. If `NULL`, no radius is
#' applied.
#' @param buffer Numeric: points within this distance of any point in the
#' test set (after `radius` is applied) will be assigned to neither the analysis
#' or assessment set. If `NULL`, no buffer is applied.
#'
#' @keywords internal
buffer_indices <- function(data, indices, radius, buffer, call = rlang::caller_env()) {
  standard_checks(data, "Buffering", call)

  n <- nrow(data)
  # calling st_distance is a _huge_ performance hit, especially for big data,
  # so we make a point of only doing it once.
  #
  # This winds up requiring all sorts of weird handler code,
  # namely `row_ids_within_dist` and `which_within_dist`, in order to
  # only calculate this matrix
  #
  # Using st_is_within_dist is not faster. Using st_intersects is not faster.
  # I keep trying both of these, and have left this comment in vain hope it
  # convinces me to stop.
  distmat <- sf::st_distance(data)

  # only run radius checks if radius is not NULL (to prevent NAs from >)
  run_radius <- !is.null(radius)
  if (run_radius && units::set_units(radius, NULL) > 0) {
    # In case `radius` has no units, assume it's in the same units as `data`
    if (!identical(sf::st_crs(data), sf::NA_crs_)) units(radius) <- units(distmat)
    indices <- row_ids_within_dist(distmat, indices, radius)
  }

  # `buffer_indices` are _always_ needed
  # so re-code a NULL buffer as a 0, which will buffer nothing
  if (is.null(buffer)) buffer <- 0L
  # In case `buffer` has no units, assume it's in the same units as `data`
  if (!identical(sf::st_crs(data), sf::NA_crs_)) units(buffer) <- units(distmat)
  buffer_indices <- row_ids_within_dist(distmat, indices, buffer)

  purrr::map2(indices, buffer_indices, buffered_complement, n = n)
}

buffered_complement <- function(ind, buff_ind, n) {
  list(
    analysis = setdiff(1:n, c(ind, buff_ind)),
    assessment = unique(ind)
  )
}

row_ids_within_dist <- function(distmat, indices, dist) {
  if (units::set_units(dist, NULL) > 0) {
    # c++ won't implicitly cast, so do it in R
    mode(distmat) <- "numeric"
    dist <- as.numeric(dist)
    purrr::map(
      # indices is the output of split_unnamed
      indices,
      # which_within_dist returns a vector of row IDs in sequential order
      #
      # In order to visualize (eventually) which observations were originally
      # chosen for the test set and which were inside `radius`,
      # we want the new indices to be appended to the end of the original indices,
      # not sorted in
      #
      # So here we append the new indices to the old and de-duplicate them
      ~ unique(c(.x, which_within_dist(distmat, as.numeric(.x), dist)))
    )
  } else {
    # initialize to integer(0) in case buffer is <= 0:
    lapply(seq_along(indices), function(x) integer(0))
  }
}