Nothing
#' Heterogeneity-constrained random resampling (HCR)
#' @description
#' Performs heterogeneity-constrained random (HCR) resampling (Lengyel, Chytrý & Tichý, 2011) of community data.
#' Within each stratum (e.g., grid cell), many random subsets of plots are evaluated and the subset with
#' the highest mean dissimilarity and the lowest variance of dissimilarities is retained. Optionally, the
#' number of plots per stratum is adapted from the stratum’s mean pairwise dissimilarity (\eqn{\beta}-diversity).
#'
#'
#' @param data_wide a data-frame like object with the following column contents:
#' \itemize{
#' \item column 1: sample ids
#' \item column 2: strata
#' \item columns 3...n: species.
#' }
#' @param transform One of \code{c("none","sqrt","log1p","binary")}. If "binary", values become 0/1 and
#' \code{vegan::vegdist(binary = TRUE)} is used.
#' @param score_dist Dissimilarity method for trial scoring; any method accepted by
#' \code{vegan::vegdist} (e.g., "bray","jaccard", "hellinger", "euclidean", "canberra",
#' "gower", "kulczynski","morisita","horn","mountford","raup","binomial",
#' "chao","cao", …).
#' @param beta_dist One of \code{c("bray","jaccard")} for per-stratum mean dissimilarity used to calculate the
#' adaptive number of plots.
#' With \code{transform="binary"}, "bray" equals Sørensen.
#' @param adaptive_n Logical. If TRUE, adapt the number of plots per stratum from
#' \code{beta_mean * max_plots} bounded to \code{[min_plots, max_plots]}; if FALSE, use fixed \code{n_plots}.
#' @param n_plots Fixed number of plots per stratum when \code{adaptive_n=FALSE}. If \code{NA},
#' defaults to \code{max_plots} (capped at stratum size).
#' @param min_plots,max_plots Global default min/max number of plots per stratum
#' @param min_stratum_n Minimum stratum size under which the whole stratum is selected (no resampling).
#' @param trials Number of random trials per stratum (default 1000).
#' @param group_vec Optional vector (length \code{nrow(data_wide)}) assigning each sample to a higher-level
#' group (e.g., country, region). Used only if \code{adaptive_n=TRUE}.
#' @param group_limits Optional \code{data.frame} with group-specific limits. The first column
#' must contain group names; it must also contain numeric columns named \code{"min_plots"} and \code{"max_plots"}.
#' Other columns are ignored.
#' @param write_csv Optional file path to write a CSV with columns \code{sample_id, selected}. If \code{NULL}, no file.
#' @param progress Show a text progress bar (default: \code{interactive()}).
#' @param seed Optional integer seed for reproducibility of random subset trials.
#'
#' @section Details:
#' The algorithm follows Lengyel, Chytrý & Tichý (2011) and was based upon the JUICE implementation (Tichý, 2002).
#' For speed, it precomputes per-stratum distance matrices (once) and reuses them across trials, which
#' enables large numbers of trials (default \code{trials = 1000}).
#'
#' Within each stratum candidate subsets are scored using \code{score_dist} by high mean dissimilarity and low variance of dissimilarities.
#'
#' If \code{adaptive_n = TRUE} (default), the target number of plots is computed as a linear function of the mean pairwise
#' dissimilarity (\eqn{\beta}-diversity; \code{beta_dist}) and the maximum number of plots (\code{beta_mean * max_plots}; Wiser & de Cáceres, 2013) and then
#' bounded to \code{[min_plots, max_plots]} and the stratum size.
#'
#' Additionally group-specific limits for minimum and maximum numbers of plots per stratum can be supplied via
#' \code{group_vec} and \code{group_limits}. Each sample is assigned to a higher-level group
#' (e.g., country or region), and the minimum and maximum number of plots are defined per group.
#' This allows, for example, larger plot limits to be set for larger countries or regions.
#'
#' @return A \code{data.frame} with \code{sample_id} and \code{selected} (0/1).
#' Attributes: \code{selected_rows} (logical) and \code{params}.
#'
#' @author Friedemann von Lampe
#'
#' @references
#' Lengyel, A., Chytrý, M., & Tichý, L. (2011). Heterogeneity-constrained random resampling of phytosociological databases.
#' \emph{Journal of Vegetation Science}, \strong{22(1)}, 175–183. \doi{10.1111/j.1654-1103.2010.01225.x}
#'
#' Tichý, L. (2002). JUICE, software for vegetation classification. \emph{Journal of Vegetation Science}, \strong{13(3)}, 451.
#' \doi{10.1658/1100-9233(2002)013[0451:JSFVC]2.0.CO;2}
#'
#' Wiser, S. K., & de Cáceres, M. (2013). Updating vegetation classifications: an example with New Zealand's woody vegetation.
#' \emph{Journal of Vegetation Science}, \strong{24(1)}, 80–93. \doi{10.1111/j.1654-1103.2012.01450.x}
#'
#' @export
#' @importFrom stats var
hcr_resampling <- function(
data_wide,
transform = c("none","sqrt","log1p","binary"),
score_dist = "bray",
beta_dist = c("bray","jaccard"),
adaptive_n = TRUE,
n_plots = NA,
min_plots = 10,
max_plots = 100,
min_stratum_n = 10,
trials = 1000,
group_vec = NULL,
group_limits = NULL,
write_csv = NULL,
progress = interactive(),
seed = NULL
) {
if (!requireNamespace("vegan", quietly = TRUE)) {
stop("Package 'vegan' is required (for vegdist). Please install it.")
}
transform <- match.arg(transform)
beta_dist <- match.arg(beta_dist)
## validate inputs
if (ncol(data_wide) < 3L)
stop("data_wide must have at least 3 columns: 1=id, 2=strata, 3..=species.")
if (!is.numeric(trials) || length(trials) != 1L || is.na(trials) || trials < 1)
stop("'trials' must be a single number >= 1.")
trials <- as.integer(trials)
if (!is.numeric(min_plots) || !is.numeric(max_plots))
stop("'min_plots' and 'max_plots' must be numeric.")
if (length(min_plots) != 1L || length(max_plots) != 1L ||
is.na(min_plots) || is.na(max_plots) || min_plots < 1 || max_plots < min_plots)
stop("Provide sensible 'min_plots' and 'max_plots' (min >= 1, max >= min).")
if (!is.null(n_plots) && !is.na(n_plots) &&
(!is.numeric(n_plots) || length(n_plots) != 1L || n_plots < 1))
stop("'n_plots' must be a single number >= 1 (or NA).")
if (!is.numeric(min_stratum_n) || length(min_stratum_n) != 1L || is.na(min_stratum_n) || min_stratum_n < 1)
stop("'min_stratum_n' must be a single number >= 1.")
if (!is.null(group_vec) && length(group_vec) != nrow(data_wide))
stop("If provided, 'group_vec' must have length equal to nrow(data_wide).")
if (!is.null(group_limits) && !all(c("min_plots","max_plots") %in% names(group_limits)))
stop("'group_limits' must contain columns named 'min_plots' and 'max_plots' (first column = group names).")
if (!is.null(seed)) set.seed(seed)
# fix positions: 1 = id, 2 = strata
dataset <- data_wide[, c(1, 2, setdiff(seq_len(ncol(data_wide)), 1:2)), drop = FALSE]
dataset1 <- dataset[, -c(1, 2), drop = FALSE]
# coerce species to numeric
non_num <- !vapply(dataset1, is.numeric, logical(1))
if (any(non_num)) {
suppressWarnings({
dataset1[non_num] <- lapply(dataset1[non_num], function(z) as.numeric(as.character(z)))
})
}
dataset1 <- as.matrix(dataset1)
if (!is.numeric(dataset1)) stop("Species columns could not be coerced to numeric.")
## NA handling
if (anyNA(dataset1)) {
dataset1[is.na(dataset1)] <- 0
message("NAs replaced by 0")
}
# transformations
bin_flag <- FALSE
if (transform == "binary") {
dataset1 <- ifelse(dataset1 > 0, 1, 0)
bin_flag <- TRUE
} else if (transform == "sqrt") {
dataset1 <- sqrt(pmax(dataset1, 0))
} else if (transform == "log1p") {
dataset1 <- log1p(pmax(dataset1, 0))
}
n <- nrow(dataset)
strata <- as.factor(dataset[[2L]])
levs <- levels(strata)
selected <- rep(FALSE, n)
# prepare group_limits lookup (first column = group names)
have_group_limits <- !is.null(group_limits) &&
ncol(group_limits) >= 3L &&
all(c("min_plots","max_plots") %in% names(group_limits))
if (have_group_limits) {
gl_key <- as.character(group_limits[[1L]])
gl_min_plots <- suppressWarnings(as.numeric(group_limits[["min_plots"]]))
gl_max_plots <- suppressWarnings(as.numeric(group_limits[["max_plots"]]))
}
# check if group limit definitions are provided
if (!is.null(group_vec)) {
if (!have_group_limits) {
message("group_limits not provided; using global min_plots/max_plots for all groups.")
} else {
used_groups <- unique(as.character(group_vec[!is.na(group_vec)]))
defined_groups <- unique(gl_key) # first column of group_limits
missing_groups <- setdiff(used_groups, defined_groups)
if (length(missing_groups) > 0) {
message("Groups not defined in 'group_limits' (using global min/max): ",
paste(sort(missing_groups), collapse = ", "))
}
if (any(is.na(group_vec))) {
message("Some samples have NA in 'group_vec'; they will use global min/max.")
}
}
}
# initialise progress bar
pb <- NULL
if (isTRUE(progress)) {
pb <- utils::txtProgressBar(min = 0, max = length(levs), style = 3)
on.exit({ if (!is.null(pb)) close(pb) }, add = TRUE)
}
# helper: pick min/max for a stratum from group_limits
pick_limits_for_stratum <- function(idx) {
stratum_min_plots <- min_plots
stratum_max_plots <- max_plots
if (is.null(group_vec) || !have_group_limits) {
return(list(min_plots = stratum_min_plots, max_plots = stratum_max_plots))
}
grp <- group_vec[idx][1]
if (is.na(grp)) return(list(min_plots = stratum_min_plots, max_plots = stratum_max_plots))
hit <- which(gl_key == as.character(grp))[1]
if (length(hit) == 1 && !is.na(hit)) {
if (!is.na(gl_min_plots[hit])) stratum_min_plots <- as.integer(gl_min_plots[hit])
if (!is.na(gl_max_plots[hit])) stratum_max_plots <- as.integer(gl_max_plots[hit])
}
list(min_plots = stratum_min_plots, max_plots = stratum_max_plots)
}
## loop BEGIN: iterating over all strata
for (k in seq_along(levs)) {
lev <- levs[k]
idx <- which(strata == lev)
m <- length(idx)
# group-specific min/max (fallback to globals)
lims <- pick_limits_for_stratum(idx)
stratum_min_plots <- lims$min_plots
stratum_max_plots <- lims$max_plots
## precompute per-stratum distance matrices (much faster)
# for beta_mean (can differ from score_dist)
beta_mean <- 0
if (m > 1L) {
if (identical(beta_dist, score_dist)) {
# compute D_score below and reuse it, set flag
reuse_beta_from_score <- TRUE
} else {
reuse_beta_from_score <- FALSE
D_beta <- vegan::vegdist(dataset1[idx, , drop = FALSE],
method = beta_dist, binary = bin_flag, na.rm = TRUE)
beta_mean <- mean(as.numeric(D_beta))
}
} else {
reuse_beta_from_score <- FALSE
}
# precompute score distance matrix for this stratum
if (m > 1L) {
D_score <- vegan::vegdist(dataset1[idx, , drop = FALSE],
method = score_dist, binary = bin_flag, na.rm = TRUE)
Dm <- as.matrix(D_score)
if (reuse_beta_from_score) {
beta_mean <- mean(D_score) # mean of 'dist' object equals mean of upper triangle
}
}
# decide plots for this stratum
if (isTRUE(adaptive_n)) {
# Compute the desired number of plots using a linear function
# linear function where the slope is set to stratum_max_plots and the intercept is zero.
# when beta_mean is zero, the result n_plots_est is 0
n_plots_est <- beta_mean * stratum_max_plots
# ensure minimum and maximum thresholds
target_n_plots <- as.integer(max(stratum_min_plots, min(stratum_max_plots, round(n_plots_est))))
target_n_plots <- max( min(target_n_plots, m), min(2L, m) )
min_stratum_size <- target_n_plots
} else {
target_n_plots <- if (is.na(n_plots)) min(max_plots, m) else as.integer(n_plots)
target_n_plots <- max( min(target_n_plots, m), min(2L, m) )
min_stratum_size <- as.integer(min_stratum_n)
}
# stratum smaller than minimum value: keep all; else run trials and score
# trial_subsets: matrix that stores all random candidate subsets
if (m < min_stratum_size || target_n_plots < 2L) {
selected[idx] <- TRUE
} else {
subset_mean_diss <- numeric(trials)
subset_var_diss <- numeric(trials)
trial_subsets <- matrix(0L, nrow = trials, ncol = target_n_plots)
# for each trial, it randomly samples target_n_plots relevés from those in the stratum
# and stores their indices in trial_subsets
for (j in seq_len(trials)) {
cand_local <- sample.int(m, size = target_n_plots, replace = FALSE)
trial_subsets[j, ] <- cand_local
# extract upper-triangle distances for the candidate subset
dsub <- Dm[cand_local, cand_local, drop = FALSE]
diss <- dsub[upper.tri(dsub, diag = FALSE)]
subset_mean_diss[j] <- mean(diss)
subset_var_diss[j] <- if (length(diss) > 1L) stats::var(diss) else 0
if (is.na(subset_var_diss[j])) subset_var_diss[j] <- 0
}
rank_mean_diss <- rank(subset_mean_diss, ties.method = "average") # higher mean dissimilarity (heterogeneity) is better
rank_var_diss <- rank(-subset_var_diss, ties.method = "average") # lower variance is better
subset_score <- rank_mean_diss + rank_var_diss
best_trial <- which.max(subset_score)
selected[idx[trial_subsets[best_trial, ]]] <- TRUE
}
if (!is.null(pb)) utils::setTxtProgressBar(pb, k)
}
## Loop END
# create a matrix marker with two columns.
# first column is taken from the first column of dataset (identifier)
# second column is set to 1 for all rows that were marked as selected.
out <- data.frame(
sample_id = dataset[[1L]],
selected = as.integer(selected),
stringsAsFactors = FALSE
)
# optional file output
if (!is.null(write_csv)) {
dir.create(dirname(write_csv), showWarnings = FALSE, recursive = TRUE)
utils::write.csv(out, file = write_csv, row.names = FALSE)
}
attr(out, "selected_rows") <- selected
attr(out, "params") <- list(
transform = transform, score_dist = score_dist, beta_dist = beta_dist,
adaptive_n = adaptive_n, trials = trials,
min_plots = min_plots, max_plots = max_plots, min_stratum_n = min_stratum_n,
n_plots = n_plots, seed = seed
)
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.