Nothing
#' Prepare data for model calibration
#'
#' @description
#' This function prepares data for model calibration, including optional PCA,
#' background point generation, training/testing partitioning, and the creation
#' of a grid of parameter combinations, including regularization multiplier values,
#' feature classes, and sets of environmental variables.
#'
#' @usage
#' prepare_data(algorithm, occ, x, y, raster_variables, species = NULL,
#' n_background = 1000, features = c("lq", "lqp"),
#' r_multiplier = c(0.1, 0.5, 1, 2, 3),
#' user_formulas = NULL,
#' partition_method = "kfolds",
#' n_partitions = 4, train_proportion = 0.7,
#' categorical_variables = NULL,
#' do_pca = FALSE, center = TRUE, scale = TRUE,
#' exclude_from_pca = NULL, variance_explained = 95,
#' min_explained = 5, min_number = 2, min_continuous = NULL,
#' bias_file = NULL, bias_effect = NULL, weights = NULL,
#' include_xy = TRUE, write_pca = FALSE, pca_directory = NULL,
#' write_file = FALSE, file_name = NULL, seed = 1)
#'
#' @param algorithm (character) modeling algorithm, either "glm" or "maxnet".
#' @param occ (data frame) a data.frame containing the coordinates (longitude
#' and latitude) of the occurrence records.
#' @param x (character) a string specifying the name of the column in `occ` that
#' contains the longitude values.
#' @param y (character) a string specifying the name of the column in `occ` that
#' contains the latitude values.
#' @param raster_variables (SpatRaster) predictor variables from which
#' environmental values will be extracted using `occ` and a background will be
#' sampled. Must correspond geographically with the area where model is
#' calibrated.
#' @param species (character) string specifying the species name (optional).
#' Default is NULL.
#' @param categorical_variables (character) names of the variables that are
#' categorical. Default is NULL.
#' @param do_pca (logical) whether to perform a principal component analysis
#' (PCA) with the set of variables. Default is FALSE.
#' @param exclude_from_pca (character) variable names within raster_variables
#' that should not be included in the PCA transformation. Instead, these
#' variables will be added directly to the final set of output variables
#' without being modified. The default is NULL, meaning all variables
#' will be used unless specified otherwise.
#' @param variance_explained (numeric) the cumulative percentage of total
#' variance that must be explained by the selected principal components.
#' Default is 95.
#' @param min_explained (numeric) the minimum percentage of total variance that
#' a principal component must explain to be retained. Default is 5.
#' @param center (logical) whether the variables should be zero-centered. Default
#' is TRUE.
#' @param scale (logical) whether the variables should be scaled to have unit
#' variance before the analysis takes place. Default is FALSE.
#' @param write_pca (logical) whether to save the PCA-derived raster layers
#' (principal components) to disk. Default is FALSE.
#' @param pca_directory (character) the path or name of the folder where the PC
#' raster layers will be saved. This is only applicable if `write_pca = TRUE`.
#' Default is NULL.
#' @param n_background (numeric) number of points to represent the background
#' for the model. Default is 1000.
#' @param bias_file (SpatRaster) a raster containing bias values (probability
#' weights) that influence the selection of background points. It must have the
#' same extent, resolution, and number of cells as the raster variables.
#' Default is NULL.
#' @param bias_effect (character) a string specifying how the values in the
#' `bias_file` should be interpreted. Options are "direct" or "inverse". If
#' "direct", higher values in bias file increase the likelihood of selecting
#' background points. If "inverse", higher values decrease the likelihood.
#' Default = NULL. Must be defined if `bias_file` is provided.
#' @param partition_method (character) method used for data partitioning.
#' Available options are `"kfolds"`, `"subsample"`, and `"bootstrap"`.
#' See **Details** for more information. Default = "kfolds".
#' @param n_partitions (numeric) number of partitions to generate. If
#' `partition_method` is `"subsample"` or `"bootstrap"`, this defines the number
#' of training testing replicates. If `"kfolds"`, it specifies the number of
#' folds. Must be > 1; default = 4.
#' @param train_proportion (numeric) proportion of occurrence and background
#' points to be used for model training in each partition. Only applicable when
#' `partition_method` is `"subsample"` or `"bootstrap"`. Default is 0.7 (i.e.,
#' 70% for training and 30% for testing).
#' @param weights (numeric) a numeric vector specifying weights for the
#' occurrence records. The default, NULL, uses 1 for presence and 100 for
#' background.
#' @param min_number (numeric) the minimum number of variables to be included in
#' model formulas to be generated. Default = 2.
#' @param min_continuous (numeric) the minimum number of continuous variables
#' required in a combination. Default is NULL.
#' @param features (character) a vector of feature classes. Default is c("q",
#' "lq", "lp", "qp", "lqp").
#' @param r_multiplier (numeric) a vector of regularization parameters for maxnet.
#' Default is c(0.1, 1, 2, 3, 5).
#' @param user_formulas (character) Optional character vector with custom
#' formulas provided by the user. See Details. Default is NULL.
#' @param include_xy (logical) whether to include the coordinates (longitude and
#' latitude) in the results from preparing data. Columns containing coordinates
#' will be renamed as "x" and "y". Default is TRUE.
#' @param write_file (logical) whether to write the resulting prepared_data list
#' in a local directory. Default is FALSE.
#' @param file_name (character) name of file (no extension needed) to write
#' resulting object in a local directory. Only needed if `write_file = TRUE`.
#' Default is NULL.
#' @param seed (numeric) integer value to specify an initial seed to split the
#' data and extract background. Default is 1.
#'
#' @details
#' Training and testing are performed multiple times (i.e., the number set in
#' `n_partitions`), and model selection is based on the average performance of
#' models after running this routine. A description of the available data
#' partitioning methods is below:
#'
#' - **"kfolds"**: Splits the dataset into *K* subsets (folds) of approximately
#' equal size, keeping proportion of 0 and 1 stable compared to
#' the full set. In each training/test run, one fold is used as
#' the test set, while the remaining folds are combined to form
#' the training set.
#' - **"bootstrap"**: Creates the training dataset by sampling observations
#' from the original dataset *with replacement* (i.e., the
#' same observation can be selected multiple times). The test
#' set consists of the observations that were not selected in
#' that specific sampling.
#' - **"subsample"**: Similar to bootstrap, but the training set is created by
#' sampling *without replacement* (i.e., each observation is
#' selected at most once).
#'
#' `user_formulas` must be a character vector of model formulas. Supported terms
#' include linear effects, quadratic terms (e.g., `I(bio_7^2)`), products
#' (e.g., `bio_1:bio_7`), hinge (e.g., `hinge(bio_1)`), threshold (e.g.,
#' `thresholds(bio_2)`), and categorical predictors (e.g., `categorical(SoilType)`).
#' Example of a valid formula:
#' `~ bio_1 + bio_7 + I(bio_7^2) + bio_1:bio_7 + hinge(bio_1) + thresholds(bio_2) + categorical(SoilType)`.
#' All variables appearing in the formulas must exist in the raster supplied
#' as `raster_variables`.
#'
#' @return
#' An object of class `prepared_data` containing all elements necessary to
#' perform further explorations of data and run a model calibration routine.
#'
#' @importFrom enmpa aux_var_comb
#' @importFrom terra crop prcomp extract as.data.frame nlyr ncell ext res
#' @importFrom utils combn
#'
#' @export
#'
#' @seealso
#' [calibration()], [explore_calibration_hist()], [explore_partition_env()],
#' [explore_partition_geo()], [explore_partition_extrapolation()],
#' [plot_calibration_hist()], [plot_explore_partition()]
#'
#' @examples
#' # Import occurrences
#' data(occ_data, package = "kuenm2")
#'
#' # Import raster layers
#' var <- terra::rast(system.file("extdata", "Current_variables.tif",
#' package = "kuenm2"))
#'
#' # Import a bias file
#' bias <- terra::rast(system.file("extdata", "bias_file.tif",
#' package = "kuenm2"))
#'
#' # Prepare data for maxnet model
#' sp_swd <- prepare_data(algorithm = "maxnet", occ = occ_data,
#' x = "x", y = "y",
#' raster_variables = var,
#' species = occ_data[1, 1],
#' categorical_variables = "SoilType",
#' n_background = 500, bias_file = bias,
#' bias_effect = "direct",
#' features = c("l", "q", "p", "lq", "lqp"),
#' r_multiplier = c(0.1, 1, 2, 3, 5))
#' print(sp_swd)
#'
#' # Prepare data for glm model
#' sp_swd_glm <- prepare_data(algorithm = "glm", occ = occ_data,
#' x = "x", y = "y",
#' raster_variables = var,
#' species = occ_data[1, 1],
#' categorical_variables = "SoilType",
#' n_background = 500, bias_file = bias,
#' bias_effect = "direct",
#' features = c("l", "q", "p", "lq", "lqp"))
#' print(sp_swd_glm)
prepare_data <- function(algorithm,
occ,
x,
y,
raster_variables,
species = NULL,
n_background = 1000,
features = c("lq", "lqp"),
r_multiplier = c(0.1, 0.5, 1, 2, 3),
user_formulas = NULL,
partition_method = "kfolds",
n_partitions = 4,
train_proportion = 0.7,
categorical_variables = NULL,
do_pca = FALSE,
center = TRUE,
scale = TRUE,
exclude_from_pca = NULL,
variance_explained = 95,
min_explained = 5,
min_number = 2,
min_continuous = NULL,
bias_file = NULL,
bias_effect = NULL,
weights = NULL,
include_xy = TRUE,
write_pca = FALSE,
pca_directory = NULL,
write_file = FALSE,
file_name = NULL,
seed = 1) {
#Check data
if (missing(algorithm)) {
stop("Argument 'algorithm' must be defined.")
}
if (missing(occ)) {
stop("Argument 'occ' must be defined.")
}
if (missing(x)) {
stop("Argument 'x' must be defined.")
}
if (missing(y)) {
stop("Argument 'y' must be defined.")
}
if (missing(raster_variables)) {
stop("Argument 'raster_variables' must be defined.")
}
if (length(algorithm) != 1) {
stop("'algorithm' must be a single value: either 'glm' or 'maxnet'.")
} else {
if (!algorithm %in% c("glm", "maxnet")) {
stop("'algorithm' must be 'glm' or 'maxnet'.")
}
}
if (!inherits(occ, "data.frame")) {
stop("Argument 'data' must be a 'data.frame'.")
}
if (!is.null(species) & !inherits(species, "character")) {
stop("Argument 'species' must be a 'character'.")
}
if (!inherits(x, "character") | !inherits(y, "character")) {
stop("Arguments 'x' and 'y' must be of class 'character'.")
}
if (!x %in% colnames(occ) | !y %in% colnames(occ)) {
stop("'x' and/or 'y' are not column names in 'occ'.")
}
if (!inherits(raster_variables, "SpatRaster")) {
stop("Argument 'raster_variables' must be a 'SpatRaster'.")
}
if (!is.null(categorical_variables) &
!inherits(categorical_variables, "character")) {
stop("Argument 'categorical_variables' must be a 'character' vector.")
}
if (!is.null(exclude_from_pca) & !inherits(exclude_from_pca, "character")) {
stop("Argument 'exclude_from_pca' must be a 'character'.")
}
if (write_pca & is.null(pca_directory)) {
stop("If 'write_pca' = TRUE, 'pca_directory' must be specified.")
}
if (write_pca & !inherits(pca_directory, "character")) {
stop("Argument 'pca_directory' must be a 'character'.")
}
if (!is.null(weights) && nrow(occ) != length(weights)) {
stop("Length of 'weights' must match the number of occurrences in 'occ'.")
}
if (min_number > terra::nlyr(raster_variables)) {
stop("'min_number' can't be greater than the number of layers in 'raster_variables'.")
}
if (!is.null(min_continuous)) {
if (!is.numeric(min_continuous)) {
stop("Argument 'min_continuous' must be 'numeric'.")
} else {
if (min_continuous > terra::nlyr(raster_variables)) {
stop("'min_continuous' can't be greater than the number of layers in 'raster_variables'.")
}
}
}
check_features <- unique(unlist(strsplit(features, "")))
check_features <- setdiff(check_features, c("l", "q", "p", "h", "t"))
if (length(check_features) > 0) {
stop("'features' defined are not valid:\n",
"'features' can be l, q, p, t, h, or combinations of these.")
}
if (write_file & is.null(file_name)) {
stop("If 'write_file' = TRUE, 'file_name' must be specified.")
}
if (write_file & !is.character(file_name)) {
stop("Argument 'file_name' must be a 'character'.")
}
if (!is.null(categorical_variables)) {
if (any(!categorical_variables %in% names(raster_variables))) {
stop("Defined 'categorical_variables' must be layers in 'raster_variables'.")
}
continuous_variable_names <- setdiff(names(raster_variables),
categorical_variables)
} else {
continuous_variable_names <- names(raster_variables)
}
if (!is.null(bias_file)) {
if (!inherits(bias_file, "SpatRaster")) {
stop("Argument 'bias_file' must be a 'SpatRaster'.")
}
if (is.null(bias_effect)) {
stop("Argument 'bias_effect' must be defined if 'bias_file' is provided.")
}
if (!inherits(bias_effect, "character")) {
stop("Argument 'bias_effect' must be a 'character'.")
}
if (!(bias_effect %in% c("direct", "inverse")) | length(bias_effect) > 1) {
stop("Argument 'bias_effect' must be 'direct' or 'inverse'.")
}
}
if(!(partition_method %in% c("kfolds", "subsample", "bootstrap"))){
stop("Invalid 'partition_method'. Available options include 'kfolds', 'subsample', and 'bootstrap'")
}
if(!(n_partitions %% 1 == 0) || n_partitions <= 0){
stop("'n_partitions' must be a positive numeric integer (e.g., 1, 2, 3...)")
}
if(partition_method %in% c("bootstrap", "subsample")){
if(train_proportion > 1 || train_proportion <= 0 ||
is.na(train_proportion) || is.null(train_proportion)){
stop("'train_proportion' must be a positive numeric between 0 and 1")
}
}
if(do_pca && !is.null(user_formulas)){
stop("Internal PCA cannot be run when 'user_formulas' is provided. To use PCA with custom formulas, run PCA externally with perform_pca(), supply the resulting PCA rasters via 'raster_variables' and set 'do_pca = FALSE'.")
}
sp_name <- species
if (do_pca) {
if (!is.null(categorical_variables)) {
exclude_from_pca <- c(categorical_variables, exclude_from_pca)
} else {
exclude_from_pca <- exclude_from_pca
}
pca <- perform_pca(raster_variables = raster_variables,
exclude_from_pca = exclude_from_pca,
project = FALSE, projection_data = NULL,
out_dir = pca_directory,
overwrite = FALSE, progress_bar = FALSE,
center = center, scale = scale,
variance_explained = variance_explained,
min_explained = min_explained)
pca$projection_directory <- NULL #Remove projection directory
raster_variables <- pca$env
} else {
pca <- NULL
}
occ_var <- extract_occurrence_variables(occ, x, y, raster_variables)
bg_var <- generate_background_variables(raster_variables, n_background,
bias_file, bias_effect, seed = seed)
bg_var <- bg_var[,colnames(occ_var)]
# combine occurrence and background data
occ_bg <- rbind(occ_var, bg_var)
# occ_bg <- occ_bg[, c(1, 2, which(names(occ_bg) == "pr_bg"),
# (3:(ncol(occ_bg)-1))[-which(names(occ_bg) == "pr_bg")])]
occ_bg <- handle_missing_data(occ_bg, weights)
occ_bg_xy <- if (include_xy) {occ_bg[, c("x", "y")]} else {NULL}
occ_bg <- subset(occ_bg, select = -c(x, y))
if (!is.null(categorical_variables)) {
occ_bg[categorical_variables] <- lapply(occ_bg[categorical_variables],
factor)
}
#Fix row.names
row.names(occ_bg) <- NULL
#Partitione data
pd <- part_data(data = occ_bg, pr_bg = "pr_bg",
train_proportion = train_proportion,
n_partitions = n_partitions,
partition_method = partition_method,
seed = seed)
#Formula grid
if(is.null(user_formulas)){
formula_grid <- calibration_grid(occ_bg = occ_bg, min_number = min_number,
min_continuous = min_continuous,
categorical_var = categorical_variables,
features = features, algorithm = algorithm,
r_multiplier = r_multiplier)
} else {
# If user formulas, check data
user_variables <- unique(unlist(extract_var_from_formulas(user_formulas)))
v_out <- setdiff(user_variables, names(raster_variables))
if(length(v_out) > 0){
stop("The following variables present in 'user_formulas' are absent from 'raster_variables': ", paste(v_out, collapse = "; "))
}
# Check categorical
categorical_variables <- extract_categorical(user_formulas)
#Build formula grid
if(algorithm == "maxnet"){
formula_grid <- expand.grid("Formulas" = user_formulas,
"R_multiplier" = r_multiplier)
# Add -1 to formulas
formula_grid$Formulas <- paste(formula_grid$Formulas, "-1")
} else if (algorithm == "glm"){
formula_grid <- data.frame("Formulas" = user_formulas)
}
#Identify quadratic
formula_grid$Features <- ifelse(grepl("I\\(|\\^2\\)",
formula_grid$Formulas),
"User_q", "User")
# Create ID
formula_grid <- cbind("ID" = 1:nrow(formula_grid), formula_grid)
# Update continuous and categorical variables
continuous_variable_names <- setdiff(user_variables, categorical_variables)
}
#Prepare final data
if(partition_method == "kfolds"){
train_proportion <- NULL
}
data <- new_prepared_data(species = species, calibration_data = occ_bg,
formula_grid = formula_grid,
part_data = pd, partition_method = partition_method,
n_partitions = n_partitions,
train_proportion = train_proportion,
data_xy = occ_bg_xy,
continuous_variables = continuous_variable_names,
categorical_variables = categorical_variables,
weights = weights, pca = pca$pca,
algorithm = algorithm)
if (write_file) {
saveRDS(data, paste0(file_name, ".RDS"))
}
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.