Nothing
#' GLM calibration with presence-absence data
#'
#' @description
#' Creates candidate models based on distinct parameter settings, evaluates
#' models, and selects the ones that perform the best.
#'
#' @usage
#' calibration_glm(data, dependent, independent, weights = NULL,
#' response_type = "l", formula_mode = "moderate",
#' minvar = 1, maxvar = NULL, user_formulas = NULL,
#' cv_kfolds = 5, partition_index = NULL, seed = 1,
#' n_threshold = 100, selection_criterion = "TSS",
#' exclude_bimodal = FALSE, tolerance = 0.01,
#' out_dir = NULL, parallel = FALSE,
#' n_cores = NULL, verbose = TRUE)
#'
#' @param data data.frame or matrix of data to be used in model calibration.
#' Columns represent dependent and independent variables.
#' @param dependent (character) name of dependent variable.
#' @param independent (character) vector of name(s) of independent variable(s).
#' @param weights (numeric) a vector with the weights for observations.
#' @param response_type (character) a character string that must contain "l",
#' "p", "q" or a combination of them. l = lineal, q = quadratic,
#' p = interaction between two variables. Default = "l".
#' @param formula_mode (character) a character string to indicate the strategy to
#' create the formulas for candidate models. Options are: "light", "moderate",
#' "intensive", or "complex". Default = "moderate". "complex" returns only the
#' most complex formula defined in `response_type`.
#' @param minvar (numeric) minimum number of independent variables in formulas.
#' @param maxvar (numeric) maximum number of independent variables in formulas.
#' @param user_formulas (character) vector with formula(s) to test.
#' Default = NULL.
#' @param partition_index list of indices for cross-validation in k-fold. The
#' default, NULL, uses the function \code{\link{kfold_partition}}.
#' @param cv_kfolds (numeric) number of folds to use for k-fold
#' cross-validation exercises. Default = 5. Ignored if `partition_index`
#' is defined.
#' @param seed (numeric) a seed for k-fold partitioning.
#' @param n_threshold (logical) number of threshold values to produce
#' evaluation metrics. Default = 100.
#' @param selection_criterion (character) criterion used to select best models,
#' options are "TSS" and "ESS". Default = "TSS".
#' @param exclude_bimodal (logical) whether to filter out models with one or
#' more variables presenting concave responses. Default = FALSE.
#' @param tolerance (numeric) value to modify the limit value of the metric
#' used to filter models during model selection if none of the models meet
#' initial considerations. Default = 0.01
#' @param out_dir (character) output directory name to save the main calibration
#' results. Default = NULL.
#' @param parallel (logical) whether to run on parallel or sequential.
#' Default = FALSE.
#' @param n_cores (numeric) number of cores to use. Default = number of free
#' processors - 1.
#' @param verbose (logical) whether to print messages and show progress bar.
#' Default = TRUE
#'
#' @return
#' An object of the class enmpa_calibration containing: selected models,
#' a summary of statistics for all models, results obtained in cross-validation
#' for all models, original data used, weights, and data-partition indices used.
#'
#' @details
#' Model evaluation is done considering the ability to predict presences and
#' absences,as well as model fitting and complexity. Model selection consists
#' of three steps: 1) a first filter to keep the models with ROC AUC >= 0.5
#' (statistically significant models), 2) a second filter to maintain only
#' models that meet the `selection_criterion` ("TSS": TSS >= 0.4; or "ESS":
#' maximum Accuracy - `tolerance`), and 3) from those, pick the ones with
#' delta AIC <= 2.
#'
#'
#' `formula_mode` options determine what strategy to iterate the predictors
#' defined in \code{type} for creating models:
#' - **light**.-- returns simple iterations of complex formulas.
#' - **moderate**.-- returns a comprehensive number of iterations.
#' - **intensive**.-- returns all possible combination. Very time-consuming for
#' 6 or more independent variables.
#' - **complex**.-- returns only the most complex formula.
#'
#' @examples
#' # Load species occurrences and environmental data.
#' data("enm_data", package = "enmpa")
#' head(enm_data)
#'
#' # Calibration using linear (l), quadratic (q), products(p) responses.
#' cal_res <- calibration_glm(data = enm_data, dependent = "Sp",
#' independent = c("bio_1", "bio_12"),
#' response_type = "lpq", formula_mode = "moderate",
#' selection_criterion = "TSS", cv_kfolds = 3,
#' exclude_bimodal = TRUE, verbose = FALSE)
#' print(cal_res)
#' summary(cal_res)
#'
#' head(cal_res$calibration_results)
#' head(cal_res$summary)
#' head(cal_res$selected)
#' head(cal_res$data)
#'
#' @export
#'
#' @importFrom utils txtProgressBar write.table
#' @importFrom stats aggregate sd
#' @importFrom snow makeSOCKcluster stopCluster
#' @importFrom doSNOW registerDoSNOW
#' @importFrom parallel clusterExport
#' @importFrom foreach foreach %dopar%
calibration_glm <- function(data, dependent, independent, weights = NULL,
response_type = "l", formula_mode = "moderate",
minvar=1, maxvar = NULL, user_formulas = NULL,
cv_kfolds = 5, partition_index = NULL, seed = 1,
n_threshold = 100, selection_criterion = "TSS",
exclude_bimodal = FALSE, tolerance = 0.01,
out_dir = NULL, parallel = FALSE,
n_cores = NULL, verbose = TRUE) {
# initial tests
if (missing(data) | missing(dependent) | missing(dependent)) {
stop("Arguments 'data', 'dependent', and 'independent' must be defined.")
}
if (is.null(user_formulas) & missing(dependent)) {
stop("Argument 'respuesta' must be defined if 'user_formulas' in NULL.")
}
## 1. Data partitioning: k-Fold Cross-Validation
if (is.null(partition_index)){
k <- cv_kfolds
data_partition <- kfold_partition(data, dependent = dependent, k = k,
seed = seed)
} else {
k <- length(partition_index)
data_partition <- partition_index
}
## 2. Formula combination
if (is.null(user_formulas)) {
if (verbose == TRUE) {
message("\nEstimating formulas combinations for evaluation.")
}
user_formulas <- get_formulas(dependent = dependent,
independent = independent,
type = response_type, mode = formula_mode,
minvar=minvar, maxvar = maxvar)
} else {
if (verbose == TRUE) {
message("\nUsing user-defined formulas.")
}
}
if (verbose == TRUE) {
message("Evaluating a total of ", length(user_formulas), " models.\n")
}
# 3. MAIN FUNCTION ___________________________________________________________
# number of models to test
iterations <- length(user_formulas)
if (parallel == FALSE) {
## SEQUENTIAL RUNNING
if (verbose == TRUE) {
message("Running in Sequential.")
}
# time start
start <- Sys.time()
## progress bar
if (verbose == TRUE) {
pb <- utils::txtProgressBar(min = 1, max = iterations, style = 3)
}
glm_res <- data.frame() # empty df to store results for each iteration.
for (i in 1:iterations) {
if (verbose == TRUE) {
utils::setTxtProgressBar(pb, i)
}
res <- model_validation(formula = user_formulas[i],
data = data, weights = weights, cv = TRUE,
partition_index = data_partition,
n_threshold = n_threshold,
keep_coefficients = exclude_bimodal)
# Concatenate by rows each loop iteration
glm_res <- rbind(glm_res, res)
}
# Final time
if (verbose == TRUE) {
format_time(Sys.time() - start)
}
} else {
## PARALLEL RUNNING
if (is.null(n_cores)) {
n_cores <- parallel::detectCores() - 1
}
if (verbose == TRUE) {
message(paste0("Running in Parallel using ", n_cores, " threads."))
}
# time start
start <- Sys.time()
## progress bar
if (verbose == TRUE) {
pb <- utils::txtProgressBar(min = 1, max = iterations, style = 3)
progress <- function(n) {
utils::setTxtProgressBar(pb, n)
}
opts <- list(progress = progress)
} else {
opts <- NULL
}
## Make cluster
cl <- snow::makeSOCKcluster(n_cores)
# export local function
#parallel::clusterExport(cl, c("optimize_metrics", "model_validation"))
doSNOW::registerDoSNOW(cl)
glm_res <- foreach::foreach(
i = 1:iterations, .combine = "rbind", .inorder = FALSE,
.options.snow = opts
) %dopar% {
res <- model_validation(formula = user_formulas[i],
data = data, weights = weights, cv = TRUE,
partition_index = data_partition,
n_threshold = n_threshold,
keep_coefficients = exclude_bimodal)
return(res)
}
# Closing cluster
snow::stopCluster(cl)
# Final time
if (verbose == TRUE) {
format_time(Sys.time() - start)
}
}
# 4. data output rearrangement
if (verbose == TRUE) {
message("\nPreparing results...\n") ### Final steps
}
# Summary stats: mean and SD calculation for each model based on folds
stats <- evaluation_stats(evaluation_results = glm_res,
bimodal_toexclude = exclude_bimodal)
# selected models
sel <- model_selection(evaluation_stats = stats,
criterion = selection_criterion,
exclude_bimodal = exclude_bimodal,
tolerance = 0.01)
# add data_partition in the input data.
aux_f <- lapply(
names(data_partition),
function(x){data.frame(obs = data_partition[[x]],kfold_ID = x)}
)
folds <- do.call(rbind, aux_f)
folds <- folds[order(folds$obs),]
data_final <- data.frame(data, kfold_ID = folds$kfold_ID)
# Add ModelID tag
fx <- user_formulas
mid <- data.frame(ModelID = paste0("ModelID_", 1:length(fx)), Formulas = fx)
glm_res <- merge(glm_res, mid, by = "Formulas", sort = F)
glm_res <- glm_res[, c("ModelID", names(glm_res)[names(glm_res) != "ModelID"])]
stats <- merge(stats, mid, by = "Formulas", sort = F)
stats <- stats[, c("ModelID", names(stats)[names(stats) != "ModelID"])]
sel <- merge(sel, mid, by = "Formulas", sort = F)
sel <- sel[, c("ModelID", names(sel)[names(sel) != "ModelID"])]
# Final output
output <- new_enmpa_calibration(
selected = sel,
summary = stats,
calibration_results = glm_res,
data = data_final,
weights = weights,
partitioned_data = data_partition)
# Save calibration tables
if (!is.null(out_dir) && is.character(out_dir)){
save_cal(x = output, out_dir = out_dir)
}
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.