Nothing
#' Select models that perform the best among candidates
#'
#' @description
#' This function selects the best models according to user-defined criteria,
#' evaluating statistical significance (partial ROC), predictive ability
#' (omission rates), and model complexity (AIC).
#'
#' @usage
#' select_models(calibration_results = NULL, candidate_models = NULL, data = NULL,
#' algorithm = NULL, compute_proc = FALSE,
#' addsamplestobackground = TRUE, weights = NULL,
#' remove_concave = FALSE, omission_rate = NULL,
#' allow_tolerance = TRUE, tolerance = 0.01,
#' significance = 0.05, delta_aic = 2, parallel = FALSE,
#' ncores = NULL, progress_bar = FALSE,verbose = TRUE)
#'
#' @param calibration_results an object of class `calibration_results` returned
#' by the [calibration()] function. Default is NULL.
#' @param candidate_models (data.frame) a summary of the evaluation metrics for each
#' candidate model. Required only if `calibration_results` is NULL. In the
#' output of the [calibration()], this data.frame is located in
#' `$calibration_results$Summary`. Default is NULL.
#' @param data an object of class `prepared_data` returned by the
#' [prepare_data()] function. Required only if `calibration_results`
#' is NULL and `compute_proc` is TRUE.
#' @param algorithm (character) model algorithm, either "glm" or "maxnet". The
#' default, NULL, uses the one defined as part of `calibration_results`, or
#' `data`. If those arguments are not used, `algorithm` must be defined.
#' @param compute_proc (logical) whether to compute partial ROC tests for the
#' selected models. This is required when partial ROC is not calculated for all
#' candidate models during calibration. Default is FALSE.
#' @param addsamplestobackground (logical) whether to add to the background any
#' presence sample that is not already there. Required only if `compute_proc` is
#' TRUE and `calibration_results` is NULL.Default is TRUE.
#' @param weights (numeric) a numeric vector specifying weights for the
#' occurrence records. Required only if `compute_proc` is TRUE and
#' `calibration_results` is NULL. Default is NULL.
#' @param remove_concave (logical) whether to remove candidate models presenting
#' concave curves. Default is FALSE.
#' @param omission_rate (numeric) the maximum omission rate a candidate model
#' can have to be considered as a potentially selected model. The default, NULL,
#' uses the value provided as part of `calibration_results`. For purposes of
#' selection in existing results of evaluation, this value must match one of
#' the values used in omission tests, and must be manually defined.
#' @param allow_tolerance (logical) whether to allow selection of models with
#' minimum values of omission rates even if their omission rate surpasses the
#' `omission_rate`. This is only applicable if all candidate models have
#' omission rates higher than the `omission_rate`. Default is TRUE.
#' @param tolerance (numeric) The value added to the minimum omission rate if it
#' exceeds the `omission_rate`. If `allow_tolerance = TRUE`, selected models
#' will have an omission rate equal to or less than the minimum rate plus this
#' tolerance. Default is 0.01.
#' @param significance (numeric) the significance level to select models
#' based on the partial ROC (pROC). Default is 0.05. See Details.
#' @param delta_aic (numeric) the value of delta AIC used as a threshold to
#' select models. Default is 2.
#' @param parallel (logical) whether to calculate the PROC of the candidate
#' models in parallel. Default is FALSE.
#' @param ncores (numeric) number of cores to use for parallel processing.
#' Default is NULL and uses available cores - 1. This is only applicable if
#' `parallel = TRUE`.
#' @param progress_bar (logical) whether to display a progress bar during
#' processing. Default is TRUE.
#' @param verbose (logical) whether to display messages during processing.
#' Default is TRUE.
#'
#' @details
#' Partial ROC is calculated following Peterson et al.
#' (2008).
#'
#' @return
#' If calibration_results is provided, it returns a new calibration_results with
#' the new selected models and summary. If calibration_results is NULL, it
#' returns a list containing the following elements:
#' - selected_models: data frame with the ID and the summary of evaluation
#' metrics for the selected models.
#' - summary: A list containing the delta AIC values for model selection, and
#' the ID values of models that failed to fit, had concave curves,
#' non-significant pROC values, omission rates above the threshold, delta AIC
#' values above the threshold, and the selected models.
#'
#' @export
#'
#' @examples
#' # Import example of calibration results (output of calibration function)
#' ## GLM
#' data(calib_results_glm, package = "kuenm2")
#'
#' #Select new best models based on another value of omission rate
#' new_best_model <- select_models(calibration_results = calib_results_glm,
#' algorithm = "glm", compute_proc = TRUE,
#' omission_rate = 10) # Omission error of 10
#'
#' # Compare with best models selected previously
#' calib_results_glm$summary$Selected # Model 86 selected
#' new_best_model$summary$Selected # Models 64, 73 and 86 selected
select_models <- function(calibration_results = NULL,
candidate_models = NULL,
data = NULL,
algorithm = NULL,
compute_proc = FALSE,
addsamplestobackground = TRUE,
weights = NULL,
remove_concave = FALSE,
omission_rate = NULL,
allow_tolerance = TRUE,
tolerance = 0.01,
significance = 0.05,
delta_aic = 2,
parallel = FALSE,
ncores = NULL,
progress_bar = FALSE,
verbose = TRUE) {
#Check data
if (is.null(calibration_results) & is.null(candidate_models)) {
stop("Either 'calibration_results' or 'candidate_models' must be defined.")
}
if (is.null(calibration_results) & is.null(data) & compute_proc) {
stop("If 'compute_proc' = TRUE, 'calibration_results' or 'data' must be defined.")
}
if (!is.null(calibration_results)) {
candidate_models <- calibration_results$calibration_results$Summary
algorithm <- calibration_results$algorithm
if(is.null(omission_rate))
omission_rate <- calibration_results$omission_rate
} else {
if (is.null(omission_rate)) {
stop("Argument 'omission_rate' must be defined.")
}
if (!is.null(data)) {
algorithm <- data$algorithm
}
if (is.null(algorithm)) {
stop("Argument 'algorithm' must be defined.")
}
}
#Update weights and addsamplestobackground from calibration_results, if necessary
if (!is.null(calibration_results)) {
weights <- calibration_results$weights
addsamplestobackground <- calibration_results$addsamplestobackground
}
# Adjust AIC column based on model type
if (algorithm == "maxnet") {
# Remove the unused AIC column in maxnet
AIC_option <- "AICc"
candidate_models$AIC_nk <- NULL
} else if (algorithm == "glm") {
AIC_option <- "AIC" # For glm models, we only use a single AIC column
} else {
stop("Unsupported algorithm. Please use 'maxnet' or 'glm'.")
}
# Omission rate column name
om_thr <- paste0("Omission_rate_at_", omission_rate, ".mean")
#proc-pval columns
proc_pval <- paste0("pval_pROC_at_", omission_rate, ".mean")
#Check if it's necessary calculate proc
if (all(is.na(candidate_models[[proc_pval]]))) {
if (!compute_proc) {
stop("pROC values were not provided as part of the input. Set 'compute_proc' to TRUE.")
}
}
# Log the number of models being filtered
if (verbose) {
message("Selecting best among ", nrow(candidate_models), " models.")
}
#### Initiate filtering if it's necessary to calculate proc ####
if (compute_proc) {
if (verbose) {
message("Calculating pROC...")
}
any_bad <- TRUE # To initiate looping
id_to_remove <- 0
while(any_bad) {
#Remove bad models
candidate_models <- candidate_models[!(candidate_models$ID %in% id_to_remove), ]
# Log the number of models being filtered
if (verbose) {
message("\nFiltering ", nrow(candidate_models), " models.")
}
# Remove models with errors
na_models <- candidate_models[is.na(candidate_models$Is_concave), "ID"]
if (verbose) {
message("Removing ", length(na_models), " model(s) because they failed to fit.")
}
candidate_models <- candidate_models[!is.na(candidate_models$Is_concave), ]
# Remove concave curves if remove_concave is TRUE
if (remove_concave) {
concave_models <- candidate_models[candidate_models$Is_concave, "ID"]
if (verbose) {
message("Removing ", length(concave_models),
" model(s) with concave curves.")
}
if(length(concave_models) > 0){
candidate_models <- candidate_models[!candidate_models$Is_concave, ]
}
} else {
concave_models <- integer(0)
}
# Subset models by omission rate
ona <- !is.na(candidate_models[, om_thr])
thigh <- candidate_models[, om_thr] > (omission_rate / 100)
high_omr <- candidate_models[thigh & ona, "ID"]
if(length(ona) == 0){
stop("All models failed to fit!")
}
if(length(high_omr) > 0){
cand_om <- candidate_models[!thigh & ona, ]
} else {
cand_om <- candidate_models[ona,]
}
if (verbose) {
message(nrow(cand_om), " model(s) were selected with omission rate below ",
omission_rate, "%.")
}
# Stop if no models meet the omission rate threshold and allow_tolerance is FALSE
if (nrow(cand_om) == 0 & !allow_tolerance) {
stop("There are no models with values of omission rate below ",
omission_rate, "%. Try with 'allow_tolerance' = TRUE.")
}
# Apply tolerance if no models meet the omission rate threshold and allow_tolerance is TRUE
if (nrow(cand_om) == 0 & allow_tolerance) {
min_thr <- min(candidate_models[, om_thr], na.rm = TRUE)
aom <- candidate_models[, om_thr] <= (min_thr + tolerance)
cand_om <- candidate_models[aom & ona, ]
high_omr <- candidate_models[!aom & ona, "ID"]
if (verbose) {
message("Minimum value of omission rate (", round(min_thr * 100, 1),
"%) is above the selected threshold (", omission_rate,
"%).\nApplying tolerance and selecting ", nrow(cand_om),
" models with omission rate <",
round(min_thr * 100 + tolerance, 2), "%.")
}
}
# Calculate delta AIC and select models based on delta AIC
cand_om$dAIC <- cand_om[, AIC_option] - min(cand_om[, AIC_option],
na.rm = TRUE)
high_aic <- cand_om[cand_om$dAIC > delta_aic | is.na(cand_om$dAIC), "ID"]
cand_final <- cand_om[cand_om$dAIC <= delta_aic & !is.na(cand_om$dAIC), ]
if (verbose) {
message("Selecting ", nrow(cand_final),
" final model(s) with delta AIC <", delta_aic, ".")
}
#Identify models to calculate proc
to_calculate <- cand_final$ID[is.na(cand_final[, proc_pval])]
#If there are models to calculate proc...
if (length(to_calculate) > 0) {
cand_final_with_proc <- cand_final[!(cand_final$ID %in% to_calculate), ]
if (nrow(cand_final_with_proc) > 0) {
cand_final <- cand_final[cand_final$ID %in% to_calculate, ]
}
# Validate pROC
if (verbose) {
message("Validating pROC of selected models...")
}
if (is.null(data)) {
data <- calibration_results[1:10]
class(data) <- "prepared_data"
}
proc_values <- partial_roc(formula_grid = cand_final, data = data,
omission_rate = omission_rate,
addsamplestobackground = addsamplestobackground,
weights = weights,
algorithm = algorithm,
parallel = parallel,
ncores = ncores,
progress_bar = progress_bar)
# formula_grid = cand_final; data = data;
# omission_rate = omission_rate;
# addsamplestobackground = addsamplestobackground;
# weights = weights;
# algorithm = algorithm;
# parallel = parallel;
# ncores = ncores;
# progress_bar = progress_bar
# Create a copy of cand_final to keep the original data unchanged
cand_final_updated <- cand_final
# Replace NA values in cand_final_updated with corresponding values from proc_values
ncpr <- ncol(proc_values)
rrep <- match(proc_values$ID, cand_final_updated$ID)
cand_final_updated[rrep, colnames(proc_values[, -ncpr])] <-
proc_values[, -ncpr]
#Append cand_final_with_proc
if (nrow(cand_final_with_proc) > 0) {
cand_final_updated <- rbind(cand_final_with_proc, cand_final_updated)
}
# Check if p_value is non-significative
p_value_omr <- cand_final_updated[, paste0("pval_pROC_at_",
omission_rate, ".mean")]
any_bad <- any(p_value_omr > significance | is.na(p_value_omr))
#Get models to remove, if necessary
id_to_remove <- cand_final_updated$ID[p_value_omr > significance |
is.na(p_value_omr)]
} else { #If all models have already been calculated
any_bad <- FALSE
}
#if there are non-significant models...
if (any_bad) {
if (verbose) {
message("\nSome of the selected models have non-significant pROC values. Re-selecting models...\n")
}
#Update candidate_models with proc values calculated
candidate_models[proc_values$ID, colnames(proc_values[, -ncpr])] <-
proc_values[, -ncpr]
#if there are not non-significant models...
} else {
#cand_final <- cand_final_updated
insig_proc <- NULL
}
} #End of anybad
} #End of calculate proc
if (!compute_proc) {
#### Initiate filtering if it's NOT necessary to calculate proc ####
# Remove models with errors
na_models <- candidate_models[is.na(candidate_models$Is_concave), "ID"]
if (verbose) {
message("\nRemoving ", length(na_models), " model(s) that failed to fit.")
}
candidate_models <- candidate_models[!is.na(candidate_models$Is_concave), ]
# Remove concave curves if remove_concave is TRUE
if (remove_concave) {
concave_models <- candidate_models[candidate_models$Is_concave, "ID"]
if (verbose) {
message("Removing ", length(concave_models),
" model(s) with concave responses.")
}
candidate_models <- candidate_models[!candidate_models$Is_concave, ]
} else {
concave_models <- 0
}
# Subset models with significant pROC
insig_proc <- candidate_models[candidate_models[[proc_pval]] >= significance |
is.na(candidate_models[[proc_pval]]), "ID"]
if (verbose) {
message("Removing ", length(insig_proc),
" model(s) with non-significant pROC values.")
}
candidate_models <- candidate_models[candidate_models[[proc_pval]] < significance &
!is.na(candidate_models[[proc_pval]]), ]
# Subset models by omission rate
high_omr <- candidate_models[candidate_models[, om_thr] > omission_rate / 100, "ID"]
cand_om <- candidate_models[candidate_models[, om_thr] <= omission_rate / 100, ]
if (verbose) {
message(nrow(cand_om), " models passed the ", omission_rate,
"% omission criterion.")
}
# Stop if no models meet the omission rate threshold and allow_tolerance is FALSE
if (nrow(cand_om) == 0 & !allow_tolerance) {
stop("There were no models with omissions below ", omission_rate, "%.",
"Try using the arguments 'allow_tolerance' and 'tolerance'.")
}
# Apply tolerance if no models meet the omission rate threshold and allow_tolerance is TRUE
if (nrow(cand_om) == 0 & allow_tolerance) {
min_thr <- min(candidate_models[, om_thr])
cand_om <- subset(candidate_models, candidate_models[, om_thr] <= min_thr + tolerance)
high_omr <- candidate_models[candidate_models[, om_thr] > min_thr + tolerance, "ID"]
if (verbose) {
message("Minimum value of omission in models (", round(min_thr * 100, 1),
"%) > omission criterion (", omission_rate, "%).\n",
"Applying tolerance: ", nrow(cand_om), " models with omission <",
round((min_thr + tolerance) * 100, 1), "% were found.")
}
}
# Calculate delta AIC and select models based on delta AIC
cand_om$dAIC <- cand_om[, AIC_option] - min(cand_om[, AIC_option])
high_aic <- cand_om[cand_om$dAIC > delta_aic, "ID"]
cand_final_updated <- cand_om[cand_om$dAIC <= delta_aic, ]
if (verbose) {
message("Selecting ", nrow(cand_final_updated), " final model(s) with delta AIC <",
delta_aic, ".")
}
}
#Final message
if (nrow(cand_final_updated) > 0 & verbose) {
message("\nAll selected models have significant pROC values.") #Do we need this?
}
if (nrow(cand_final_updated) == 0) {
stop("It is impossible to select models with significant pROC values.")
}
# Final results
sel_res <- list(selected_models = cand_final_updated,
summary = list(delta_AIC = delta_aic,
omission_rate_thr = omission_rate,
Errors = na_models,
Remove_concave = remove_concave,
Concave_removed = concave_models,
Non_sig_pROC = insig_proc,
High_omission_rate = high_omr,
High_AIC = high_aic,
Selected = cand_final_updated$ID))
#Update calibration_results if necessary
if (!is.null(calibration_results)) {
calibration_results$selected_models <- sel_res$selected_models
calibration_results$summary <- sel_res$summary
calibration_results$omission_rate <- omission_rate
return(calibration_results)
} else {
return(sel_res)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.