Nothing
## #######################################################################################
##
## RUN REGRESSION SUB-MODELS
##
## PURPOSE: Run regression models using the `caret` package for use in stacking
##
## #######################################################################################
#' Run regression sub-models
#'
#' @description Wrapper to run many regression sub-models using the caret package
#'
#' @param input_data A data.frame with at least the following columns:
#' - 'indicator': number of "hits' per site, e.g. tested positive for malaria
#' - 'samplesize': total population sampled at the site
#' - 'x': x position, often longitude
#' - 'y': y position, often latitude
#' @param id_raster [terra::SpatRaster] with non-NA pixels delineating the extent of the
#' study area
#' @param covariates (list) Named list of all covariate effects included in the model,
#' typically generated by [load_covariates()].
#' @param cv_settings Named list of cross-validation settings, passed to
#' [caret::trainControl].
#' @param model_settings Named list where the name of each header corresponds to a model
#' run in [caret::train], and the arguments correspond to the model-specific settings
#' for that model type.
#' @param family (`character(1)`, default 'binomial') Statistical model family being
#' evaluated. For Gaussian models, this function trains against the 'mean' field; for
#' all other families, this function trains against the ratio of
#' 'indicator':'samplesize'.
#' @param clamping (`logical(1)`, default TRUE) Should the predictions of individual ML
#' models be limited to the range observed in the data?
#' @param use_admin_bounds (`logical(1)`, default FALSE) Use one-hot encoding of
#' administrative boundaries as a candidate feature?
#' @param admin_bounds ([sf][sf::sf], default NULL) Administrative boundaries to use.
#' Only considered if `use_admin_bounds` is TRUE.
#' @param admin_bounds_id (`character`, default 'polygon_id') Field to use for
#' administrative boundary one-hot encoding. Only considered if `use_admin_bounds` is
#' TRUE.
#' @param prediction_range (`numeric(2)`, default c(-Inf, Inf)) Prediction limits for the
#' outcome range. Used when the predictions are in a limited range, for example, 0 to 1
#' or -1 to 1.
#' @param verbose (`logical(1)`, default TRUE) Log progress for ML model fitting?
#'
#' @return List with two items:
#' - "models": A list containing summary objects for each regression model
#' - "predictions": Model predictions covering the entire id_raster
#'
#' @concept model_fit
#'
#' @importFrom caret trainControl train
#' @importFrom glue glue
#' @importFrom stats model.matrix predict
#' @importFrom terra extract values rasterize
#' @import data.table
#' @export
run_regression_submodels <- function(
input_data, id_raster, covariates, cv_settings, model_settings, family = 'binomial',
clamping = TRUE, use_admin_bounds = FALSE, admin_bounds = NULL,
admin_bounds_id = 'polygon_id', prediction_range = c(-Inf, Inf), verbose = TRUE
){
if(verbose) logging_start_timer(glue::glue(
"Fitting {length(model_settings)} regression models"
))
# Prepare training data and eventual prediction space
xy_fields <- c('x', 'y')
id_raster_table <- data.table::as.data.table(id_raster, xy = TRUE) |> na.omit()
colnames(id_raster_table)[3] <- 'pixel_id'
cov_names <- names(covariates)
xy_train <- as.matrix(input_data[, xy_fields, with = F])
xy_pred <- as.matrix(id_raster_table[, xy_fields, with = F])
for(cov_name in setdiff(cov_names, 'intercept')){
input_data[[cov_name]] <- terra::extract(
x = covariates[[cov_name]],
y = xy_train
)[, 1]
id_raster_table[[cov_name]] <- terra::extract(
x = covariates[[cov_name]],
y = xy_pred
)[, 1]
}
# Optionally add administrative identifiers as features
if(use_admin_bounds){
if(is.null(admin_bounds)) stop("Administrative polygons not included")
admin_bounds$ADM_ <- factor(make.names(admin_bounds[[admin_bounds_id]]))
admin_raster <- terra::vect(admin_bounds)[, c('ADM_')] |>
terra::rasterize(y = id_raster, field = 'ADM_')
# Temporarily set na.action to 'na.pass' to avoid dropping NAs in model.matrix
old_na_action <- getOption('na.action')
on.exit(options(na.action = old_na_action))
options(na.action = 'na.pass')
bounds_training <- stats::model.matrix(
~ 0 + ADM_,
data = terra::extract(x = admin_raster, y = xy_train)
) |> as.data.frame()
bounds_prediction <- stats::model.matrix(
~ 0 + ADM_,
data = terra::extract(x = admin_raster, y = xy_pred)
) |> as.data.frame()
# Reset na.action
options(na.action = old_na_action)
for(adm_unit in colnames(bounds_training)){
if(sum(bounds_training[[adm_unit]], na.rm = TRUE) < 3){
bounds_training[[adm_unit]] <- NULL
bounds_prediction[[adm_unit]] <- NULL
}
}
# Add to the training and prediction data.frames
input_data <- cbind(input_data, bounds_training)
id_raster_table <- cbind(id_raster_table, bounds_prediction)
cov_names <- c(cov_names, colnames(bounds_training))
}
# Subset only to data outcome (indicator / samplesize), covariates, and x/y
cov_cols <- c(setdiff(cov_names, 'intercept'), 'x', 'y')
if((family == 'gaussian') & ('mean' %in% colnames(input_data))){
input_data$data_rate <- input_data$mean
} else {
input_data$data_rate <- input_data$indicator / input_data$samplesize
}
training_data <- copy(na.omit(input_data[, c('data_rate', cov_cols), with = F ]))
prediction_grid <- copy(na.omit(id_raster_table))
# Set internal out-of-sample tuning
oos_tune <- do.call(caret::trainControl, args = cv_settings)
# Make a fully NA raster to add predictions to
template_raster <- id_raster
terra::values(template_raster) <- NA_real_
min_observed <- min(input_data$data_rate, na.rm = TRUE)
max_observed <- max(input_data$data_rate, na.rm = TRUE)
# Run each stacker, then predict to a raster
model_names <- names(model_settings)
models_list <- preds_list <- vector('list', length = length(model_names))
names(models_list) <- names(preds_list) <- model_names
for(model_name in model_names){
if(verbose) logging_start_timer(paste("Candidate model: ", model_name))
# Fit caret model
model_args <- c(
list(data_rate ~ ., data = training_data, method = model_name, trControl = oos_tune),
model_settings[[model_name]]
)
models_list[[model_name]] <- do.call(caret::train, args = model_args)
# Predict out to raster
prediction_grid$new_vals <- predict(
models_list[[model_name]],
newdata = prediction_grid
)
pred_raster <- template_raster
terra::values(pred_raster)[prediction_grid$pixel_id] <- prediction_grid$new_vals
# Restrict to to plausible range
pred_raster[pred_raster < min(prediction_range)] <- min(prediction_range)
pred_raster[pred_raster > max(prediction_range)] <- max(prediction_range)
# If clamping, restrict to observed range
if(clamping){
pred_raster[pred_raster < min_observed] <- min_observed
pred_raster[pred_raster > max_observed] <- max_observed
}
preds_list[[model_name]] <- pred_raster
if(verbose) logging_stop_timer()
}
# End logging for candidate model fitting
if(verbose) logging_stop_timer()
return(list(models = models_list, predictions = preds_list))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.