R/run_ml_models.R
In mbg: Model-Based Geostatistics

Documented in run_regression_submodels

## #######################################################################################
##
## RUN REGRESSION SUB-MODELS
##
## PURPOSE: Run regression models using the `caret` package for use in stacking
##
## #######################################################################################

#' Run regression sub-models
#'
#' @description Wrapper to run many regression sub-models using the caret package
#'
#' @param input_data A data.frame with at least the following columns:
#'   - 'indicator': number of "hits' per site, e.g. tested positive for malaria
#'   - 'samplesize': total population sampled at the site
#'   - 'x': x position, often longitude
#'   - 'y': y position, often latitude
#' @param id_raster [terra::SpatRaster] with non-NA pixels delineating the extent of the
#'   study area
#' @param covariates (list) Named list of all covariate effects included in the model,
#'   typically generated by [load_covariates()].
#' @param cv_settings Named list of cross-validation settings, passed to
#'   [caret::trainControl].
#' @param model_settings Named list where the name of each header corresponds to a model
#'   run in [caret::train], and the arguments correspond to the model-specific settings
#'   for that model type.
#' @param family (`character(1)`, default 'binomial') Statistical model family being
#'   evaluated. For Gaussian models, this function trains against the 'mean' field; for
#'   all other families, this function trains against the ratio of
#'  'indicator':'samplesize'.
#' @param clamping (`logical(1)`, default TRUE) Should the predictions of individual ML
#'   models be limited to the range observed in the data?
#' @param use_admin_bounds (`logical(1)`, default FALSE) Use one-hot encoding of
#'   administrative boundaries as a candidate feature?
#' @param admin_bounds ([sf][sf::sf], default NULL) Administrative boundaries to use.
#'   Only considered if `use_admin_bounds` is TRUE.
#' @param admin_bounds_id (`character`, default 'polygon_id') Field to use for
#'   administrative boundary one-hot encoding. Only considered if `use_admin_bounds` is
#'   TRUE.
#' @param prediction_range (`numeric(2)`, default c(-Inf, Inf)) Prediction limits for the
#'   outcome range. Used when the predictions are in a limited range, for example, 0 to 1
#'   or -1 to 1.
#' @param verbose (`logical(1)`, default TRUE) Log progress for ML model fitting?
#'
#' @return List with two items:
#'   - "models": A list containing summary objects for each regression model
#'   - "predictions": Model predictions covering the entire id_raster
#'
#' @concept model_fit
#'
#' @importFrom caret trainControl train
#' @importFrom glue glue
#' @importFrom stats model.matrix predict
#' @importFrom terra extract values rasterize
#' @import data.table
#' @export
run_regression_submodels <- function(
  input_data, id_raster, covariates, cv_settings, model_settings, family = 'binomial',
  clamping = TRUE, use_admin_bounds = FALSE, admin_bounds = NULL,
  admin_bounds_id = 'polygon_id', prediction_range = c(-Inf, Inf), verbose = TRUE
){
  if(verbose) logging_start_timer(glue::glue(
    "Fitting {length(model_settings)} regression models"
  ))

  # Prepare training data and eventual prediction space
  xy_fields <- c('x', 'y')
  id_raster_table <- data.table::as.data.table(id_raster, xy = TRUE) |> na.omit()
  colnames(id_raster_table)[3] <- 'pixel_id'
  cov_names <- names(covariates)
  xy_train <- as.matrix(input_data[, xy_fields, with = F])
  xy_pred <- as.matrix(id_raster_table[, xy_fields, with = F])
  for(cov_name in setdiff(cov_names, 'intercept')){
    input_data[[cov_name]] <- terra::extract(
      x = covariates[[cov_name]],
      y = xy_train
    )[, 1]
    id_raster_table[[cov_name]] <- terra::extract(
      x = covariates[[cov_name]],
      y = xy_pred
    )[, 1]
  }

  # Optionally add administrative identifiers as features
  if(use_admin_bounds){
    if(is.null(admin_bounds)) stop("Administrative polygons not included")
    admin_bounds$ADM_ <- factor(make.names(admin_bounds[[admin_bounds_id]]))
    admin_raster <- terra::vect(admin_bounds)[, c('ADM_')] |>
      terra::rasterize(y = id_raster, field = 'ADM_')
    # Temporarily set na.action to 'na.pass' to avoid dropping NAs in model.matrix
    old_na_action <- getOption('na.action')
    on.exit(options(na.action = old_na_action))
    options(na.action = 'na.pass')
    bounds_training <- stats::model.matrix(
      ~ 0 + ADM_,
      data = terra::extract(x = admin_raster, y = xy_train)
    ) |> as.data.frame()
    bounds_prediction <- stats::model.matrix(
      ~ 0 + ADM_,
      data = terra::extract(x = admin_raster, y = xy_pred)
    ) |> as.data.frame()
    # Reset na.action
    options(na.action = old_na_action)
    for(adm_unit in colnames(bounds_training)){
      if(sum(bounds_training[[adm_unit]], na.rm = TRUE) < 3){
        bounds_training[[adm_unit]] <- NULL
        bounds_prediction[[adm_unit]] <- NULL
      }
    }
    # Add to the training and prediction data.frames
    input_data <- cbind(input_data, bounds_training)
    id_raster_table <- cbind(id_raster_table, bounds_prediction)
    cov_names <- c(cov_names, colnames(bounds_training))
  }

  # Subset only to data outcome (indicator / samplesize), covariates, and x/y
  cov_cols <- c(setdiff(cov_names, 'intercept'), 'x', 'y')
  if((family == 'gaussian') & ('mean' %in% colnames(input_data))){
    input_data$data_rate <- input_data$mean
  } else {
    input_data$data_rate <- input_data$indicator / input_data$samplesize
  }
  training_data <- copy(na.omit(input_data[, c('data_rate', cov_cols), with = F ]))
  prediction_grid <- copy(na.omit(id_raster_table))

  # Set internal out-of-sample tuning
  oos_tune <- do.call(caret::trainControl, args = cv_settings)

  # Make a fully NA raster to add predictions to
  template_raster <- id_raster
  terra::values(template_raster) <- NA_real_

  min_observed <- min(input_data$data_rate, na.rm = TRUE)
  max_observed <- max(input_data$data_rate, na.rm = TRUE)

  # Run each stacker, then predict to a raster
  model_names <- names(model_settings)
  models_list <- preds_list <- vector('list', length = length(model_names))
  names(models_list) <- names(preds_list) <- model_names
  for(model_name in model_names){
    if(verbose) logging_start_timer(paste("Candidate model: ", model_name))
    # Fit caret model
    model_args <- c(
      list(data_rate ~ ., data = training_data, method = model_name, trControl = oos_tune),
      model_settings[[model_name]]
    )
    models_list[[model_name]] <- do.call(caret::train, args = model_args)
    # Predict out to raster
    prediction_grid$new_vals <- predict(
      models_list[[model_name]],
      newdata = prediction_grid
    )
    pred_raster <- template_raster
    terra::values(pred_raster)[prediction_grid$pixel_id] <- prediction_grid$new_vals
    # Restrict to to plausible range
    pred_raster[pred_raster < min(prediction_range)] <- min(prediction_range)
    pred_raster[pred_raster > max(prediction_range)] <- max(prediction_range)
    # If clamping, restrict to observed range
    if(clamping){
      pred_raster[pred_raster < min_observed] <- min_observed
      pred_raster[pred_raster > max_observed] <- max_observed
    }
    preds_list[[model_name]] <- pred_raster
    if(verbose) logging_stop_timer()
  }
  # End logging for candidate model fitting
  if(verbose) logging_stop_timer()
  return(list(models = models_list, predictions = preds_list))
}