R/prepare_inla_data_stack.R
In mbg: Model-Based Geostatistics

Documented in prepare_inla_data_stack

#' Prepare data stack for INLA
#'
#' @details Creates the formatted input data to be used by the INLA model. For more
#'   information about penalized complexity priors, see Daniel Simpson's
#'   paper on the subject: \doi{doi:10.1214/16-STS576}
#'
#' @seealso [mbg::fit_inla_model()] [mbg::MbgModelRunner]
#'
#' @param input_data A data.frame with at least the following columns:
#'   - 'indicator': number of "hits' per site, e.g. tested positive for malaria
#'   - 'samplesize': total population sampled at the site
#'   - 'x': x position, often longitude
#'   - 'y': y position, often latitude
#' @param id_raster [terra::SpatRaster] with non-NA pixels delineating the extent of the
#'   study area
#' @param covariates (list) Named list of all covariate effects included in the model,
#'   typically generated by [load_covariates()]. Only used if `use_covariates` is TRUE.
#' @param use_covariates (`boolean(1)`, default TRUE) Should covariate fixed effects be
#'   included in the model? If TRUE, includes fixed effects for all covariates in the
#'   `covariates` argument. If FALSE, includes only an intercept.
#' @param covariates_sum_to_one (logical, default FALSE) Should the input covariates be
#'   constrained to sum to one? Usually FALSE when raw covariates are passed to the model,
#'   and TRUE if running an ensemble (stacking) model.
#' @param family (`character(1)`, default 'binomial') Statistical family; used to
#'   link stacked covariates with outcomes
#' @param use_spde (`boolean(1)`, default TRUE) Should an SPDE approximation of a Gaussian
#'   process be included in the model?
#' @param spde_range_pc_prior (list) A named list specifying the penalized complexity
#'   prior for the SPDE range. The two named items are "threshold", the test threshold
#'   (set as a proportion of the overall mesh extent), and "prob_below", the prior
#'   probability that the value is BELOW that range threshold. The function automatically
#'   converts "threshold" from a proportion of the overall mesh extent into a distance.
#' @param spde_sigma_pc_prior (list) A named list specifying the penalized complexity
#'   prior for sigma (standard deviation) of the SPDE object. The two named items are
#'   "threshold", the test threshold for the standard deviation, and "prob_above",
#'   the prior probability that sigma will EXCEED that threshold.
#' @param spde_integrate_to_zero (`boolean(1)`, default TRUE) Should the 'volume' under
#'   the SPDE mesh integrate to zero?
#' @param mesh_max_edge (`numeric(2)`, default c(0.2, 5)) Maximum size of the INLA SPDE
#'   mesh inside (1) and outside (2) of the modeled region.
#' @param mesh_cutoff (`numeric(1)`, default 0.04) Minimum size of the INLA mesh, usually
#'   reached in data-dense areas.
#' @param use_nugget (`boolean(1)`, default TRUE) Should a nugget (IID observation-level
#'   error or noise) term be included?
#' @param nugget_pc_prior A named list specifying the penalized complexity prior for the
#'   nugget term. The two named items are "threshold", the test threshold for the nugget
#'   standard deviation, and "prob_above", the prior probability that the standard
#'   deviation will EXCEED that threshold. Only used if `use_nugget` is TRUE
#' @param use_admin_effect (`boolean(1)`, default FALSE) Should an IID random effect by
#'   administrative unit be included?
#' @param admin_boundaries ([sf][sf::sf] object, default NULL) admin boundaries spatial
#'   object. Only used if `use_admin_effect` is TRUE
#' @param admin_pc_prior (list) A named list specifying the penalized complexity prior
#'   for the admin-level IID term. The two named items are "threshold", the test threshold
#'   for the standard deviation of admin-level effects, and "prob_above", the prior
#'   probability that the standard deviation will EXCEED that threshold. Only used if
#'   `use_admin_effect` is TRUE.
#'
#' @return List containing the following items:
#'   - "mesh": The mesh used to approximate the latent Gaussian process
#'   - "spde": The SPDE object that will be used to fit the INLA model
#'   - "inla_data_stack": The data stack to be passed to `INLA::inla()`
#'   - "formula_string": The formula specification to be passed to `INLA::inla()`
#'
#' @concept model_fit
#'
#' @importFrom data.table as.data.table
#' @importFrom sf st_sf st_join st_nearest_feature
#' @importFrom stats qlogis
#' @importFrom terra extract
#' @importFrom glue glue
#' @export
prepare_inla_data_stack <- function(
  input_data, id_raster, covariates,
  use_covariates = TRUE,
  covariates_sum_to_one = FALSE,
  family = 'binomial',
  use_spde = TRUE,
  spde_range_pc_prior = list(threshold = 0.1, prob_below = 0.05),
  spde_sigma_pc_prior = list(threshold = 3, prob_above = 0.05),
  spde_integrate_to_zero = TRUE,
  mesh_max_edge = c(0.2, 5),
  mesh_cutoff = 0.04,
  use_nugget = TRUE,
  nugget_pc_prior = list(threshold = 3, prob_above = 0.05),
  use_admin_effect = FALSE,
  admin_boundaries = NULL,
  admin_pc_prior = list(threshold = 3, prob_above = 0.05)
){
  # Helper variable for extracting XY data
  xy_fields <- c('x','y')
  # Prepare the effects list and observations map list
  effects_list <- list()
  obs_list <- list()

  ## Optionally add covariate fixed effects
  if(use_covariates){
  # Extract all covariates
  cov_names <- names(covariates)
    for(cov_name in cov_names){
      input_data[[cov_name]] <- terra::extract(
        x = covariates[[cov_name]],
        y = as.matrix(input_data[, xy_fields, with = F])
      )[, 1]
    }
    # If sum to one, add a model constraint and convert to logit space
    if(covariates_sum_to_one){
      constraint_suffix <- glue::glue(
        ", extraconstr = list(A = matrix(1, ncol = {length(cov_names)}), e = 1)"
      )
      if(family == 'binomial'){
        for(cov_name in cov_names) input_data[[cov_name]] <- qlogis(input_data[[cov_name]])
      }
    } else {
      constraint_suffix <- ""
    }
    # Append to effects list, observations map list, and formula string
    effects_list$covariates <- seq_along(cov_names)
    obs_list$covariates <- as.matrix(input_data[, cov_names, with = F])
    formula_string <- glue::glue(
      "y ~ 0 + f(covariates, model = 'iid', fixed = TRUE{constraint_suffix})"
    )
  } else {
    formula_string <- "y ~ 0 + Intercept"
    effects_list$Intercept <- 1
    obs_list$Intercept <- rep(1, nrow(input_data))
  }

  ## Optionally add the SPDE approximation to a Gaussian process
  if(use_spde){
    # Build prediction mesh
    id_raster_table <- data.table::as.data.table(id_raster, xy = TRUE) |> na.omit()
    mesh <- INLA::inla.mesh.2d(
      loc = input_data[, xy_fields, with = F],
      loc.domain = id_raster_table[, xy_fields, with = F],
      max.edge = mesh_max_edge,
      cutoff = mesh_cutoff
    )
    # The maximum mesh dimension will be used to determine the SPDE range prior
    max_d <- apply(X = mesh$loc, MARGIN = 2, FUN = function(x) diff(range(x))) |>
      max(na.rm = TRUE)
    # SPDE object
    spde <- INLA::inla.spde2.pcmatern(
      mesh = mesh,
      alpha = 2,
      constr = !spde_integrate_to_zero,
      prior.range = c(max_d * spde_range_pc_prior$threshold, spde_range_pc_prior$prob_below),
      prior.sigma = c(spde_sigma_pc_prior$threshold, spde_sigma_pc_prior$prob_above)
    )
    # Append to effects list, observations map list, and formula string
    effects_list$s <- INLA::inla.spde.make.index(name = "space", n.spde = spde$n.spde)
    obs_list$s <- INLA::inla.spde.make.A(
      mesh = mesh,
      loc = as.matrix(input_data[, xy_fields, with = F])
    )
    formula_string <- glue::glue("{formula_string} + f(space, model = spde)")
  } else {
    mesh <- NULL
    spde <- NULL
  }

  ## Optionally add nugget
  if(use_nugget){
    obs_list$nugget <- 1
    effects_list$nugget <- as.matrix(input_data[, 'cluster_id', with = F])
    n_pcp <- nugget_pc_prior
    formula_string <- glue::glue(
      "{formula_string} +
       f(nugget, model = 'iid', hyper = list(
         prec = list(prior = 'pc.prec', param = c({n_pcp$threshold}, {n_pcp$prob_above}))
       ))
      "
    )
  }

  ## Optionally add admin-level effects
  if(use_admin_effect){
    # Merge on unique admin ID for each observation
    admin_boundaries$admin_id <- seq_len(nrow(admin_boundaries))
    input_data_merged <- sf::st_as_sf(input_data, coords = xy_fields, crs = 'EPSG:4326') |>
      sf::st_join(y = admin_boundaries[, c('admin_id')], join = sf::st_nearest_feature) |>
      data.table::as.data.table()
    # Add the effect to the observations list, effects list, and model formula
    obs_list$adm_effect <- 1
    effects_list$adm_effect <- as.matrix(input_data_merged[, 'admin_id', with = F])
    a_pcp <- admin_pc_prior
    formula_string <- glue::glue(
      "{formula_string} +
       f(adm_effect, model = 'iid', hyper = list(
         prec = list(prior = 'pc.prec', param = c({a_pcp$threshold}, {a_pcp$prob_above}))
       ))
      "
    )
  }

  ## Combine observation and effects lists into an INLA estimation data stack
  inla_data_stack <- INLA::inla.stack(
    tag = 'est',
    data = list(y = input_data$indicator),
    A = obs_list,
    effects = effects_list
  )

  # Return a list containing the mesh, SPDE object, INLA data stack, and INLA formula
  data_list <- list(
    mesh = mesh,
    spde = spde,
    inla_data_stack = inla_data_stack,
    formula_string = formula_string
  )
  return(data_list)
}