R/model_functions.R

Defines functions setup_run_tmb run_sparsity_algorithm normalize_adfun find_glm_map_parameter_estimates assign_seasonality_ids icar_precision_from_adjacency

Documented in assign_seasonality_ids find_glm_map_parameter_estimates icar_precision_from_adjacency normalize_adfun run_sparsity_algorithm setup_run_tmb

#' Generate an ICAR precision matrix based on an adjacency matrix
#'
#' @description Generate a precision matrix for the intrinsic correlated autoregressive
#'  (ICAR) model specification, a special case of the correlated autoregressive (CAR)
#'  class of Markov random field models. This precision matrix is usually denoted as "Q".
#'
#' @details The precision matrix is fully specified by the adjacency weights, matrix W,
#'   defined as W = {w_ij} where w_ij is 1 if i and j are neighbors, and 0 otherwise. The
#'   precision matrix Q is defined as Q = D_w - W, where D_w is a diagonal matrix with
#'   each diagonal term d_ii equal to the sum of row i in W.
#'
#'   Note that the ICAR model is improper, in that the conditional distributions
#'   specified by the precision matrix do not determine a full joint distribution that
#'   integrates to 1; in other words, the precision matrix Q is not invertible. The ICAR
#'   precision matrix can still be used as a prior in a hierarchical model.
#'
#'   This function includes optional argument `scale_variance`. If set to `TRUE` (the
#'   default), the function will rescale the precision matrix to have a generalized
#'   variance of 1, which may aid in prior specifications that are comparable across
#'   areal spatial models with different geometries.
#'
#'   For more details, see:
#'   Banerjee, Carlin, and Gelfand (2015). Hierarchical Modeling and Analysis for Spatial
#'     Data, 2nd Edition. Section 6.4.3.3: CAR models and their difficulties.
#'   Riebler et al. (2016). An intuitive Bayesian sptial model for disease mapping that
#'     accounts for scaling. Statistical methods in medical research, 25(4):1145-65.
#'
#' @param W Adjacency matrix, with w_ij = w_ji = 1 if areal units i and j are neighbors,
#'   and zero otherwise. See function details for more information
#' @param scale_variance [default TRUE] Should the precision matrix be rescaled so that
#'  the generalized variance is equal to 1? Setting to TRUE may help with prior
#'  specification.
#'
#' @return Sparse ICAR precision matrix Q. See function details for more information.
#'
#' @import Matrix INLA
#' @export
icar_precision_from_adjacency <- function(W, scale_variance = TRUE){
  # Generate and return sparse precision matrix
  Q <- Matrix::Diagonal(n = nrow(W), x = Matrix::rowSums(W)) - W
  if(scale_variance){
    # Scale model to have generalized variance of 1
    constraint_matrix <- matrix(1, nrow = 1, ncol = ncol(Q))
    Q <- INLA::inla.scale.model(Q, constr = list(A = constraint_matrix, e = 0))
  }
  return(Q)
}


#' Assign seasonality grouping IDs
#'
#' @description Create a vector that assigns a (zero-indexed) seasonality
#'   grouping ID based on other categories in the data. If no categories are
#'   assigned, all rows get the same seasonality grouping ID (0). If categories
#'   are based on other ID fields, then a group is assigned to every unique
#'   combination of values in those fields. For example, if grouings were
#'   assigned based on the "age" and "location" fields, and the data contained
#'   four age groupings and 10 unique locations, then 20 unique seasonality
#'   groupings would be assigned.
#'
#' @param input_data Data.table of the full input dataset. Must contain fields
#'   for any grouping categories that are assigned
#' @param grouping_fields [optional, default NULL] character vector listing any
#'   fields in the dataset that should be used to create unique seasonality
#'   groups
#'
#' @return integer vector with the same number of rows as `input_data` assigning
#'   holdout IDs.
#'
#' @import data.table
#' @export
assign_seasonality_ids <- function(input_data, grouping_fields = NULL){

  # Check for missing fields
  missing <- setdiff(grouping_fields, names(input_data))
  if(length(missing) > 0) stop("Missing fields: ", paste(missing, collapse=', '))

  # Case of no groupings: Assign all zeroes
  if(length(grouping_fields) == 0){
    return(rep(0, nrow(input_data)))
  }

  # Case of groupings: create a grouping data.table and merge it into the data
  to_cj <- lapply(grouping_fields, function(fld) sort(unique(input_data[[fld]])))
  names(to_cj) <- grouping_fields
  holdout_id_dt <- do.call('CJ', to_cj)
  # Add holdout ID, zero-indexed
  holdout_id_dt[, idx_season := .I - 1 ]
  # Merge back onto dataset
  templ_dt <- copy(input_data)[, ..grouping_fields]
  templ_dt[holdout_id_dt, idx_season := idx_season, on = grouping_fields]

  # Check that the field is the correct length and return
  if(length(templ_dt$idx_season) != nrow(input_data)) stop("Holdout merge issue")
  return(templ_dt$idx_season)
}


#' Get Maximum a Priori (MAP) parameter estimates
#'
#' @description Find the Maximum a Priori (MAP) estimates for fixed effect
#'   parameter values, including age-specific fixed effects, in a simplified
#'   version of a GLM. Starting the full TMB model with fixed effect starting
#'   values set at the MAP has been shown to improve performance and run time.
#'
#' @param in_data Input data.table, including only the data used to train the
#'    model. Fields must include a numerator, a denominator, fields named for
#'    all covariates, and (optionally) an age ID field
#' @param events_field Field name for the number of events (eg deaths)
#' @param exposure_field Field name for the relative exposure (eg. pop-time)
#' @param covar_names Names of all covariates, including 'intercept' if desired
#' @param distribution_family Name of the distribution to to fit the GLM with,
#'   which also determines the link function to be used. Should be a valid
#'   argument to pass to `stats::glm(family = distribution_family)`
#' @param grouping_field [optional] Field that groups observations by ID
#'
#' @return Named list with two items:
#'     - "glm_fit": Full model fit for the GLM
#'     - "fixed_effects_map": Maximum a priori estimates for covariate fixed
#'          effects, organized as a named numeric vector
#'     - "fixed_effects_grouping": Maximum a priori estimates for grouped fixed
#'          effects, organized as a vector of length(num groups). This list item
#'          is NULL if the argument `grouping_field` was not specified
#'
#' @import data.table glue stats
#' @export
find_glm_map_parameter_estimates <- function(
  in_data, events_field, exposure_field, covar_names, distribution_family,
  grouping_field = NULL
){
  # Ensure that all columns are available in input data
  reqd <- c(events_field, exposure_field, covar_names, grouping_field)
  missing_fields <- setdiff(reqd, colnames(in_data))
  if(length(missing_fields) > 0){
    stop("MAP input data missing fields: ", paste(missing_fields, collapse=', '))
  }
  in_data <- na.omit(in_data, cols = reqd)

  # Add group-based fixed effects, if specified
  if(!is.null(grouping_field) & in_data[, uniqueN(get(grouping_field)) ] > 1){
    grp_vals <- sort(unique(in_data[[grouping_field]]))
    # The first group is set as 'default' - others vary with a fixed effect
    for(grp_val in grp_vals[2:length(grp_vals)]){
      in_data[, paste0('grp',grp_val) := 0 ]
      in_data[ get(grouping_field) == grp_val, paste0('grp',grp_val) := 1 ]
    }
    grp_cols <- paste0('grp', grp_vals[2:length(grp_vals)])
  } else {
    grp_cols <- c()
  }

  # Get a field representing successes as a proportion of exposure
  in_data[, rate_success := get(events_field) / get(exposure_field) ]

  # Set up formula with an offset based on link function, then run the GLM
  formula_char <- glue::glue(
    "rate_success ~ 0 + {paste(c(covar_names, grp_cols), collapse = ' + ')}"
  )
  .env <- environment()
  formula_parsed <- as.formula(formula_char, env = .env)
  glm_fit <- stats::glm(
    formula_parsed, data = in_data, family = distribution_family,
    weights = in_data[[exposure_field]]
  )

  # Return the full GLM fit, the covariate fixed effects, and (optionally) the
  #  grouped fixed effects
  covs_map <- glm_fit$coefficients[ covar_names ]
  if(length(grp_cols) > 0){
    grps_map <- c(0, glm_fit$coefficients[ grp_cols ])
    names(grps_map) <- paste0('grp',grp_vals)
  } else {
    grps_map <- NULL
  }
  return(list(
    glm_fit = glm_fit,
    fixed_effects_map = covs_map,
    fixed_effects_grouping = grps_map
  ))
}


#' Run TMB normalization
#'
#' @description Get the TMB normalization constant from random effects
#'
#' @param adfun The ADFunction to normalize
#' @param flag [char] Flagging variable to indicate whether random effects only have been
#'   incorporated into the joint negative log-likelihood
#' @param value [int, default 0] Value that the flagging variable takes when the data
#'   should NOT be incorporated into the jnll
#' @param verbose [bool, default FALSE] return a message about normalization?
#'
#' @return Normalized ADFunction
#'
#' @import TMB
#' @import tictoc
#' @export
normalize_adfun <- function(adfun, flag, value=0, verbose=FALSE){
  if(verbose) message(" - Running normalization")
  if(verbose) tictoc::tic("    Normalization")
  normalized <- TMB::normalize(adfun, flag=flag, value=value)
  if(verbose) tictoc::toc()
  return(normalized)
}


#' Run TMB precision matrix sparsity algorithm
#'
#' @description Run "symbolic analysis", a set of algorithms that prune the
#'   precision matrix to increase sparsity and speed up optimization
#'
#' @param adfun The ADFunction to normalize
#' @param verbose [bool, default FALSE] return a message about normalization?
#'
#' @import TMB
#' @import tictoc
#' @export
run_sparsity_algorithm <- function(adfun, verbose=FALSE){
  if(verbose) message(" - Running symbolic analysis to reduce run time")
  tictoc::tic("    Symbolic analysis")
  suppressMessages(suppressWarnings(TMB::runSymbolicAnalysis(adfun)))
  tictoc::toc()
  invisible()
}

#' Set up and run TMB
#'
#' @description Generic TMB model run handler. Sets up the ADFun object, applies
#'   model speedups and fixes as specified, and optimizes using `nlminb`. This
#'   is meant to be a helper function run by more specific model functions.
#'
#' @param tmb_data_stack List containing all data inputs expected in the TMB
#'   CPP file
#' @param params_list List containing all parameters expected in the TMB CPP file
#' @param tmb_random Character vector containing all random effects that will be
#'   optimized in the inner optimizer
#' @param tmb_map Named list containing parameters that will be treated in a
#'   particular way by the optimizer
#' @param tmb_outer_maxsteps Max number of steps taken by the outer optimizer
#' @param tmb_inner_maxsteps Max number of steps taken by the inner optimizer
#'   in a single outer optimizer step
#' @param normalize [boolean, default FALSE] Run TMB's automatic process normalization
#'   function? Adds two additional items to the TMB data stack: `auto_normalize` (1) and
#'   `early_return` (0). Both are used by the \code{\link{normalize_adfun}} function to
#'   get the normalizing constant prior to optimization. For more details, see
#'   \link{\code{TMB::normalize}}.
#' @param run_symbolic_analysis [boolean, default FALSE] run symbolic analysis
#'   to speed up model run time?
#' @param set_limits [boolean, default FALSE] Set limits for fixed effects?
#' @param limit_max [numeric, default 10] If limits are set in the model,
#'   maximum value for any fixed effect
#' @param limit_min [numeric, default -limit_max] If limits are set in the model,
#'   minimum value of any fixed effect
#' @param parallel_model [bool, default FALSE] Is the model implemented in parallel? If
#'   TRUE, opens multiple OMP threads before fitting
#' @param optimization_method [char, default 'nlminb'] Outer optimization method to use
#'   for fitting, implemented in the optimx library. Recommended options include 'nlminb'
#'   and 'L-BFGS-B'
#' @param model_name [char, default "model"] name of the model
#' @param verbose [boolean, default FALSE] Should this function return logging
#'   information about the stage of model fitting, including the outer optizimer
#'   sampling? This will be somewhat long (>100 lines)
#' @param inner_verbose [boolean, default FALSE] Should this function return
#'   logging information about inner optimizer sampling? This can be useful for
#'   debugging, but will show very verbose (>10k lines) output.
#'
#' @return list of two objects: obj (ADFunction object), and opt (optimized
#'   nlminb object)
#'
#' @useDynLib covidemr
#' @import TMB glue tictoc optimx
#' @export
setup_run_tmb <- function(
  tmb_data_stack, params_list, tmb_random, tmb_map, tmb_outer_maxsteps,
  tmb_inner_maxsteps, normalize=FALSE, run_symbolic_analysis=FALSE,
  set_limits=FALSE, limit_max=10, limit_min=-limit_max, parallel_model = FALSE,
  optimization_method = 'nlminb', model_name="model", verbose=FALSE, inner_verbose=FALSE
){
  # Helper function to send a message only if verbose
  vbmsg <- function(x) if(verbose) message(x)
  # Setup
  vbmsg(paste0(c("\n",rep("*",nchar(model_name)+14)),collapse=''))
  vbmsg(glue::glue("***  {model_name} RUN  ***"))
  vbmsg(paste0(c(rep("*",nchar(model_name)+14),"\n"),collapse=''))

  if(parallel_model){
    # Set up openmp threads
    threads <- system('echo $OMP_NUM_THREADS', intern = TRUE)
    if(threads != '') {
      vbmsg(sprintf('Detected %s threads in OMP environmental variable.',threads))
      openmp(as.numeric(threads))
    } else {
      vbmsg("Did not detect environmental OMP variable, defaulting to 2 cores. \n
             You can set this using OMP_NUM_THREADS.")
      openmp(2)
    }
  }

  # If necessary, add data flags for TMB::normalize() to use
  if(normalize){
    tmb_data_stack$auto_normalize <- 1L
    tmb_data_stack$early_return <- 0L
  }

  # Make Autodiff function
  vbmsg("Constructing ADFunction...")
  tictoc::tic("  Making Model ADFun")
  obj <- TMB::MakeADFun(
    data = tmb_data_stack, parameters = params_list, random = tmb_random,
    map = tmb_map, DLL = 'covidemr', silent = !inner_verbose,
    inner.control = list(trace=inner_verbose, tol=1E-11)
  )
  obj$env$tracemgc <- as.integer(verbose)
  tictoc::toc()

  # Optionally run a normalization fix for models with large random effect sets
  if(normalize) obj <- normalize_adfun(
    adfun=obj, flag='early_return', value=1, verbose=verbose
  )
  # Optionally run optimization algorithms to improve model run time
  if(run_symbolic_analysis) run_sparsity_algorithm(adfun=obj, verbose=verbose)
  # Optionally set upper and lower limits for fixed effects
  fe_names <- names(obj$par)
  if(set_limits==TRUE){
    fe_lower_vec = rep(limit_min, times=length(fe_names))
    fe_upper_vec = rep(limit_max, times=length(fe_names))
    names(fe_lower_vec) <- names(fe_upper_vec) <- fe_names
    vbmsg(glue::glue("Fixed effects limited to the range [{limit_min},{limit_max}]."))
  } else {
    fe_lower_vec <- -Inf
    fe_upper_vec <- Inf
  }

  # Optimize using the specified outer optimizer, implemented in optimx
  tictoc::tic("  Optimization")
  vbmsg(glue("\n** OPTIMIZING USING METHOD {optimization_method} **"))
  opt <- optimx(
    par = obj$par, fn = obj$fn, gr = obj$gr,
    lower = fe_lower_vec, upper = fe_upper_vec,
    method = optimization_method,
    itnmax = tmb_outer_maxsteps,
    hessian = FALSE,
    control = list(
      trace = as.integer(verbose), follow.on = FALSE,
      dowarn = as.integer(verbose), maxit = tmb_inner_maxsteps, starttests = FALSE,
      kkt = FALSE
    )
  )
  conv_code <- opt$convcode
  vbmsg(glue::glue(
    "{model_name} optimization finished with convergence code {conv_code}.\n"
  ))
  tictoc::toc()
  vbmsg(glue::glue("*** {model_name} RUN COMPLETE **************\n\n"))
  return(list(obj=obj, opt=opt))
}
njhenry/covidemr documentation built on Feb. 2, 2022, 2:31 a.m.