R/fit-model.R
In lineagefreq: Lineage Frequency Dynamics from Genomic Surveillance Counts

Documented in fit_model lfq_engines

#' Fit a lineage frequency model
#'
#' Unified interface for modeling lineage frequency dynamics. Supports
#' multiple engines that share the same input/output contract.
#'
#' @param data An [lfq_data] object.
#' @param engine Model engine to use:
#'   * `"mlr"` (default): Multinomial logistic regression. Fast,
#'     frequentist, no external dependencies.
#'   * `"hier_mlr"`: Hierarchical MLR with partial pooling across
#'     locations. Requires `.location` column in data.
#'   * `"piantham"`: Piantham approximation converting MLR growth
#'     rates to relative reproduction numbers. Requires
#'     `generation_time` argument.
#'   * `"fga"`: Fixed growth advantage model (Bayesian via CmdStan).
#'     Requires 'CmdStan'; check with [lfq_stan_available()].
#'   * `"garw"`: Growth advantage random walk model (Bayesian via
#'     CmdStan). Allows fitness to change over time.
#' @param pivot Reference lineage name. Growth rates are reported
#'   relative to this lineage (fixed at 0). Default: the lineage with
#'   the highest count at the earliest time point.
#' @param horizon Forecast horizon in days (stored for later use by
#'   [forecast()]). Default 28.
#' @param ci_level Confidence level for intervals. Default 0.95.
#' @param ... Engine-specific arguments passed to the internal engine
#'   function. For `engine = "mlr"`: `window`, `ci_method`,
#'   `laplace_smooth`. For `engine = "piantham"`: `generation_time`
#'   (required). For `engine = "hier_mlr"`: `shrinkage_method`.
#'
#' @return An `lfq_fit` object (S3 class), a list containing:
#' \describe{
#'   \item{engine}{Engine name (character).}
#'   \item{growth_rates}{Named numeric vector of growth rates per
#'     `time_scale` days (pivot = 0).}
#'   \item{intercepts}{Named numeric vector of intercepts.}
#'   \item{pivot}{Name of pivot lineage.}
#'   \item{lineages}{Character vector of all lineage names.}
#'   \item{fitted_values}{Tibble of fitted frequencies.}
#'   \item{residuals}{Tibble with observed, fitted, Pearson residuals.}
#'   \item{vcov_matrix}{Variance-covariance matrix.}
#'   \item{loglik, aic, bic}{Model fit statistics.}
#'   \item{nobs, n_timepoints, df}{Sample and model sizes.}
#'   \item{ci_level, horizon}{As specified.}
#'   \item{call}{The matched call.}
#' }
#'
#' @seealso [growth_advantage()] to extract fitness estimates,
#'   [forecast()] for frequency prediction, [backtest()] for
#'   rolling-origin evaluation.
#'
#' @details
#' The MLR engine models the frequency of lineage \eqn{v} at time
#' \eqn{t} as:
#' \deqn{p_v(t) = \frac{\exp(\alpha_v + \delta_v t)}{\sum_k \exp(\alpha_k + \delta_k t)}}
#' where \eqn{\alpha_v} is the intercept, \eqn{\delta_v} is the
#' growth rate per `time_scale` days (default 7), and the pivot
#' lineage has \eqn{\alpha = \delta = 0}. Parameters are estimated
#' by maximum likelihood via `optim(method = "BFGS")` with the
#' Hessian used for asymptotic Wald confidence intervals.
#'
#' The constant growth rate assumption is appropriate for monotonic
#' variant replacement periods (typically 2--4 months). For longer
#' periods or non-monotonic dynamics, use the `window` argument to
#' restrict the estimation window, or consider the `"garw"` engine
#' which allows time-varying growth advantages.
#'
#' @references
#' Abousamra E, Figgins M, Bedford T (2024). Fitness models provide
#' accurate short-term forecasts of SARS-CoV-2 variant frequency.
#' \emph{PLoS Computational Biology}, 20(9):e1012443.
#' \doi{10.1371/journal.pcbi.1012443}
#'
#' Piantham C, Linton NM, Nishiura H (2022). Predicting the
#' trajectory of replacements of SARS-CoV-2 variants using relative
#' reproduction numbers. \emph{Viruses}, 14(11):2556.
#' \doi{10.3390/v14112556}
#'
#' @examples
#' sim <- simulate_dynamics(
#'   n_lineages = 3,
#'   advantages = c("JN.1" = 1.3, "KP.3" = 0.9),
#'   n_timepoints = 15, seed = 42
#' )
#' fit <- fit_model(sim, engine = "mlr")
#' fit
#'
#' @export
fit_model <- function(data,
                      engine   = c("mlr", "hier_mlr", "piantham", "fga", "garw"),
                      pivot    = NULL,
                      horizon  = 28L,
                      ci_level = 0.95,
                      ...) {

  # Allow both built-in and registered engines
  builtin <- c("mlr", "hier_mlr", "piantham", "fga", "garw")

  # If default vector passed (length > 1), use match.arg
  if (length(engine) > 1L) {
    engine <- match.arg(engine, builtin)
  }

  registered_fn <- .get_registered_engine(engine)

  if (is.null(registered_fn) && !engine %in% builtin) {
    cli::cli_abort(c(
      "Unknown engine {.val {engine}}.",
      "i" = "Available: {.val {c(builtin, names(.get_registry()))}}"
    ))
  }

  if (!is_lfq_data(data)) {
    cli::cli_abort(
      "{.arg data} must be an {.cls lfq_data} object. Use {.fn lfq_data} first."
    )
  }
  assert_prob(ci_level, "ci_level")

  if (!is.null(registered_fn)) {
    result <- registered_fn(data, pivot = pivot, ci_level = ci_level, ...)
  } else {
    result <- switch(engine,
      mlr      = .engine_mlr(data, pivot = pivot, ci_level = ci_level, ...),
      hier_mlr = .engine_hier_mlr(data, pivot = pivot, ci_level = ci_level, ...),
      piantham = .engine_piantham(data, pivot = pivot, ci_level = ci_level, ...),
      fga      = .engine_fga(data, pivot = pivot, ci_level = ci_level, ...),
      garw     = .engine_garw(data, pivot = pivot, ci_level = ci_level, ...)
    )
  }

  result$engine   <- engine
  result$ci_level <- ci_level
  result$horizon  <- as.integer(horizon)
  result$call     <- match.call()

  structure(result, class = c(paste0("lfq_fit_", engine), "lfq_fit"))
}


#' List available modeling engines
#'
#' Returns information about all modeling engines available in
#' lineagefreq. Core engines (mlr, hier_mlr, piantham) are always
#' available. Bayesian engines (fga, garw) require 'CmdStan'.
#'
#' @return A tibble with columns: `engine`, `type`, `time_varying`,
#'   `available`, `description`.
#'
#' @examples
#' lfq_engines()
#'
#' @export
lfq_engines <- function() {
  stan_ok <- lfq_stan_available()
  builtin <- tibble::tibble(
    engine       = c("mlr", "hier_mlr", "piantham", "fga", "garw"),
    type         = c("frequentist", "frequentist", "frequentist",
                     "bayesian", "bayesian"),
    time_varying = c(FALSE, FALSE, FALSE, FALSE, TRUE),
    available    = c(TRUE, TRUE, TRUE, stan_ok, stan_ok),
    description  = c(
      "Multinomial logistic regression (MLE)",
      "Hierarchical MLR with empirical Bayes shrinkage",
      "Piantham Rt approximation from MLR growth rates",
      "Fixed growth advantage (Stan)",
      "Growth advantage random walk (Stan)"
    )
  )

  # Append registered engines
  reg <- .get_registry()
  if (length(reg) > 0) {
    custom <- tibble::tibble(
      engine       = vapply(reg, `[[`, "", "name"),
      type         = vapply(reg, `[[`, "", "type"),
      time_varying = vapply(reg, `[[`, FALSE, "time_varying"),
      available    = TRUE,
      description  = vapply(reg, `[[`, "", "description")
    )
    builtin <- dplyr::bind_rows(builtin, custom)
  }
  builtin
}