R/misim.R
In clarify: Simulation-Based Inference for Regression Models

Documented in misim

#' @title Simulate model coefficients after multiple imputation
#'
#' @description `misim()` simulates model parameters from multivariate normal or t distributions after multiple imputation that are then used by [sim_apply()] to calculate quantities of interest.
#'
#' @param fitlist a list of model fits, one for each imputed dataset, or a `mira` object (the output of a call to `with()` applied to a `mids` object in `mice`).
#' @param n the number of simulations to run for each imputed dataset; default is 1000. More is always better but resulting calculations will take longer.
#' @param vcov a square covariance matrix of the coefficient covariance estimates, a function to use to extract it from `fit`, or a list thereof with an element for each imputed dataset. By default, uses [stats::vcov()] or [insight::get_varcov()] if that doesn't work.
#' @param coefs a vector of coefficient estimates, a function to use to extract it from `fit`, or a list thereof with an element for each imputed dataset. By default, uses [stats::coef()] or [insight::get_parameters()] if that doesn't work.
#' @param dist a character vector containing the name of the multivariate distribution(s) to use to draw simulated coefficients. Should be one of `"normal"` (multivariate normal distribution) or `"t_{#}"` (multivariate t distribution), where `{#}` corresponds to the desired degrees of freedom (e.g., `"t_100"`). If `NULL`, the right distributions to use will be determined based on heuristics; see [sim()] for details.
#'
#' @return
#' A `clarify_misim` object, which inherits from `clarify_sim` and has the following components:
#'  \item{sim.coefs}{a matrix containing the simulated coefficients with a column for each coefficient and a row for each simulation for each imputation}
#'  \item{coefs}{a matrix containing the original coefficients extracted from `fitlist` or supplied to `coefs`, with a row per imputation.}
#'  \item{fit}{the list of model fits supplied to `fitlist`}
#'  \item{imp}{a identifier of which imputed dataset each set of simulated coefficients corresponds to.}
#' The `"dist"` attribute contains `"normal"` if the coefficients were sampled from a multivariate normal distribution and `"t({df})"` if sampled from a multivariate t distribution. The `"clarify_hash"` attribute contains a unique hash generated by [rlang::hash()].
#'
#' @details
#' `misim()` essentially combines multiple `sim()` calls applied to a list of model fits, each fit in an imputed dataset, into a single combined pool of simulated coefficients. When simulation-based inference is to be used with multiply imputed data, many imputations are required; see Zhou and Reiter (2010).
#'
#' @references
#'
#' Zhou, X., & Reiter, J. P. (2010). A Note on Bayesian Inference After Multiple Imputation. *The American Statistician*, 64(2), 159–163. \doi{10.1198/tast.2010.09109}
#'
#' @examplesIf requireNamespace("Amelia", quietly = TRUE)
#' data("africa", package = "Amelia")
#'
#' # Multiple imputation using Amelia
#' a.out <- Amelia::amelia(x = africa, m = 10,
#'                         cs = "country",
#'                         ts = "year", logs = "gdp_pc",
#'                         p2s = 0)
#'
#' fits <- with(a.out, lm(gdp_pc ~ infl * trade))
#'
#' # Simulate coefficients
#' s <- misim(fits)
#' s
#'
#' @seealso
#' * [sim()] for simulating model coefficients for a single dataset
#' * [sim_apply()] for applying a function to each set of simulated coefficients
#' * [sim_ame()] for computing average marginal effects in each simulation draw
#' * [sim_setx()] for computing marginal predictions and first differences at typical values in each simulation draw
#' @export
#'
misim <- function(fitlist,
                  n = 1e3,
                  vcov = NULL,
                  coefs = NULL,
                  dist = NULL) {

  if (missing(fitlist)) fitlist <- NULL

  if (inherits(fitlist, "mira")) {
    fitlist <- fitlist$analyses
  }

  if (is.null(fitlist)) {
    if (is.null(coefs) || is.null(vcov)) {
      .err("when `fitlist` is not supplied, arguments must be supplied to both `coefs` and `vcov`")
    }
    if (!is.list(coefs) && !is.list(vcov)) {
      .err("when `fitlist` is not supplied, at least one of `coefs` or `vcov` must be a list")
    }
    nimp <- if (!is.list(coefs)) length(vcov) else length(coefs)
  }
  else {
    check_fitlist(fitlist)
    nimp <- length(fitlist)
  }

  chk::chk_count(n)

  if (!is.list(coefs)) {
    coefs <- lapply(seq_len(nimp), function(i) coefs)
  }
  else if (length(coefs) != nimp) {
    if (is.null(fitlist)) {
      .err("when `fitlist` is not supplied and `coefs` is supplied as a list, `coefs` must have as many entries as there are entries in `vcov`")
    }
    else {
      .err("when supplied as a list, `coefs` must have as many entries as there are models in `fitlist`")
    }
  }

  coef_supplied <- {
    if (all(vapply(coefs, is.null, logical(1L)))) "null"
    else if (all(vapply(coefs, is.function, logical(1L)))) "fun"
    else if (all(vapply(coefs, check_valid_coef, logical(1L)))) "num"
    else {
      .err("`coefs` must be a vector of coefficients, a function that extracts one from each model in `fitlist`, or a list thereof")
    }
  }

  if (!is.list(vcov)) {
    vcov <- lapply(seq_len(nimp), function(i) vcov)
  }
  else if (length(vcov) != nimp) {
    if (is.null(fitlist)) {
      .err("when `fitlist` is not supplied and `vcov` is supplied as a list, `vcov` must have as many entries as there are entries in `coefs`")
    }
    else {
      .err("when supplied as a list, `vcov` must have as many entries as there are models in `fitlist`")
    }
  }

  vcov_supplied <- {
    if (all(vapply(vcov, is.null, logical(1L)))) "null"
    else if (all(vapply(vcov, is.matrix, logical(1L)))) "num"
    else "marginaleffects_code"
  }

  for (i in seq_len(nimp)) {
    coefs[[i]] <- process_coefs(coefs[[i]], fitlist[[i]], coef_supplied)
  }

  for (i in seq_len(nimp)) {
    vcov[[i]] <- process_vcov(vcov[[i]], fitlist[[i]], vcov_supplied)
  }

  check_coefs_vcov_length_mi(vcov, coefs, vcov_supplied, coef_supplied)

  chk::chk_count(n)

  if (!is.null(dist)) {
    if (length(dist) == 1) {
      dist <- lapply(seq_len(nimp), function(i) dist)
    }
    else if (length(dist) != nimp) {
      .err("when supplied as a vector, `dist` must have as many values as there are imputations")
    }
    else {
      dist <- as.list(dist)
    }
  }

  samplers <- lapply(seq_len(nimp), function(i) {
    get_sampling_dist(fitlist[[i]], dist[[i]])
  })

  sim.coefs <- do.call("rbind", lapply(seq_len(nimp), function(i) {
    samplers[[i]](n, coefs[[i]], vcov[[i]])
  }))

  out <- list(sim.coefs = sim.coefs,
              coefs = do.call("rbind", coefs),
              fit = fitlist,
              imp = rep(seq_len(nimp), each = n))

  dists <- unlist(lapply(samplers, attr, "dist"))
  if (all_the_same(dists)) dists <- dists[1]

  attr(out, "dist") <- dists
  attr(out, "use_fit") <- !is.null(fitlist)
  attr(out, "sim_hash") <- rlang::hash(out$sim.coefs)
  class(out) <- c("clarify_misim", "clarify_sim")

  out
}

#' @export
print.clarify_misim <- function(x, ...) {
  obj <- deparse1(substitute(x))
  cat("A `clarify_misim` object\n")
  cat(sprintf(" - %s coefficients, %s imputations with %s simulated values each\n",
              ncol(x$sim.coefs), nrow(x$coefs), nrow(x$sim.coefs) / nrow(x$coefs)))
  cat(" - sampled distributions: ")
  if (length(attr(x, "dist")) == 1) {
    cat(sprintf("multivariate %s\n", attr(x, "dist")))
  }
  else {
    cat("multiple different multivariate distributions")
    if (exists(obj)) {
      cat(sprintf(" (use `attr(%s, \"dist\") to view them\n"), obj)
    }
    else {
      cat("\n")
    }
  }

  invisible(x)
}