R/misim.R

Defines functions print.clarify_misim misim

Documented in misim

#' Simulate model coefficients after multiple imputation
#'
#' `misim()` simulates model parameters from multivariate normal or t distributions after multiple imputation that are then used by [sim_apply()] to calculate quantities of interest.
#'
#' @param fitlist a list of model fits, one for each imputed dataset, or a `mira` object (the output of a call to `with()` applied to a `mids` object in `mice`).
#' @param n the number of simulations to run for each imputed dataset; default is 1000. More is always better but resulting calculations will take longer.
#' @param vcov a square covariance matrix of the coefficient covariance estimates, a function to use to extract it from `fit`, or a list thereof with an element for each imputed dataset. By default, uses [stats::vcov()] or [insight::get_varcov()] if that doesn't work.
#' @param coefs a vector of coefficient estimates, a function to use to extract it from `fit`, or a list thereof with an element for each imputed dataset. By default, uses [stats::coef()] or [insight::get_parameters()] if that doesn't work.
#' @param dist a character vector containing the name of the multivariate distribution(s) to use to draw simulated coefficients. Should be one of `"normal"` (multivariate normal distribution) or `"t_{#}"` (multivariate t distribution), where `{#}` corresponds to the desired degrees of freedom (e.g., `"t_100"`). If `NULL`, the right distributions to use will be determined based on heuristics; see [sim()] for details.
#'
#' @returns
#' A `clarify_misim` object, which inherits from `clarify_sim` and has the following components:
#'  \item{sim.coefs}{a matrix containing the simulated coefficients with a column for each coefficient and a row for each simulation for each imputation}
#'  \item{coefs}{a matrix containing the original coefficients extracted from `fitlist` or supplied to `coefs`, with a row per imputation.}
#'  \item{fit}{the list of model fits supplied to `fitlist`}
#'  \item{imp}{a identifier of which imputed dataset each set of simulated coefficients corresponds to.}
#' The `"dist"` attribute contains `"normal"` if the coefficients were sampled from a multivariate normal distribution and `"t({df})"` if sampled from a multivariate t distribution. The `"clarify_hash"` attribute contains a unique hash generated by [rlang::hash()].
#'
#' @details
#' `misim()` essentially combines multiple `sim()` calls applied to a list of model fits, each fit in an imputed dataset, into a single combined pool of simulated coefficients. When simulation-based inference is to be used with multiply imputed data, many imputations are required; see Zhou and Reiter (2010).
#'
#' @references
#' Zhou, X., & Reiter, J. P. (2010). A Note on Bayesian Inference After Multiple Imputation. *The American Statistician*, 64(2), 159–163. \doi{10.1198/tast.2010.09109}
#'
#' @seealso
#' * [sim()] for simulating model coefficients for a single dataset
#' * [sim_apply()] for applying a function to each set of simulated coefficients
#' * [sim_ame()] for computing average marginal effects in each simulation draw
#' * [sim_setx()] for computing marginal predictions and first differences at typical values in each simulation draw
#'
#' @examplesIf requireNamespace("Amelia", quietly = TRUE)
#' data("africa", package = "Amelia")
#'
#' # Multiple imputation using Amelia
#' a.out <- Amelia::amelia(x = africa, m = 10,
#'                         cs = "country",
#'                         ts = "year", logs = "gdp_pc",
#'                         p2s = 0)
#'
#' fits <- with(a.out, lm(gdp_pc ~ infl * trade))
#'
#' # Simulate coefficients
#' s <- misim(fits)
#' s

#' @export
misim <- function(fitlist, n = 1e3, vcov = NULL, coefs = NULL, dist = NULL) {

  if (missing(fitlist)) {
    fitlist <- NULL
  }

  if (inherits(fitlist, "mira")) {
    fitlist <- fitlist$analyses
  }

  if (is_null(fitlist)) {
    if (is_null(coefs) || is_null(vcov)) {
      .err("when `fitlist` is not supplied, arguments must be supplied to both `coefs` and `vcov`")
    }

    if (!is.list(coefs) && !is.list(vcov)) {
      .err("when `fitlist` is not supplied, at least one of `coefs` or `vcov` must be a list")
    }

    nimp <- if (is.list(coefs)) length(coefs) else length(vcov)
  }
  else {
    check_fitlist(fitlist)
    nimp <- length(fitlist)
  }

  chk::chk_count(n)

  if (!is.list(coefs)) {
    coefs <- rep.int(list(coefs), nimp)
  }
  else if (length(coefs) != nimp) {
    if (is_null(fitlist)) {
      .err("when `fitlist` is not supplied and `coefs` is supplied as a list, `coefs` must have as many entries as there are entries in `vcov`")
    }
    else {
      .err("when supplied as a list, `coefs` must have as many entries as there are models in `fitlist`")
    }
  }

  coef_supplied <- {
    if (all(lengths(coefs) == 0L)) "null"
    else if (all_apply(coefs, is.function)) "fun"
    else if (all_apply(coefs, check_valid_coef)) "num"
    else {
      .err("`coefs` must be a vector of coefficients, a function that extracts one from each model in `fitlist`, or a list thereof")
    }
  }

  if (!is.list(vcov)) {
    vcov <- rep.int(list(vcov), nimp)
  }
  else if (length(vcov) != nimp) {
    if (is_null(fitlist)) {
      .err("when `fitlist` is not supplied and `vcov` is supplied as a list, `vcov` must have as many entries as there are entries in `coefs`")
    }
    else {
      .err("when supplied as a list, `vcov` must have as many entries as there are models in `fitlist`")
    }
  }

  vcov_supplied <- {
    if (all_apply(vcov, is_null)) "null"
    else if (all_apply(vcov, is.matrix)) "num"
    else "marginaleffects_code"
  }

  for (i in seq_len(nimp)) {
    coefs[[i]] <- process_coefs(coefs[[i]], fitlist[[i]], coef_supplied)
  }

  for (i in seq_len(nimp)) {
    vcov[[i]] <- process_vcov(vcov[[i]], fitlist[[i]], vcov_supplied)
  }

  check_coefs_vcov_length_mi(vcov, coefs, vcov_supplied, coef_supplied)

  chk::chk_count(n)

  if (is_not_null(dist)) {
    if (length(dist) == 1L) {
      dist <- rep.int(list(dist), nimp)
    }
    else if (length(dist) == nimp) {
      dist <- as.list(dist)
    }
    else {
      .err("when supplied as a vector, `dist` must have as many values as there are imputations")
    }
  }

  samplers <- lapply(seq_len(nimp), function(i) {
    get_sampling_dist(fitlist[[i]], dist[[i]])
  })

  sim.coefs <- do.call("rbind", lapply(seq_len(nimp), function(i) {
    samplers[[i]](n, coefs[[i]], vcov[[i]])
  }))

  out <- list(sim.coefs = sim.coefs,
              coefs = do.call("rbind", coefs),
              fit = fitlist,
              imp = rep(seq_len(nimp), each = n))

  dists <- lapply(samplers, .attr, "dist") |>
    unlist()

  if (all_the_same(dists)) {
    dists <- dists[1L]
  }

  attr(out, "dist") <- dists
  attr(out, "use_fit") <- is_not_null(fitlist)
  attr(out, "sim_hash") <- rlang::hash(out$sim.coefs)
  class(out) <- c("clarify_misim", "clarify_sim")

  out
}

#' @exportS3Method print clarify_misim
print.clarify_misim <- function(x, ...) {
  obj <- deparse1(substitute(x))

  cat("A `clarify_misim` object\n")
  cat(sprintf(" - %s coefficients, %s imputations with %s simulated values each\n",
              ncol(x$sim.coefs), nrow(x$coefs), nrow(x$sim.coefs) / nrow(x$coefs)))
  cat(" - sampled distributions: ")
  if (length(.attr(x, "dist")) == 1L) {
    cat(sprintf("multivariate %s\n", .attr(x, "dist")))
  }
  else {
    cat("multiple different multivariate distributions")
    if (exists(obj)) {
      cat(sprintf(' (use `attr(%s, "dist") to view them\n', obj))
    }
    else {
      cat("\n")
    }
  }

  invisible(x)
}

Try the clarify package in your browser

Any scripts or data that you put into this service are public.

clarify documentation built on Nov. 5, 2025, 5:50 p.m.