R/multiple_imputation_da.R

Defines functions rubinPool aggregateMatrices matrixMeans modsem_mimput_modsem_da modsem_mimpute

Documented in modsem_mimpute

#' Estimate a \code{modsem} model using multiple imputation
#'
#' @param model.syntax \code{lavaan} syntax
#'
#' @param data A dataframe with observed variables used in the model.
#'
#' @param method Method to use:
#' \describe{
#'   \item{\code{"lms"}}{latent moderated structural equations (not passed to \code{lavaan}).}
#'   \item{\code{"qml"}}{quasi maximum likelihood estimation (not passed to \code{lavaan}).}
#' }
#'
#' @param m Number of imputations to perform. More imputations will yield better estimates
#'   but can also be (a lot) slower.
#'
#' @param verbose Should progress be printed to the console?
#'
#' @param se How should corrected standard errors be computed? Alternatives are:
#' \describe{
#'   \item{\code{"simple"}}{Uncorrected standard errors are only calculated once, in the first imputation.
#'                          The standard errors are thereafter corrected using the distribution of the
#'                          estimated coefficients from the different imputations.}
#'   \item{\code{"full"}}{Uncorrected standard errors are calculated and aggregated for each imputation.
#'                        This can give more accurate results, but can be (a lot) slower.
#'                        The standard errors are thereafter corrected using the distribution of the
#'                        estimated coefficients from the different imputations.}
#' }
#'
#' @param ... Arguments passed to \code{\link{modsem}}.
#'
#' @details
#'  \code{modsem_impute} is currently only available for the DA approaches
#'  (LMS and QML). It performs multiple imputation using \code{Amelia::amelia}
#'  and returns aggregated coefficients from the multiple imputations, along with
#'  corrected standard errors.
#'
#' @examples
#'
#' m1 <- '
#'   # Outer Model
#'   X =~ x1 + x2 +x3
#'   Y =~ y1 + y2 + y3
#'   Z =~ z1 + z2 + z3
#'
#'   # Inner model
#'   Y ~ X + Z + X:Z
#' '
#'
#' oneInt2 <- oneInt
#'
#' set.seed(123)
#' k <- 200
#' I <- sample(nrow(oneInt2), k, replace = TRUE)
#' J <- sample(ncol(oneInt2), k, replace = TRUE)
#' for (k_i in seq_along(I)) oneInt2[I[k_i], J[k_i]] <- NA
#'
#' \dontrun{
#' est <- modsem_mimpute(m1, oneInt2, m = 25)
#' summary(est)
#' }
#' @export
modsem_mimpute <- function(model.syntax,
                           data,
                           method = "lms",
                           m = 25,
                           verbose = interactive(),
                           se = c("simple", "full"),
                           ...) {
  stopif(!method %in% c("lms", "qml"),
         "Multiple imputation is only available for the LMS and QML approaches!")

  modsem_mimput_modsem_da(
    model.syntax = model.syntax,
    data = data,
    method = method,
    m = m,
    verbose = verbose,
    se = se,
    ...
  )
}


modsem_mimput_modsem_da <- function(model.syntax,
                                    data,
                                    method = "lms",
                                    m = 25,
                                    cov.syntax = NULL,
                                    verbose = TRUE,
                                    se = c("simple", "full"),
                                    calc.se = NULL,
                                    ...) {
  se <- tolower(se)
  se <- match.arg(se)

  parTable <- rbind(modsemify(model.syntax), modsemify(cov.syntax))
  ovs      <- getOVs(parTable)

  missing.cols <- !ovs %in% colnames(data)
  stopif(any(missing.cols),
         "Variables missing from data:\n",
         paste(ovs[missing.cols], sep = "\n"))

  data <- data[, ovs, drop = FALSE]

  if (verbose) cat("Imputing data...\n")
  imputed  <- Amelia::amelia(data, m = m, p2s = 0)

  COEF.ALL  <- vector("list", length = m)
  COEF.FREE <- vector("list", length = m)
  VCOV.ALL  <- vector("list", length = m)
  VCOV.FREE <- vector("list", length = m)
  fits <- vector("list", length = m)

  THETA <- NULL

  for (i in seq_len(m)) {
    printedLines <- utils::capture.output(split = TRUE, {
      if (verbose) printf("Fitting imputation %d/%d...\n", i, m)

      if (is.null(calc.se))
        calc.se_i <- ifelse(se == "simple" && i > 1, yes = FALSE, no = TRUE)
      else
        calc.se_i <- calc.se

      if (is.null(THETA)) {
        optimize <- TRUE
        start    <- NULL
      } else {
        optimize <- FALSE
        start    <- apply(THETA, MARGIN = 2, FUN = mean, na.rm = TRUE) # na.rm should't be necessary, ever...
      }

      data_i <- as.data.frame(imputed$imputations[[i]])

      fit_i <- tryCatch(
        modsem_da(
          model.syntax = model.syntax,
          data         = data_i,
          method       = method,
          cov.syntax   = cov.syntax,
          start        = start,
          verbose      = verbose,
          optimize     = optimize,
          calc.se      = calc.se_i,
          ...
        ), error = \(e) {print(e); NULL}
      )

      if (is.null(fit_i)) next


      fits[[i]]      <- fit_i
      COEF.ALL[[i]]  <- coef(fit_i, type = "all")
      COEF.FREE[[i]] <- coef(fit_i, type = "free")

      if (i > 1 && se == "simple") {
        VCOV.ALL[[i]]  <- VCOV.ALL[[1]]
        VCOV.FREE[[i]] <- VCOV.FREE[[1]]

      } else if (calc.se_i) {
        VCOV.ALL[[i]]  <- vcov(fit_i, type = "all")
        VCOV.FREE[[i]] <- vcov(fit_i, type = "free")

      } else {
        k.all  <- length(COEF.ALL[[i]])
        k.free <- length(COEF.FREE[[i]])
        d.all  <- names(COEF.ALL[[i]])
        d.free <- names(COEF.FREE[[i]])

        VCOV.ALL[[i]]  <- matrix(0, nrow = k.all, ncol = k.all,
                                 dimnames = list(d.all, d.all))
        VCOV.FREE[[i]] <- matrix(0, nrow = k.free, ncol = k.free,
                                 dimnames = list(d.free, d.free))
      }

      THETA <- rbind(
        THETA,
        matrix(fit_i$theta, nrow = 1, dimnames = list(NULL, names(fit_i$theta)))
      )
    })

    nprinted <- length(printedLines)
    if (i < m) eraseConsoleLines(nprinted)
  }

  failed <- vapply(fits, FUN.VALUE = logical(1L), FUN = is.null)

  if (any(failed)) {
    warning2(sprintf("Model estimation failed in %d out of %d impuations!",
                     sum(failed), m), immediate. = FALSE)

    fits      <- fits[!failed]
    COEF.ALL  <- COEF.ALL[!failed]
    COEF.FREE <- COEF.FREE[!failed]
    VCOV.ALL  <- VCOV.ALL[!failed]
    VCOV.FREE <- VCOV.FREE[!failed]
    m         <- sum(!failed)
  }

  pool.all  <- rubinPool(COEF.ALL,  VCOV.ALL)
  pool.free <- rubinPool(COEF.FREE, VCOV.FREE)

  coef.all  <- pool.all$theta.bar
  vcov.all  <- pool.all$Tvcov

  coef.free <- pool.free$theta.bar
  vcov.free <- pool.free$Tvcov

  # Re-do parameter estimates
  parTable1   <- parameter_estimates(fits[[1]])
  orig.labels <- parTable1$label
  parTable1   <- getMissingLabels(parTable1)
  parTableT   <- data.frame(label = names(coef.all),
                            est.t = coef.all,
                            std.error.t = sqrt(diag(vcov.all)))

  parTable    <- leftJoin(left = parTable1, right = parTableT, by = "label")
  match       <- !is.na(parTable$est.t)

  parTable$est[match]       <- parTable$est.t[match]
  parTable$std.error[match] <- parTable$std.error.t[match]
  parTable$est.t       <- NULL
  parTable$std.error.t <- NULL
  parTable$label[!parTable$label %in% orig.labels] <- ""
  parTable <- parTable[c("lhs", "op", "rhs", "label", "est", "std.error")] # remove z-statistics
  parTable <- addZStatsParTable(parTable)
  rownames(parTable) <- NULL # reset

  matrices    <- aggregateMatrices(fits, type = "main")
  covMatrices <- aggregateMatrices(fits, type = "cov")
  expected.matrices <- aggregateMatrices(fits, type = "expected")

  getScalarFit <- function(fit, field, dtype = numeric)
    vapply(fits, FUN.VALUE = dtype(1L), \(fit) fit[[field]])

  fit.out <- fits[[1]]
  fit.out$coefs.all       <- coef.all
  fit.out$coefs.free      <- coef.free
  fit.out$vcov.all        <- vcov.all
  fit.out$vcov.free       <- vcov.free
  fit.out$parTable        <- parTable
  fit.out$information     <- sprintf("Rubin-corrected (m=%d)", m)
  fit.out$FIM             <- solve(vcov.free)
  fit.out$theta           <- apply(THETA, MARGIN = 2, FUN = mean, na.rm = TRUE)
  fit.out$iterations      <- sum(getScalarFit(fits, field = "iterations"))
  fit.out$logLik          <- mean(getScalarFit(fits, field = "logLik"))
  fit.out$convergence     <- all(getScalarFit(fits, field = "convergence",
                                              dtype = logical))
  fit.out$convergence.msg <- getConvergenceMessage(fit.out$convergence,
                                                   fit.out$iterations)
  fit.out$model$matrices          <- matrices
  fit.out$model$covModel$matrices <- covMatrices
  fit.out$expected.matrices       <- expected.matrices

  fit.out$imputations <- list(fitted = fits, data = imputed)

  fit.out
}


matrixMeans <- function(matrices, na.rm = TRUE) {
  if (!length(matrices))
    return(matrix(nrow=0, ncol=0))

  # Take the mean of a list of n*p matrices
  # With the option of ignoring missing values

  flattened <- as.data.frame(lapply(matrices, FUN = c))
  means <- rowMeans(flattened, na.rm = na.rm)

  dims <- dimnames(matrices[[1]])
  n    <- NROW(matrices[[1]])
  p    <- NCOL(matrices[[1]])

  matrix(means, nrow = n, ncol = p, dimnames = dims)
}


aggregateMatrices <- function(fits, type) {
  switch(type,
    main = {
      matrices  <- fits[[1]]$model$matrices
      names.num <- namesParMatrices
    },
    cov = {
      matrices  <- fits[[1]]$model$covModel$matrices
      names.num <- namesParMatricesCov
    },
    expected = {
      matrices  <- fits[[1]]$expected.matrices
      names.num <- names(matrices)
    }
  )

  if (is.null(matrices) || !length(matrices))
    return(matrices)

  m <- length(fits)

  for (i in seq_len(m)[-1]) {
    fit_i      <- fits[[i]]
    matrices_i <- switch(type,
      main     = fit_i$model$matrices,
      cov      = fit_i$model$covModel$matrices,
      expected = fit_i$expected.matrices
    )
    matrices_i <- matrices_i[names(matrices_i) %in% names.num]

    for (mat in names(matrices_i)) {
      if (is.null(matrices_i[[mat]]) || !length(matrices_i[[mat]]))
        next

      matrices[[mat]] <- matrices[[mat]] + matrices_i[[mat]]
    }
  }

  for (mat in names.num)
    matrices[[mat]] <- matrices[[mat]] / m

  matrices
}


rubinPool <- function(coef.list, vcov.list) {
  stopifnot(length(coef.list) == length(vcov.list))

  m     <- length(coef.list)
  THETA <- do.call(cbind, coef.list)
  p     <- nrow(THETA)

  # Pooled point estimate
  theta.bar <- rowMeans(THETA, na.rm = TRUE) # na.rm shouldn't be necessary
  # but just in case...

  # Within-imputation covariance (W)
  W <- matrixMeans(vcov.list, na.rm = TRUE)

  # Between-imputation covariance (B)
  THETAc <- sweep(THETA, 1, theta.bar, "-")
  B      <- (THETAc %*% t(THETAc)) / (m - 1)

  # Total Rubin-corrected covariance (T)
  Tvcov  <- W + (1 + 1/m) * B

  # Small-sample df
  df <- (m - 1) * (1 + diag(W) / ((1 + 1/m) * diag(B)))^(-2)

  list(theta.bar = theta.bar,
       W         = W,
       B         = B,
       Tvcov     = Tvcov,
       df        = df)
}

Try the modsem package in your browser

Any scripts or data that you put into this service are public.

modsem documentation built on Aug. 27, 2025, 9:08 a.m.