R/mifa-ci.R
In mifa: Multiple Imputation for Exploratory Factor Analysis

Documented in combine_rubin get_fieller_ci mifa_ci_boot mifa_ci_fieller

#' Bootstrap confidence intervals for explained variance
#'
#' Compute bootstrap confidence intervals for the proportion of explained
#' variance for the covariance of an incomplete data imputed using
#' multiple imputation.
#' For multiple imputation, Multivariate Imputation by Chained Equations
#' (MICE) from the \link[mice]{mice} package is used.
#'
#' This function uses the Shao and Sitter (1996) method to combine multiple
#' imputation and bootstrapping. The imputations are done using [mice::mice()].
#'
#' Normally, this function does not need to be called directly. Instead,
#' use `mifa(..., ci = "boot")`.
#'
#' @references
#' Shao, J. & Sitter, R. R. (1996). Bootstrap for imputed survey data.
#' Journal of the American Statistical Association 91.435 (1996): 1278-1288.
#' \doi{10.1080/01621459.1996.10476997}
#'
#' @param progress Logical. Whether to show progress bars for computation of
#' bootstrap confidence intervals. Default is FALSE.
#' @inheritParams mifa
#' @inheritDotParams mice::mice
#'
#' @seealso [mifa()], [mice::mice()]
#' @family mifa confidence intervals
#'
#' @return A data frame containing bootstrapped confidence intervals for
#' variance explained by different number of principal components.
#' @export
#' @examples
#' \donttest{
#' if(requireNamespace("psych")) {
#'   data <- psych::bfi[, 1:25]
#'   mifa_ci_boot(data, n_pc = 3:8, n_boot = 10, print = FALSE)
#' }
#' }
mifa_ci_boot <- function(data, cov_vars = dplyr::everything(), n_pc,
                         conf = .95, n_boot = 1000, progress = FALSE, ...) {
  # check and clean arguments
  checkmate::assert_data_frame(data, min.rows = 1, min.cols = 2)
  checkmate::assert_data_frame(dplyr::select(data, {{ cov_vars }}), min.cols = 2)
  checkmate::assert_number(conf, lower = 0, upper = 1)
  checkmate::assert_count(n_boot, positive = TRUE)
  checkmate::assert_flag(progress)

  n_cov_vars <- ncol(dplyr::select(data, {{ cov_vars }}))
  n_pc <- clean_n_pc(n_pc, n_cov_vars)

  # bootstrap samples and compute variance explained by principal components
  # (i.e. eigenvalues of eigenvectors)
  boot_eig <- matrix(0, n_cov_vars, n_boot)

  if (progress) {
    cat("\n\n  Computing bootstrap confidence intervals...\n")
    pb <- utils::txtProgressBar(min = 0, max = n_boot, initial = 0, style = 3)
  }
  for (i in 1:n_boot) {
    # draw bootstrap samples and impute until there are no constants
    repeat {
      boot_idx <- sample(nrow(data), replace = TRUE)
      data_boot <- data[boot_idx, ]
      try({
        imp <- stop_constants(mice_impute_once(data_boot, ...))
        break
      })
    }

    # extract imputed dataset, make sure no NAs are left
    data_imp <- mice::complete(imp)
    data_imp <- mice_impute_all_NA(data_imp, ...)

    # Select variables for calculation of covariance matrix
    data_imp <- dplyr::select(data_imp, {{ cov_vars }})

    # eigenvalues of covariance matrix
    boot_eig[, i] <- eigen(stats::cov(data_imp))$values

    if (progress) utils::setTxtProgressBar(pb, i)
  }

  # get confidence intervals for variance explained by  principal components
  # (i.e., eigenvalues of eigenvectors)
  var_expl <- t(apply(boot_eig, 2, cumsum)) / apply(boot_eig, 2, sum)
  probs_ci <- c((1 - conf) / 2, 1 - (1 - conf) / 2)
  boot_expl <- apply(var_expl, 2, stats::quantile, probs = probs_ci)
  boot_cis <- t(boot_expl)[n_pc, ]

  data.frame(
    n_pc = n_pc,
    lower = boot_cis[, 1],
    upper = boot_cis[, 2]
  )
}



#' Fieller confidence intervals for explained variance
#'
#' Computes parametric confidence intervals for proportion of explained
#' variance for given numbers of principal components using Fieller's method.
#' Note that by setting `ci = TRUE` in [mifa()], this confidence
#' interval can be computed as well.
#'
#' Normally, this function does not need to be called directly. Instead,
#' use `mifa(..., ci = "fieller")`.
#'
#' @references
#' Fieller, E. C. (1954). Some problems in interval estimation.
#' Journal of the Royal Statistical Society. Series B (Methodological): 175-185.
#'
#' @param cov_imps List containing the estimated covariance matrix within
#' each imputed data. One can use `cov_imputations` returned by [mifa()].
#' @param N A scalar specifying sample size.
#'
#' @inheritParams mifa
#'
#' @seealso [mifa()]
#' @family mifa confidence intervals
#'
#' @return A data frame containing confidence intervals for `n_pc` principal
#' components.
#' @export
#' @examples
#' \donttest{
#' if(requireNamespace("psych")) {
#'   data <- psych::bfi[, 1:25]
#'   mi <- mifa(data, print = FALSE)
#'   mifa_ci_fieller(mi$cov_imputations, n_pc = 3:8, N = nrow(data))
#' }
#' }
mifa_ci_fieller <- function(cov_imps, n_pc, conf = .95, N) {
  # check and clean arguments
  checkmate::assert_list(cov_imps, "matrix", any.missing = FALSE, min.len = 1)
  lapply(cov_imps, function(x) {
    checkmate::assert_matrix(x, ncols = nrow(x), any.missing = FALSE)
  })
  checkmate::assert_number(conf, lower = 0, upper = 1)
  checkmate::assert_count(N, positive = TRUE)

  n_cov_vars <- ncol(cov_imps[[1]])
  n_pc <- clean_n_pc(n_pc, n_cov_vars)

  # get eigenvalues
  m <- length(cov_imps)
  eig_imp <- matrix(0, m, dim(cov_imps[[1]])[1])

  for (i in 1:m) {
    eig_imp[i, ] <- eigen(cov_imps[[i]])$values
  }

  # compute Fieller confidence intervals for each number of principal components
  ci_fieller <- matrix(NA, length(n_pc), 2)
  for (i in seq_along(n_pc)) {
    try({
      ci_fieller[i, ] <- get_fieller_ci(eig_imp, n_pc[i], conf, N, m)
    })
  }

  out <- data.frame(cbind(n_pc, ci_fieller))
  colnames(out) <- c("n_pc", "lower", "upper")
  return(out)
}



#' Find the Fieller interval for each k
#'
#' This function is used by [mifa_ci_fieller()] to compute Fieller's confidence
#' intervals for each of the components of n_pc.
#'
#' @param eig_imp A matrix with each of its columns the eigenvalues of the
#' estimated covariance matrix for each imputed data.
#' @param m A scalar specifying number of multiple imputations.
#'
#' @inheritParams mifa_ci_fieller
#'
#' @return A vector of length 2, containing the lower and upper bounds of
#' estimated Fieller's interval.
#' @keywords internal
get_fieller_ci <- function(eig_imp, n_pc, conf, N, m) {

  # combine imputations
  eig_imp2 <- eig_imp^2
  mi_cov <- lapply(1:m, function(i) (2 / N) * diag(eig_imp2[i, ]))

  mi_comb <- combine_rubin(eig_imp, mi_cov)
  cov_lambda <- mi_comb$cov_param

  # compute Fieller's intervals
  P <- dim(cov_lambda)[1]
  A1 <- matrix(c(rep(1, n_pc), rep(0, P - n_pc)), 1, P)
  A <- matrix(1, 1, P)
  s11_comb <- (A1 %*% cov_lambda) %*% t(A1)
  s22_comb <- (A %*% cov_lambda) %*% t(A)
  s12_comb <- (A1 %*% cov_lambda) %*% t(A)
  S <- matrix(c(s11_comb, s12_comb, s12_comb, s22_comb), 2, 2)
  s11 <- S[1, 1]
  s22 <- S[2, 2]
  s12 <- S[1, 2]

  eigen.all <- sum(mi_comb$param_est)
  eigen.first <- sum(mi_comb$param_est[1:n_pc])

  C12 <- s11 / (eigen.first^2)
  C22 <- s22 / (eigen.all^2)
  r <- s12 / sqrt(s11 * s22)
  T.crit <- stats::qnorm(1 - ((1 - conf) / 2))
  A <- C12 + C22 - (2 * r * sqrt(C12 * C22))
  B <- (T.crit^2) * C12 * C22 * (1 - (r^2))

  if ((A - B) < 0) {
    stop("Computing Fieller CI is not possible. Try using more imputations.")
  }

  fieller1 <- eigen.first / eigen.all
  fieller2 <- 1 - (T.crit^2 * r * sqrt(C12 * C22))
  fieller3 <- T.crit * sqrt(A - B)
  fieller4 <- 1 - ((T.crit^2) * C22)

  c(
    lower = fieller1 * ((fieller2 - fieller3) / fieller4),
    upper = fieller1 * ((fieller2 + fieller3) / fieller4)
  )
}



#' Combine results from different imputations using Rubin's rules
#'
#' Applies Rubin's rules to combine estimates and
#' variance-covariance matrices from different imputations.
#'
#' @references
#' Rubin D. B. Multiple imputation for nonresponse in surveys (2004).
#' John Wiley & Sons.
#'
#' @param param_imps Matrix containing estimated parameters in each imputation
#' as its rows.
#' @param cov_imps List of estimated covariance matrices for each imputation.
#'
#' @return A list:
#' \describe{
#'   \item{param_est}{Vector of combined parameter estimates with the same length
#'   as columns in `param_imps`.}
#'   \item{cov_param}{Combined variance-covariance matrix of size n x n, where n
#'   is the number of columns in `param_imps`.}
#'   \item{between_cov}{Between imputations variance-covariance matrix of size
#'   n x n, where n is the number of columns in `param_imps`.}
#' }
#' @keywords internal
combine_rubin <- function(param_imps, cov_imps) {
  m <- length(cov_imps)

  est_diff <- scale(param_imps, center = TRUE, scale = FALSE)
  cov_param_sample1 <- lapply(1:m, function(i) est_diff[i, ] %*% t(est_diff[i, ]))
  cov_param_sample1 <- Reduce("+", cov_param_sample1) / (m - 1)

  cov_param_sample <- cov_param_sample1 * ((m + 1) / m)
  cov_param_mean <- Reduce(`+`, cov_imps) / m
  cov_param <- cov_param_mean + cov_param_sample

  list(
    param_est = apply(param_imps, 2, mean),
    cov_param = cov_param,
    cov_between = cov_param_sample1
  )
}