inst/doc/missing-data-and-imputation.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

## ----packages-----------------------------------------------------------------
library(miceFast)
library(dplyr)
library(ggplot2)
set.seed(2025)

## ----quickstart---------------------------------------------------------------
data(air_miss)

# 1. Impute m = 10 completed datasets
completed <- lapply(1:10, function(i) {
  air_miss %>%
    mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")))
})
# 2. Fit the analysis model on each
fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
# 3. Pool using Rubin's rules
summary(pool(fits))

## ----mi-basic-----------------------------------------------------------------
data(air_miss)

# Step 1: Create m = 10 completed datasets
m <- 10
completed <- lapply(1:m, function(i) {
  air_miss %>%
    mutate(
      Ozone_imp = fill_NA(
        x = ., model = "lm_bayes",
        posit_y = "Ozone",
        posit_x = c("Solar.R", "Wind", "Temp")
      )
    )
})

# Step 2: Fit the analysis model on each
fits <- lapply(completed, function(d) {
  lm(Ozone_imp ~ Wind + Temp, data = d)
})

# Step 3: Pool using Rubin's rules
pool_result <- pool(fits)
pool_result

# Detailed summary with confidence intervals
summary(pool_result)

## ----mi-mixed-----------------------------------------------------------------
data(air_miss)

impute_dataset <- function(data) {
  data %>%
    mutate(
      # Continuous: Bayesian linear regression
      Ozone_imp = fill_NA(
        x = ., model = "lm_bayes",
        posit_y = "Ozone",
        posit_x = c("Solar.R", "Wind", "Temp")
      ),
      Solar_R_imp = fill_NA(
        x = ., model = "lm_bayes",
        posit_y = "Solar.R",
        posit_x = c("Wind", "Temp", "Intercept")
      ),
      # Categorical: LDA with random ridge for stochasticity
      Ozone_chac_imp = fill_NA(
        x = ., model = "lda",
        posit_y = "Ozone_chac",
        posit_x = c("Wind", "Temp"),
        ridge = runif(1, 0.5, 50)
      )
    )
}

set.seed(777)
completed <- replicate(10, impute_dataset(air_miss), simplify = FALSE)

# Fit and pool a model for continuous outcome
fits_ozone <- lapply(completed, function(d) {
  lm(Ozone_imp ~ Wind + Temp + Solar_R_imp, data = d)
})
pool(fits_ozone)

## ----mi-glm-------------------------------------------------------------------
data(air_miss)

# Create a binary outcome for demonstration
completed <- lapply(1:10, function(i) {
  d <- air_miss %>%
    mutate(
      Ozone_imp = fill_NA(
        x = ., model = "lm_bayes",
        posit_y = "Ozone",
        posit_x = c("Solar.R", "Wind", "Temp")
      )
    )
  d$high_ozone <- as.integer(d$Ozone_imp > median(d$Ozone_imp, na.rm = TRUE))
  d
})

fits_logit <- lapply(completed, function(d) {
  glm(high_ozone ~ Wind + Temp, data = d, family = binomial)
})

pool(fits_logit)

## ----mi-grouped---------------------------------------------------------------
data(air_miss)

completed_grouped <- lapply(1:5, function(i) {
  air_miss %>%
    group_by(groups) %>%
    do(mutate(.,
      Ozone_imp = fill_NA(
        x = ., model = "lm_bayes",
        posit_y = "Ozone",
        posit_x = c("Wind", "Temp", "Intercept")
      )
    )) %>%
    ungroup()
})

fits <- lapply(completed_grouped, function(d) {
  lm(Ozone_imp ~ Wind + Temp + factor(groups), data = d)
})
pool(fits)

## ----mi-weighted--------------------------------------------------------------
data(air_miss)

completed_w <- lapply(1:5, function(i) {
  air_miss %>%
    mutate(
      Solar_R_imp = fill_NA(
        x = ., model = "lm_bayes",
        posit_y = "Solar.R",
        posit_x = c("Wind", "Temp", "Intercept"),
        w = weights
      )
    )
})

fits_w <- lapply(completed_w, function(d) {
  lm(Solar_R_imp ~ Wind + Temp, data = d)
})
pool(fits_w)

## ----mi-pmm-oop---------------------------------------------------------------
data(air_miss)

dat <- as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")])
dat <- cbind(dat, Intercept = 1)

m <- 10
completed <- lapply(1:m, function(i) {
  model <- new(miceFast)
  model$set_data(dat + 0)            # copy. set_data uses the matrix by reference.
  # impute("pmm", ...) draws from the Bayesian posterior and matches to nearest observed value
  model$update_var(1, model$impute("pmm", 1, c(3, 4, 5))$imputations)
  d <- as.data.frame(model$get_data())
  colnames(d) <- c("Ozone", "Solar.R", "Wind", "Temp", "Intercept")
  d
})

fits_pmm <- lapply(completed, function(d) {
  lm(Ozone ~ Wind + Temp, data = d)
})
pool(fits_pmm)

## ----sensitivity-models-------------------------------------------------------
data(air_miss)

air_sensitivity <- air_miss %>%
  mutate(
    Ozone_bayes = fill_NA(x = ., model = "lm_bayes",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")),
    Ozone_noise = fill_NA(x = ., model = "lm_noise",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")),
    Ozone_pmm = fill_NA_N(x = ., model = "pmm",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), k = 20),
    Ozone_pred = fill_NA(x = ., model = "lm_pred",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"))
  )

compare_imp(air_sensitivity, origin = "Ozone",
  target = c("Ozone_bayes", "Ozone_noise", "Ozone_pmm", "Ozone_pred"))

## ----sensitivity-m------------------------------------------------------------
set.seed(2025)
results <- data.frame()

for (m_val in c(3, 5, 10, 20, 50)) {
  completed <- lapply(1:m_val, function(i) {
    air_miss %>%
      mutate(Ozone_imp = fill_NA(
        x = ., model = "lm_bayes",
        posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
      ))
  })
  fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
  s <- summary(pool(fits))
  s$m <- m_val
  results <- rbind(results, s)
}

results %>%
  filter(term == "Wind") %>%
  ggplot(aes(x = m, y = estimate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 2) +
  labs(title = "Stability of Wind coefficient across m",
       x = "Number of imputations (m)",
       y = "Pooled estimate") +
  theme_minimal()

## ----sensitivity-baselines----------------------------------------------------
data(air_miss)
set.seed(2025)

# 1. Complete cases (listwise deletion). Unbiased under MCAR.
fit_cc <- lm(Ozone ~ Wind + Temp, data = air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ])
ci_cc <- confint(fit_cc)

# 2. Mean imputation. Biased; underestimates variance.
air_mean <- air_miss
air_mean$Ozone[is.na(air_mean$Ozone)] <- mean(air_mean$Ozone, na.rm = TRUE)
fit_mean <- lm(Ozone ~ Wind + Temp, data = air_mean)
ci_mean <- confint(fit_mean)

# 3. Deterministic regression imputation (lm_pred). No residual noise.
air_pred <- air_miss %>%
  mutate(Ozone_imp = fill_NA(
    x = ., model = "lm_pred",
    posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
  ))
fit_pred <- lm(Ozone_imp ~ Wind + Temp, data = air_pred)
ci_pred <- confint(fit_pred)

# 4. Proper MI with Rubin's rules (lm_bayes, m = 20)
completed <- lapply(1:20, function(i) {
  air_miss %>%
    mutate(Ozone_imp = fill_NA(
      x = ., model = "lm_bayes",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
    ))
})
fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
pool_s <- summary(pool(fits))

# Compare Wind coefficient across all methods
comparison <- data.frame(
  method = c("Complete cases", "Mean imputation", "Regression (lm_pred)", "MI (lm_bayes, m=20)"),
  estimate = c(coef(fit_cc)["Wind"], coef(fit_mean)["Wind"],
               coef(fit_pred)["Wind"], pool_s$estimate[pool_s$term == "Wind"]),
  ci_low = c(ci_cc["Wind", 1], ci_mean["Wind", 1],
             ci_pred["Wind", 1], pool_s$conf.low[pool_s$term == "Wind"]),
  ci_high = c(ci_cc["Wind", 2], ci_mean["Wind", 2],
              ci_pred["Wind", 2], pool_s$conf.high[pool_s$term == "Wind"]),
  n_used = c(nrow(air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]),
             nrow(air_miss), nrow(air_miss), nrow(air_miss))
)
comparison$ci_width <- comparison$ci_high - comparison$ci_low
comparison

## ----vonhippel----------------------------------------------------------------
# Pilot with m = 5
set.seed(2025)
pilot <- lapply(1:5, function(i) {
  air_miss %>%
    mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")))
})
pilot_pool <- pool(lapply(pilot, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)))

# Inspect FMI, the key input for deciding m
data.frame(term = pilot_pool$term, fmi = round(pilot_pool$fmi, 3))

# For the exact formula and its derivation see:
# von Hippel (2020) "How many imputations do you need?", Sociological Methods & Research
# R implementation: install.packages("howManyImputations")

Try the miceFast package in your browser

Any scripts or data that you put into this service are public.

miceFast documentation built on Feb. 26, 2026, 5:06 p.m.