Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
## ----packages-----------------------------------------------------------------
library(miceFast)
library(dplyr)
library(ggplot2)
set.seed(2025)
## ----quickstart---------------------------------------------------------------
data(air_miss)
# 1. Impute m = 10 completed datasets
completed <- lapply(1:10, function(i) {
air_miss %>%
mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")))
})
# 2. Fit the analysis model on each
fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
# 3. Pool using Rubin's rules
summary(pool(fits))
## ----mi-basic-----------------------------------------------------------------
data(air_miss)
# Step 1: Create m = 10 completed datasets
m <- 10
completed <- lapply(1:m, function(i) {
air_miss %>%
mutate(
Ozone_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp")
)
)
})
# Step 2: Fit the analysis model on each
fits <- lapply(completed, function(d) {
lm(Ozone_imp ~ Wind + Temp, data = d)
})
# Step 3: Pool using Rubin's rules
pool_result <- pool(fits)
pool_result
# Detailed summary with confidence intervals
summary(pool_result)
## ----mi-mixed-----------------------------------------------------------------
data(air_miss)
impute_dataset <- function(data) {
data %>%
mutate(
# Continuous: Bayesian linear regression
Ozone_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp")
),
Solar_R_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept")
),
# Categorical: LDA with random ridge for stochasticity
Ozone_chac_imp = fill_NA(
x = ., model = "lda",
posit_y = "Ozone_chac",
posit_x = c("Wind", "Temp"),
ridge = runif(1, 0.5, 50)
)
)
}
set.seed(777)
completed <- replicate(10, impute_dataset(air_miss), simplify = FALSE)
# Fit and pool a model for continuous outcome
fits_ozone <- lapply(completed, function(d) {
lm(Ozone_imp ~ Wind + Temp + Solar_R_imp, data = d)
})
pool(fits_ozone)
## ----mi-glm-------------------------------------------------------------------
data(air_miss)
# Create a binary outcome for demonstration
completed <- lapply(1:10, function(i) {
d <- air_miss %>%
mutate(
Ozone_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp")
)
)
d$high_ozone <- as.integer(d$Ozone_imp > median(d$Ozone_imp, na.rm = TRUE))
d
})
fits_logit <- lapply(completed, function(d) {
glm(high_ozone ~ Wind + Temp, data = d, family = binomial)
})
pool(fits_logit)
## ----mi-grouped---------------------------------------------------------------
data(air_miss)
completed_grouped <- lapply(1:5, function(i) {
air_miss %>%
group_by(groups) %>%
do(mutate(.,
Ozone_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Wind", "Temp", "Intercept")
)
)) %>%
ungroup()
})
fits <- lapply(completed_grouped, function(d) {
lm(Ozone_imp ~ Wind + Temp + factor(groups), data = d)
})
pool(fits)
## ----mi-weighted--------------------------------------------------------------
data(air_miss)
completed_w <- lapply(1:5, function(i) {
air_miss %>%
mutate(
Solar_R_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = weights
)
)
})
fits_w <- lapply(completed_w, function(d) {
lm(Solar_R_imp ~ Wind + Temp, data = d)
})
pool(fits_w)
## ----mi-pmm-oop---------------------------------------------------------------
data(air_miss)
dat <- as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")])
dat <- cbind(dat, Intercept = 1)
m <- 10
completed <- lapply(1:m, function(i) {
model <- new(miceFast)
model$set_data(dat + 0) # copy. set_data uses the matrix by reference.
# impute("pmm", ...) draws from the Bayesian posterior and matches to nearest observed value
model$update_var(1, model$impute("pmm", 1, c(3, 4, 5))$imputations)
d <- as.data.frame(model$get_data())
colnames(d) <- c("Ozone", "Solar.R", "Wind", "Temp", "Intercept")
d
})
fits_pmm <- lapply(completed, function(d) {
lm(Ozone ~ Wind + Temp, data = d)
})
pool(fits_pmm)
## ----sensitivity-models-------------------------------------------------------
data(air_miss)
air_sensitivity <- air_miss %>%
mutate(
Ozone_bayes = fill_NA(x = ., model = "lm_bayes",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")),
Ozone_noise = fill_NA(x = ., model = "lm_noise",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")),
Ozone_pmm = fill_NA_N(x = ., model = "pmm",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), k = 20),
Ozone_pred = fill_NA(x = ., model = "lm_pred",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"))
)
compare_imp(air_sensitivity, origin = "Ozone",
target = c("Ozone_bayes", "Ozone_noise", "Ozone_pmm", "Ozone_pred"))
## ----sensitivity-m------------------------------------------------------------
set.seed(2025)
results <- data.frame()
for (m_val in c(3, 5, 10, 20, 50)) {
completed <- lapply(1:m_val, function(i) {
air_miss %>%
mutate(Ozone_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
))
})
fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
s <- summary(pool(fits))
s$m <- m_val
results <- rbind(results, s)
}
results %>%
filter(term == "Wind") %>%
ggplot(aes(x = m, y = estimate)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 2) +
labs(title = "Stability of Wind coefficient across m",
x = "Number of imputations (m)",
y = "Pooled estimate") +
theme_minimal()
## ----sensitivity-baselines----------------------------------------------------
data(air_miss)
set.seed(2025)
# 1. Complete cases (listwise deletion). Unbiased under MCAR.
fit_cc <- lm(Ozone ~ Wind + Temp, data = air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ])
ci_cc <- confint(fit_cc)
# 2. Mean imputation. Biased; underestimates variance.
air_mean <- air_miss
air_mean$Ozone[is.na(air_mean$Ozone)] <- mean(air_mean$Ozone, na.rm = TRUE)
fit_mean <- lm(Ozone ~ Wind + Temp, data = air_mean)
ci_mean <- confint(fit_mean)
# 3. Deterministic regression imputation (lm_pred). No residual noise.
air_pred <- air_miss %>%
mutate(Ozone_imp = fill_NA(
x = ., model = "lm_pred",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
))
fit_pred <- lm(Ozone_imp ~ Wind + Temp, data = air_pred)
ci_pred <- confint(fit_pred)
# 4. Proper MI with Rubin's rules (lm_bayes, m = 20)
completed <- lapply(1:20, function(i) {
air_miss %>%
mutate(Ozone_imp = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
))
})
fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))
pool_s <- summary(pool(fits))
# Compare Wind coefficient across all methods
comparison <- data.frame(
method = c("Complete cases", "Mean imputation", "Regression (lm_pred)", "MI (lm_bayes, m=20)"),
estimate = c(coef(fit_cc)["Wind"], coef(fit_mean)["Wind"],
coef(fit_pred)["Wind"], pool_s$estimate[pool_s$term == "Wind"]),
ci_low = c(ci_cc["Wind", 1], ci_mean["Wind", 1],
ci_pred["Wind", 1], pool_s$conf.low[pool_s$term == "Wind"]),
ci_high = c(ci_cc["Wind", 2], ci_mean["Wind", 2],
ci_pred["Wind", 2], pool_s$conf.high[pool_s$term == "Wind"]),
n_used = c(nrow(air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]),
nrow(air_miss), nrow(air_miss), nrow(air_miss))
)
comparison$ci_width <- comparison$ci_high - comparison$ci_low
comparison
## ----vonhippel----------------------------------------------------------------
# Pilot with m = 5
set.seed(2025)
pilot <- lapply(1:5, function(i) {
air_miss %>%
mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")))
})
pilot_pool <- pool(lapply(pilot, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)))
# Inspect FMI, the key input for deciding m
data.frame(term = pilot_pool$term, fmi = round(pilot_pool$fmi, 3))
# For the exact formula and its derivation see:
# von Hippel (2020) "How many imputations do you need?", Sociological Methods & Research
# R implementation: install.packages("howManyImputations")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.