Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
## ----load-packages------------------------------------------------------------
library(miceFast)
library(dplyr)
library(data.table)
library(ggplot2)
set.seed(123456)
## ----data---------------------------------------------------------------------
data(air_miss)
str(air_miss)
## ----upset, eval=requireNamespace("UpSetR", quietly=TRUE)---------------------
upset_NA(air_miss, 6)
## ----vif----------------------------------------------------------------------
VIF(
air_miss,
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp", "x_character", "Day", "weights", "groups")
)
## ----fill-na-basic------------------------------------------------------------
data(air_miss)
result <- air_miss %>%
# Continuous variable: Bayesian linear model (stochastic)
mutate(Ozone_imp = fill_NA(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp")
)) %>%
# Categorical variable: LDA
mutate(x_char_imp = fill_NA(
x = .,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp")
))
head(result[, c("Ozone", "Ozone_imp", "x_character", "x_char_imp")])
## ----fill-na-grouped----------------------------------------------------------
data(air_miss)
result_grouped <- air_miss %>%
group_by(groups) %>%
do(mutate(.,
Solar_R_imp = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .[["weights"]]
)
)) %>%
ungroup()
head(result_grouped[, c("Solar.R", "Solar_R_imp", "groups")])
## ----fill-na-logreg-----------------------------------------------------------
data(air_miss)
result_log <- air_miss %>%
mutate(Ozone_imp = fill_NA(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp", "Intercept"),
logreg = TRUE
))
# Compare distributions: log imputation avoids negative values
summary(result_log[c("Ozone", "Ozone_imp")])
## ----fill-na-position---------------------------------------------------------
data(air_miss)
result_pos <- air_miss %>%
mutate(Ozone_imp = fill_NA(
x = .,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
))
head(result_pos[, c("Ozone", "Ozone_imp")])
## ----fill-na-dt---------------------------------------------------------------
data(air_miss)
setDT(air_miss)
air_miss[, Ozone_imp := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp")
)]
air_miss[, x_char_imp := fill_NA(
x = .SD,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp")
)]
# With grouping
air_miss[, Solar_R_imp := fill_NA(
x = .SD,
model = "lm_pred",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .SD[["weights"]]
), by = .(groups)]
## ----fill-na-n-dplyr----------------------------------------------------------
data(air_miss)
result_n <- air_miss %>%
# PMM with 20 draws. Imputes from observed values.
mutate(Ozone_pmm = fill_NA_N(
x = .,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp"),
k = 20
)) %>%
# lm_noise with 30 draws and weights
mutate(Ozone_noise = fill_NA_N(
x = .,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
))
## ----fill-na-n-dt-------------------------------------------------------------
data(air_miss)
setDT(air_miss)
air_miss[, Ozone_pmm := fill_NA_N(
x = .SD,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Wind", "Temp", "Intercept"),
w = .SD[["weights"]],
logreg = TRUE,
k = 20
), by = .(groups)]
## ----compare-imp--------------------------------------------------------------
data(air_miss)
air_miss_imp <- air_miss %>%
mutate(
Ozone_bayes = fill_NA(x = ., model = "lm_bayes",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")),
Ozone_pmm = fill_NA_N(x = ., model = "pmm",
posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), k = 20)
)
compare_imp(air_miss_imp, origin = "Ozone", target = c("Ozone_bayes", "Ozone_pmm"))
## ----mi-workflow--------------------------------------------------------------
data(air_miss)
# Select the 4 core variables: Ozone and Solar.R have NAs; Wind and Temp are complete.
df <- air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]
# Step 1: Generate m = 10 completed datasets.
# Impute Solar.R first (predictors fully observed), then Ozone
# (can now use the freshly imputed Solar.R). This sequential order
# ensures all NAs are filled in a single pass.
set.seed(1234)
completed <- lapply(1:10, function(i) {
df %>%
mutate(Solar.R = fill_NA(., "lm_bayes", "Solar.R", c("Wind", "Temp"))) %>%
mutate(Ozone = fill_NA(., "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp")))
})
# Step 2: Fit the analysis model on each completed dataset
fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d))
# Step 3: Pool using Rubin's rules
pool_res <- pool(fits)
pool_res
summary(pool_res)
## ----full-impute-all----------------------------------------------------------
data(air_miss)
# Define a function that fills all variables with NAs in one pass.
# Order matters: Solar.R first (Wind, Temp are complete), then Ozone
# (uses the freshly imputed Solar.R), then categorical variables.
impute_all <- function(data) {
data %>%
# Continuous: Solar.R (predictors fully observed)
mutate(Solar.R = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp")
)) %>%
# Continuous: Ozone (Solar.R now complete)
mutate(Ozone = fill_NA(
x = ., model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp")
)) %>%
# Categorical: x_character (LDA with random ridge for stochasticity)
mutate(x_character = fill_NA(
x = ., model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp"),
ridge = runif(1, 0.5, 50)
)) %>%
# Categorical: Ozone_chac (use tryCatch for safety with small groups)
group_by(groups) %>%
do(mutate(., Ozone_chac = tryCatch(
fill_NA(
x = ., model = "lda",
posit_y = "Ozone_chac",
posit_x = c("Temp", "Wind")
),
error = function(e) .[["Ozone_chac"]]
))) %>%
ungroup()
}
# MI: generate m = 10 completed datasets
set.seed(2024)
m <- 10
completed <- lapply(1:m, function(i) impute_all(air_miss))
# Fit the analysis model on each completed dataset
fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d))
# Pool using Rubin's rules
pool_result <- pool(fits)
pool_result
summary(pool_result)
## ----full-impute-all-dt-------------------------------------------------------
data(air_miss)
setDT(air_miss)
impute_all_dt <- function(dt) {
d <- copy(dt)
# Order: Solar.R first (predictors complete), then Ozone, then categoricals
d[, Solar.R := fill_NA(
x = .SD, model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp")
)]
d[, Ozone := fill_NA(
x = .SD, model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp")
)]
d[, x_character := fill_NA(
x = .SD, model = "lda",
posit_y = "x_character", posit_x = c("Wind", "Temp"),
ridge = runif(1, 0.5, 50)
)]
d[, Ozone_chac := tryCatch(
fill_NA(
x = .SD, model = "lda",
posit_y = "Ozone_chac", posit_x = c("Temp", "Wind")
),
error = function(e) .SD[["Ozone_chac"]]
), by = .(groups)]
d
}
set.seed(2024)
completed_dt <- lapply(1:10, function(i) impute_all_dt(air_miss))
fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d))
pool(fits_dt)
## ----oop-simple---------------------------------------------------------------
data <- cbind(as.matrix(airquality[, 1:4]), intercept = 1, index = 1:nrow(airquality))
model <- new(miceFast)
model$set_data(data)
# Single imputation with linear model (col 1 = Ozone)
model$update_var(1, model$impute("lm_pred", 1, 5)$imputations)
# Averaged multiple imputation for Solar.R (col 2, Bayesian, k=10 draws)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5), k = 10)$imputations)
model$which_updated()
head(model$get_data(), 4)
## ----oop-groups---------------------------------------------------------------
data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(airquality[, 5])
model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)
model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5, 6), k = 10)$imputations)
# Recover original row order
head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)
## ----oop-unsorted-------------------------------------------------------------
data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(sample(1:8, nrow(data), replace = TRUE))
model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)
model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5, 6), 10)$imputations)
head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)
## ----oop-mi-------------------------------------------------------------------
data(air_miss)
# Prepare a numeric matrix with an intercept and row index
mat <- cbind(
as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]),
intercept = 1,
index = seq_len(nrow(air_miss))
)
groups <- as.numeric(air_miss$groups)
impute_oop <- function(mat, groups) {
m <- new(miceFast)
m$set_data(mat + 0) # copy. set_data uses the matrix by reference.
m$set_g(groups)
# col 1 = Ozone, col 2 = Solar.R, col 3 = Wind, col 4 = Temp, col 5 = intercept
m$update_var(1, m$impute("lm_bayes", 1, c(2, 3, 4))$imputations)
m$update_var(2, m$impute("lm_bayes", 2, c(3, 4, 5))$imputations)
completed <- m$get_data()[order(m$get_index()), ]
as.data.frame(completed)
}
set.seed(2025)
completed <- lapply(1:10, function(i) impute_oop(mat, groups))
fits <- lapply(completed, function(d) lm(V1 ~ V3 + V4, data = d))
pool(fits)
summary(pool(fits))
## ----fcs-dt-------------------------------------------------------------------
data(air_miss)
setDT(air_miss)
fcs_dt <- function(dt, n_iter = 5) {
d <- copy(dt)
na_ozone <- is.na(d$Ozone)
na_solar <- is.na(d[["Solar.R"]])
d <- naive_fill_NA(d) # initialise
for (iter in seq_len(n_iter)) {
set(d, which(na_ozone), "Ozone", NA_real_) # restore NAs
d[, Ozone := fill_NA(.SD, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))]
set(d, which(na_solar), "Solar.R", NA_real_)
d[, Solar.R := fill_NA_N(.SD, "pmm", "Solar.R", c("Ozone", "Wind", "Temp", "Intercept"))]
}
d
}
set.seed(2025)
completed_dt <- lapply(1:10, function(i) fcs_dt(air_miss))
fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Wind + Temp, data = d))
pool(fits_dt)
## ----corrdata-example---------------------------------------------------------
# 3 continuous variables, 100 observations
means <- c(10, 20, 30)
cor_matrix <- matrix(c(
1.0, 0.7, 0.3,
0.7, 1.0, 0.5,
0.3, 0.5, 1.0
), nrow = 3)
cd <- new(corrData, 100, means, cor_matrix)
sim_data <- cd$fill("contin")
round(cor(sim_data), 2)
## ----corrdata-discrete--------------------------------------------------------
# With 2 categorical variables: first argument is nr_cat
cd2 <- new(corrData, 2, 200, means, cor_matrix)
sim_discrete <- cd2$fill("discrete")
head(sim_discrete)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.