inst/doc/miceFast-intro.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

## ----load-packages------------------------------------------------------------
library(miceFast)
library(dplyr)
library(data.table)
library(ggplot2)
set.seed(123456)

## ----data---------------------------------------------------------------------
data(air_miss)
str(air_miss)

## ----upset, eval=requireNamespace("UpSetR", quietly=TRUE)---------------------
upset_NA(air_miss, 6)

## ----vif----------------------------------------------------------------------
VIF(
  air_miss,
  posit_y = "Ozone",
  posit_x = c("Solar.R", "Wind", "Temp", "x_character", "Day", "weights", "groups")
)

## ----fill-na-basic------------------------------------------------------------
data(air_miss)

result <- air_miss %>%
  # Continuous variable: Bayesian linear model (stochastic)
  mutate(Ozone_imp = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Solar.R", "Wind", "Temp")
  )) %>%
  # Categorical variable: LDA
  mutate(x_char_imp = fill_NA(
    x = .,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp")
  ))

head(result[, c("Ozone", "Ozone_imp", "x_character", "x_char_imp")])

## ----fill-na-grouped----------------------------------------------------------
data(air_miss)

result_grouped <- air_miss %>%
  group_by(groups) %>%
  do(mutate(.,
    Solar_R_imp = fill_NA(
      x = .,
      model = "lm_pred",
      posit_y = "Solar.R",
      posit_x = c("Wind", "Temp", "Intercept"),
      w = .[["weights"]]
    )
  )) %>%
  ungroup()

head(result_grouped[, c("Solar.R", "Solar_R_imp", "groups")])

## ----fill-na-logreg-----------------------------------------------------------
data(air_miss)

result_log <- air_miss %>%
  mutate(Ozone_imp = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Solar.R", "Wind", "Temp", "Intercept"),
    logreg = TRUE
  ))

# Compare distributions: log imputation avoids negative values
summary(result_log[c("Ozone", "Ozone_imp")])

## ----fill-na-position---------------------------------------------------------
data(air_miss)

result_pos <- air_miss %>%
  mutate(Ozone_imp = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  ))

head(result_pos[, c("Ozone", "Ozone_imp")])

## ----fill-na-dt---------------------------------------------------------------
data(air_miss)
setDT(air_miss)

air_miss[, Ozone_imp := fill_NA(
  x = .SD,
  model = "lm_bayes",
  posit_y = "Ozone",
  posit_x = c("Solar.R", "Wind", "Temp")
)]

air_miss[, x_char_imp := fill_NA(
  x = .SD,
  model = "lda",
  posit_y = "x_character",
  posit_x = c("Wind", "Temp")
)]

# With grouping
air_miss[, Solar_R_imp := fill_NA(
  x = .SD,
  model = "lm_pred",
  posit_y = "Solar.R",
  posit_x = c("Wind", "Temp", "Intercept"),
  w = .SD[["weights"]]
), by = .(groups)]

## ----fill-na-n-dplyr----------------------------------------------------------
data(air_miss)

result_n <- air_miss %>%
  # PMM with 20 draws. Imputes from observed values.
  mutate(Ozone_pmm = fill_NA_N(
    x = .,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Solar.R", "Wind", "Temp"),
    k = 20
  )) %>%
  # lm_noise with 30 draws and weights
  mutate(Ozone_noise = fill_NA_N(
    x = .,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Solar.R", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  ))

## ----fill-na-n-dt-------------------------------------------------------------
data(air_miss)
setDT(air_miss)

air_miss[, Ozone_pmm := fill_NA_N(
  x = .SD,
  model = "pmm",
  posit_y = "Ozone",
  posit_x = c("Wind", "Temp", "Intercept"),
  w = .SD[["weights"]],
  logreg = TRUE,
  k = 20
), by = .(groups)]

## ----compare-imp--------------------------------------------------------------
data(air_miss)

air_miss_imp <- air_miss %>%
  mutate(
    Ozone_bayes = fill_NA(x = ., model = "lm_bayes",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")),
    Ozone_pmm = fill_NA_N(x = ., model = "pmm",
      posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), k = 20)
  )

compare_imp(air_miss_imp, origin = "Ozone", target = c("Ozone_bayes", "Ozone_pmm"))

## ----mi-workflow--------------------------------------------------------------
data(air_miss)

# Select the 4 core variables: Ozone and Solar.R have NAs; Wind and Temp are complete.
df <- air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]

# Step 1: Generate m = 10 completed datasets.
# Impute Solar.R first (predictors fully observed), then Ozone
# (can now use the freshly imputed Solar.R). This sequential order
# ensures all NAs are filled in a single pass.
set.seed(1234)
completed <- lapply(1:10, function(i) {
  df %>%
    mutate(Solar.R = fill_NA(., "lm_bayes", "Solar.R", c("Wind", "Temp"))) %>%
    mutate(Ozone   = fill_NA(., "lm_bayes", "Ozone",   c("Solar.R", "Wind", "Temp")))
})

# Step 2: Fit the analysis model on each completed dataset
fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d))

# Step 3: Pool using Rubin's rules
pool_res <- pool(fits)
pool_res
summary(pool_res)

## ----full-impute-all----------------------------------------------------------
data(air_miss)

# Define a function that fills all variables with NAs in one pass.
# Order matters: Solar.R first (Wind, Temp are complete), then Ozone
# (uses the freshly imputed Solar.R), then categorical variables.
impute_all <- function(data) {
  data %>%
    # Continuous: Solar.R (predictors fully observed)
    mutate(Solar.R = fill_NA(
      x = ., model = "lm_bayes",
      posit_y = "Solar.R",
      posit_x = c("Wind", "Temp")
    )) %>%
    # Continuous: Ozone (Solar.R now complete)
    mutate(Ozone = fill_NA(
      x = ., model = "lm_bayes",
      posit_y = "Ozone",
      posit_x = c("Solar.R", "Wind", "Temp")
    )) %>%
    # Categorical: x_character (LDA with random ridge for stochasticity)
    mutate(x_character = fill_NA(
      x = ., model = "lda",
      posit_y = "x_character",
      posit_x = c("Wind", "Temp"),
      ridge = runif(1, 0.5, 50)
    )) %>%
    # Categorical: Ozone_chac (use tryCatch for safety with small groups)
    group_by(groups) %>%
    do(mutate(., Ozone_chac = tryCatch(
      fill_NA(
        x = ., model = "lda",
        posit_y = "Ozone_chac",
        posit_x = c("Temp", "Wind")
      ),
      error = function(e) .[["Ozone_chac"]]
    ))) %>%
    ungroup()
}

# MI: generate m = 10 completed datasets
set.seed(2024)
m <- 10
completed <- lapply(1:m, function(i) impute_all(air_miss))

# Fit the analysis model on each completed dataset
fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d))

# Pool using Rubin's rules
pool_result <- pool(fits)
pool_result
summary(pool_result)

## ----full-impute-all-dt-------------------------------------------------------
data(air_miss)
setDT(air_miss)

impute_all_dt <- function(dt) {
  d <- copy(dt)
  # Order: Solar.R first (predictors complete), then Ozone, then categoricals
  d[, Solar.R := fill_NA(
    x = .SD, model = "lm_bayes",
    posit_y = "Solar.R",
    posit_x = c("Wind", "Temp")
  )]
  d[, Ozone := fill_NA(
    x = .SD, model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Solar.R", "Wind", "Temp")
  )]
  d[, x_character := fill_NA(
    x = .SD, model = "lda",
    posit_y = "x_character", posit_x = c("Wind", "Temp"),
    ridge = runif(1, 0.5, 50)
  )]
  d[, Ozone_chac := tryCatch(
    fill_NA(
      x = .SD, model = "lda",
      posit_y = "Ozone_chac", posit_x = c("Temp", "Wind")
    ),
    error = function(e) .SD[["Ozone_chac"]]
  ), by = .(groups)]
  d
}

set.seed(2024)
completed_dt <- lapply(1:10, function(i) impute_all_dt(air_miss))
fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d))
pool(fits_dt)

## ----oop-simple---------------------------------------------------------------
data <- cbind(as.matrix(airquality[, 1:4]), intercept = 1, index = 1:nrow(airquality))
model <- new(miceFast)
model$set_data(data)

# Single imputation with linear model (col 1 = Ozone)
model$update_var(1, model$impute("lm_pred", 1, 5)$imputations)

# Averaged multiple imputation for Solar.R (col 2, Bayesian, k=10 draws)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5), k = 10)$imputations)

model$which_updated()
head(model$get_data(), 4)

## ----oop-groups---------------------------------------------------------------
data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(airquality[, 5])

model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)

model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5, 6), k = 10)$imputations)

# Recover original row order
head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)

## ----oop-unsorted-------------------------------------------------------------
data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(sample(1:8, nrow(data), replace = TRUE))

model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)

model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5, 6), 10)$imputations)

head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)

## ----oop-mi-------------------------------------------------------------------
data(air_miss)

# Prepare a numeric matrix with an intercept and row index
mat <- cbind(
  as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]),
  intercept = 1,
  index = seq_len(nrow(air_miss))
)
groups <- as.numeric(air_miss$groups)

impute_oop <- function(mat, groups) {
  m <- new(miceFast)
  m$set_data(mat + 0)  # copy. set_data uses the matrix by reference.
  m$set_g(groups)
  # col 1 = Ozone, col 2 = Solar.R, col 3 = Wind, col 4 = Temp, col 5 = intercept
  m$update_var(1, m$impute("lm_bayes", 1, c(2, 3, 4))$imputations)
  m$update_var(2, m$impute("lm_bayes", 2, c(3, 4, 5))$imputations)
  completed <- m$get_data()[order(m$get_index()), ]
  as.data.frame(completed)
}

set.seed(2025)
completed <- lapply(1:10, function(i) impute_oop(mat, groups))

fits <- lapply(completed, function(d) lm(V1 ~ V3 + V4, data = d))
pool(fits)
summary(pool(fits))

## ----fcs-dt-------------------------------------------------------------------
data(air_miss)
setDT(air_miss)

fcs_dt <- function(dt, n_iter = 5) {
  d <- copy(dt)
  na_ozone <- is.na(d$Ozone)
  na_solar <- is.na(d[["Solar.R"]])
  d <- naive_fill_NA(d)                          # initialise
  for (iter in seq_len(n_iter)) {
    set(d, which(na_ozone), "Ozone", NA_real_)   # restore NAs
    d[, Ozone := fill_NA(.SD, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))]
    set(d, which(na_solar), "Solar.R", NA_real_)
    d[, Solar.R := fill_NA_N(.SD, "pmm", "Solar.R", c("Ozone", "Wind", "Temp", "Intercept"))]
  }
  d
}

set.seed(2025)
completed_dt <- lapply(1:10, function(i) fcs_dt(air_miss))
fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Wind + Temp, data = d))
pool(fits_dt)

## ----corrdata-example---------------------------------------------------------
# 3 continuous variables, 100 observations
means <- c(10, 20, 30)
cor_matrix <- matrix(c(
  1.0, 0.7, 0.3,
  0.7, 1.0, 0.5,
  0.3, 0.5, 1.0
), nrow = 3)

cd <- new(corrData, 100, means, cor_matrix)
sim_data <- cd$fill("contin")
round(cor(sim_data), 2)

## ----corrdata-discrete--------------------------------------------------------
# With 2 categorical variables: first argument is nr_cat
cd2 <- new(corrData, 2, 200, means, cor_matrix)
sim_discrete <- cd2$fill("discrete")
head(sim_discrete)

Try the miceFast package in your browser

Any scripts or data that you put into this service are public.

miceFast documentation built on Feb. 26, 2026, 5:06 p.m.