Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)
## ---- echo=TRUE, message=FALSE, warning=FALSE---------------------------------
pkgs <- c("miceFast", "mice", "ggplot2", "dplyr", "data.table")
inst <- lapply(pkgs, library, character.only = TRUE)
set.seed(123456)
## ---- echo=TRUE---------------------------------------------------------------
data(air_miss) # airquality with additional variables
## ---- echo=TRUE---------------------------------------------------------------
upset_NA(air_miss, 6)
## ---- echo=TRUE---------------------------------------------------------------
# VIF (Variance Inflation Factor) > ~10 suggests collinearity problems.
VIF(
air_miss,
posit_y = "Ozone",
posit_x = c("Solar.R", "Wind", "Temp", "x_character",
"Day", "weights", "groups")
)
## ---- echo=TRUE---------------------------------------------------------------
air_miss <- air_miss %>%
# Impute with grouping and weights
group_by(groups) %>%
do(mutate(.,
Solar_R_imp = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .[["weights"]]
)
)) %>%
ungroup() %>%
# Impute a discrete variable
mutate(x_character_imp = fill_NA(
x = .,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp")
)) %>%
# Log regression because Ozone is roughly log-normal
mutate(Ozone_imp1 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)) %>%
# Impute by position indices
mutate(Ozone_imp2 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)) %>%
# Imputations (mean of 30 draws)
mutate(Ozone_imp3 = fill_NA_N(
x = .,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
mutate(Ozone_imp4 = fill_NA_N(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
group_by(groups) %>%
# Single evaluation model
do(mutate(.,
Ozone_imp5 = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE
)
)) %>%
do(mutate(.,
Ozone_imp6 = fill_NA_N(
x = .,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 20
)
)) %>%
ungroup() %>%
# Combine imputations
mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
# Protect against errors in small groups
group_by(groups) %>%
do(mutate(.,
Ozone_chac_imp = tryCatch(
fill_NA(
x = .,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c("Intercept", "Month", "Day", "Temp", "x_character_imp"),
w = .[["weights"]]
),
error = function(e) .[["Ozone_chac"]]
)
)) %>%
ungroup()
## ---- echo=TRUE---------------------------------------------------------------
# Compare original vs. imputed distributions
compare_imp(air_miss, origin = "Ozone", target = "Ozone_imp_mix")
# Or compare multiple imputation columns
compare_imp(air_miss, origin = "Ozone", target = c("Ozone_imp2", "Ozone_imp_mix"))
## ---- echo=TRUE---------------------------------------------------------------
data(air_miss)
setDT(air_miss)
## ---- echo=TRUE---------------------------------------------------------------
# Imputations
air_miss[, Solar_R_imp := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind","Temp","Intercept"),
w = .SD[["weights"]],
k = 100
), by = .(groups)] %>%
.[, x_character_imp := fill_NA(
x = .SD,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind","Temp","groups")
)] %>%
.[, Ozone_imp1 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)] %>%
.[, Ozone_imp2 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4,6),
logreg = TRUE
)] %>%
.[, Ozone_imp3 := fill_NA_N(
x = .SD,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept","x_character_imp","Wind","Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp4 := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept","x_character_imp","Wind","Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp5 := fill_NA(
x = .SD,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept","x_character_imp","Wind","Temp"),
w = .SD[["weights"]],
logreg = TRUE
), by = .(groups)] %>%
.[, Ozone_imp6 := fill_NA_N(
x = .SD,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept","x_character_imp","Wind","Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 10
), by = .(groups)] %>%
# Average across a set of methods
.[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
# tryCatch for small group issues or collinearity
.[, Ozone_chac_imp := tryCatch(
fill_NA(
x = .SD,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c("Intercept","Month","Day","Temp","x_character_imp"),
w = .SD[["weights"]]
),
error = function(e) .SD[["Ozone_chac"]]
), by = .(groups)]
## ---- eval=FALSE--------------------------------------------------------------
# cd <- new(corrData, nr_cat = 3, n_obs = 1000, means = c(0,1,2), cor_matrix = some_matrix)
# generated_data <- cd$fill("discrete")
## ---- echo=TRUE---------------------------------------------------------------
data <- cbind(as.matrix(mice::nhanes), intercept = 1, index = 1:nrow(mice::nhanes))
model <- new(miceFast)
model$set_data(data)
# Single imputation, linear model
imp_result <- model$impute("lm_pred", posit_y = 2, posit_x = 5)
model$update_var(2, imp_result$imputations)
# LDA imputation for a categorical variable
model$update_var(3, model$impute("lda", 3, c(1,2))$imputations)
# Multiple imputation example
multi_imp <- model$impute_N("lm_bayes", 4, c(1,2,3), k = 10)
model$update_var(4, multi_imp$imputations)
# Check which variables were updated
model$which_updated()
# Always be cautious with 'update_var'
# since it permanently alters the object and your data matrix.
## ---- echo=TRUE---------------------------------------------------------------
data <- cbind(as.matrix(airquality[,-5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(airquality[,5])
model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)
# Impute with group-wise weighting
model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1,3,4,5,6), k = 10)$imputations)
# Inspect updated data, returning to original order via model$get_index()
head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)
## ---- echo=TRUE---------------------------------------------------------------
data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality))
weights <- rgamma(nrow(data), 3, 3)
groups <- as.numeric(sample(1:8, nrow(data), replace = TRUE))
model <- new(miceFast)
model$set_data(data)
model$set_w(weights)
model$set_g(groups)
# The data will be automatically sorted by 'groups' during the first imputation:
model$update_var(1, model$impute("lm_pred", 1, 6)$imputations)
model$update_var(2, model$impute_N("lm_bayes", 2, c(1,3,4,5,6), 10)$imputations)
head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.