Nothing
context("predict_mi")
library(dplyr)
library(mice)
library(testthat)
suppressWarnings(RNGversion("3.5.0"))
# load data
dat <- mice::nhanes
# add indicator training and test
# first 20 for training, last 5 for testing
dat$set <- c(rep("train", 20), rep("test", 5))
# Make prediction matrix and ensure that set is not used as a predictor
predmat <- mice::make.predictorMatrix(dat)
predmat[,"set"] <- 0
# train imputation model
imp <- mice(dat, m = 5, maxit = 5 ,seed = 1,
predictorMatrix = predmat,
ignore = ifelse(dat$set == "test", TRUE, FALSE))
# extract the training and test data sets
impdats <- mice::complete(imp, "all")
traindats <- lapply(impdats, function(dat) subset(dat, set == "train", select = -set))
testdats <- lapply(impdats, function(dat) subset(dat, set == "test", select = -c(set)))
# predict age with other variables with training datasets
fits <- lapply(traindats, function(dat) lm(age ~ bmi + hyp + chl, data = dat))
# pool the predictions with function
pool_preds <- predict_mi(object = fits,
newdata = testdats,
pool = TRUE,
interval = "prediction",
level = 0.95)
# obtain the imputed value and the width of the prediction interval
predfunc <- function(model, data, level) {
summfit <- summary(model)
sigma <- summfit$sigma
xmat <- model.matrix(summfit$terms, data = as.data.frame(data))
se <- sqrt(1 + rowSums((xmat %*% summfit$cov.unscaled) * xmat))
xfit <- xmat %*% model$coefficients
# variance
var_pred = se^2 * sigma^2
return(cbind(model = xfit, var_pred = var_pred))
}
# obtain predictions for the different imputations
preds_all <- Map(predfunc, model = fits, data = testdats, level = 0.95)
# change the list to a df
preds <- unlist(preds_all) %>%
array(dim = c(nrow(testdats[[1]]), 2, 5))
# our estimand is the predicted value
# so Q bar is the mean over the predictions
Q_bar <- apply(preds[, 1, ], 1, mean)
# The uncertainty estimate is the width of the prediction interval
U_bar <- apply(preds[, 2, ], 1, mean)
# The between imputation variance is the variance of the predictions
B <- apply(preds[, 1, ], 1, var)
# The total variance is T + U_bar + B + B/M (with m the number of imputations)
T_var <- U_bar + B + B / 5
# calculate the degrees of freedom for all observations
df_vector <- numeric(length(Q_bar))
t_vector <- numeric(length(Q_bar))
for (i in 1:length(Q_bar)) {
df_vector[i] <- mice:::barnard.rubin(5, B[i], T_var[i])
t_vector[i] <- qt(1 - (1-0.95)/2, df_vector[i])
}
# Calculate bounds using individual t-values
lwr <- Q_bar - t_vector * sqrt(T_var)
upr <- Q_bar + t_vector * sqrt(T_var)
# check if the output structure is as expected
test_that("Output class is correct", {
expect_type(pool_preds, "double") # checks storage type
expect_true(is.matrix(pool_preds)) # checks it's a matrix
})
# check if result is the same, for by hand or in function
# Note pool_preds function depends on mice::pool.scalar
test_that("retains same numerical result", {
expect_equal(unname(pool_preds[, 1]), Q_bar, tolerance = 0.00001)
expect_equal(unname(pool_preds[, 2]), lwr, tolerance = 0.00001)
expect_equal(unname(pool_preds[, 3]), upr, tolerance = 0.00001)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.