Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(pminternal)
library(mice)
# make some data
set.seed(2345)
n <- 800
p <- 5
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
LP <- -1 + X %*% c(-1, 1, -.5, .5, 0)
y <- rbinom(n, 1, plogis(LP))
mean(y)
datcomp <- data.frame(y, X)
# add missingness
datmis <- datcomp
datmis$X1[sample(n, 200)] <- NA
datmis$X2[sample(n, 200)] <- NA
datmis$X3[sample(n, 200)] <- NA
# assess performance before introducing missingness
mod <- glm(y ~ ., data = datcomp, family = "binomial")
(comp_val <- validate(mod, method = "cv_o"))
## -----------------------------------------------------------------------------
model_mi <- function(data, ...){
imp <- mice::mice(data, printFlag = FALSE) # 5 imputed datasets via pmm
fits <- with(imp, glm(y ~ X1 + X2 + X3 + X4 + X5, family=binomial))
B <- mice::pool(fits)$pooled$estimate # pooled coefs for predictions
# save pooled coefficients and the data to do stacked imputation at validation
list(B, data)
}
pred_mi <- function(model, data, ...){
# extract pooled coefs
B <- model[[1]]
# stack the new data on the model fit data (model[[2]])
dstacked <- rbind(model[[2]], data)
i <- (nrow(model[[2]]) + 1):(nrow(dstacked))
# impute stacked data
# use ignore argument to ensure the test data doesn't influence
# the imputation model but still gets imputed
imp <- mice::mice(dstacked, printFlag = FALSE,
ignore = seq(nrow(dstacked)) %in% i)
# get logit predicted risks for each imputed data set
preds <- sapply(seq(imp$m), \(x){
# complete(imp, x)[i, ] = get complete data set x and extract the
# validation data i
X <- model.matrix(~ X1 + X2 + X3 + X4 + X5, data = mice::complete(imp, x)[i, ])
X %*% B
})
# average logit predictions, transform invlogit, and return
plogis(apply(preds, 1, mean))
}
## -----------------------------------------------------------------------------
(mi_val <- validate(data = datmis,
model_fun = model_mi, pred_fun = pred_mi,
outcome = "y", method = "cv_o"))
## ----fig.width=4, fig.height=5------------------------------------------------
md <- md.pattern(datmis)
## -----------------------------------------------------------------------------
# store missing data pattern in data
# as this will be useful
datmis$mdp <- apply(datmis[, paste0("X", 1:5)], 1, \(x) paste0(as.numeric(is.na(x)), collapse = ""))
table(datmis$mdp)
## -----------------------------------------------------------------------------
submodels <- list(
"00000" = y ~ X1 + X2 + X3 + X4 + X5,
"00100" = y ~ X1 + X2 + X4 + X5,
"01000" = y ~ X1 + X3 + X4 + X5,
"01100" = y ~ X1 + X4 + X5,
"10000" = y ~ X2 + X3 + X4 + X5,
"10100" = y ~ X2 + X4 + X5,
"11000" = y ~ X3 + X4 + X5,
"11100" = y ~ X4 + X5
)
model_ps <- function(data, ...){
# this example uses submodels as an additional argument to
# validate. But we could have put the submodels list in the
# model_sub function
dots <- list(...)
if ("submodels" %in% names(dots))
submodels <- dots[["submodels"]]
else
stop("no submodels")
patterns <- names(submodels)
# all patterns in data should be in submodels,
# but not necessarily vice versa (resampling
# could have omitted a pattern)
stopifnot(all(unique(data$mdp) %in% patterns))
fits <- lapply(patterns, \(pat){
f <- submodels[[pat]] # submodel formula
# if n with pattern > 2*p then fit submodel
n <- sum(data$mdp == pat)
if (n >= 2*nchar(pat)){
fit <- glm(f, data = subset(data, mdp == pat), family = binomial)
} else{
# otherwise fit on complete cases
fit <- glm(f, data = data, family = binomial)
# note this approach allows submodels for patterns not
# observed in development data. Though these will be
# fit using complete cases (obviously not pattern specific data)
}
fit
})
names(fits) <- patterns
fits
}
pred_ps <- function(model, data, ...){
patterns <- names(model)
# there needs to be a model for each pattern in data
stopifnot(all( unique(data$mdp) %in% patterns ))
patterns <- patterns[patterns %in% unique(data$mdp)]
preds <- lapply(patterns, \(pat){
i = which(data$mdp == pat) # for reordering later
pdat <- subset(data, mdp == pat)
p <- predict(model[[pat]], newdata = pdat, type = "response")
data.frame(y = pdat$y, p = p, i = i)
})
preds <- do.call(rbind, preds)
# reorder so same as original data
preds <- preds[order(preds$i),]
# all(data$y == preds$y)
preds$p
}
## -----------------------------------------------------------------------------
(sub_val <- validate(data = datmis,
model_fun = model_ps, pred_fun = pred_ps,
submodels = submodels,
outcome = "y", method = "cv_o"))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.