inst/doc/missing-data.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(pminternal)
library(mice)

# make some data
set.seed(2345)
n <- 800
p <- 5

X <- matrix(rnorm(n*p), nrow = n, ncol = p)
LP <- -1 + X %*% c(-1, 1, -.5, .5, 0)
y <- rbinom(n, 1, plogis(LP))

mean(y)

datcomp <- data.frame(y, X)

# add missingness
datmis <- datcomp
datmis$X1[sample(n, 200)] <- NA
datmis$X2[sample(n, 200)] <- NA
datmis$X3[sample(n, 200)] <- NA

# assess performance before introducing missingness
mod <- glm(y ~ ., data = datcomp, family = "binomial")
(comp_val <- validate(mod, method = "cv_o"))


## -----------------------------------------------------------------------------
model_mi <- function(data, ...){
  imp <- mice::mice(data, printFlag = FALSE) # 5 imputed datasets via pmm
  fits <- with(imp, glm(y ~ X1 + X2 + X3 + X4 + X5, family=binomial))
  B <- mice::pool(fits)$pooled$estimate # pooled coefs for predictions
  # save pooled coefficients and the data to do stacked imputation at validation
  list(B, data) 
}

pred_mi <- function(model, data, ...){
  # extract pooled coefs
  B <- model[[1]]
  # stack the new data on the model fit data (model[[2]])
  dstacked <- rbind(model[[2]], data)
  i <- (nrow(model[[2]]) + 1):(nrow(dstacked))
  # impute stacked data
  # use ignore argument to ensure the test data doesn't influence
  # the imputation model but still gets imputed
  imp <- mice::mice(dstacked, printFlag = FALSE, 
                    ignore = seq(nrow(dstacked)) %in% i)
  # get logit predicted risks for each imputed data set
  preds <- sapply(seq(imp$m), \(x){
    # complete(imp, x)[i, ] = get complete data set x and extract the 
    # validation data i
    X <- model.matrix(~ X1 + X2 + X3 + X4 + X5, data = mice::complete(imp, x)[i, ]) 
    X %*% B
  })
  # average logit predictions, transform invlogit, and return
  plogis(apply(preds, 1, mean)) 
}


## -----------------------------------------------------------------------------
(mi_val <- validate(data = datmis, 
                    model_fun = model_mi, pred_fun = pred_mi, 
                    outcome = "y", method = "cv_o"))

## ----fig.width=4, fig.height=5------------------------------------------------
md <- md.pattern(datmis)

## -----------------------------------------------------------------------------
# store missing data pattern in data
# as this will be useful
datmis$mdp <- apply(datmis[, paste0("X", 1:5)], 1, \(x) paste0(as.numeric(is.na(x)), collapse = ""))

table(datmis$mdp)

## -----------------------------------------------------------------------------
submodels <- list(
  "00000" = y ~ X1 + X2 + X3 + X4 + X5,
  "00100" = y ~ X1 + X2 + X4 + X5,
  "01000" = y ~ X1 + X3 + X4 + X5,
  "01100" = y ~ X1 + X4 + X5,
  "10000" = y ~ X2 + X3 + X4 + X5,
  "10100" = y ~ X2 + X4 + X5,
  "11000" = y ~ X3 + X4 + X5,
  "11100" = y ~ X4 + X5
)

model_ps <- function(data, ...){
  # this example uses submodels as an additional argument to 
  # validate. But we could have put the submodels list in the 
  # model_sub function
  dots <- list(...)
  if ("submodels" %in% names(dots)) 
    submodels <- dots[["submodels"]] 
  else 
    stop("no submodels")
  
  patterns <- names(submodels)
  # all patterns in data should be in submodels, 
  # but not necessarily vice versa (resampling 
  # could have omitted a pattern)
  stopifnot(all(unique(data$mdp) %in% patterns)) 
  
  fits <- lapply(patterns, \(pat){
    f <- submodels[[pat]] # submodel formula
    # if n with pattern > 2*p then fit submodel
    n <- sum(data$mdp == pat)
    if (n >= 2*nchar(pat)){
      fit <- glm(f, data = subset(data, mdp == pat), family = binomial)
    } else{
      # otherwise fit on complete cases
      fit <- glm(f, data = data, family = binomial)
      # note this approach allows submodels for patterns not 
      # observed in development data. Though these will be 
      # fit using complete cases (obviously not pattern specific data)
    }
    fit
  })
  
  names(fits) <- patterns
  fits
}

pred_ps <- function(model, data, ...){
  patterns <- names(model)
  # there needs to be a model for each pattern in data
  stopifnot(all( unique(data$mdp) %in% patterns ))
  
  patterns <- patterns[patterns %in% unique(data$mdp)]
  
  preds <- lapply(patterns, \(pat){
    i = which(data$mdp == pat) # for reordering later
    pdat <- subset(data, mdp == pat)
    p <- predict(model[[pat]], newdata = pdat, type = "response")
    data.frame(y = pdat$y, p = p, i = i)
  })
  
  preds <- do.call(rbind, preds)
  # reorder so same as original data
  preds <- preds[order(preds$i),]
  # all(data$y == preds$y)
  
  preds$p
}

## -----------------------------------------------------------------------------
(sub_val <- validate(data = datmis, 
                    model_fun = model_ps, pred_fun = pred_ps, 
                    submodels = submodels, 
                    outcome = "y", method = "cv_o"))

Try the pminternal package in your browser

Any scripts or data that you put into this service are public.

pminternal documentation built on April 4, 2025, 3:30 a.m.