inst/doc/precomputed-regressions.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, message = FALSE---------------------------------------------------
library("vimp")
library("SuperLearner")

## ----load-vrc01-data----------------------------------------------------------
# read in the data
data("vrc01")
# subset to the columns of interest for this analysis
library("dplyr")
library("tidyselect")
# retain only the columns of interest for this analysis
y <- vrc01$ic50.censored
X <- vrc01 %>%
  select(starts_with("geog"), starts_with("subtype"), starts_with("length"))
set.seed(1234)
vrc01_folds <- make_folds(y = y, V = 2)

## ----est-regressions-lm, warning = FALSE--------------------------------------
library("rlang")
vrc01_subset <- vrc01 %>%
  select(ic50.censored, starts_with("geog"), starts_with("subtype"), starts_with("length")) %>%
  rename(y = ic50.censored)
# estimate prediction function on each subset, predict on held-out fold
full_fit <- vector("numeric", length = nrow(vrc01))
for (v in 1:2) {
  train_v <- subset(vrc01_subset, vrc01_folds != v)
  test_v <- subset(vrc01_subset, vrc01_folds == v)
  full_mod <- glm(y ~ ., data = train_v)
  full_fit[vrc01_folds == v] <- predict(full_mod, newdata = test_v)
}

# estimate the reduced conditional means for each of the individual variables
# remove the outcome for the predictor matrix
geog_indx <- max(which(grepl("geog", names(X))))
for (i in seq_len(ncol(X) - geog_indx)) {
  this_name <- names(X)[i + geog_indx]
  red_fit <- vector("numeric", length = nrow(vrc01))
  for (v in 1:2) {
    train_v <- subset(vrc01_subset, vrc01_folds != v)
    test_v <- subset(vrc01_subset, vrc01_folds == v)
    red_fit[vrc01_folds == v] <- suppressWarnings(
      predict(glm(y ~ .,  data = train_v %>% select(-!!this_name)), 
              newdata = test_v)  
    )
  }
  this_vim <- vim(Y = y, f1 = full_fit, f2 = red_fit, indx = i + geog_indx,
                  run_regression = FALSE, type = "r_squared",
                  sample_splitting_folds = vrc01_folds, scale = "logit")
  if (i == 1) {
    lm_mat <- this_vim
  } else {
    lm_mat <- merge_vim(lm_mat, this_vim)
  }
}
# print out the matrix
lm_mat

## ----estimate-full-regression-with-cf, message = FALSE, warning = FALSE-------
learners <- "SL.ranger"
# estimate the full regression function
V <- 2
set.seed(4747)
full_cv_fit <- suppressWarnings(
  SuperLearner::CV.SuperLearner(
  Y = y, X = X, SL.library = learners, cvControl = list(V = 2 * V),
  innerCvControl = list(list(V = V)), family = binomial()
)
)
# get a numeric vector of cross-fitting folds
cross_fitting_folds <- get_cv_sl_folds(full_cv_fit$folds)
# get sample splitting folds
set.seed(1234)
sample_splitting_folds <- make_folds(unique(cross_fitting_folds), V = 2)
full_cv_preds <- full_cv_fit$SL.predict

## ----estimate-reduced-regressions-with-cf, message = FALSE, warning = FALSE----
vars <- names(X)[(geog_indx + 1):ncol(X)]
set.seed(1234)
for (i in seq_len(length(vars))) {
  # use "eval" and "parse" to assign the objects of interest to avoid duplicating code
  eval(parse(text = paste0("reduced_", vars[i], "_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner(
  Y = y, X = X[, -(geog_indx + i), drop = FALSE], SL.library = learners,
  cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds),
  innerCvControl = list(list(V = V)), family = binomial()
))")))
  eval(parse(text = paste0("reduced_", vars[i], "_cv_preds <- reduced_", vars[i], "_cv_fit$SL.predict")))
}

## ----cf-vims------------------------------------------------------------------
for (i in seq_len(length(vars))) {
  # again, use "eval" and "parse" to assign the objects of interest to avoid duplicating code
  eval(parse(text = paste0("cf_", vars[i], "_vim <- vimp_rsquared(Y = y,
  cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_", vars[i], "_cv_preds,
  indx = (geog_indx + i), cross_fitting_folds = cross_fitting_folds,
  sample_splitting_folds = sample_splitting_folds, run_regression = FALSE, alpha = 0.05,
  V = V, na.rm = TRUE, scale = 'logit')")))
}
cf_ests <- merge_vim(cf_subtype.is.01_AE_vim,
                     cf_subtype.is.02_AG_vim, cf_subtype.is.07_BC_vim,
                     cf_subtype.is.A1_vim, cf_subtype.is.A1C_vim,
                     cf_subtype.is.A1D_vim, cf_subtype.is.B_vim,
                     cf_subtype.is.C_vim, cf_subtype.is.D_vim,
                     cf_subtype.is.O_vim, cf_subtype.is.Other_vim,
                     cf_length.env_vim, cf_length.gp120_vim, cf_length.loop.e_vim,
                     cf_length.loop.e.outliers_vim, cf_length.v5_vim,
                     cf_length.v5.outliers_vim)
all_vars <- c(paste0("Subtype is ", c("01_AE", "02_AG", "07_BC", "A1", "A1C", "A1D",
                                      "B", "C", "D", "O", "Other")),
              paste0("Length of ", c("Env", "gp120", "V5", "V5 outliers", "Loop E",
                                     "Loop E outliers")))

## ----plot-cf-vim, fig.width = 8.5, fig.height = 8-----------------------------
library("ggplot2")
library("cowplot")
theme_set(theme_cowplot())
cf_est_plot_tib <- cf_ests$mat %>%
  mutate(
    var_fct = rev(factor(s, levels = cf_ests$mat$s,
                     labels = all_vars[as.numeric(cf_ests$mat$s) - geog_indx],
                     ordered = TRUE))
  )

# plot
cf_est_plot_tib %>%
  ggplot(aes(x = est, y = var_fct)) +
  geom_point() +
  geom_errorbarh(aes(xmin = cil, xmax = ciu)) +
  xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) +
  ylab("") +
  ggtitle("Estimated individual feature importance") +
  labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)")

## ----cf-group-vim, fig.width = 8.5, fig.height = 8, warning = FALSE-----------
set.seed(91011)
reduced_subtype_cv_fit <- suppressWarnings(
  SuperLearner::CV.SuperLearner(
  Y = y, X = X[, -c(5:15), drop = FALSE], SL.library = learners,
  cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds),
  innerCvControl = list(list(V = V)), family = binomial()
)
)
reduced_subtype_cv_preds <- reduced_subtype_cv_fit$SL.predict
reduced_geometry_cv_fit <- suppressWarnings(
  SuperLearner::CV.SuperLearner(
  Y = y, X = X[, -c(16:21), drop = FALSE], SL.library = learners,
  cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds),
  innerCvControl = list(list(V = V)), family = binomial()
)
)
reduced_geometry_cv_preds <- reduced_geometry_cv_fit$SL.predict
cf_subtype_vim <- vimp_rsquared(
  Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_subtype_cv_preds,
  indx = 5:15, run_regression = FALSE, V = V,
  cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds,
  scale = "logit"
)
cf_geometry_vim <- vimp_rsquared(
  Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_geometry_cv_preds,
  indx = 16:21, run_regression = FALSE, V = V,
  cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds,
  scale = "logit"
)
cf_groups <- merge_vim(cf_subtype_vim, cf_geometry_vim)
all_grp_nms <- c("Viral subtype", "Viral geometry")

grp_plot_tib <- cf_groups$mat %>%
  mutate(
    grp_fct = factor(case_when(
      s == "5,6,7,8,9,10,11,12,13,14,15" ~ "1",
      s == "16,17,18,19,20,21" ~ "2"
    ), levels = c("1", "2"),  labels = all_grp_nms, ordered = TRUE)
  )
grp_plot_tib %>%
  ggplot(aes(x = est, y = grp_fct)) +
  geom_point() +
  geom_errorbarh(aes(xmin = cil, xmax = ciu)) +
  xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) +
  ylab("") +
  ggtitle("Estimated feature group importance") +
  labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)")

Try the vimp package in your browser

Any scripts or data that you put into this service are public.

vimp documentation built on Aug. 29, 2023, 1:06 a.m.