# load required functions and packages
library("testthat")
suppressWarnings(library("SuperLearner"))
# generate the data -- note that this is a simple setting, for speed -----------
set.seed(4747)
p <- 2
n <- 5e4
x <- as.data.frame(replicate(p, sample(0:3, n, replace = TRUE)))
a <- rbinom(n, 1, 0.5 + 0.1 * x[, 1] + .05 * x[, 2])
y <- 1 + a * 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1)
# create the potential outcomes
f <- (1 + 1 * 0.5 * x[, 1] + 0.75 * x[, 2]) > (1 + 0 * 0.5 * x[, 1] + 0.75 * x[, 2])
f1 <- (1 + 1 + 0.75 * x[, 2]) > (1 + 0 + 0.75 * x[, 2])
f2 <- (1 + 1 * 0.5 * x[, 1]) > (1 + 0 * 0.5 * x[, 1])
y_f <- 1 + f * 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1)
y_f1 <- 1 + f1 * 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1)
y_f2 <- 1 + f2 * 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1)
# compute the true value
true_avg_value <- mean(y_f)
avg_value_1 <- true_avg_value - mean(y_f1)
avg_value_2 <- true_avg_value - mean(y_f2)
# set up a library for SuperLearner
learners <- c("SL.glm.interaction")
V <- 2
# cv folds
folds <- make_folds(y = y, V = 2)
sl_folds <- lapply(as.list(1:2), function(i) which(folds == i))
set.seed(1234)
# estimate the outcome regression
q_n <- SuperLearner::SuperLearner(Y = y, X = cbind.data.frame(a = a, x), SL.library = learners,
cvControl = list(V = V))
q_n1 <- SuperLearner::SuperLearner(Y = q_n$SL.predict, X = cbind.data.frame(a = a, x[, -1, drop = FALSE]), SL.library = learners,
cvControl = list(V = V))
q_n2 <- SuperLearner::SuperLearner(Y = q_n$SL.predict, X = cbind.data.frame(a = a, x[, -2, drop = FALSE]), SL.library = learners,
cvControl = list(V = V))
# estimate the optimal rule
f_n <- as.numeric(predict(q_n, newdata = cbind.data.frame(a = 1, x))$pred >
predict(q_n, newdata = cbind.data.frame(a = 0, x))$pred)
f_n1 <- as.numeric(predict(q_n1, newdata = cbind.data.frame(a = 1, x[, -1, drop = FALSE]))$pred >
predict(q_n1, newdata = cbind.data.frame(a = 0, x[, -1, drop = FALSE]))$pred)
f_n2 <- as.numeric(predict(q_n2, newdata = cbind.data.frame(a = 1, x[, -2, drop = FALSE]))$pred >
predict(q_n2, newdata = cbind.data.frame(a = 0, x[, -2, drop = FALSE]))$pred)
# estimate the propensity score
g_n <- SuperLearner::SuperLearner(Y = f_n, X = x, SL.library = learners, cvControl = list(V = V))
g_n1 <- SuperLearner::SuperLearner(Y = f_n1, X = x[, -1, drop = FALSE], SL.library = learners, cvControl = list(V = V))
g_n2 <- SuperLearner::SuperLearner(Y = f_n2, X = x[, -2, drop = FALSE], SL.library = learners, cvControl = list(V = V))
# also get cross-fitted versions
q_n_cv <- suppressWarnings(
SuperLearner::CV.SuperLearner(Y = y, X = cbind.data.frame(a = a, x),
SL.library = learners, cvControl = list(V = 2 * V),
innerCvControl = list(list(V = V)),
saveAll = TRUE, control = list(saveFitLibrary = TRUE))
)
cross_fitting_folds <- get_cv_sl_folds(q_n_cv$folds)
q_n_cv_preds_lst <- lapply(as.list(seq_len(length(q_n_cv$AllSL))), function(l) {
as.numeric(predict(q_n_cv$AllSL[[l]], newdata = cbind.data.frame(a = 1, x[cross_fitting_folds == l, ]))$pred >
predict(q_n_cv$AllSL[[l]], newdata = cbind.data.frame(a = 0, x[cross_fitting_folds == l, ]))$pred)
})
f_n_cv <- unlist(q_n_cv_preds_lst)[unlist(q_n_cv$folds)]
g_n_cv <- suppressWarnings(
SuperLearner::CV.SuperLearner(Y = f_n_cv, X = x, SL.library = learners,
cvControl = list(V = 2 * V, validRows = q_n_cv$folds), innerCvControl = list(list(V = V)),
saveAll = TRUE, control = list(saveFitLibrary = TRUE))
)
opt_q_n_cv <- unlist(lapply(as.list(seq_len(length(q_n_cv$AllSL))), function(l) {
predict(q_n_cv$AllSL[[l]], newdata = cbind.data.frame(a = f_n_cv, x))$pred
}))[unlist(q_n_cv$folds)]
opt_g_n_cv <- unlist(lapply(as.list(seq_len(length(g_n_cv$AllSL))), function(l) {
predict(g_n_cv$AllSL[[l]], newdata = cbind.data.frame(x))$pred
}))[unlist(q_n_cv$folds)]
nuisances_cv <- list(f_n = f_n_cv, q_n = opt_q_n_cv, g_n = opt_g_n_cv)
q_n_cv1 <- suppressWarnings(
SuperLearner::CV.SuperLearner(Y = y, X = cbind.data.frame(a = a, x[, -1, drop = FALSE]),
SL.library = learners, cvControl = list(V = 2 * V, validRows = q_n_cv$folds),
innerCvControl = list(list(V = V)),
saveAll = TRUE, control = list(saveFitLibrary = TRUE))
)
q_n_cv_preds_lst1 <- lapply(as.list(seq_len(length(q_n_cv1$AllSL))), function(l) {
as.numeric(predict(q_n_cv1$AllSL[[l]], newdata = cbind.data.frame(a = 1, x[cross_fitting_folds == l, -1, drop = FALSE]))$pred >
predict(q_n_cv1$AllSL[[l]], newdata = cbind.data.frame(a = 0, x[cross_fitting_folds == l, -1, drop = FALSE]))$pred)
})
f_n_cv1 <- unlist(q_n_cv_preds_lst1)[unlist(q_n_cv1$folds)]
g_n_cv1 <- suppressWarnings(
SuperLearner::CV.SuperLearner(Y = f_n_cv, X = x[, -1, drop = FALSE], SL.library = learners,
cvControl = list(V = 2 * V, validRows = q_n_cv$folds), innerCvControl = list(list(V = V)),
saveAll = TRUE, control = list(saveFitLibrary = TRUE))
)
opt_q_n_cv1 <- unlist(lapply(as.list(seq_len(length(q_n_cv1$AllSL))), function(l) {
predict(q_n_cv1$AllSL[[l]], newdata = cbind.data.frame(a = f_n_cv1, x[, -1, drop = FALSE]))$pred
}))[unlist(q_n_cv1$folds)]
opt_g_n_cv1 <- unlist(lapply(as.list(seq_len(length(g_n_cv1$AllSL))), function(l) {
predict(g_n_cv1$AllSL[[l]], newdata = cbind.data.frame(x[, -1, drop = FALSE]))$pred
}))[unlist(q_n_cv1$folds)]
nuisances_cv1 <- list(f_n = f_n_cv1, q_n = opt_q_n_cv1, g_n = opt_g_n_cv1)
# test average value -----------------------------------------------------------
# estimate the average value
nuisances <- list(f_n = f_n, q_n = predict(q_n, newdata = cbind.data.frame(a = f_n, x))$pred,
g_n = predict(g_n, newdata = cbind.data.frame(x))$pred)
est_avg_value <- measure_average_value(nuisance_estimators = nuisances, y = y,
a = a)
test_that("Estimating the average value works", {
expect_equal(est_avg_value$point_est, true_avg_value, tol = 0.05, scale = 1)
})
nuisances1 <- list(f_n = f_n1, q_n = predict(q_n1, newdata = cbind.data.frame(a = f_n1, x[, -1, drop = FALSE]))$pred,
g_n = predict(g_n1, newdata = cbind.data.frame(a = f_n1, x[, -1, drop = FALSE]))$pred)
est_avg_value1 <- measure_average_value(nuisance_estimators = nuisances1, y = y,
a = a)
nuisances2 <- list(f_n = f_n2, q_n = predict(q_n2, newdata = cbind.data.frame(a = f_n2, x[, -2, drop = FALSE]))$pred,
g_n = predict(g_n2, newdata = cbind.data.frame(a = f_n2, x[, -2, drop = FALSE]))$pred)
est_avg_value2 <- measure_average_value(nuisance_estimators = nuisances2, y = y,
a = a)
test_that("Estimating the difference in average values works", {
expect_equal(est_avg_value$point_est - est_avg_value1$point_est, avg_value_1, tol = 0.1, scale = 1)
expect_equal(est_avg_value$point_est - est_avg_value2$point_est, avg_value_2, tol = 0.1, scale = 1)
})
set.seed(1234)
test_that("Average-value-based VIM works", {
expect_warning(
vim_avg_value <- vim(Y = y, X = cbind.data.frame(a = a, x), f1 = f_n, f2 = f_n1, indx = 1, type = "average_value",
run_regression = FALSE, exposure_name = "a",
nuisance_estimators_full = nuisances,
nuisance_estimators_reduced = nuisances1)
)
expect_equal(vim_avg_value$est, avg_value_1, tol = 0.1, scale = 1)
})
set.seed(5678)
test_that("Average-value-based VIM with cross-fitting works", {
expect_warning(
vim_avg_value_cv <- cv_vim(Y = y, X = cbind.data.frame(a = a, x), cross_fitted_f1 = f_n_cv,
cross_fitted_f2 = f_n_cv1, indx = 1, type = "average_value",
run_regression = FALSE, exposure_name = "a",
nuisance_estimators_full = nuisances_cv,
nuisance_estimators_reduced = nuisances_cv1,
cross_fitting_folds = cross_fitting_folds, V = V)
)
expect_equal(vim_avg_value_cv$est, avg_value_1, tol = 0.1, scale = 1)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.