# load required functions and packages
library("testthat")
suppressWarnings(library("SuperLearner"))
# generate the data -- note that this is a simple setting, for speed -----------
set.seed(4747)
p <- 2
n <- 5e4
x <- as.data.frame(replicate(p, stats::rnorm(n, 0, 1)))
# apply the function to the x's
y <- 1 + 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1)
true_var <- 1 + .5 ^ 2 + .75 ^ 2
# note that true difference in R-squareds for variable j, under independence, is
# beta_j^2 * var(x_j) / var(y)
r2_one <- 0.5 ^ 2 * 1 / true_var
r2_two <- 0.75 ^ 2 * 1 / true_var
# set up a library for SuperLearner
learners <- c("SL.glm", "SL.mean")
V <- 2
# no sample-splitting ----------------------------------------------------------
set.seed(4747)
test_that("CV-VIM without sample splitting works", {
est_no_ss <- cv_vim(Y = y, X = x, indx = 2, V = V, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE, sample_splitting = FALSE)
# check variable importance estimate
expect_equal(est_no_ss$est, r2_two, tolerance = 0.1, scale = 1)
# expect_equal(sprintf("%.15f", est_no_ss$est), "0.305201823514129")
})
set.seed(1234)
test_that("CV-VIM with no SS and a single algorithm", {
est_no_ss_single_alg <- cv_vim(Y = y, X = x, indx = 2, V = V, type = "r_squared",
run_regression = TRUE, SL.library = "SL.glm",
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE, sample_splitting = FALSE)
# check variable importance estimate
expect_equal(est_no_ss_single_alg$est, r2_two, tolerance = 0.1, scale = 1)
# expect_equal(sprintf("%.15f", est_no_ss_single_alg$est), "0.305073651770309")
})
set.seed(4747)
test_that("CV-VIM with no SS, non-cross-fitted SE, multiple algorithms works", {
est_no_ss_non_cf_se <- cv_vim(Y = y, X = x, indx = 2, V = V, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE,
sample_splitting = FALSE, cross_fitted_se = FALSE)
# check variable importance estimate
expect_equal(est_no_ss_non_cf_se$est, r2_two, tolerance = 0.1, scale = 1)
# expect_equal(sprintf("%.15f", est_no_ss_non_cf_se$est),
# "0.305202533328328")
})
set.seed(5678)
test_that("CV-VIM with no SS, non-cross-fitted SE, single algorithm works", {
est_no_ss_single_alg_non_cf_se <- cv_vim(Y = y, X = x, indx = 2, V = V, type = "r_squared",
run_regression = TRUE, SL.library = "SL.glm",
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE,
sample_splitting = FALSE, cross_fitted_se = FALSE)
# check variable importance estimate
expect_equal(est_no_ss_single_alg_non_cf_se$est, r2_two, tolerance = 0.1, scale = 1)
# expect_equal(sprintf("%.15f", est_no_ss_single_alg_non_cf_se$est),
# "0.305288025743816")
})
# cross-fitting and sample-splitting -------------------------------------------
set.seed(101112)
test_that("Cross-validated variable importance using internally-computed regressions works", {
est <- cv_vim(Y = y, X = x, indx = 2, V = V, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE)
# check variable importance estimate
expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1)
# check full predictiveness estimate
expect_equal(est$predictiveness_full, 0.44, tolerance = 0.1, scale = 1)
# check that the SE, CI work
expect_length(est$ci, 2)
expect_length(est$se, 1)
# check that the p-value worked
expect_length(est$p_value, 1)
expect_true(est$test)
# check that printing, plotting, etc. work
expect_silent(format(est)[1])
expect_output(print(est), "Estimate", fixed = TRUE)
# check that influence curve worked
expect_length(est$eif, length(y) / 2)
# check that the point estimate is what it is supposed to be
# expect_equal(sprintf("%.15f", est$est), "0.307418610446996")
})
test_that("CF VIM with sample-splitting, full and sampled-split average works", {
est_full <- cv_vim(Y = y, X = x, indx = 2, V = V, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE, final_point_estimate = "full")
est_avg <- cv_vim(Y = y, X = x, indx = 2, V = V, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE, final_point_estimate = "average")
# check variable importance estimate
expect_equal(est_full$est, r2_two, tolerance = 0.1, scale = 1)
expect_equal(est_avg$est, r2_two, tolerance = 0.1, scale = 1)
})
set.seed(101112)
test_that("Cross-validated variable importance with odd number of outer folds works", {
est <- cv_vim(Y = y, X = x, indx = 2, V = 3, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE)
est_full <- cv_vim(Y = y, X = x, indx = 2, V = 3, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE, final_point_estimate = "full")
est_avg <- cv_vim(Y = y, X = x, indx = 2, V = 3, type = "r_squared",
run_regression = TRUE, SL.library = learners,
alpha = 0.05, delta = 0, cvControl = list(V = V),
env = environment(), na.rm = TRUE, final_point_estimate = "average")
# check variable importance estimate
expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1)
expect_equal(est_full$est, r2_two, tolerance = 0.1, scale = 1)
expect_equal(est_avg$est, r2_two, tolerance = 0.1, scale = 1)
# check full predictiveness estimate
expect_equal(est$predictiveness_full, 0.44, tolerance = 0.1, scale = 1)
# check that the SE, CI work
expect_length(est$ci, 2)
expect_length(est$se, 1)
# check that the p-value worked
expect_length(est$p_value, 1)
expect_true(est$test)
# check that printing, plotting, etc. work
expect_silent(format(est)[1])
expect_output(print(est), "Estimate", fixed = TRUE)
# check that influence curve worked
expect_equal(sum(!is.na(est$eif_full)), 24999)
expect_equal(sum(!is.na(est$eif_redu)), 25001)
# check that the point estimate is what it is supposed to be
# expect_equal(sprintf("%.15f", est$est), "0.307418610446996")
})
# using pre-computed regression functions --------------------------------------
# cross-fitted estimates of the full and reduced regressions,
# for point estimate of variable importance.
indx <- 2
Y <- matrix(y)
set.seed(4747)
full_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner(
Y = Y, X = x, SL.library = learners, cvControl = list(V = 2 * V),
innerCvControl = list(list(V = V))
))
# use the same cross-fitting folds for reduced
reduced_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner(
Y = Y, X = x[, -indx, drop = FALSE], SL.library = learners,
cvControl = SuperLearner::SuperLearner.CV.control(
V = 2 * V, validRows = full_cv_fit$folds
),
innerCvControl = list(list(V = V))
))
# set up folds for hypothesis testing
cross_fitting_folds <- get_cv_sl_folds(full_cv_fit$folds)
set.seed(1234)
sample_splitting_folds <- make_folds(unique(cross_fitting_folds), V = 2)
full_cv_preds <- full_cv_fit$SL.predict
reduced_cv_preds <- reduced_cv_fit$SL.predict
set.seed(5678)
# refit without cross-fitting (for non-cross-fitted SE estimation)
# but fit using CV.SL, V = 2
unique_cf_folds <- sort(unique(cross_fitting_folds))
ss_folds_lst <- lapply(as.list(seq_len(2)), function(i) which(cross_fitting_folds %in% unique_cf_folds[sample_splitting_folds == i]))
full_fit_2 <- suppressWarnings(
SuperLearner::CV.SuperLearner(
Y = Y, X = x, SL.library = learners, cvControl = list(V = 2, validRows = ss_folds_lst),
innerCvControl = list(list(V = V)),
)
)
fhat_ful <- SuperLearner::predict.SuperLearner(full_fit_2, onlySL = TRUE)$pred
reduced_fit_2 <- suppressWarnings(
SuperLearner::CV.SuperLearner(
Y = Y, X = x[, -indx, drop = FALSE], SL.library = learners, cvControl = list(V = 2, validRows = ss_folds_lst),
innerCvControl = list(list(V = V)),
)
)
fhat_red <- SuperLearner::predict.SuperLearner(reduced_fit_2, onlySL = TRUE)$pred
test_that("Cross-validated variable importance using externally-computed regressions works", {
est_prefit <- cv_vim(Y = y, cross_fitted_f1 = full_cv_preds,
cross_fitted_f2 = reduced_cv_preds, indx = 2, delta = 0, V = V,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE)
est_prefit_full <- cv_vim(Y = y, cross_fitted_f1 = full_cv_preds,
cross_fitted_f2 = reduced_cv_preds, indx = 2, delta = 0, V = V,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE,
final_point_estimate = "full")
est_prefit_avg <- cv_vim(Y = y, cross_fitted_f1 = full_cv_preds,
cross_fitted_f2 = reduced_cv_preds, indx = 2, delta = 0, V = V,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE,
final_point_estimate = "average")
# check variable importance estimate
expect_equal(est_prefit$est, r2_two, tolerance = 0.1, scale = 1)
expect_equal(est_prefit_full$est, r2_two, tolerance = 0.1, scale = 1)
expect_equal(est_prefit_avg$est, r2_two, tolerance = 0.1, scale = 1)
# check full predictiveness estimate
expect_equal(est_prefit$predictiveness_full, 0.44, tolerance = 0.1, scale = 1)
# check that the SE, CI work
expect_length(est_prefit$ci, 2)
expect_length(est_prefit$se, 1)
# check that the p-value worked
expect_length(est_prefit$p_value, 1)
expect_true(est_prefit$test)
# check that printing, plotting, etc. work
expect_silent(format(est_prefit)[1])
expect_output(print(est_prefit), "Estimate", fixed = TRUE)
# check that influence curve worked
expect_length(est_prefit$eif, length(y) / 2)
# check the actual value of the point estimate
# expect_equal(sprintf("%.15f", est_prefit$est), "0.311079577886281")
})
test_that("Cross-validated variable importance using non-list externally-computed regressions works", {
est_prefit <- cv_vim(Y = y, cross_fitted_f1 = full_cv_fit$SL.predict,
cross_fitted_f2 = reduced_cv_fit$SL.predict, indx = 2,
delta = 0, V = V,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE)
# check variable importance estimate
expect_equal(est_prefit$est, r2_two, tolerance = 0.1, scale = 1)
# check full predictiveness estimate
expect_equal(est_prefit$predictiveness_full, 0.44, tolerance = 0.1, scale = 1)
# check that the SE, CI work
expect_length(est_prefit$ci, 2)
expect_length(est_prefit$se, 1)
# check that the p-value worked
expect_length(est_prefit$p_value, 1)
expect_true(est_prefit$test)
# check that printing, plotting, etc. work
expect_silent(format(est_prefit)[1])
expect_output(print(est_prefit), "Estimate", fixed = TRUE)
# check that influence curve worked
expect_length(est_prefit$eif, length(y) / 2)
# check the actual value of the point estimate
# expect_equal(sprintf("%.15f", est_prefit$est), "0.311079577886281")
})
test_that("Cross-validated variable importance using non-list externally-computed regressions and no sample-splitting works", {
est_prefit <- cv_vim(Y = y, cross_fitted_f1 = full_cv_fit$SL.predict,
cross_fitted_f2 = reduced_cv_fit$SL.predict, indx = 2,
delta = 0, V = 2 * V,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting = FALSE,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE)
# check variable importance estimate
expect_equal(est_prefit$est, r2_two, tolerance = 0.1, scale = 1)
# check full predictiveness estimate
expect_equal(est_prefit$predictiveness_full, 0.44, tolerance = 0.1, scale = 1)
# check that the SE, CI work
expect_length(est_prefit$ci, 2)
expect_length(est_prefit$se, 1)
# check that the p-value worked
expect_length(est_prefit$p_value, 1)
expect_true(est_prefit$test)
# check that printing, plotting, etc. work
expect_silent(format(est_prefit)[1])
expect_output(print(est_prefit), "Estimate", fixed = TRUE)
# check that influence curve worked
expect_length(est_prefit$eif, length(y))
# check the actual value of the point estimate
# expect_equal(sprintf("%.15f", est_prefit$est), "0.311079577886281")
})
# non-cross-fitted SEs
test_that("Cross-validated variable importance using externally-computed regressions and non-cross-fitted SEs works", {
est_prefit_no_cf_se <- cv_vim(Y = y, cross_fitted_f1 = full_cv_preds,
cross_fitted_f2 = reduced_cv_preds, f1 = fhat_ful,
f2 = fhat_red, indx = 2, delta = 0, V = V, type = "r_squared",
cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE,
cross_fitted_se = FALSE)
# check variable importance estimate
expect_equal(est_prefit_no_cf_se$est, r2_two, tolerance = 0.1, scale = 1)
# check full predictiveness estimate
expect_equal(est_prefit_no_cf_se$predictiveness_full, 0.44, tolerance = 0.1, scale = 1)
# check that the SE, CI work
expect_length(est_prefit_no_cf_se$ci, 2)
expect_length(est_prefit_no_cf_se$se, 1)
# check that the p-value worked
expect_length(est_prefit_no_cf_se$p_value, 1)
expect_true(est_prefit_no_cf_se$test)
# check that printing, plotting, etc. work
expect_silent(format(est_prefit_no_cf_se)[1])
expect_output(print(est_prefit_no_cf_se), "Estimate", fixed = TRUE)
# check that influence curve worked
expect_length(est_prefit_no_cf_se$eif, length(y) / 2)
# check the actual value of the point estimate
# expect_equal(sprintf("%.15f", est_prefit_no_cf_se$est), "0.311079577886281")
})
# measures of predictiveness ---------------------------------------------------
full_cv_preds_list <- extract_sampled_split_predictions(
cvsl_obj = full_cv_fit, sample_splitting = TRUE,
sample_splitting_folds = sample_splitting_folds, full = TRUE, vector = FALSE
)
test_that("Measures of predictiveness work", {
k_fold_lst <- make_kfold(cross_fitting_folds, sample_splitting_folds)
full_test <- (k_fold_lst$sample_splitting_folds == 1)
full_rsquared <- est_predictiveness_cv(fitted_values = full_cv_preds_list,
y = y[full_test],
full_y = y,
folds = k_fold_lst$full,
type = "r_squared", na.rm = TRUE)
expect_equal(full_rsquared$point_est, 0.44, tolerance = 0.1, scale = 1)
expect_length(full_rsquared$all_ests, V)
expect_length(full_rsquared$eif, length(cross_fitting_folds) / 2)
expect_equal(length(full_rsquared$all_eifs), V)
})
# error messages ---------------------------------------------------------------
test_that("Error messages work", {
expect_error(cv_vim(X = x))
expect_error(cv_vim(Y = y))
expect_error(cv_vim(Y = y, X = x, SL.library = NULL))
expect_error(cv_vim(Y = y, X = x, run_regression = FALSE))
expect_error(cv_vim(Y = y, f1 = mean(y)))
expect_error(cv_vim(Y = y, f1 = rep(mean(y), length(y)), f2 = mean(y)))
})
# pre-computed regression functions with an odd number of folds ----------------
# with an odd number of outer folds
indx <- 2
Y <- matrix(y)
set.seed(4747)
full_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner(
Y = Y, X = x, SL.library = learners, cvControl = list(V = 5),
innerCvControl = list(list(V = V))
))
# use the same cross-fitting folds for reduced
reduced_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner(
Y = Y, X = x[, -indx, drop = FALSE], SL.library = learners,
cvControl = SuperLearner::SuperLearner.CV.control(
V = 5, validRows = full_cv_fit$folds
),
innerCvControl = list(list(V = V))
))
# extract the predictions on split portions of the data, for hypothesis testing
cross_fitting_folds <- get_cv_sl_folds(full_cv_fit$folds)
set.seed(1234)
sample_splitting_folds <- make_folds(unique(cross_fitting_folds), V = 2)
full_cv_preds <- full_cv_fit$SL.predict
reduced_cv_preds <- reduced_cv_fit$SL.predict
test_that("Cross-validated VIM works with externally-computed regressions and an odd number of folds", {
est_prefit <- cv_vim(Y = y, cross_fitted_f1 = full_cv_preds,
cross_fitted_f2 = reduced_cv_preds, indx = 2, delta = 0, V = 2,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE)
est_prefit_full <- cv_vim(Y = y, cross_fitted_f1 = full_cv_preds,
cross_fitted_f2 = reduced_cv_preds, indx = 2, delta = 0, V = 2,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE, final_point_estimate = "full")
est_prefit_avg <- cv_vim(Y = y, cross_fitted_f1 = full_cv_preds,
cross_fitted_f2 = reduced_cv_preds, indx = 2, delta = 0, V = 2,
type = "r_squared", cross_fitting_folds = cross_fitting_folds,
sample_splitting_folds = sample_splitting_folds,
run_regression = FALSE, alpha = 0.05, na.rm = TRUE, final_point_estimate = "average")
# check variable importance estimate
expect_equal(est_prefit$est, r2_two, tolerance = 0.1, scale = 1)
expect_equal(est_prefit_full$est, r2_two, tolerance = 0.1, scale = 1)
expect_equal(est_prefit_avg$est, r2_two, tolerance = 0.1, scale = 1)
# check full predictiveness estimate
expect_equal(est_prefit$predictiveness_full, 0.44, tolerance = 0.1, scale = 1)
# check that the SE, CI work
expect_length(est_prefit$ci, 2)
expect_length(est_prefit$se, 1)
# check that the p-value worked
expect_length(est_prefit$p_value, 1)
expect_true(est_prefit$test)
# check that printing, plotting, etc. work
expect_silent(format(est_prefit)[1])
expect_output(print(est_prefit), "Estimate", fixed = TRUE)
# check that influence curve worked; note that we used 5-fold outer CV and
# 3 of those went to the full nuisance function
expect_length(est_prefit$eif, length(y) / 5 * 3)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.