# R/average_vim.R In vimp: Perform Inference on Algorithm-Agnostic Variable Importance

#### Documented in average_vim

#' Average multiple independent importance estimates
#'
#' Average the output from multiple calls to \code{vimp_regression}, for different independent groups, into a single estimate with a corresponding standard error and confidence interval.
#'
#' @param ... an arbitrary number of \code{vim} objects.
#' @param weights how to average the vims together, and must sum to 1; defaults to 1/(number of vims) for each vim, corresponding to the arithmetic mean
#'
#' @return an object of class \code{vim} containing the (weighted) average of the individual importance estimates, as well as the appropriate standard error and confidence interval.
#' This results in a list containing:
#' \itemize{
#'  \item{s}{ - a list of the column(s) to calculate variable importance for}
#'  \item{SL.library}{ - a list of the libraries of learners passed to \code{SuperLearner}}
#'  \item{full_fit}{ - a list of the fitted values of the chosen method fit to the full data}
#'  \item{red_fit}{ - a list of the fitted values of the chosen method fit to the reduced data}
#'  \item{est}{- a vector with the corrected estimates}
#'  \item{naive}{- a vector with the naive estimates}
#'  \item{update}{- a list with the influence curve-based updates}
#'  \item{mat}{ - a matrix with the estimated variable importance, the standard error, and the \eqn{(1-\alpha) \times 100}\% confidence interval}
#'  \item{full_mod}{ - a list of the objects returned by the estimation procedure for the full data regression (if applicable)}
#'  \item{red_mod}{ - a list of the objects returned by the estimation procedure for the reduced data regression (if applicable)}
#'  \item{alpha}{ - the level, for confidence interval calculation}
#'  \item{y}{ - a list of the outcomes}
#' }

#' @examples
#' # generate the data
#' p <- 2
#' n <- 100
#' x <- data.frame(replicate(p, stats::runif(n, -5, 5)))
#'
#' # apply the function to the x's
#' smooth <- (x[,1]/5)^2*(x[,1]+7)/5 + (x[,2]/3)^2
#'
#' # generate Y ~ Normal (smooth, 1)
#' y <- smooth + stats::rnorm(n, 0, 1)
#'
#' # set up a library for SuperLearner; note simple library for speed
#' library("SuperLearner")
#' learners <- c("SL.glm", "SL.mean")
#'
#' # get estimates on independent splits of the data
#' samp <- sample(1:n, n/2, replace = FALSE)
#'
#' # using Super Learner (with a small number of folds, for illustration only)
#' est_2 <- vimp_regression(Y = y[samp], X = x[samp, ], indx = 2, V = 2,
#'            run_regression = TRUE, alpha = 0.05,
#'            SL.library = learners, cvControl = list(V = 2))
#'
#' est_1 <- vimp_regression(Y = y[-samp], X = x[-samp, ], indx = 2, V = 2,
#'            run_regression = TRUE, alpha = 0.05,
#'            SL.library = learners, cvControl = list(V = 2))
#'
#' ests <- average_vim(est_1, est_2, weights = c(1/2, 1/2))
#'
#' @importFrom rlang "!!" sym
#' @export
average_vim <- function(..., weights = rep(1/length(list(...)), length(list(...)))) {
# capture the arguments
L <- list(...)
names(L) <- unlist(match.call(expand.dots=F)$...) p <- length(L) # check if weights sum to 1; if not, break if (sum(weights) != 1) stop("Weights must sum to one.") # extract the estimates and SEs from each element of the list # also get the sample sizes ests <- do.call(c, lapply(L, function(z) z$est))
naives <- do.call(c, lapply(L, function(z) z$naive)) ses <- do.call(c, lapply(L, function(z) z$se))
tests <- do.call(c, lapply(L, function(z) z$test)) p_values <- do.call(c, lapply(L, function(z) z$p_value))
predictivenesses_full <- do.call(c, lapply(L, function(z) z$predictiveness_full)) predictivenesses_reduced <- do.call(c, lapply(L, function(z) z$predictiveness_reduced))
predictiveness_cis_full <- do.call(rbind, lapply(L, function(z) z$predictiveness_ci_full)) predictiveness_cis_reduced <- do.call(rbind, lapply(L, function(z) z$predictiveness_ci_reduced))
test_statistics <- do.call(rbind, lapply(L, function(z) z$test_statistic)) delta <- min(do.call(c, lapply(L, function(z) z$delta)))
scale <- unique(unlist(lapply(L, function(z) z$scale))) names(ests) <- "est" names(ses) <- "se" names(naives) <- "naive" # create the (weighted) average est_avg <- sum(weights*ests) predictiveness_full <- sum(weights*predictivenesses_full) predictiveness_reduced <- sum(weights*predictivenesses_reduced) # combine the variances correctly # will need to use the covariance, if not independent se_avg <- sqrt(matrix(weights^2, nrow = 1)%*%as.matrix(ses^2)) # create a CI alpha <- min(unlist(lapply(L, function(z) z$alpha)))
ci_avg <- vimp_ci(est_avg, se_avg, level = 1 - alpha, scale = scale[1])

# hypothesis test:
test_statistic <- sum(weights * test_statistics)
p_value <- 1 - pnorm(test_statistic)
hyp_test <- p_value < alpha

# now get lists of the remaining components
eifs <- lapply(L, function(z) z$eif) s_lst <- lapply(L, function(z) z$s)
s <- paste0("avg_", paste(unlist(s_lst), collapse = "_"))
SL.library <- lapply(L, function(z) z$SL.library) full_fit <- lapply(L, function(z) z$full_fit)
red_fit <- lapply(L, function(z) z$red_fit) full_mod <- lapply(L, function(z) z$full_mod)
red_mod <- lapply(L, function(z) z$red_mod) # combine into a tibble mat <- tibble::tibble(s = s, est = est_avg, se = se_avg, cil = ci_avg[, 1], ciu = ci_avg[, 2], test = hyp_test, p_value = p_value) %>% dplyr::arrange(dplyr::desc(!! rlang::sym("est"))) # create output list output <- list(s = s, SL.library = SL.library, full_fit = full_fit, red_fit = red_fit, est = mat$est, naive = naives, eif = eifs,
se = mat$se, ci = cbind(mat$cil, mat\$ciu),
predictiveness_full = predictiveness_full,
predictiveness_reduced = predictiveness_reduced,
predictiveness_ci_full = sum(weights*predictiveness_cis_full),
predictiveness_ci_reduced = sum(weights*predictiveness_cis_reduced),
test = hyp_test,
p_value = p_value,
mat = mat,
full_mod = full_mod, red_mod = red_mod,
alpha = alpha,
delta = delta,
scale = scale)

tmp <- class(output)
classes <- unlist(lapply(L, function(z) class(z)[2]))
class(output) <- c("vim", classes, tmp)

return(output)
}


## Try the vimp package in your browser

Any scripts or data that you put into this service are public.

vimp documentation built on Aug. 16, 2021, 5:08 p.m.