#' Fit a learner on training folds and get predictions on validation folds
#'
#' @param task A named list identifying what training_folds to fit the learner
#' on. The function returns predictions from this fit on the remaining folds (i.e.,
#' the validation folds).
#' @param folds Vector identifying which fold observations fall into.
#' @param Y A matrix or data.frame of outcomes.
#' @param tmpX A matrix or data.frame of predictors.
#' @param sl_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#'
#' @return A named list with task and output of super learner wrapper fit.
get_fit <- function(task, folds, tmpX, Y, sl_control){
train_idx <- folds %in% task$training_folds
valid_idx <- !train_idx
train_Y <- Y[train_idx, task$Yname]
train_X <- tmpX[train_idx, , drop = FALSE]
if(sum(valid_idx) > 0){
valid_X <- tmpX[valid_idx, , drop = FALSE]
}else{
# if learner being fit on all data, then return
# predictions on training sample
valid_X <- train_X
}
# fit super learner wrapper
opts <- list(Y = train_Y, X = train_X,
newX = valid_X, obsWeights = rep(1, length(train_Y)),
family = sl_control$family)
fit <- tryCatch(
do.call(task$SL_wrap, args = opts),
error = function(e){
do.call(SuperLearner::SL.mean, args = opts)
})
# split up validation predictions
if(sum(valid_idx) > 0){
fit$pred <- split(fit$pred, folds[valid_idx])
}else{
fit$pred <- fit$pred
}
# return(fit)
return(c(task, fit))
}
#' Get outcome weights based on cross-validated super learner fits
#'
#' @param task A named list identifying what training folds to use to
#' obtain outcome weights.
#' @param Y A matrix or data.frame of outcomes
#' @param V Number of outer folds of cross-validation (nested cross-validation
#' uses V-1 and V-2 folds), so must be at least four.
#' @param Ynames Names of the columns of \code{Y}.
#' @param all_fits A list of all learner fits (from \code{get_fit})
#' @param all_sl A list of all super learner fits (from \code{get_sl})
#' @param all_fit_tasks A list of all learner fitting tasks (quicker to search over
#' than \code{all_fits}).
#' @param sl_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#' @param y_weight_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn. Available functions can be viewed with \code{y_weight_control_options()}. See
#' \code{?y_weight_control_options} for more on how users may supply their own functions.
#' @param folds Vector identifying which fold observations fall into.
#' @param learners Super learner wrappers. See \code{SuperLearner::listWrappers}.
#'
#' @return Named list identifying training folds used and the composite outcome weights.
get_y_weight <- function(task, Y, V, Ynames, all_fits, all_sl,
all_fit_tasks, sl_control, y_weight_control, folds, learners){
# add folds so split.data.frame works
split_Y <- split(data.frame(Y, folds), folds)
# remove folds column
split_Y <- lapply(split_Y, function(x){ x[,-ncol(x)] })
if(!all(1:V %in% task$training_folds)){
valid_folds <- (1:V)[-task$training_folds]
}else{
# evaluates when computing outer weights
valid_folds <- NULL
}
# get input needed to compute weights
input <- get_y_weight_input(valid_folds = valid_folds,
split_Y = split_Y, Ynames = Ynames,
all_fits = all_fits, all_sl = all_sl,
all_fit_tasks = all_fit_tasks, V = V,
sl_control = sl_control,
learners = learners)
y_weight <- do.call(y_weight_control$weight_fn,
args = list(input = input, y_weight_control = y_weight_control))
out <- c(list(training_folds = task$training_folds), y_weight)
return(out)
}
#' Get super learner weights based on cross-validated learner fits.
#'
#' @param task A named list identifying what training folds to use to
#' obtain outcome weights.
#' @param Y A matrix or data.frame of outcomes
#' @param V Number of outer folds of cross-validation (nested cross-validation
#' uses V-1 and V-2 folds), so must be at least four.
#' @param all_fits A list of all learner fits (from \code{get_fit})
#' @param all_fit_tasks A list of all learner fitting tasks (quicker to search over
#' than \code{all_fits}).
#' @param sl_control A list with named entries ensemble_fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#' @param folds Vector identifying which fold observations fall into.
#' @param learners Super learner wrappers. See \code{SuperLearner::listWrappers}.
#'
#' @return Named list identifying training folds used and the composite outcome weights.
#'
get_sl <- function(task, Y, V, all_fit_tasks, all_fits, folds, sl_control, learners){
split_Y <- split(Y[ , task$Yname], folds)
if(!all(1:V %in% task$training_folds)){
valid_folds <- (1:V)[-task$training_folds]
}else{
valid_folds <- NULL
}
# get input needed to compute sl ensemble weights
#Returns validation fold, actual Y outcome, and predicted Y outcome for each learner
if(length(learners) > 1){
input <- get_sl_input(split_Y = split_Y, valid_folds = valid_folds,
Yname = task$Yname, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, learners = learners)
# get sl ensemble weights
sl_weight <- do.call(sl_control$weight_fn,
args = list(input = input, sl_control = sl_control))
}else{
sl_weight <- list(weight = 1)
}
out <- list(training_folds = task$training_folds, Yname = task$Yname, sl_weight = sl_weight$weight)
# get sl predictions on valid_folds folds by searching for when
# (1:V)[-valid_folds] is training sample
# sl_pred <- get_sl_pred(valid_folds = valid_folds, V = V, all_fit_tasks = all_fit_tasks,
# all_fits = all_fits, y = Y_name, sl_weight = sl_weight,
# split_Y = split_Y)
return(out)
}
#' Get a super learner fit for a given outcome with more information.
#'
#' This is called if \code{return_outer_sl = TRUE}, in which case more
#' information on learner risks etc... is computed and returned.
#'
#' @param task A named list identifying what outcome to use.
#' @param Y A matrix or data.frame of outcomes
#' @param V Number of outer folds of cross-validation (nested cross-validation
#' uses V-1 and V-2 folds), so must be at least four.
#' @param all_fits A list of all learner fits (from \code{get_fit})
#' @param all_fit_tasks A list of all learner fitting tasks (quicker to search over
#' than \code{all_fits}).
#' @param return_learner_fits Should the \code{fit} component of the learners
#' be returned. Must be \code{TRUE} to obtain later predictions.
#' @param sl_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#' @param folds Vector identifying which fold observations fall into.
#' @param learners Super learner wrappers. See \code{SuperLearner::listWrappers}.
#' @return Named list of super learner results.
get_formatted_sl <- function(task, Y, V, all_fit_tasks, all_fits, folds,
sl_control, return_learner_fits = TRUE,
learners){
split_Y <- split(Y[ , task$Yname], folds)
if(!all(1:V %in% task$training_folds)){
valid_folds <- (1:V)[-task$training_folds]
}else{
valid_folds <- NULL
}
# get input needed to compute sl ensemble weights
input <- get_sl_input(split_Y = split_Y, valid_folds = valid_folds,
Yname = task$Yname, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, learners = learners)
if(length(learners) > 1){
# get sl ensemble weights
sl_weight_list <- do.call(sl_control$weight_fn,
args = list(input = input, sl_control = sl_control))
sl_weight <- sl_weight_list$weight
M <- dim(input[[1]]$pred)[2]
}else{
M <- 1
sl_weight <- 1
}
# get risks for each learner
all_pred <- Reduce(rbind, lapply(input, '[[', "pred"))
all_y <- Reduce(c, lapply(input, '[[', "Y"))
risk_input <- list(Y = all_y, pred = all_pred)
risks <- rep(Inf, M)
for(m in 1:M){
weights <- rep(0, M)
weights[m] <- 1
risks[m] <- do.call(sl_control$optim_risk_fn,
args = list(sl_weight = weights, sl_control = sl_control,
input = risk_input))
}
# get fits for each learner
idx <- search_fits_for_training_folds(fits = all_fit_tasks,
y = task$Yname,
training_folds = task$training_folds)
fit_out <- sapply(idx, function(i){
lapply(all_fits[i], "[", "fit")
})
out <- list(Yname = task$Yname, training_folds = task$training_folds,
sl_weight = sl_weight, learner_names = learners,
learner_risks = risks, learner_fits = fit_out)
return(out)
}
#' Get cross-validated risk of the super learner for a univariate outcome
#'
#' @param task A named list identifying what training folds to use to
#' obtain outcome weights.
#' @param Y A matrix or data.frame of outcomes
#' @param V Number of outer folds of cross-validation (nested cross-validation
#' uses V-1 and V-2 folds), so must be at least four.
#' @param all_fits A list of all learner fits (from \code{get_fit})
#' @param all_sl A list of all super learner fits (from \code{get_sl})
#' @param all_fit_tasks A list of all learner fitting tasks (quicker to search over
#' than \code{all_fits}).
#' @param sl_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#' @param folds Vector identifying which fold observations fall into.
#' @param learners Super learner wrappers. See \code{SuperLearner::listWrappers}.
#' @param all_weight List of all outcome weight fits.
#'
#' @return Named list identifying which outcome and the cross-validated risk of the super learner.
get_risk_sl <- function(task, Y, V, all_sl, all_fit_tasks, all_fits, folds,
sl_control, learners, all_weight){
split_Y <- split(Y[ , task$Yname], folds)
if(!all(1:V %in% task$training_folds)){
valid_folds <- (1:V)[-task$training_folds]
}else{
valid_folds <- NULL
}
input <- get_risk_sl_input(split_Y = split_Y, Yname = task$Yname, V = V,
all_fits = all_fits,
all_weight = all_weight, all_sl = all_sl,
all_fit_tasks = all_fit_tasks, folds = folds,
sl_control = sl_control, learners = learners)
risk <- do.call(sl_control$cv_risk_fn, args = list(input = input, sl_control = sl_control))
# re-order influence function contributions
if(!is.null(risk$ic)){
if(length(risk$ic) != length(folds)){
stop(paste0("Length of influence curves returned by ",
sl_control$cv_risk_fn," is wrong. Check this",
"function for errors."))
}
n <- length(folds)
tmp <- rep(NA, n)
tmp[unlist(split(1:n, folds))] <- risk$ic
risk$ic <- tmp
}
out <- c(Yname = task$Yname, risk)
return(out)
}
#' Get cross-validated risk of the super learner for a univariate outcome
#'
#' @param task A named list identifying what training folds to use to
#' obtain outcome weights.
#' @param Y A matrix or data.frame of outcomes
#' @param V Number of outer folds of cross-validation (nested cross-validation
#' uses V-1 and V-2 folds), so must be at least four.
#' @param all_fits A list of all learner fits (from \code{get_fit})
#' @param all_sl A list of all super learner fits (from \code{get_sl})
#' @param all_fit_tasks A list of all learner fitting tasks (quicker to search over
#' than \code{all_fits}).
#' @param sl_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#' @param folds Vector identifying which fold observations fall into.
#' @param learners Super learner wrappers. See \code{SuperLearner::listWrappers}.
#' @param all_weight List of all outcome weight fits.
#'
#' @return Named list identifying which outcome and the cross-validated risk of the super learner.
get_risk_learner <- function(task, Y, V, all_sl, all_fit_tasks, all_fits, folds,
sl_control, learners, all_weight){
split_Y <- split(Y[ , task$Yname], folds)
if(!all(1:V %in% task$training_folds)){
valid_folds <- (1:V)[-task$training_folds]
}else{
valid_folds <- NULL
}
input <- get_risk_learner_input(split_Y = split_Y, Yname = task$Yname, V = V,
all_fits = all_fits,
all_weight = all_weight, all_sl = all_sl,
all_fit_tasks = all_fit_tasks, folds = folds,
sl_control = sl_control, learner = task$SL_wrap)
risk <- do.call(sl_control$cv_risk_fn, args = list(input = input, sl_control = sl_control))
# re-order influence function contributions
if(!is.null(risk$ic)){
if(length(risk$ic) != length(folds)){
stop(paste0("Length of influence curves returned by ",
sl_control$cv_risk_fn," is wrong. Check this",
"function for errors."))
}
n <- length(folds)
tmp <- rep(NA, n)
tmp[unlist(split(1:n, folds))] <- risk$ic
risk$ic <- tmp
}
out <- c(Yname = task$Yname, SL_wrap = task$SL_wrap, risk)
return(out)
}
#' Get cross-validated risk of entire procedure (i.e., outer-most cross-validation layer)
#'
#' @param Y A matrix or data.frame of outcomes
#' @param V Number of outer folds of cross-validation (nested cross-validation
#' uses V-1 and V-2 folds), so must be at least four.
#' @param all_fits A list of all learner fits (from \code{get_fit})
#' @param all_sl A list of all super learner fits (from \code{get_sl})
#' @param all_fit_tasks A list of all learner fitting tasks (quicker to search over
#' than \code{all_fits}).
#' @param all_weight A list of all outcome weights (from \code{get_y_weight})
#' @param sl_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#' @param y_weight_control A list with named entries ensemble.fn, optim_risk_fn, weight_fn,
#' cv_risk_fn. Available functions can be viewed with \code{y_weight_control_options()}. See
#' \code{?y_weight_control_options} for more on how users may supply their own functions.
#' @param folds Vector identifying which fold observations fall into.
#' @param learners Super learner wrappers. See \code{SuperLearner::listWrappers}.
#'
#' @return Numeric risk value
get_risk <- function(Y, V, all_fit_tasks, all_fits, all_weight, folds, all_sl,
sl_control, y_weight_control, learners){
split_Y <- split(data.frame(Y, folds), folds)
split_Y <- lapply(split_Y, function(x){ x[,-ncol(x)] })
input <- get_risk_input(split_Y = split_Y, Ynames = colnames(Y),
all_fits = all_fits, all_weight = all_weight,
all_sl = all_sl, all_fit_tasks = all_fit_tasks,
sl_control = sl_control, V = V,
learners = learners)
risk <- do.call(y_weight_control$cv_risk_fn,
args = list(input = input, y_weight_control = y_weight_control))
# re-sort ic contributions
if(!is.null(risk$ic)){
if(length(risk$ic) != length(folds)){
stop(paste0("Length of influence curves returned by ",
y_weight_control$cv_risk_fn," is wrong. Check this",
"function for errors."))
}
n <- length(folds)
tmp <- rep(NA, n)
tmp[unlist(split(1:n, folds))] <- risk$ic
risk$ic <- tmp
}
return(risk)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.