#' Cross-validated maximal association measures
#'
#' A flexible interface for computing cross-validation-based measures of maximal association.
#' In an outer layer of V-fold cross validation, training samples are used to train a prediction
#' algorithm for each outcome. Multiple algorithms may be ensembled using stacking (also known as
#' super learning) based on V-2 fold cross-validation. An inner layer of V-1 cross validation is used
#' to determine a user-specified combination of outcomes that maximizes a user-specified prediction
#' criteria. The outer layer
#' validation sample is used to compute a user-specified cross-validated measure of performance of
#' the prediction algorithm for predicting the combined outcome that was computed in the training
#' sample. Several common choices for outcome combinations (convex combination of
#' outcomes and single outcome that is most associated) and prediction criteria (nonparametric R^2,
#' negative log-likelihood, and area under ROC curve) are included; however, users may specify
#' their own criteria as well. The function returns the cross-validated summary measure for the
#' maximally combined outcome and, if desired, the cross-validated summary measure for each
#' outcome.
#'
#' TO DO: Figure out how future works (e.g., can plan() be specified internally
#' or externally?)
#'
#' @param Y A matrix or data.frame of outcomes
#' @param X A matrix or data.frame of predictors
#' @param V Number of outer folds of cross-validation (nested cross-validation
#' uses V-1 and V-2 folds), so must be at least four.
#' @param learners Super learner wrappers. See \code{SuperLearner::listWrappers}.
#' @param sl_control A list with named entries ensemble_fn, optim_risk_fn, weight_fn,
#' cv_risk_fn, family. Available functions can be viewed with \code{sl_control_options()}. See
#' \code{?sl_control_options} for more on how users may supply their own functions.
#' @param y_weight_control A list with named entries ensemble_fn, optim_risk_fn, weight_fn,
#' cv_risk_fn. Available functions can be viewed with \code{y_weight_control_options()}. See
#' \code{?y_weight_control_options} for more on how users may supply their own functions.
#' @param return_control A list with named entries \code{outer_weight} (whether to return outcome
#' weights for outer-most fold of CV, default \code{TRUE}), \code{outer_sl} (whether to return the
#' super learner fit for each outcome on all the data), \code{all_y} (whether to return cross-validated
#' performance metrics for all outcomes), \code{all_learner_assoc} (whether to return cross-validation
#' performance metrics for all learners), \code{all_learner_fits} (whether to return all learner fits,
#' which, while memory intensive, can be helpful if association measures based on different outcome
#' weighting schemes are desired). TO DO: For all the control options, it would be nice
#' if one could just input one of the entries and have the others set to default a la
#' SuperLearner control or caret trControl.
#' @param scale Standardize each outcome to be mean zero with standard deviation 1.
#'
#' @return \code{cv_assoc} returns risk for the entire procedure. The \code{cv_assoc_all_y} will return
#' cross-validated performance metric for all the outcomes, including the confidence interval, p-value and
#' influence curve. \code{all_learner_assoc} will return for each outcome and learner
#' cross-validated metric, confidence interval, associated p-value and influence curve.
#' The \code{sl_fit} will return Super Learner fit for each outcome and associated learner risks on
#' all the data. In addition, it will return the fit for all learners based on all folds.
#' The \code{outer_weight} will return the outcome weights obtained using outer-most fold of CV.
#' \code{inner_weight} returns outcome weights obtained using inner-most fold of CV. Additinally,
#' \code{all_learner_fits} returns all learner fits.
#' TO DO: Should cvma have $cv_measure, $ci_low, $ci_high, and $p_value returned?
#' These are seen when the objects are printed so it may be natural for users to think
#' that those are named in the cvma object.
#'
#' @export
#'
#' @seealso predict method
#'
#' @importFrom future.apply future_lapply
#' @importFrom stats gaussian
#' @import SuperLearner
#'
#' @examples
#' set.seed(1234)
#' library(SuperLearner)
#' library(future)
#' X <- data.frame(x1=runif(n=100,0,5), x2=runif(n=100,0,5))
#' Y1 <- rnorm(100, X$x1 + X$x2, 1)
#' Y2 <- rnorm(100, X$x1 + X$x2, 3)
#' Y <- data.frame(Y1 = Y1, Y2 = Y2)
#' fit <- cvma(Y = Y, X = X, V = 5,
#' learners = c("SL.glm","SL.mean"))
cvma <- function(Y, X, V = 5, learners,
sl_control = list(ensemble_fn = "ensemble_linear",
optim_risk_fn = "optim_risk_sl_se",
weight_fn = "weight_sl_convex",
cv_risk_fn = "cv_risk_sl_r2",
family = gaussian(),
alpha = 0.05),
y_weight_control = list(ensemble_fn = "ensemble_linear",
weight_fn = "weight_y_convex",
optim_risk_fn = "optim_risk_y_r2",
cv_risk_fn = "cv_risk_y_r2",
alpha = 0.05),
return_control = list(outer_weight = TRUE,
outer_sl = TRUE,
inner_sl = FALSE,
all_y = TRUE,
all_learner_assoc = TRUE,
all_learner_fits = FALSE),
scale = FALSE
){
# get initial parameter values
n <- length(Y[,1])
J <- ncol(Y)
M <- length(learners)
#scale all multivariate outcome by covariance matrix?
if(scale & !(all(Y==0 | Y==1))){
Ymat <- data.frame(scale(Y))
}else{
# put Y into the proper format
Ymat <- data.matrix(Y)
}
# Checks
if(!all(apply(Ymat,2,function(x) { all(x %in% 0:1) })) & sl_control$optim_risk_fn == "optim_risk_sl_auc"){
stop("Outcome should be binary.")
}
if(y_weight_control$weight_fn == "weight_y_convex" & y_weight_control$optim_risk_fn == "optim_risk_y_auc"){
stop("risk_y_auc requires all composite outcome to be either 0 or 1.")
}
# correct names if none
if(is.null(colnames(Ymat))){
colnames(Ymat) <- paste0("Y",1:J)
}
# TO DO: make_folds function with more options, possibly from origami?
if(!all(Y[,1] %in% c(0,1))){
folds <- rep(seq_len(V), length = n)
folds <- sample(folds)
}else{
within.split <- suppressWarnings(tapply(1:n,
INDEX = Y, FUN = split, rep(1:V)))
validRows <- vector("list", length = V)
names(validRows) <- paste(seq(V))
for (vv in seq(V)) {
validRows[[vv]] <- c(within.split[[1]][[vv]],
within.split[[2]][[vv]])
}
folds <- rep(NA, n)
for(v in 1:V){
folds[validRows[[v]]] <- v
}
}
# all learner fitting tasks
all_fit_tasks <- make_fit_task_list(Ynames = colnames(Ymat), learners = learners,
V = V, return_outer_sl = return_control$outer_sl)
all_fits <- future.apply::future_lapply(X = all_fit_tasks, FUN = get_fit, folds = folds,
tmpX = X, Y = Ymat, sl_control = sl_control)
# all super learner weight-getting tasks
all_sl_tasks <- make_sl_task_list(Ynames = colnames(Ymat), V = V)
# TO DO: I have a hunch that if future_lapply requires transferring
# all_fits between nodes that the communication overhead will make
# parallelization of this step slower than doing it sequentially
all_sl <- lapply(all_sl_tasks, FUN = get_sl,
Y = Ymat, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, folds = folds,
learners = learners, sl_control = sl_control)
# all outcome weight tasks
if(length(colnames(Ymat)) > 1){
all_y_weight_tasks <- make_y_weight_task_list(V = V)
}else{
all_y_weight_tasks <- list(list(training_folds = 1:V))
}
all_weight <- lapply(all_y_weight_tasks, FUN = get_y_weight,
Y = Ymat, V = V, Ynames = colnames(Ymat),
all_fits = all_fits, all_sl = all_sl,
all_fit_tasks = all_fit_tasks,
sl_control = sl_control, y_weight_control = y_weight_control, folds = folds,
learners = learners)
# get risk of entire procedure
risk <- get_risk(Y = Y, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, all_sl = all_sl, folds = folds,
all_weight = all_weight,
sl_control = sl_control, y_weight_control = y_weight_control, learners = learners)
# compute outer super learner
if(return_control$outer_sl){
outer_sl_tasks <- make_outer_sl_task_list(Ynames = colnames(Ymat), V = V)
all_outer_sl <- lapply(outer_sl_tasks, FUN = get_formatted_sl,
Y = Ymat, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, folds = folds,
sl_control = sl_control, learners = learners,
return_learner_fits = TRUE)
}else{
all_outer_sl <- NULL
}
# format inner super learner
if(return_control$inner_sl){
inner_sl_tasks <- make_sl_task_list(Ynames = colnames(Ymat), V = V,
fold_fits = V-1)
all_inner_sl <- lapply(inner_sl_tasks, FUN = get_formatted_sl,
Y = Ymat, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, folds = folds,
sl_control = sl_control, learners = learners,
return_learner_fits = TRUE)
}else{
all_inner_sl <- NULL
}
# compute outer weights
if(return_control$outer_weight){
outer_weight <- get_y_weight(task = list(training_folds = 1:V),
Y = Ymat, V = V, Ynames = colnames(Ymat),
all_fits = all_fits, all_sl = all_sl,
all_fit_tasks = all_fit_tasks,
sl_control = sl_control, y_weight_control = y_weight_control, folds = folds,
learners = learners)
}else{
outer_weight <- NULL
}
# get CV risk for each learner and each outcome
if(return_control$all_y){
outer_sl_tasks <- make_outer_sl_task_list(Ynames = colnames(Ymat), V = V)
risk_all_y <- lapply(outer_sl_tasks, FUN = get_risk_sl,
Y = Y, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, all_sl = all_sl,
folds = folds, sl_control = sl_control, learners = learners)
}else{
risk_all_y <- NULL
}
if(return_control$all_learner_assoc){
outer_learner_tasks <- make_outer_learner_task_list(Ynames = colnames(Ymat),
V = V, learners = learners)
risk_all_learners <- lapply(outer_learner_tasks, FUN = get_risk_learner,
Y = Y, V = V, all_fit_tasks = all_fit_tasks,
all_fits = all_fits, all_sl = all_sl,
folds = folds, sl_control = sl_control, learners = learners)
}else{
risk_all_learners <- NULL
}
if(!return_control$all_learner_fits){
all_fits <- NULL
}
# format output
out <- list(cv_assoc = risk,
cv_assoc_all_y = risk_all_y,
cv_assoc_all_learners = risk_all_learners,
sl_fits = all_outer_sl,
inner_sl_fits = all_inner_sl,
outer_weight = outer_weight,
inner_weight = all_weight,
folds = folds,
y_names = colnames(Ymat),
learners = learners,
all_learner_fits = all_fits,
V = V)
class(out) <- "cvma"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.