#' Cross-validated mean squared-error for computing super learner
#'
#'
#' In general, the function passed to \code{sl_control$optim_risk} should expect a named list
#' of outcomes (Y) and predictions (pred) in validation folds and should return a criteria by
#' which super learner weights may be optimized. The weights are input to the function via
#' \code{sl_weight} and are optimized in the \code{sl_control$weight_fn}. See
#' Examples section below for an example of the format of the \code{input} list used
#' for \code{sl_control$optim_risk} functions.
#'
#' In this case, the function computes cross-validated mean squared-error.
#'
#' @param sl_weight A numeric vector of super learner weights corresponding to each
#' \code{learner}. Typically, this is what is maximized over in \code{sl_control$weight_fn}.
#' @param input A list where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{leaner}s).
#' @param sl_control Super learner control options.
#' @export
#'
#' @return Numeric value of cross-validated mean squared-error
#' @examples
#'
#' # simulate data with proper format
#' # Y is one component of the multivariate outcome
#' # pred is the predictions made by learners
#' input <- list(Y = rnorm(50), pred = cbind(rnorm(50), rnorm(50)))
#'
#' # made up weights
#' sl_weight <- c(0.5, 0.5)
#'
#' # linear ensemble
#' sl_control <- list(ensemble_fn = "ensemble_linear")
#'
#' # get risk
#' risk <- optim_risk_sl_se(sl_weight, input, sl_control)
optim_risk_sl_se <- function(sl_weight, input, sl_control){
ens_pred <- do.call(sl_control$ensemble_fn, args = list(weight = sl_weight, pred = input$pred))
risk <- mean((input$Y - ens_pred)^2)
return(risk)
}
#' Cross-validated negative log-likelihood for computing super learner
#'
#' In general, the function passed to \code{sl_control$optim_risk} should expect a named list
#' of outcomes (Y) and predictions (pred) in validation folds and should return a criteria by
#' which super learner weights may be optimized. The weights are input to the function via
#' \code{sl_weight} and are optimized in the \code{sl_control$weight_fn}. See
#' Examples section below for an example of the format of the \code{input} list used
#' for \code{sl_control$optim_risk} functions.
#'
#' In this case, the function computes cross-validated (quasi-) log-likelihood for the outcomes
#' scaled to be between 0 and 1. The option \code{trim} must be a value greater than
#' zero in order that the loss is bounded. The bounds on the outcome are set via
#' \code{l} and \code{u}.
#'
#' @param sl_weight A numeric vector of super learner weights corresponding to each
#' \code{learner}. Typically, this is what is maximized over in \code{sl_control$weight_fn}.
#' @param input A list where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{leaner}s).
#' @param sl_control Super learner control options.
#' @param l Lower bound on outcomes
#' @param u Upper bound on outcomes
#' @param trim Where to trim \code{qlogis} to avoid \code{NA}s.
#'
#' @export
#'
#' @examples
#'
#' #Simulate data with proper format:
#' #Y is one component of the multivariate outcome, and
#' #pred is the predictions made by learners.
#' input <- list(Y = rnorm(50), pred = cbind(rnorm(50), rnorm(50)))
#'
#' #Made up weights:
#' sl_weight <- c(0.5, 0.5)
#'
#' #Linear ensemble:
#' sl_control <- list(ensemble_fn = "ensemble_linear")
#'
#' #Get risk by setting l and u to min and max:
#' risk <- optim_risk_sl_nloglik(sl_weight, input, sl_control, l = min(input$Y), u = max(input$Y))
#'
#'
#' @return Numeric value of cross-validated negative log-likelihood
#'
optim_risk_sl_nloglik <- function(sl_weight, input, sl_control,
l = 0 , u = 1, trim = 0.001){
ens_pred <- do.call(sl_control$ensemble_fn,
args = list(weight = sl_weight, pred = input$pred))
scale_ens_pred <- trim_p((ens_pred - l)/(u - l))
scale_y <- trim_p((input$Y - l)/(u - l))
risk <- - mean(scale_y * log(scale_ens_pred) + (1 - scale_y) * log(1 - scale_ens_pred))
return(risk)
}
#' Cross-validated area under receiver operating characteristic curve (AUC)
#'
#'
#' In general, the function passed to \code{sl_control$optim_risk} should expect a named list
#' of outcomes (Y) and predictions (pred) in validation folds and should return a criteria by
#' which super learner weights may be optimized. The weights are input to the function via
#' \code{sl_weight} and are optimized in the \code{sl_control$weight_fn}. See
#' Examples section below for an example of the format of the \code{input} list used
#' for \code{sl_control$optim_risk} functions.
#'
#' In this case, the function computes cross-validated area under the receiver operating
#' characteristics curve (AUC) using \code{cvAUC}.
#' scaled to be between 0 and 1. The option \code{trim} must be a value greater than
#' zero in order that the loss is bounded. The bounds on the outcome are set via
#' \code{l} and \code{u}.
#'
#' @param sl_weight A numeric vector of super learner weights corresponding to each
#' \code{learner}. Typically, this is what is maximized over in \code{sl_control$weight_fn}.
#' @param input A list where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{leaner}s).
#' @param sl_control Super learner control options.
#' @export
#' @importFrom cvAUC cvAUC
#' @return Numeric value of cross-validated AUC.
#'
#' @examples
#'
#' #Simulate data with proper format:
#' #Y is one component of the multivariate outcome, pred is the predictions made by 2 learners
#' input <- list(Y = rbinom(50, 1, 0.5), pred = cbind(runif(50,0,1), runif(50,0,1)))
#'
#' #Example weights:
#' sl_weight <- c(0.5, 0.5)
#'
#' #Linear ensemble:
#' sl_control <- list(ensemble_fn = "ensemble_linear")
#'
#' #Risk:
#' risk <- optim_risk_sl_auc(sl_weight, input, sl_control)
#'
optim_risk_sl_auc <- function(sl_weight, input, sl_control){
ens_pred <- do.call(sl_control$ensemble_fn, args = list(weight = sl_weight, pred = input$pred))
ens_y <- input$Y
#ens_pred <- lapply(input, function(i){do.call(sl_control$ensemble_fn, args = list(weight = sl_weight, pred = i$pred))})
#ens_y <- lapply(input, function(i){Y = i$Y})
auc <- cvAUC::cvAUC(predictions = ens_pred, labels = ens_y)
risk <- 1 - auc$cvAUC
return(risk)
}
#' Cross-validated non-parametric R-squared of the super learner
#'
#' In general, the function passed to \code{sl_control$cv_risk} should expect a list
#' of outcomes and predictions in validation folds. The function should return a
#' list with names cv_measure, ci_low, ci_high, and p_value. The output of this function
#' is returned irrespective of the names of the list; however, the names are
#' necessary for \code{print} methods to work properly.
#'
#' In this case, the confidence intervals are computed on the scale of log(MSE/Var) and
#' back-transformed to the R-squared scale. Here, MSE is the cross-validated mean
#' squared-error of the super learner for predicting a univariate outcome (one of the
#' components of \code{Y}) and Var is the cross-validated marginal mean of this outcome. The p-value
#' is for the one-sided hypothesis test that cross-validated R-squared equals zero against
#' the alternative that it is greater than zero.
#'
#' @param input List where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{leaner}s).
#' @param sl_control List of super learner control options.
#'
#' @export
#' @importFrom stats qnorm pnorm
#' @return A list with named entries cv_measure, ci_low, ci_high, and p_value. The list will
#' be returned by \code{max_assoc} irrespective of the named entries; however, the \code{print}
#' methods will only work if the function returns the above names.
#'
#' @examples
#'
#' # simulate data with proper format
#' input <- list(list(valid_folds=1, Y = rbinom(50,1,0.5),pred = runif(50,0,1)),
#' list(valid_folds=2, Y = rbinom(50,1,0.5),pred = runif(50,0,1)))
#'
#' # alpha value
#' sl_control= list(alpha= 0.05)
#'
#' # get risk
#' cv_risk <- cv_risk_sl_r2(input, sl_control)
cv_risk_sl_r2 <- function(input, sl_control){
# mse
mse_list <- unlist(lapply(input, FUN = function(i){
mean((i$Y-i$pred)^2)
}), use.names = FALSE)
cv_mse <- mean(mse_list)
# var
all_y <- unlist(lapply(input, "[", "Y"), use.names = FALSE)
ybar <- mean(all_y)
y_var <- mean((all_y - ybar)^2)
# cross-validated r2
cv_r2 <- 1 - cv_mse/y_var
# influence functions
all_pred <- unlist(lapply(input, "[", "pred"), use.names = FALSE)
ic_mse <- (all_y - all_pred)^2 - cv_mse
ic_var <- (all_y - ybar)^2 - y_var
log_grad <- matrix(c(1/cv_mse, -1/y_var), ncol = 1)
grad <- matrix(c(-1/y_var, cv_mse/y_var^2), ncol = 1)
ic_mat <- rbind(ic_mse, ic_var)
log_ic <- crossprod(log_grad, ic_mat)
ic <- crossprod(grad, ic_mat)
se_1mlog_cv_r2 <- as.numeric(sqrt(tcrossprod(ic))/length(ic_mse))
ci_low <- 1 - exp(
log(cv_mse/y_var) + stats::qnorm(1-(sl_control$alpha/2)) * se_1mlog_cv_r2
)
ci_high <- 1 - exp(
log(cv_mse/y_var) - stats::qnorm(1-(sl_control$alpha/2)) * se_1mlog_cv_r2
)
p_value <- stats::pnorm(log(cv_mse/y_var)/se_1mlog_cv_r2)
return(list(cv_measure = cv_r2, ci_low = ci_low, ci_high = ci_high, p_value = p_value,
ic = as.numeric(ic)))
}
#' Cross-validated negative log-likelihood of the super learner
#'
#' In general, the function passed to \code{sl_control$cv_risk} should expect a list
#' of outcomes and predictions in validation folds. The function should return a
#' list with names cv_measure, ci_low, ci_high, and p_value. The output of this function
#' is returned irrespective of the names of the list; however, the names are
#' necessary for \code{print} methods to work properly.
#'
#' Confidence intervals are computed for negative log-likelihood. The p-value
#' is for the one-sided hypothesis test that cross-validated negative log-likelihood
#' is less than -log(0.5), which is what the value of negative log-likelihood if
#' every observation were predicted to have Y = 1 with probability 0.5.
#'
#' @param input List where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{leaner}s).
#' @param sl_control List of super learner control options.
#'
#' @export
#' @importFrom stats qnorm pnorm
#' @return A list with named entries cv_measure, ci_low, ci_high, and p_value. The list will
#' be returned by \code{max_assoc} irrespective of the named entries; however, the \code{print}
#' methods will only work if the function returns the above names.
#'
#' @examples
#'
#' # simulate data with proper format
#' input <- list(list(valid_folds=1, Y = rbinom(50,1,0.5),pred = runif(50,0,1)),
#' list(valid_folds=2, Y = rbinom(50,1,0.5),pred = runif(50,0,1)))
#'
#' # alpha value
#' sl_control= list(alpha= 0.05)
#'
#' # get risk
#' cv_risk <- cv_risk_sl_nloglik(input, sl_control)
cv_risk_sl_nloglik <- function(input, sl_control){
# mse
nloglik_list <- unlist(lapply(input, FUN = function(i){
i$pred[i$pred == 0] <- .Machine$double.neg.eps
i$pred[i$pred == 1] <- 1 - .Machine$double.neg.eps
-mean(i$Y * log(unlist(i$pred)) + (1 - i$Y) * log(1 - unlist(i$pred)))
}), use.names = FALSE)
cv_nloglik <- mean(nloglik_list)
# var
all_y <- unlist(lapply(input, "[", "Y"), use.names = FALSE)
all_pred <- unlist(lapply(input, "[", "pred"), use.names = FALSE)
ic_nloglik <- matrix(ifelse(all_y == 1, -log(all_pred), -log(1 - all_pred)) - cv_nloglik, nrow = 1)
se_cv_nloglik <- as.numeric(sqrt(tcrossprod(ic_nloglik))/length(ic_nloglik))
ci_low <- cv_nloglik - stats::qnorm(1-(sl_control$alpha/2)) * se_cv_nloglik
ci_high <- cv_nloglik + stats::qnorm(1-(sl_control$alpha/2)) * se_cv_nloglik
p_value <- stats::pnorm((cv_nloglik + log(0.5))/se_cv_nloglik)
return(list(cv_measure = cv_nloglik, ci_low = ci_low, ci_high = ci_high, p_value = p_value,
ic = as.numeric(ic_nloglik)))
}
#' Cross-validated area under the receiver operating characteristic curve
#' for predictions based on super learner
#'
#' In general, the function passed to \code{sl_control$cv_risk} should expect a list
#' of outcomes and predictions in validation folds. The function should return a
#' list with names cv_measure, ci_low, ci_high, and p_value. The output of this function
#' is returned irrespective of the names of the list; however, the names are
#' necessary for \code{print} methods to work properly.
#'
#' In this case, the confidence intervals are computed using the \code{cvAUC::cvAUC.ci}
#' function from the \code{cvAUC} package. The p-value
#' is for the one-sided hypothesis test that cross-validated AUC equals 0.5 against
#' the alternative that it is greater than 0.5.
#'
#' @param input List where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{learner}).
#' @param sl_control List of super learner control options.
#' @export
#' @importFrom stats pnorm
#' @examples
#' # simulate data with proper format
#' input <- list(list(valid_folds=1, Y = rbinom(50,1,0.5), pred = rbinom(50,1,0.5)),
#' list(valid_folds=2, Y = rbinom(50,1,0.5), pred = rbinom(50,1,0.5)))
#'
#' # alpha value
#' sl_control= list(alpha= 0.05)
#'
#' # get risk
#' cv_risk <- cv_risk_sl_r2(input, sl_control)
cv_risk_sl_auc <- function(input, sl_control){
all_y <- lapply(input, '[[', "Y")
all_pred <- lapply(input, function(x){ unlist(x$pred, use.names = FALSE) })
cv_auc_fit <- ci.cvAUC_withIC(labels = all_y, predictions = all_pred, confidence = 1 - sl_control$alpha)
# p-value of one-sided test that cvAUC = 0.5 vs. cvAUC > 0.5
p_value <- stats::pnorm((cv_auc_fit$cvAUC - 0.5) / cv_auc_fit$se, lower.tail = FALSE)
out <- list(cv_measure = cv_auc_fit$cvAUC, ci_low = cv_auc_fit$ci[1],
ci_high = cv_auc_fit$ci[2], p_value = p_value, ic = cv_auc_fit$ic)
}
#' Convex ensemble weights for super learner
#'
#' In general, the function passed to \code{sl_control$weight_fn} should expect a list of
#' named lists of outcomes (Y), predictions (pred) in validation folds. Typically,
#' this function is used to maximize \code{sl_control$optim_risk_fn} over
#' weights. The function should return a named list. One of the names in the list should
#' be \code{weight}, which is the optimized weights. Other entries in the return list
#' are passed on to \code{sl_control$cv_risk_fn} (e.g., things needed to compute
#' cross-validated measure of association, though none are present for this particular
#' function).
#'
#' In this case, the function uses \code{Rsolnp::solnp} to minimize
#' \code{sl_control$optim_risk_fn} over convex weights.
#'
#' @param input A list where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{learner}).
#' @param sl_control Super learner control options.
#'
#' @examples
#'
#' #Simulate data with proper format:
#' inp <- list(list(Y = rnorm(50), pred = cbind(rnorm(50), rnorm(50))))
#'
#' #Linear ensemble minimizing mean squared-error:
#' sl_control <- list(ensemble_fn = "ensemble_linear", optim_risk_fn = "optim_risk_sl_se")
#'
#' #Get weights to minimize optim_risk:
#' sl_weight <- weight_sl_convex(inp, sl_control)
#'
#' @export
#'
#' @return Numeric vector giving convex weights for the super learner.
#'
weight_sl_convex <- function(input, sl_control){
# number of learners
M <- dim(input[[1]]$pred)[2]
# constrain weights to sum to 1
constraint <- function(weight, input, sl_control){
1 - sum(weight)
}
# reformat input into a single data.frame
all_pred <- Reduce(rbind, lapply(input, '[[', "pred"))
all_y <- Reduce(c, lapply(input, '[[', "Y"))
solnp_input <- list(Y = all_y, pred = all_pred)
# find weights
fit <- Rsolnp::solnp(pars=rep(1/M, M), fun = eval(parse(text = sl_control$optim_risk_fn)),
LB=rep(0, M), UB=rep(1, M), eqfun = constraint,
control = list(trace=0), eqB=0, input = solnp_input,
sl_control = sl_control)
return(list(weight = fit$pars))
}
#' 0/1 weights for super learner (i.e., discrete super learner)
#'
#' In general, the function passed to \code{sl_control$weight_fn} should expect a list of
#' named lists of outcomes (Y), predictions (pred) in validation folds. Typically,
#' this function is used to maximize \code{sl_control$optim_risk_fn} over
#' weights. The function should return a named list. One of the names in the list should
#' be \code{weight}, which is the optimized weights. Other entries in the return list
#' are passed on to \code{sl_control$cv_risk_fn} (e.g., things needed to compute
#' cross-validated measure of association, though none are present for this particular
#' function).
#'
#' In this case, the function selects the single outcome with the lowest value
#' returned by \code{sl_control$optim_risk_fn}
#'
#' @param input A list where each entry corresponds to a validation fold. Each entry is a list
#' with entries: Y (univariate outcome for this validation fold), pred (matrix of predictions
#' from \code{learner} and columns correspond to different \code{leaner}s).
#' @param sl_control Super learner control options.
#' @return Numeric vector giving 0/1 weights for super learner.
#' @export
#' @examples
#'
#' #Simulate data and properly format:
#' input <- list(list(Y = rbinom(50,1,0.5), pred = cbind(rnorm(50), rnorm(50))))
#'
#' #Linear ensemble
#' sl_control <- list(ensemble_fn = "ensemble_linear", optim_risk_fn = "optim_risk_sl_auc")
#'
#' #Get weights to minimize optim_risk:
#' sl_weight <- weight_sl_01(input, sl_control)
weight_sl_01 <- function(input, sl_control){
# number of learners
M <- dim(input[[1]]$pred)[2]
# reformat input into a single data.frame
all_pred <- Reduce(rbind, lapply(input, '[[', "pred"))
all_y <- Reduce(c, lapply(input, '[[', "Y"))
risk_input <- list(Y = all_y, pred = all_pred)
# get risk for each outcome
risks <- rep(Inf, M)
for(m in 1:M){
y_weight <- rep(0, M)
y_weight[m] <- 1
risks[m] <- do.call(sl_control$optim_risk_fn,
args = list(sl_weight = y_weight,
input = risk_input,
sl_control = sl_control))
}
# find lowest risk
final_weight <- rep(0, M)
final_weight[which.min(risks)] <- 1
return(list(weight = final_weight))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.