globalVariables(c("ind", "o"))
#' Create a ensemble from models
#'
#' Combine the predictions of multiple models
#'
#' You can create a ensemble model based on predictions of multiple models.
#' The ensemble prediction is calculated based on the ensemble_method:
#' \itemize{
#' \item \code{mean} - models mean prediction.
#' \item \code{median} - models median prediction.
#' \item \code{weighted_mean } - models weighted mean prediction. Weighs are based on the \code{metric},
#' so models with higher metric have more weight in the mean.
#' \item \code{number_votes} - The number of predictions of the first class (considered positive or presence) is
#' divided by the number of models. When the prediction is close to 1, it means that all models
#' agree to predict the first class. Only models of type "Classification" are supported.
#' }
#' For classification, the probability of predictions is used to create the ensemble.
#' @param model.list A list of models returned by \code{\link[caret]{train}}.
#' @param ensemble_method One of 'mean', 'median', 'weighted_mean' and 'number_votes'. Check details.
#' @param metric A metric to use to calculate weights. Only used if \code{ensemble_method = "weighted.mean"}.
#' Must be one of the metrics provided by 'summaryFunction' used to train the models.
#' If \code{NULL}, the metric of the first model is used.
#' @param calc.pred logical. Calculate predictions for training data? If \code{TRUE}, savePredictions is set
#' to "final"; else, savePredictions is set to "FALSE".
#' @return An S3 object of classes "ensemble.train" that also inherits "train".
#' This object can be used in other functions, like \code{evaluate} or \code{confidence_map}.
#' @seealso \code{\link{confidence_map}}
#' @export
createEnsemble <- function(model.list, ensemble_method = "weighted_mean", metric = NULL, calc.pred = FALSE) {
check_list(model.list)
check_list_ensemble(model.list)
out <- list()
out$model.list <- model.list
# add items in the list so it can be evaluated later
out$method <- "ensemble"
out$modelInfo <- list(label = "Ensemble",
parameters = data.frame(parameter = "params", class = "numeric", label = "dummy"))
out$modelType <- model.list[[1]]$modelType
out$levels <- model.list[[1]]$levels
out$coefnames <- model.list[[1]]$coefnames
out$xlevels <- model.list[[1]]$xlevels
out$trainingData <- model.list[[1]]$trainingData
out$control <- model.list[[1]]$control
# select algorithm
out$ensemble_method <- ensemble_method
out$algo <- switch(ensemble_method,
mean = function(x, w) mean(x),
median = function(x, w) stats::median(x),
weighted_mean = function(x, w) stats::weighted.mean(x, w),
number_votes = function(x, w) sum(x == w) / length(x)
)
if (is.null(out$algo))
stop("ensemble_method should be one of 'mean', 'median',
'weighted_mean' or 'number_votes'")
# check model.type
if (ensemble_method == "number_votes") {
if (out$modelType != "Classification")
stop("All models in 'model.list' must be of type Classification for this algorithm.")
out$type <- "raw"
} else {
out$type <- if (out$modelType == "Classification") "prob1" else "raw"
}
# get weighs
if (ensemble_method == "weighted_mean") {
if (is.null(metric)) metric <- model.list[[1]]$metric
tmp <- lapply(model.list, caret::getTrainPerf)
if (is.null(tmp[[1]][[paste0("Train", metric)]]))
stop("'metric' must be a valid metric from summaryFunction used to train the model.")
out$w <- sapply(tmp, `[[`, paste0("Train", metric))
} else if (ensemble_method == "number_votes") {
presence <- sapply(model.list, function(x) x$levels[1])
if (!all(presence == presence[1]))
stop("All models in 'model.list' should have the same first level as response.")
out$w <- presence[1]
} else {
out$w <- NULL
}
# calculate predictions if possible
if (calc.pred) {
# check 1
check_control0 <- sapply(model.list, function(x) length(x$control$indexOut))
check_control0 <- all(check_control0 == check_control0[1])
if (!check_control0) {
message("Distinct number of replication detected. Skipping predictions calculation!")
calc.pred <- FALSE
}
# check 2
check_control1 <- sapply(model.list, function(x)
x$control$savePredictions == "none" || x$control$savePredictions == FALSE)
if (any(check_control1)) {
message("'savePredictions' is not set to TRUE or 'final' in all models.
Skipping predictions calculation!")
calc.pred <- FALSE
}
# check 3
check_control2 <- sapply(model.list, function(x) {
any(mapply(function(x, y) any(y %in% x), x = x$control$index, y = out$control$indexOut))
})
if (any(check_control2)) {
message("indexOut of one model has records from index of onother model at any replicate.
You should train all models of the ensemble using the same index and indexOut in trainControl.
Skipping predictions calculation!")
calc.pred <- FALSE
}
}
if (calc.pred) {
check_control3 <- sapply(model.list, function(x, y) identical(y, x$control$indexOut), y = out$control$indexOut)
# if one model has distinct traindata, we need to recalculate testdata for pred
if (!all(check_control3)) {
message("Models with distinct traindata detected. Using traindata of the first model to evaluate.")
m <- which(!check_control3)
for (i in m) {
args <- create_args(model.list[[i]])
args$trControl$savePredictions <- "final"
# get with seed to use
results <- model.list[[i]]$results
results$order <- seq(nrow(results))
results <- merge(results, model.list[[i]]$bestTune)
M <- results$order
`%op%` <- if (out$control$allowParallel && getDoParWorkers() > 1) `%dopar%` else `%do%`
results <- foreach(ind = model.list[[i]]$control$index,
s = model.list[[i]]$control$seeds,
o = out$control$indexOut,
.packages = "caret") %op% {
args$trControl$seeds <- s[M]
args$data <- model.list[[i]]$trainingData[ind, ]
args$weights <- model.list[[i]]$trainingData$.weights[ind]
tmp.model <- invisible(do.call("train", args))
if (!is.null(model.list[[i]]$thr)) tmp.model$thr <- model.list[[i]]$thr
predict2(tmp.model, out$trainingData[o, ], type = "both")
}
# update model$pred data.frame
results <- as.data.frame(rbindlist(results))
results2 <- data.frame(
obs = out$trainingData$.outcome,
rowIndex = unlist(out$control$indexOut),
Resample = rep(names(out$control$index), sapply(out$control$indexOut, length)),
model.list[[i]]$bestTune
)
model.list[[i]]$pred <- cbind(results, results2)
# update model$results data.frame - not sure if needed
# tmp <- data.table(model.list[[i]]$pred)
# tmpfunc <- function(x) as.data.table(as.list(
# model.list[[i]]$control$summaryFunction(as.data.frame(x), model.list[[i]]$levels)
# ))
# tmp <- tmp[, tmpfunc(.SD), by = "Resample"]
# tmp1 <- tmp[, lapply(.SD, mean), .SDcols = -"Resample"]
# tmp2 <- tmp[, lapply(.SD, stats::sd), .SDcols = -"Resample"]
# colnames(tmp2) <- paste0(colnames(tmp2), "SD")
# model.list[[i]]$results <- as.data.frame(cbind(model.list[[i]]$bestTune, tmp1, tmp2))
}
}
preds <- lapply(model.list, function(x) merge(x$pred, x$bestTune))
out_tmp <- preds[[1]][, c('obs', 'rowIndex', 'Resample')]
preds <- lapply(preds, as.data.table)
col_join <- c('Resample', 'rowIndex')
preds <- lapply(preds, function(x) x[out_tmp[, col_join], on = col_join])
if (out$modelType == "Classification") {
preds <- lapply(preds, `[[`, out$levels[1])
} else {
preds <- lapply(preds, `[[`, "pred")
}
preds <- apply(data.frame(preds), 1, out$algo, w = out$w)
out$pred <- data.frame(
pred = factor(preds > 0.5, levels = c(TRUE, FALSE), labels = out$levels),
obs = out_tmp$obs,
lvl1 = preds,
lvl2 = 1 - preds,
rowIndex = out_tmp$rowIndex,
params = 0,
Resample = out_tmp$Resample
)
colnames(out$pred)[3:4] <- out$levels
out$bestTune <- data.frame(params = 0)
out$control$savePredictions <- "final"
} else {
out$control$savePredictions <- FALSE
}
# create results table
e <- evaluate.train(out, errorFunction = stats::sd)
out$results <- t(data.frame(c(0, e$eval$value, e$eval$error)))
row.names(out$results) <- 1
colnames(out$results) <- c("param", as.character(e$eval$metric), paste0(e$eval$metric, "SD"))
# get terms if available
if (inherits(model.list[[1]], "train.formula")) {
out$terms <- model.list[[1]]$terms
class(out) <- c("ensemble.train", "train", "train.formula")
} else {
class(out) <- c("ensemble.train", "train")
}
return(out)
}
#' @param object A object returned by \code{createEnsemble}.
#' @param newdata A data.frame containing data to predict.
#' @param type One of "raw" or "prob".
#' @param scale logical. Scale predictions of each model between 0 and 1 before ensemble?
#' @param ... Further arguments passed to \code{predict}.
#' @rdname createEnsemble
#' @export
predict.ensemble.train <- function(object, newdata = NULL, scale = TRUE, type = "raw", ...) {
if (is.null(newdata)) newdata <- object$trainingData
preds <- lapply(object$model.list, predict2, newdata = newdata, type = object$type, ...)
# scale
if (scale && object$type == "prob1") preds <- lapply(preds, range01)
preds <- apply(data.frame(preds), 1, object$algo, w = object$w)
if (object$modelType == "Classification") {
if (type == "raw") {
preds <- factor(preds > 0.5, levels = c(TRUE, FALSE), labels = object$levels)
} else {
preds <- data.frame(l1 = preds, l2 = 1 - preds)
colnames(preds) <- object$levels
}
}
return(preds)
}
range01 <- function(x) {
rg <- range(x)
return((x - rg[1]) / (rg[2] - rg[1]))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.