#' Find fitting models and test them using given metrics on the test dataset
#'
#' @param train The training dataset
#' @param test The testing dataset
#' @param response The response column as a string
#' @param models A list of models. Each model should be a list, containing at least a training function \code{.train} and a \code{.predict} function, plus named
#' vectors of parameters to explore.
#'
#' The \code{.train} function has to take a \code{data} argument that stores the training data and a \code{...} argument for the parameters.
#' The \code{.predict} function needs to take two arguments, where the first is the model and the second the new dataset.
#'
#' If a parameter only takes a single value, you can use a vector to store options. Otherwise use a list.
#'
#' You can use \code{\link{model_trainer}} as a wrapper for this list. It will also test your inputs.
#' @param metrics A list of metrics (functions) that need to be calculated on the train and test response and predictions
#' @param parameter_sample_rate Optional parameter. If set in the range \code{(0,1])}, it will be used to sample the possible combinations of parameters
#' @param seed Random seed to set each time before a model is trained
#' @param preprocess_pipes List of preprocessing pipelines generated using \code{\link{pipeline}}.
#' @param prepend_data_checker Flag indicating if \code{\link{pipe_check}} should be prepended before all pipelines.
#' @param on_missing_column See \code{\link{pipe_check}} for details.
#' @param on_extra_column See \code{\link{pipe_check}} for details.
#' @param on_type_error See \code{\link{pipe_check}} for details.
#' @param verbose Should intermediate updates be printed.
#' @param save_model Flag indicating if the generated models should be saved. Defaults to False.
#'
#' @return A dataframe containing the training function, a list of parameters used to train the function, and one column for each metric / dataset combination.
#' @export
#' @importFrom purrr map_dbl map_lgl pmap_df
find_model <- function(train, test, response,
models, metrics, parameter_sample_rate = 1, seed = 1,
prepend_data_checker = T,
on_missing_column = c("error", "add")[1],
on_extra_column = c("remove", "error")[1],
on_type_error = c("ignore", "error")[1],
verbose = T,
save_model = F,
preprocess_pipes = list(function(train, test) return(list(train = train, test = train, .predict = function(data) return(data))))
) {
stopifnot(
!missing(train), is.data.frame(train),
!missing(test) , is.data.frame(test),
is.character(response), length(response) == 1,
is.logical(prepend_data_checker),
is.logical(save_model),
is.list(models),
parameter_sample_rate <= 1, parameter_sample_rate > 0,
is.list(metrics), !any(!purrr::map_lgl(metrics, is.function)),
is.list(preprocess_pipes), !any(!purrr::map_lgl(preprocess_pipes, is.function))
)
models_have_valid_elements <- purrr::map_lgl(models, function(m) {
if(any(!c(".train", ".predict") %in% names(m))) return(F)
are_functions <- purrr::map_lgl(m[c(".train", ".predict")], is.function)
return(!any(!are_functions))
})
if(is.null(names(models))) model_names <- seq_along(models)
else model_names <- names(models)
if(is.null(names(preprocess_pipes))) pipe_names <- seq_along(preprocess_pipes)
else pipe_names <- names(preprocess_pipes)
if(any(!models_have_valid_elements)) stop("Error: all models must contain .train and .predict elements that are functions")
if(!is.null(names(metrics))) metric_names <- names(metrics)
else metric_names <- paste0("metric_", seq_along(metrics))
res <- data_frame(.train = list(), .predict = list(), .id = "", params = list(), .preprocess_pipe = list())[0,]
for(metric_name in metric_names) {
res[paste0("train_", metric_name)] <- numeric(0)
res[paste0("test_", metric_name)] <- numeric(0)
}
if(save_model) res[".model"] <- list()
# Try each preprocessing pipe
for(preprocess_index in seq_along(preprocess_pipes)){
preprocess_pipe <- preprocess_pipes[[preprocess_index]]
if(prepend_data_checker){
preprocess_pipe <- train_pipeline(
segment(.segment = pipe_check, response = response,
on_missing_column = on_missing_column, on_extra_column = on_extra_column, on_type_error = on_type_error),
segment(.segment = preprocess_pipe))
}
piped <- preprocess_pipe(train)
piped_train <- piped$train
trained_pipeline <- piped$pipe
piped_test <- invoke(trained_pipeline, test)
pipe_name <- pipe_names[preprocess_index]
# Make sure response is in the final training / testing dataset
stopifnot(
response %in% colnames(piped_train),
response %in% colnames(piped_test)
)
# Try each model
for(model_index in seq_along(models)) {
model <- models[[model_index]]
model_name <- model_names[model_index]
f_train <- model[[".train"]]
f_predict <- model[[".predict"]]
training_wrapper <- function(...) f_train(data = piped_train, ...)
parameter_grid <- expand.grid(stringsAsFactors = F, model[!names(model) %in% c(".train", ".predict")])
if(parameter_sample_rate < 1) {
n_samples <- ceiling(nrow(parameter_grid) * parameter_sample_rate)
subsection <- sample.int(n = nrow(parameter_grid), size = n_samples, replace = F)
parameter_grid <- parameter_grid[subsection,,drop = F]
}
if(nrow(parameter_grid) < 1) parameter_grid <- data_frame(1)[,0]
parameter_grid <- as_data_frame(parameter_grid)
# Try each combination of parameters
for(r in seq_len(nrow(parameter_grid))) {
if(verbose) {
update_message <- paste("\rComputing preprocess pipeline", preprocess_index, "/", length(preprocess_pipes), "model", model_index, "/", length(models), "iteration", r, "/", nrow(parameter_grid))
cat(update_message)
}
set.seed(seed)
args <- as.list(parameter_grid[r,])
requested_arguments <- formalArgs(f_train)
if(any(!names(args) %in% requested_arguments) && !"..." %in% requested_arguments) {
faulty_args <- names(args)[!names(args) %in% requested_arguments]
text_args <- paste0(collapse = ", ", faulty_args)
stop(paste0("Warning in preprocess pipeline ", preprocess_index, ", model ", model_index , ": arguments `", text_args, "` were not arguments of the provided .train function"))
}
model <- do.call(what = training_wrapper, args = args)
# Do train and test predictions and calculate metrics
train_preds <- f_predict(model, piped_train)
train_metrics_calculated <- purrr::map_dbl(.x = metrics, function(f) f(unlist(piped_train[response]), train_preds))
test_preds <- f_predict(model, piped_test)
test_metrics_calculated <- purrr::map_dbl(.x = metrics, function(f) f(unlist(piped_test[response]), test_preds))
tmp <- list(".train" = list(f_train), ".predict" = list(f_predict), ".id" = paste0(pipe_name, "_", model_name), "params" = list(parameter_grid[r,]), ".preprocess_pipe" = list(trained_pipeline))
tmp[paste0("train_", metric_names)] <- train_metrics_calculated
tmp[paste0("test_", metric_names)] <- test_metrics_calculated
if(save_model) tmp$.model <- list(model)
if("post_pipe" %in% names(piped)) tmp$.post_pipe <- list(piped$post_pipe)
res <- bind_rows(res, tmp)
}
}
if(verbose) cat("\n")
}
return(res)
}
#' Select top models from the find_model function
#'
#' @param train The training dataset
#' @param find_model_result Result from the find_model function
#' @param metric Target metric, as a string
#' @param higher_is_better Logical indicating if the results should be sorted from high to low.
#' @param per_model Logical indicating if we should take N models per model type or in total
#' @param top_n The top n models to return
#' @param aggregate_func Aggregation function to apply. Useful if you choose more than 1 model. Set this to NA to skip it.
#'
#' @return A pipeline to create predictions for new data
#' @export
find_best_models <- function(train, find_model_result, metric, higher_is_better, per_model = F, top_n = 1, aggregate_func = NA) {
stopifnot(
is.logical(per_model),
is.logical(higher_is_better),
is.data.frame(find_model_result),
is.data.frame(train),
is.character(metric), length(metric) == 1,
!any(!c(".id", ".train", metric, "params") %in% colnames(find_model_result)),
is.function(aggregate_func) || is.na(aggregate_func)
)
find_model_result <- find_model_result[order(unlist(find_model_result[,metric]), decreasing = higher_is_better),]
if(per_model) find_model_result %<>% group_by(.id)
find_model_result %<>% filter(row_number() <= top_n) %>%
mutate(model_name = paste0(.id, "_", row_number()))
if(".model" %in% colnames(find_model_result)) {
models <- find_model_result$.model
names(models) <- find_model_result$model_name
} else {
models <- apply(X = find_model_result, MARGIN = 1, function(r){
d <- invoke(r$.preprocess_pipe, train)
m <- do.call(what = r$.train, args = list(data = d) %>% c(unlist(r$params)))
return(m)
}) %>% as.list
names(models) <- find_model_result$model_name
}
model_pipe <- pipe(.function = model_prediction, models = models, pipes = find_model_result$.preprocess_pipe,
model_predict_functions = find_model_result$.predict)
if(is.function(aggregate_func)) {
aggreation_pipe_function <- function(data, func)
dplyr::as_data_frame(apply(X = data, MARGIN = 1, FUN = func))
full_pipe <- pipeline(
"model" = model_pipe,
"aggregate" = pipe(.function = aggreation_pipe_function, func = aggregate_func)
)
} else full_pipe <- pipeline("model" = model_pipe)
return(full_pipe)
}
model_prediction <- function(data, models, pipes, model_predict_functions) {
purrr::pmap_df(list(model = models, pred_func = model_predict_functions, trained_pipeline = pipes),
function(model, pred_func, trained_pipeline) pred_func(model, invoke(trained_pipeline, data)))
}
#' Wrapper function for model inputs to \code{find_model}
#'
#' @param .train The training function. This function has to take a \code{data} argument that stores the training data and a \code{...} argument for the parameters.
#' @param .predict The prediction function. This function needs to take two arguments, where the first is the model and the second the new dataset.
#' @param ... Other parameters to be passed to the training function.
#'
#' @return A list for use as a model input with \code{\link{find_model}}
#' @export
#'
#' @examples
#' d <- data.frame(x = 1:10, y = 1:10 + rnorm(10, sd = .3))
#' model_trainer(
#' .train = function(data, ...) lm(data = data, formula = y ~ ., ...),
#' .predict = function(x,y) predict.lm(x,y),
#' x = FALSE
#' )
model_trainer <- function(.train, .predict, ...) {
stopifnot(
is.function(.train), !any(!c("data", "...") %in% formalArgs(.train)),
is.function(.predict), length(formalArgs(.predict)) >= 2
)
return(list(
.train = .train,
.predict = .predict,
...
))
}
#' A convenient wrapper function for find_model for models that have a formula and data argument, like lm
#'
#' @param response The response column, as a string.
#' @param training_function The function that trains the algorithm, such as \code{\link{lm}}
#' @param ... Named vectors of parameters to explore.
#'
#' @return A list that's compatible with the \code{\link{find_model}} function
#' @export
#'
#' @examples
#' # uses svm function from library e1071
#' find_template_formula_and_data(response = "response", training_function = svm,
#' type = c("eps-regression", "nu-regression"),
#' kernel = c("radial"), gamma = c(.001, .002, .003, .0008))
find_template_formula_and_data <- function(response, training_function, ...) {
form <- paste(response, "~ .") %>% as.formula
return(model_trainer(
.train = function(data, ...) training_function(formula = form, data = data, ...),
.predict = function(x, y) {
if(response %in% colnames(y)) y <- select_(y, paste0("-", response))
predict(x, y)
},
...
))
}
#' A convenient wrapper function for find_model for xgboost
#'
#' @param response The response column, as a string.
#' @param ... Named vectors of parameters to explore.
#'
#' @return A list that's compatible with the \code{\link{find_model}} function
#' @export
#'
#' @importFrom xgboost xgb.train xgb.DMatrix
#' @examples
#' find_xgb(response = "response", nrounds = c(30, 50), eta = c(.015),
#' max_depth = c(2), colsample_bytree = c(.7), lambda = c(.1),
#' subsample = c(.7), base_score = c(100))
find_xgb <- function(response, ...) {
return(model_trainer(
.train = function(data, ...) xgboost::xgb.train(data = xgboost::xgb.DMatrix(as.matrix(select_(data, paste0("-", response))), label = unlist(data[response])), ...),
.predict = function(x, y) {
if(response %in% colnames(y)) y <- select_(y, paste0("-", response))
predict(x, as.matrix(y))
},
...
))
}
#' Expands the params column of the result of \code{find_model}
#'
#' @param find_model_result The result from \code{find_model}
#'
#' @return A dataframe with the params column expanded
#' @export
find_expand_results <- function(find_model_result) {
return(bind_cols(find_model_result[names(find_model_result) != "params"], bind_rows(find_model_result$params)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.