Nothing
cross_validate_list <- function(data,
formulas,
model_fn,
predict_fn,
preprocess_fn = NULL,
preprocess_once = FALSE,
hyperparameters = NULL,
fold_cols = ".folds",
family = "gaussian",
cutoff = 0.5,
positive = 2,
metrics = list(),
info_cols = list(),
rm_nc = FALSE,
verbose = FALSE,
parallel_ = FALSE,
caller = "cross_validate_fn()") {
if (checkmate::test_string(x = metrics, pattern = "^all$")) {
metrics <- list("all" = TRUE)
}
# Check arguments ####
assert_collection <- checkmate::makeAssertCollection()
checkmate::assert_data_frame(
x = data,
min.rows = 2,
min.cols = 2,
add = assert_collection
)
checkmate::assert_character(
x = formulas,
min.len = 1,
any.missing = FALSE,
add = assert_collection
)
checkmate::assert_character(
x = fold_cols,
min.len = 1,
any.missing = FALSE,
add = assert_collection
)
checkmate::assert_choice(
x = family,
choices = c(
"gaussian",
"binomial",
"multinomial"
),
.var.name = "family/type",
add = assert_collection
)
checkmate::assert_number(
x = cutoff,
lower = 0,
upper = 1,
add = assert_collection
)
# Positive
checkmate::assert(
checkmate::check_choice(
x = positive,
choices = c(1, 2)
),
checkmate::check_string(
x = positive,
min.chars = 1
)
)
checkmate::assert_list(
x = metrics,
types = "logical",
any.missing = FALSE,
names = "named",
add = assert_collection
)
checkmate::assert(
checkmate::check_data_frame(
x = hyperparameters,
col.names = "named",
min.rows = 1,
min.cols = 1,
null.ok = TRUE
),
checkmate::check_list(
x = hyperparameters,
null.ok = TRUE,
any.missing = FALSE,
min.len = 1,
names = "named"
)
)
checkmate::assert_list(
x = info_cols,
any.missing = FALSE,
names = "named",
add = assert_collection
)
checkmate::assert_flag(x = verbose, add = assert_collection)
checkmate::assert_flag(x = rm_nc, add = assert_collection)
checkmate::assert_flag(x = preprocess_once, add = assert_collection)
checkmate::assert_flag(
x = parallel_,
add = assert_collection,
.var.name = "parallel"
)
checkmate::assert_string(x = caller, add = assert_collection)
checkmate::assert_function(
x = model_fn,
add = assert_collection
)
checkmate::assert_function(
x = predict_fn,
add = assert_collection
)
checkmate::assert_function(
x = preprocess_fn,
null.ok = TRUE,
add = assert_collection
)
checkmate::reportAssertions(assert_collection)
# Argument names
checkmate::assert_names(
x = names(formals(model_fn)),
identical.to = c("train_data", "formula", "hyperparameters"),
what = "argument names",
.var.name = "model_fn argument names",
add = assert_collection)
checkmate::assert_names(
x = names(formals(predict_fn)),
identical.to = c(
"test_data", "model",
"formula", "hyperparameters",
"train_data"
),
what = "argument names",
.var.name = "predict_fn argument names",
add = assert_collection)
if (!is.null(preprocess_fn)){
checkmate::assert_names(
x = names(formals(preprocess_fn)),
identical.to = c(
"train_data", "test_data",
"formula", "hyperparameters"
),
what = "argument names",
.var.name = "preprocess_fn argument names",
add = assert_collection)
}
checkmate::reportAssertions(assert_collection)
if (length(setdiff(fold_cols, colnames(data))) > 0){
assert_collection$push(
paste0("the following 'fold_cols' columns were not in 'data': ", paste0(
setdiff(fold_cols, colnames(data)), collapse = ", "
))
)
}
checkmate::reportAssertions(assert_collection)
# End of argument checks ####
# Convert to tibble
data <- dplyr::as_tibble(data) %>%
dplyr::ungroup()
# Add identifier for each observation so we can find which
# ones are hard to predict
tmp_observation_id_col <- create_tmp_name(data, name = ".observation")
data[[tmp_observation_id_col]] <- seq_len(nrow(data))
# Get evaluation type
evaluation_type <- family
# Check metrics # TODO Is this redundant?
check_metrics_list(metrics)
check_metrics_list(info_cols)
# Fill metrics with default values for non-specified metrics
# and get the names of the metrics to use
metrics <- set_metrics(
family = family, metrics_list = metrics,
include_model_object_metrics = TRUE
)
info_cols <- set_info_cols(
family = family,
info_cols_list = info_cols
)
# Check that the fold column(s) is/are factor(s)
check_fold_col_factor(data = data, fold_cols = fold_cols)
# When using cross_validate() we need to extract a few hparams
# Hyperparameters for REML, link, control, is_cross_validate
special_hparams <- extract_special_fn_specific_hparams(
hyperparameters = hyperparameters
)
is_cross_validate <- special_hparams[["is_special_fn"]]
REML <- special_hparams[["REML"]]
link <- special_hparams[["link"]]
control <- special_hparams[["control"]]
# Create model_specifics object
# Update to get default values when an argument was not specified
model_specifics <- list(
model_formula = "",
family = family,
REML = REML, # TODO Anyway to safely remove this and link and control from model_specifics?
link = link,
control = control,
cutoff = cutoff,
positive = positive,
model_verbose = FALSE, # TODO Should this be removed or renamed to verbose?
model_fn = model_fn,
predict_fn = predict_fn,
preprocess_fn = preprocess_fn,
preprocess_once = preprocess_once,
hparams = NULL,
observation_id_col = tmp_observation_id_col,
caller = caller
) %>%
update_model_specifics()
## Create computational grid
computation_grid <- create_computation_grid(
data = data,
hparams = hyperparameters,
formulas = formulas,
fold_cols = fold_cols
)
n_models <- length(unique(computation_grid[["model"]]))
n_model_instances <- nrow(computation_grid)
n_folds <- length(unique(computation_grid[["abs_fold"]]))
# TODO Perhaps add a progress bar?
if (isTRUE(verbose)){
message(
paste0(
"Will cross-validate ",
n_models,
" models. This requires fitting ",
n_model_instances,
" model instances."
)
)
}
if (isTRUE(preprocess_once)) {
data <- run_preprocess_once(
data = data,
computation_grid = computation_grid,
model_specifics = model_specifics,
fold_cols = fold_cols
)
}
# Set names of fold info columns
# Should match those in fold_info below
fold_info_cols <- list(
"rel_fold" = "rel_fold",
"abs_fold" = "abs_fold",
"fold_column" = "fold_column"
)
# cross_validate all the models
validated_folds <- plyr::llply(seq_len(nrow(computation_grid)),
.parallel = parallel_,
.fun = function(r) {
# Extract current row from computation grid
to_compute <- computation_grid[r, ]
model_specifics[["model_formula"]] <- to_compute[["Formula"]]
# TODO Could maybe be done elsewhere for avoiding redundant checks:
# Check that formula contains dependent variable
y_col <- extract_y(model_specifics[["model_formula"]]) # Name of target column
if (is.null(y_col)) stop("The model formula does not contain a dependent variable.")
model_specifics[["hparams"]] <- to_compute[["hparams"]]
fold_info <- list(
"rel_fold" = to_compute[["rel_fold"]],
"abs_fold" = to_compute[["abs_fold"]],
"fold_column" = as.character(to_compute[["fold_col_name"]])
)
validate_fold(
data = data,
fold_info = fold_info,
fold_info_cols = fold_info_cols,
evaluation_type = evaluation_type,
model_specifics = model_specifics,
model_specifics_update_fn = NULL,
metrics = metrics,
fold_cols = fold_cols,
err_nc = FALSE,
return_model = FALSE
)
}
)
# Extract predictions and targets
predictions_and_targets <- validated_folds %c% "predictions_and_targets"
# Extract model object metrics
model_evaluations <- validated_folds %c% "model_evaluation"
# Extract preprocessing parameters
preprocess_params <- validated_folds %c% "preprocess_parameters"
# Extract whether the models were NULL or not
model_was_null <- unlist(validated_folds %c% "model_was_null")
# Add to computation grid
computation_grid <- computation_grid %>%
dplyr::mutate(
model_eval = model_evaluations,
Predictions = predictions_and_targets,
model_was_null = model_was_null,
Preprocess = preprocess_params
)
# Evaluate predictions
cross_validations <- plyr::llply(seq_len(n_models),
.parallel = parallel_,
.fun = function(m) {
# Extract grid for current model
current_grid <- computation_grid[computation_grid[["model"]] == m, ]
# Extract current predictions
current_predictions <- dplyr::bind_rows(current_grid[["Predictions"]])
# Extract current model object evaluations
current_model_evals <- dplyr::bind_rows(current_grid[["model_eval"]])
# Extract current preprocessing parameters
current_preprocess_params <- dplyr::bind_rows(current_grid[["Preprocess"]])
nested_current_preprocess_params <- current_preprocess_params %>%
dplyr::group_nest() %>%
dplyr::pull(.data$data)
# Extract current model metrics + some fold cols
current_model_metrics <- current_model_evals %>%
base_select(cols = c(
fold_info_cols[["fold_column"]],
fold_info_cols[["rel_fold"]],
intersect(metrics, colnames(current_model_evals))
))
# Average the model metrics
# First by fold column
# Then again
average_model_metrics <- current_model_metrics %>%
base_deselect(cols = fold_info_cols[["rel_fold"]]) %>%
dplyr::group_by(!!as.name(fold_info_cols[["fold_column"]])) %>%
dplyr::summarise_all(.funs = ~ mean(.)) %>%
base_deselect(cols = fold_info_cols[["fold_column"]]) %>%
dplyr::summarise_all(.funs = ~ mean(.))
# Prepare and nest the warnings and messages
current_warnings_and_messages <- current_model_evals %>%
base_select(cols = "Warnings and Messages") %>%
legacy_unnest()
nested_current_warnings_and_messages <- current_warnings_and_messages %>%
dplyr::group_nest() %>%
dplyr::pull(.data$data)
# Sum the warning and message counts
current_warnings_and_messages_counts <- current_model_evals %>%
base_select(
cols = c(
"Convergence Warnings",
"Singular Fit Messages",
"Other Warnings"
)
) %>%
dplyr::summarise_all(.funs = ~ sum(.))
# Prepare and nest the coefficients
current_coefficients <- current_model_evals[["Coefficients"]] %>%
dplyr::bind_rows()
nested_current_coefficients <- current_coefficients %>%
dplyr::group_nest() %>%
dplyr::pull(.data$data)
# Evaluate the predictions
prediction_evaluation <- internal_evaluate_predictions(
data = current_predictions,
prediction_col = "prediction",
target_col = "target",
model_was_null_col = "model_was_null",
type = family,
fold_info_cols = fold_info_cols,
model_specifics = model_specifics,
metrics = metrics,
include_fold_columns = TRUE,
include_predictions = TRUE # TODO Perhaps should be arg in main fn?
)
if (family == "gaussian") {
# Extract the prediction fold results tibble
fold_results <- prediction_evaluation[["Results"]][[1]]
prediction_evaluation[["Results"]] <- NULL
# Add the model metric object results
fold_results <- fold_results %>%
dplyr::full_join(current_model_metrics,
by = c(
`Fold Column` = fold_info_cols[["fold_column"]],
Fold = fold_info_cols[["rel_fold"]]
)
)
} else if (family %in% c("binomial", "multinomial")) {
# In classification, we evaluate the collected (all folds) predictions
# per fold column. So if the Results column exists,
# we will join them per
# TODO: Make sure, Results is always included in prediction_evaluation !!!
fold_results <- prediction_evaluation[["Results"]][[1]]
prediction_evaluation[["Results"]] <- NULL
# Prepare model metrics for joining with the prediction results
# Extract model metric names
model_metric_names <- intersect(names(current_model_metrics), metrics)
if (length(model_metric_names) > 0) {
# TODO The new tidyr::nest or chop() interface might be able to do this part
# without the loop and stuff (kind of messy). Requires v1.0.0 though
# so for now we will do it this way, and change it if profiling
# marks it as problematic. Note: It seems to be fairly taxing, so
# perhaps it is worth checking the tidyr version and only using
# this when necessary?
if (tidyr_new_interface()) { # Chop is new in tidyr 1.0.0
# TODO What's the effect of using chop on speed?
# It's at least a LOT prettier than the below version
fold_col_model_metrics_nested <- current_model_metrics %>%
base_deselect(cols = "rel_fold") %>%
dplyr::group_by(!!as.name(fold_info_cols[["fold_column"]])) %>%
tidyr::chop(cols = dplyr::all_of(model_metric_names))
} else {
fold_col_model_metrics_nested <- plyr::ldply(model_metric_names, function(mn) {
current_model_metrics %>%
base_select(cols = c(fold_info_cols[["fold_column"]], mn)) %>%
dplyr::group_by(!!as.name(fold_info_cols[["fold_column"]])) %>%
legacy_nest(2, .key = "value") %>%
dplyr::mutate(metric = mn)
}) %>%
dplyr::as_tibble() %>%
tidyr::spread(
key = "metric",
value = "value"
)
}
fold_results <- fold_results %>%
dplyr::full_join(fold_col_model_metrics_nested,
by = c(`Fold Column` = fold_info_cols[["fold_column"]])
)
fold_results <- fold_results %>%
base_select(cols = c(
"Fold Column", metrics,
setdiff(
colnames(fold_results),
c("Fold Column", metrics)
)
))
}
}
if (!is.data.frame(fold_results) && is.na(fold_results)) {
nested_fold_results <- list(fold_results)
} else {
# Nest fold results
nested_fold_results <- fold_results %>%
dplyr::group_nest() %>%
dplyr::pull(.data$data)
}
# Combine the various columns
evaluation <- prediction_evaluation %>%
dplyr::bind_cols(average_model_metrics) %>%
tibble::add_column(
"Results" = nested_fold_results,
"Coefficients" = nested_current_coefficients,
"Preprocess" = nested_current_preprocess_params
) %>%
reposition_column("Predictions", .before = "Results") %>%
tibble::add_column(
Folds = n_folds,
`Fold Columns` = length(fold_cols)
) %>%
dplyr::bind_cols(current_warnings_and_messages_counts) %>%
dplyr::mutate(`Warnings and Messages` = nested_current_warnings_and_messages)
if (is.null(preprocess_fn) || ncol(current_preprocess_params) == 0) {
evaluation[["Preprocess"]] <- NULL
}
evaluation
}
) %>%
dplyr::bind_rows() %>%
tibble::as_tibble() %>%
dplyr::mutate(Family = model_specifics[["family"]])
# Extract the first row for each model in the computation grid
grid_first_rows <- computation_grid %>%
dplyr::group_by(.data$model) %>%
dplyr::slice(1)
grid_first_rows <- grid_first_rows[
order(grid_first_rows$model,
method = "radix"), ]
# Extract hparams from grid
hparams <- grid_first_rows[["hparams"]]
# Extract formulas from grid
model_formulas <- grid_first_rows[["Formula"]]
# Now we want to take the formula from the formulas and split it up into
# fixed effects and random effects
# Some users might want to mix models with an without random effects,
# and so we first try to seperate into fixed and random,
# and if no random effects are found for any of the models,
# we remove the column "random".
# Models without random effects will get NA in the random column.
# Extract model effects from the original formulas
# (as we need their order in a moment)
original_formula_order <- extract_model_effects(formulas) %>%
dplyr::mutate(Formula = formulas)
# Get model effects for the current output rows
mixed_effects <- tibble::tibble("Formula" = model_formulas) %>%
dplyr::left_join(original_formula_order,
by = "Formula"
) %>%
base_deselect(cols = "Formula")
# Remove Formula column
original_formula_order[["Formula"]] <- NULL
# We put the two data frames together
output <- dplyr::bind_cols(
cross_validations,
mixed_effects
)
if (!is.null(hyperparameters)) {
output <- output %>%
tibble::add_column(
"HParams" = hparams,
.before = "Dependent"
)
}
# Reorder data frame
# Reorder rows by original formula order
# This also removes unwanted columns
new_col_order <- c(metrics, intersect(info_cols, colnames(output)))
output <-
original_formula_order %>%
dplyr::left_join(output,
by = names(original_formula_order)) %>%
base_select(cols = new_col_order) %>%
position_first("Fixed")
# If asked to remove non-converged models from output
if (isTRUE(rm_nc)) {
output <- output[output[["Convergence Warnings"]] == 0, ]
}
# and return it
output
}
extract_from_hparams_for_cross_validate <- function(hyperparameters, param) {
if (!is.null(hyperparameters) &&
param %in% names(hyperparameters)) {
return(hyperparameters[[param]])
}
NULL
}
extract_from_hparams_for_validate <- extract_from_hparams_for_cross_validate
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.