Nothing
#' Helper function for computing ensemble model outputs as a linear pool
#' (distributional mixture) of component model outputs for the `sample`
#' output type.
#'
#' @inheritParams linear_pool
#'
#' @details The resulting output type ID values are character strings, generated by
#' a concatenation of the component model ID and initial output type ID, unless
#' the input `model_out_tbl` is detected to have a numeric `output_type_id`
#' column. In the latter case, a factor representation of this character string
#' is coerced to a numeric value.
#' @noRd
#'
#' @return a `model_out_tbl` object of ensemble predictions for the `sample`
#' output type. Note that the output type ID values will not match those of the
#' input model_out_tbl but do preserve relationships across combinations of
#' task ID variables.
#'
#' @importFrom rlang .data
linear_pool_sample <- function(model_out_tbl, weights = NULL,
weights_col_name = "weight",
model_id = "hub-ensemble",
task_id_cols = NULL,
compound_taskid_set,
n_output_samples = NULL) {
validate_sample_inputs(model_out_tbl, weights, weights_col_name, compound_taskid_set, n_output_samples)
# assign equal weight to all models if weights were not provided
num_models <- length(unique(model_out_tbl$model_id))
if (is.null(weights)) {
weights <- data.frame(
model_id = unique(model_out_tbl$model_id),
stringsAsFactors = FALSE
)
weights[[weights_col_name]] <- 1 / num_models
}
# ensure that the weights are equal for every model;
# we only support this setting for now
unique_weights <- unique(weights[[weights_col_name]])
if (length(unique_weights) != 1) {
cli::cli_abort("{.arg weights} must be {.val NULL} or equal for every model")
}
if (!is.null(n_output_samples)) {
# calculate the number of samples to draw from each model for each unique
# combination of compound task ID set variables
samples_per_combo <- calc_samples_per_combo(
model_out_tbl,
weights,
weights_col_name,
task_id_cols,
compound_taskid_set,
n_output_samples
)
# ensure that requested output samples per compound un doesn't exceed amount provided;
# we only support this setting for now
if (any(samples_per_combo$provided_samples < samples_per_combo$target_samples)) {
cli::cli_abort("Requested {.arg n_output_samples} cannot exceed the provided samples per compound unit.")
}
# draw the target number of samples from each model for each unique
# combination of compound task ID set variables
split_model_compound_set_tbl <- split(
model_out_tbl,
f = model_out_tbl[, c("model_id", compound_taskid_set)]
)
model_out_tbl <- split_model_compound_set_tbl |>
purrr::map(.f = draw_target_samples, samples_per_combo, compound_taskid_set) |>
purrr::list_rbind()
}
model_out_tbl |>
make_sample_indices_unique() |>
dplyr::select(-"model_id") |>
dplyr::mutate(model_id = model_id, .before = 1)
}
#' Make the output type ID values of sample forecasts distinct for different
#' models
#'
#' @param model_out_tbl an object of class `model_out_tbl` with component
#' model outputs (e.g., predictions).
#'
#' @details The new `output_type_id` column values will follow one of two patterns,
#' depending on whether the column is detected to be numeric:
#' 1. If the output type ID is not numeric (may be a character):
#' A concatenation of the prediction's model ID and original output type ID
#' 2. If the output type ID is numeric: A numeric representation of the above
#' pattern rendered as a factor.
#'
#' @return a model_out_tbl object with unique output type ID values for different
#' models but otherwise identical to the input model_out_tbl.
make_sample_indices_unique <- function(model_out_tbl) {
numeric_output_type_ids <- is.numeric(model_out_tbl$output_type_id)
new_indices_outputs <- model_out_tbl |>
dplyr::mutate(output_type_id = paste0(.data[["model_id"]], .data[["output_type_id"]]))
if (numeric_output_type_ids) {
dplyr::mutate(
new_indices_outputs,
output_type_id = as.integer(factor(.data[["output_type_id"]], levels = unique(.data[["output_type_id"]])))
)
} else {
new_indices_outputs
}
}
#' Helper function for drawing the requested number of samples from each model for
#' every unique combination of compound task ID set variables when requesting a
#' linear pool of the `sample` output type.
#'
#' @inheritParams linear_pool
#' @param model_compound_set_tbl a `model_out_tbl` of all the sample predictions for
#' a single, unique combination of compound task ID set variables for a particular
#' model
#' @param samples_per_combo a `data.frame` giving the number of provided and
#' target samples for every unique combination of compound task ID set variables
#' for each model, with columns: "model_id", compound task id set variables,
#' "provided_samples", "weight", "effective_weight", and "target_samples".
#'
#' @noRd
#'
#' @return a `model_out_tbl` containing the requested number sample predictions for a
#' single, unique combination of compound task ID set variables for a particular
#' model, drawn from the provided `model_compound_set_tbl`
#'
#' @importFrom rlang .data
draw_target_samples <- function(model_compound_set_tbl, samples_per_combo, compound_taskid_set) {
if (nrow(model_compound_set_tbl) == 0) {
return(model_compound_set_tbl)
}
# current_compound_taskid_set has 1 row, where the column
# `target_samples` is the number of samples to draw for this
# combination of model_id and compound task ID set variables
current_compound_taskid_set <- dplyr::left_join(
model_compound_set_tbl[1, ],
samples_per_combo,
by = c("model_id", compound_taskid_set)
)
provided_indices <- unique(model_compound_set_tbl$output_type_id)
sample_idx <- sample(x = provided_indices, size = current_compound_taskid_set$target_samples, replace = FALSE)
dplyr::filter(model_compound_set_tbl, .data[["output_type_id"]] %in% sample_idx)
}
#' Helper function for computing the requested number of samples from each model for
#' every unique combination of compound task ID set variables when requesting a
#' linear pool of the `sample` output type.
#'
#' @inheritParams linear_pool
#' @noRd
#'
#' @return a `data.frame` giving the number of provided and target samples for every
#' unique combination of compound task ID set variables for each model, with columns:
#' "model_id", compound task id set variables, "provided_samples", "weight",
#' "effective_weight", and "target_samples".
#'
#' @importFrom rlang .data
calc_samples_per_combo <- function(model_out_tbl, weights,
weights_col_name,
task_id_cols,
compound_taskid_set,
n_output_samples) {
weight_by_cols <- colnames(weights)[colnames(weights) != weights_col_name]
samples_per_combo <- model_out_tbl |>
# for each unique combination of model_id and compound task ID set variables,
# calculate the number of unique output type IDs (i.e., samples) provided
dplyr::group_by(dplyr::across(dplyr::all_of(c("model_id", compound_taskid_set)))) |>
dplyr::summarize(provided_samples = length(unique(.data[["output_type_id"]]))) |>
dplyr::ungroup() |>
tidyr::complete(
!!!rlang::syms(c("model_id", compound_taskid_set)),
fill = list(provided_samples = 0)
) |>
# add model weights
dplyr::left_join(weights, weight_by_cols) |>
# calculate the target number of samples to draw from each model for each unique
# combination of compound task ID set variables.
# for each compound unit, models with no provided samples will have 0 target samples
dplyr::group_by(dplyr::across(dplyr::all_of(compound_taskid_set))) |>
dplyr::mutate(
effective_weight = as.integer(.data[["provided_samples"]] > 0) * .data[[weights_col_name]],
effective_weight = .data[["effective_weight"]] / sum(.data[["effective_weight"]]),
target_samples = floor(.data[["effective_weight"]] * n_output_samples)
) |>
dplyr::ungroup()
if (is.null(compound_taskid_set)) {
samples_per_combo <- list(samples_per_combo)
} else {
samples_per_combo <- split(samples_per_combo, f = samples_per_combo[, compound_taskid_set])
}
# deal with n_output_samples not divisible evenly among component models
samples_per_combo <- samples_per_combo |>
purrr::map(.f = function(split_per_combo) {
actual_output_samples <- sum(split_per_combo$target_samples)
remainder_samples <- n_output_samples - actual_output_samples
valid_models <- split_per_combo$model_id[split_per_combo$provided_samples > 0]
models_to_resample <- sample(x = valid_models, size = remainder_samples)
dplyr::mutate(
split_per_combo,
target_samples = ifelse(
.data[["model_id"]] %in% models_to_resample,
.data[["target_samples"]] + 1,
.data[["target_samples"]]
)
)
}) |>
purrr::list_rbind()
}
#' Perform simple validations on the inputs used to calculate a linear pool
#' of samples
#'
#' @inheritParams linear_pool
#' @param model_out_tbl an object of class `model_out_tbl` with component
#' model outputs (e.g., predictions). May only contain the "sample" output type.
#' @param weights an optional `data.frame` with component model weights. If
#' provided, it should have a column named `model_id` and a column containing
#' model weights. Optionally, it may contain additional columns corresponding to
#' variables in the compound task ID set for weights specific to the sample output
#' type. Defaults to `NULL`, which specifies an equally-weighted ensemble.
#'
#' @return no return value
#'
#' @noRd
validate_sample_inputs <- function(model_out_tbl, weights = NULL,
weights_col_name = "weight",
compound_taskid_set,
n_output_samples = NULL) {
if (!identical(unique(model_out_tbl$output_type), "sample")) {
cli::cli_abort("{.arg model_out_tbl} should only contain the sample output type")
}
if (!is.null(n_output_samples) && !is.numeric(n_output_samples) && trunc(n_output_samples) != n_output_samples) {
cli::cli_abort("{.arg n_output_samples} must be {.val NULL} or an integer value")
}
if (!is.null(weights) && length(unique(weights[[weights_col_name]])) != 1) {
if (is.null(n_output_samples)) {
cli::cli_abort("Component model weights were provided,
so a value for {.arg n_output_samples} must be provided")
}
if (!all(colnames(weights) %in% c("model_id", compound_taskid_set, weights_col_name))) {
cli::cli_abort("{.arg weights} may only vary for variables in the {.arg compound_taskid_set}.")
}
}
# check that for each compound unit defined by the compound task ID set variables,
# all models provide the same number of samples
same_num_output_ids <- model_out_tbl |>
dplyr::group_by(dplyr::across(dplyr::all_of(c(compound_taskid_set, "model_id")))) |>
dplyr::summarize(num_output_type_id = length(.data[["output_type_id"]])) |>
dplyr::ungroup() |>
dplyr::group_split(dplyr::across(dplyr::all_of(c(compound_taskid_set)))) |>
purrr::map_lgl(.f = function(split_outputs) {
length(unique(split_outputs$num_output_type_id)) == 1
})
false_counter <- sum(!same_num_output_ids)
if (false_counter != 0) {
cli::cli_abort(c(
"x" = "{.arg model_out_tbl} contains {.val {false_counter}} sample distribution{?s} that cannot be ensembled.",
"i" = "Within each group defined by a combination of the compound task ID set
variables, all models must provide the same number of sample forecasts"
))
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.