R/linear_pool_sample.R

Defines functions validate_sample_inputs calc_samples_per_combo draw_target_samples make_sample_indices_unique linear_pool_sample

Documented in make_sample_indices_unique

#' Helper function for computing ensemble model outputs as a linear pool
#' (distributional mixture) of component model outputs for the `sample`
#' output type.
#'
#' @inheritParams linear_pool
#'
#' @details The resulting output type ID values are character strings, generated by
#'   a concatenation of the component model ID and initial output type ID, unless
#'   the input `model_out_tbl` is detected to have a numeric `output_type_id`
#'   column. In the latter case, a factor representation of this character string
#'   is coerced to a numeric value.
#' @noRd
#'
#' @return a `model_out_tbl` object of ensemble predictions for the `sample`
#' output type. Note that the output type ID values will not match those of the
#' input model_out_tbl but do preserve relationships across combinations of
#' task ID variables.
#'
#' @importFrom rlang .data
linear_pool_sample <- function(model_out_tbl, weights = NULL,
                               weights_col_name = "weight",
                               model_id = "hub-ensemble",
                               task_id_cols = NULL,
                               compound_taskid_set,
                               n_output_samples = NULL) {
  validate_sample_inputs(model_out_tbl, weights, weights_col_name, compound_taskid_set, n_output_samples)

  # assign equal weight to all models if weights were not provided
  num_models <- length(unique(model_out_tbl$model_id))
  if (is.null(weights)) {
    weights <- data.frame(
      model_id = unique(model_out_tbl$model_id),
      stringsAsFactors = FALSE
    )
    weights[[weights_col_name]] <- 1 / num_models
  }

  # ensure that the weights are equal for every model;
  # we only support this setting for now
  unique_weights <- unique(weights[[weights_col_name]])
  if (length(unique_weights) != 1) {
    cli::cli_abort("{.arg weights} must be {.val NULL} or equal for every model")
  }

  if (!is.null(n_output_samples)) {
    # calculate the number of samples to draw from each model for each unique
    # combination of compound task ID set variables
    samples_per_combo <- calc_samples_per_combo(
      model_out_tbl,
      weights,
      weights_col_name,
      task_id_cols,
      compound_taskid_set,
      n_output_samples
    )

    # ensure that requested output samples per compound un doesn't exceed amount provided;
    # we only support this setting for now
    if (any(samples_per_combo$provided_samples < samples_per_combo$target_samples)) {
      cli::cli_abort("Requested {.arg n_output_samples} cannot exceed the provided samples per compound unit.")
    }

    # draw the target number of samples from each model for each unique
    # combination of compound task ID set variables
    split_model_compound_set_tbl <- split(
      model_out_tbl,
      f = model_out_tbl[, c("model_id", compound_taskid_set)]
    )
    model_out_tbl <- split_model_compound_set_tbl |>
      purrr::map(.f = draw_target_samples, samples_per_combo, compound_taskid_set) |>
      purrr::list_rbind()
  }

  model_out_tbl |>
    make_sample_indices_unique() |>
    dplyr::select(-"model_id") |>
    dplyr::mutate(model_id = model_id, .before = 1)
}


#' Make the output type ID values of sample forecasts distinct for different
#' models
#'
#' @param model_out_tbl an object of class `model_out_tbl` with component
#'   model outputs (e.g., predictions).
#'
#' @details The new `output_type_id` column values will follow one of two patterns,
#' depending on whether the column is detected to be numeric:
#'   1. If the output type ID is not numeric (may be a character):
#'      A concatenation of the prediction's model ID and original output type ID
#'   2. If the output type ID is numeric: A numeric representation of the above
#'      pattern rendered as a factor.
#'
#' @return a model_out_tbl object with unique output type ID values for different
#'   models but otherwise identical to the input model_out_tbl.
make_sample_indices_unique <- function(model_out_tbl) {
  numeric_output_type_ids <- is.numeric(model_out_tbl$output_type_id)

  new_indices_outputs <- model_out_tbl |>
    dplyr::mutate(output_type_id = paste0(.data[["model_id"]], .data[["output_type_id"]]))

  if (numeric_output_type_ids) {
    dplyr::mutate(
      new_indices_outputs,
      output_type_id = as.integer(factor(.data[["output_type_id"]], levels = unique(.data[["output_type_id"]])))
    )
  } else {
    new_indices_outputs
  }
}


#' Helper function for drawing the requested number of samples from each model for
#' every unique combination of compound task ID set variables when requesting a
#' linear pool of the `sample` output type.
#'
#' @inheritParams linear_pool
#' @param model_compound_set_tbl a `model_out_tbl` of all the sample predictions for
#'   a single, unique combination of compound task ID set variables for a particular
#'   model
#' @param samples_per_combo a `data.frame` giving the number of provided and
#'   target samples for every unique combination of compound task ID set variables
#'   for each model, with columns: "model_id", compound task id set variables,
#'   "provided_samples", "weight", "effective_weight", and "target_samples".
#'
#' @noRd
#'
#' @return a `model_out_tbl` containing the requested number sample predictions for a
#' single, unique combination of compound task ID set variables for a particular
#' model, drawn from the provided `model_compound_set_tbl`
#'
#' @importFrom rlang .data
draw_target_samples <- function(model_compound_set_tbl, samples_per_combo, compound_taskid_set) {
  if (nrow(model_compound_set_tbl) == 0) {
    return(model_compound_set_tbl)
  }

  # current_compound_taskid_set has 1 row, where the column
  # `target_samples` is the number of samples to draw for this
  # combination of model_id and compound task ID set variables
  current_compound_taskid_set <- dplyr::left_join(
    model_compound_set_tbl[1, ],
    samples_per_combo,
    by = c("model_id", compound_taskid_set)
  )
  provided_indices <- unique(model_compound_set_tbl$output_type_id)

  sample_idx <- sample(x = provided_indices, size = current_compound_taskid_set$target_samples, replace = FALSE)
  dplyr::filter(model_compound_set_tbl, .data[["output_type_id"]] %in% sample_idx)
}


#' Helper function for computing the requested number of samples from each model for
#' every unique combination of compound task ID set variables when requesting a
#' linear pool of the `sample` output type.
#'
#' @inheritParams linear_pool
#' @noRd
#'
#' @return a `data.frame` giving the number of provided and target samples for every
#' unique combination of compound task ID set variables for each model, with columns:
#' "model_id", compound task id set variables, "provided_samples", "weight",
#' "effective_weight", and "target_samples".
#'
#' @importFrom rlang .data
calc_samples_per_combo <- function(model_out_tbl, weights,
                                   weights_col_name,
                                   task_id_cols,
                                   compound_taskid_set,
                                   n_output_samples) {
  weight_by_cols <- colnames(weights)[colnames(weights) != weights_col_name]
  samples_per_combo <- model_out_tbl |>
    # for each unique combination of model_id and compound task ID set variables,
    # calculate the number of unique output type IDs (i.e., samples) provided
    dplyr::group_by(dplyr::across(dplyr::all_of(c("model_id", compound_taskid_set)))) |>
    dplyr::summarize(provided_samples = length(unique(.data[["output_type_id"]]))) |>
    dplyr::ungroup() |>
    tidyr::complete(
      !!!rlang::syms(c("model_id", compound_taskid_set)),
      fill = list(provided_samples = 0)
    ) |>
    # add model weights
    dplyr::left_join(weights, weight_by_cols) |>
    # calculate the target number of samples to draw from each model for each unique
    # combination of compound task ID set variables.
    # for each compound unit, models with no provided samples will have 0 target samples
    dplyr::group_by(dplyr::across(dplyr::all_of(compound_taskid_set))) |>
    dplyr::mutate(
      effective_weight = as.integer(.data[["provided_samples"]] > 0) * .data[[weights_col_name]],
      effective_weight = .data[["effective_weight"]] / sum(.data[["effective_weight"]]),
      target_samples = floor(.data[["effective_weight"]] * n_output_samples)
    ) |>
    dplyr::ungroup()

  if (is.null(compound_taskid_set)) {
    samples_per_combo <- list(samples_per_combo)
  } else {
    samples_per_combo <- split(samples_per_combo, f = samples_per_combo[, compound_taskid_set])
  }
  # deal with n_output_samples not divisible evenly among component models
  samples_per_combo <- samples_per_combo |>
    purrr::map(.f = function(split_per_combo) {
      actual_output_samples <- sum(split_per_combo$target_samples)
      remainder_samples <- n_output_samples - actual_output_samples
      valid_models <- split_per_combo$model_id[split_per_combo$provided_samples > 0]
      models_to_resample <- sample(x = valid_models, size = remainder_samples)

      dplyr::mutate(
        split_per_combo,
        target_samples = ifelse(
          .data[["model_id"]] %in% models_to_resample,
          .data[["target_samples"]] + 1,
          .data[["target_samples"]]
        )
      )
    }) |>
    purrr::list_rbind()
}

#' Perform simple validations on the inputs used to calculate a linear pool
#' of samples
#'
#' @inheritParams linear_pool
#' @param model_out_tbl an object of class `model_out_tbl` with component
#'   model outputs (e.g., predictions). May only contain the "sample" output type.
#' @param weights an optional `data.frame` with component model weights. If
#'   provided, it should have a column named `model_id` and a column containing
#'   model weights. Optionally, it may contain additional columns corresponding to
#'   variables in the compound task ID set for weights specific to the sample output
#'   type. Defaults to `NULL`, which specifies an equally-weighted ensemble.
#'
#' @return no return value
#'
#' @noRd
validate_sample_inputs <- function(model_out_tbl, weights = NULL,
                                   weights_col_name = "weight",
                                   compound_taskid_set,
                                   n_output_samples = NULL) {
  if (!identical(unique(model_out_tbl$output_type), "sample")) {
    cli::cli_abort("{.arg model_out_tbl} should only contain the sample output type")
  }

  if (!is.null(n_output_samples) && !is.numeric(n_output_samples) && trunc(n_output_samples) != n_output_samples) {
    cli::cli_abort("{.arg n_output_samples} must be {.val NULL} or an integer value")
  }

  if (!is.null(weights) && length(unique(weights[[weights_col_name]])) != 1) {
    if (is.null(n_output_samples)) {
      cli::cli_abort("Component model weights were provided,
                      so a value for {.arg n_output_samples} must be provided")
    }

    if (!all(colnames(weights) %in% c("model_id", compound_taskid_set, weights_col_name))) {
      cli::cli_abort("{.arg weights} may only vary for variables in the {.arg compound_taskid_set}.")
    }
  }

  # check that for each compound unit defined by the compound task ID set variables,
  # all models provide the same number of samples
  same_num_output_ids <- model_out_tbl |>
    dplyr::group_by(dplyr::across(dplyr::all_of(c(compound_taskid_set, "model_id")))) |>
    dplyr::summarize(num_output_type_id = length(.data[["output_type_id"]])) |>
    dplyr::ungroup() |>
    dplyr::group_split(dplyr::across(dplyr::all_of(c(compound_taskid_set)))) |>
    purrr::map_lgl(.f = function(split_outputs) {
      length(unique(split_outputs$num_output_type_id)) == 1
    })

  false_counter <- sum(!same_num_output_ids)
  if (false_counter != 0) {
    cli::cli_abort(c(
      "x" = "{.arg model_out_tbl} contains {.val {false_counter}} sample distribution{?s} that cannot be ensembled.",
      "i" = "Within each group defined by a combination of the compound task ID set 
        variables, all models must provide the same number of sample forecasts"
    ))
  }
}

Try the hubEnsembles package in your browser

Any scripts or data that you put into this service are public.

hubEnsembles documentation built on June 8, 2025, 1:26 p.m.