covidHubUtils: Utility functions for the COVID-19 forecast hub

Documented in score_forecasts

#' Score forecasts
#'
#' @param forecasts required data.frame with forecasts in the format returned
#' by \code{\link{load_forecasts}} 
#' @param truth required data.frame with forecasts in the format returned
#' by \code{\link{load_truth}}
#' @param return_format string: `"long"` returns long format with a column for
#' `"score_name"` and a column for `"score_value"`; `"wide"` returns wide format with
#' a separate column for each score. Defaults to `"wide"`.
#' @param metrics character vector of the metrics to be returned with options
#' `"abs_error"`, `"wis"`, `"wis_components"`,`"interval_coverage"`, and `"quantile_coverage"`
#' @param use_median_as_point logical: `TRUE` uses the median as the point
#' forecast when scoring; `FALSE` uses the point forecasts from the data when
#' scoring. Defaults to `FALSE`
#'
#' @importFrom dplyr any_of
#' @importFrom scoringutils score summarise_scores
#' @return data.frame with scores. The result will have some columns that
#' define the observation, namely, `model`, `forecast_date`, `location`,
#' `horizon`, `temporal_resolution`, `target_variable`, `horizon`, and
#' `target_end_date`.
#' Other columns will contain scores dependent on metrics parameter:
#'  - `true_value` is the observed truth at that `location` and `target_end_date` (always returned)
#'  - `abs_error` is the absolute error based on median estimate if
#'  use_median_as_point is TRUE, and absolute error based on point forecast
#'  if use_median_as_point is FALSE
#'  - `wis` is the weighted interval score
#'  - `dispersion` the component of WIS made up of interval widths
#'  - `overprediction` the component of WIS made up of overprediction of intervals
#'  - `underprediction` the component of WIS made up of underprediction of intervals
#'  - `coverage_X` are prediction interval coverage at alpha level X
#'  - `quantile_coverage_0.X` are one-sided quantile coverage at quantile X
#'
#' If return_format is `"long"`, also contains columns score_name and score_value
#' where `score_name` is the type of score calculated and `score_value` has the numeric
#' value of the score.
#' If return_format is `"wide"`, each calculated score is in its own column.
#'
#' @references
#' Bracher J, Ray EL, Gneiting T, Reich NG. (2020) Evaluating epidemic forecasts
#' in an interval format. arXiv:2005.12881.
#' \url{https://arxiv.org/abs/2005.12881}.
#'
#' @examples
#' library(scoringutils)
#' forecasts <- load_latest_forecasts(
#'   models = c("COVIDhub-ensemble", "UMass-MechBayes"),
#'   last_forecast_date = "2020-12-14",
#'   forecast_date_window_size = 7,
#'   locations = c("US"),
#'   targets = paste(1:4, "wk ahead inc death"),
#'   source = "zoltar"
#' )
#' truth <- load_truth("JHU", target_variable = "inc death", locations = "US")
#' score_forecasts(forecasts, truth)
#' 
#' forecasts <- load_latest_forecasts(
#'   models = c("ILM-EKF"),
#'   hub = c("ECDC", "US"), last_forecast_date = "2021-03-08",
#'   forecast_date_window_size = 0,
#'   locations = c("GB"),
#'   targets = paste(1:4, "wk ahead inc death"),
#'   source = "zoltar"
#' )
#' truth <- load_truth("JHU",
#'   hub = c("ECDC", "US"),
#'   target_variable = "inc death", locations = "GB"
#' )
#' score_forecasts(forecasts, truth)
#'
#' @export
score_forecasts <- function(
                            forecasts,
                            truth,
                            return_format = "wide",
                            metrics = c(
                              "abs_error", "wis", "wis_components",
                              "interval_coverage", "quantile_coverage"
                            ),
                            use_median_as_point = FALSE) {

  # forecasts data.frame format
  # columns: model, forecast_date, location, horizon, temporal_resolution,
  #          target_variable, target_end_date, type, quantile, value
  forecasts_colnames <- c(
    "model", "forecast_date", "location", "horizon", "temporal_resolution",
    "target_variable", "target_end_date", "type", "quantile", "value"
  )

  # validate forecasts
  # as long as forecasts contains the columns above, it will pass the check
  if (missing(forecasts) || is.null(forecasts)) {
    stop("Forecast dataframe missing", call. = TRUE)
  } else if (!any(is.element(colnames(forecasts), forecasts_colnames))) {
    stop("Forecast dataframe columns malformed", call. = TRUE)
  }

  # truth data.frame format
  # columns: model, target_variable, target_end_date, location and value
  truth_colnames <- c(
    "model", "target_variable", "target_end_date", "location", "value"
  )

  # validate truth
  # as long as forecasts contains the columns above, it will pass the check
  if (missing(truth) || is.null(truth)) {
    stop("Truth dataframe missing", call. = TRUE)
  } else if (!any(is.element(colnames(truth), truth_colnames))) {
    stop("Truth dataframe columns malformed", call. = TRUE)
  }

  # validate return_format
  # match.arg returns error if arg does not match choice
  # which is a bit more complicated to deal with
  if (!is.element(return_format, c("long", "wide"))) {
    return_format <- "wide"
  }

  # validate metrics
  metrics <- match.arg(metrics,
    choices = c("abs_error", "wis", "wis_components", "interval_coverage", "quantile_coverage"),
    several.ok = TRUE
  )

  # validate use_median_as_point
  if (is.null(use_median_as_point)) {
    stop("use_median_as_point is NULL and should be one of (TRUE,FALSE)")
  }

  # match.arg does not like logical input
  if (!(use_median_as_point %in% c(FALSE, TRUE))) {
    stop("use_median_as_point should be one of (TRUE,FALSE)")
  }

  if (length(use_median_as_point) != 1) {
    stop("use_median_as_point should only have a length of 1")
  }

  if (use_median_as_point == FALSE && !("point" %in% unique(forecasts[["type"]]))) {
    stop("Want to use point forecast when scoring but no point forecast in forecast data")
  }


  # get dataframe into scoringutil format
  joint_df <- dplyr::left_join(
    x = forecasts, y = truth,
    by = c("location", "target_variable", "target_end_date")
  ) %>%
    dplyr::select(-c("model.y")) %>%
    dplyr::rename(model = model.x, prediction = value.x, true_value = value.y) %>%
    dplyr::filter(!is.na(true_value))

  # score using scoringutil
  observation_cols <- c(
    "model",
    "location",
    "horizon", "temporal_resolution", "target_variable",
    "forecast_date", "target_end_date" 
  )

  # creates placeholder variables to store the name of the column from scoringutils::score() to
  # take values from (`abs_var`) and the column name to rename as "abs_error" (`abs_var_rename`)
  if (use_median_as_point) {
    abs_var <- "ae_median"
    abs_var_rename <- "ae_median_0"
  } else {
    abs_var <- "ae_point"
    abs_var_rename <- "ae_point_NA"
  }
  
 # column names always included in final output
  col_names_include <- c("model", "location", "horizon", "temporal_resolution",
                        "target_variable", "forecast_date", "target_end_date")
    
  
  #two sided
  scores <- NULL
  for (var in unique(joint_df[["target_variable"]])) {
    joint_df_target <- suppressMessages(joint_df %>%
                                          dplyr::filter(target_variable == var))
    var_scores <- scoringutils::score(
      data = joint_df_target) %>% 
      scoringutils::summarise_scores(by = c(observation_cols, "range")) %>%
      tidyr::pivot_wider(
        id_cols = observation_cols,
        names_from = c("range"),
        values_from = c("coverage", "interval_score", abs_var, "dispersion", "overprediction", "underprediction")
      ) %>%
      purrr::set_names(~ sub(abs_var_rename, "abs_error", .x)) %>%
      ## need to remove all columns ending with NA to not affect WIS calculations
      dplyr::select(
        -dplyr::ends_with("_NA")
      ) %>%
      ## before next lines: do we need to check to ensure interval_score columns exist?
      ## the following lines ensure that we use denominator for the wis of
      ## (# of interval_scores)-0.5
      ## which is written in the paper and elsewhere as
      ## (# of interval_scores at level >0 ) + 0.5 or (K + 1/2)
      ## to make sure that the median only gets half the weight of the other
      ## intervals, multiply its value by 0.5
      dplyr::mutate(
        n_interval_scores = rowSums(!is.na(dplyr::select(., dplyr::starts_with("interval_score")))),
        exists_interval_score_0 = "interval_score_0" %in% names(.),
        interval_score_0 = ifelse(exists_interval_score_0, 0.5 * interval_score_0, NA_real_),
        dispersion_0 = ifelse(exists_interval_score_0, 0.5 * dispersion_0, NA_real_),
        underprediction_0 = ifelse(exists_interval_score_0, 0.5 * underprediction_0, NA_real_),
        overprediction_0 = ifelse(exists_interval_score_0, 0.5 * overprediction_0, NA_real_)
      ) %>%
      dplyr::mutate(
        wis = rowSums(dplyr::select(., dplyr::starts_with("interval_score")), na.rm = FALSE) / (n_interval_scores - 0.5 * (exists_interval_score_0)),
      ) %>%
      dplyr::mutate(
        dispersion = rowSums(dplyr::select(., dplyr::starts_with("dispersion")), na.rm = FALSE) / (n_interval_scores - 0.5 * (exists_interval_score_0)),
        overprediction = rowSums(dplyr::select(., dplyr::starts_with("overprediction")), na.rm = FALSE) / (n_interval_scores - 0.5 * (exists_interval_score_0)),
        underprediction = rowSums(dplyr::select(., dplyr::starts_with("underprediction")), na.rm = FALSE) / (n_interval_scores - 0.5 * (exists_interval_score_0))
      ) %>%
      dplyr::select(
        -dplyr::starts_with("ae_median_"),
        -dplyr::starts_with("ae_point_"),
        -dplyr::starts_with("interval_score"),
        -dplyr::starts_with("dispersion_"),
        -dplyr::starts_with("underprediction_"),
        -dplyr::starts_with("overprediction_")
      ) %>%
      dplyr::select(
        col_names_include, dplyr::starts_with("coverage_"),
        dplyr::starts_with("abs_error"),
        "n_interval_scores", "exists_interval_score_0", "wis",
        "dispersion", "overprediction", "underprediction"
      )
    scores <- rbind(scores,var_scores)
  }

  # one-sided quantile coverage only calculated if needed
  if ("quantile_coverage" %in% metrics) {
    sq <- scoringutils::score(
      data = joint_df_target) %>% 
      scoringutils::summarise_scores(by = c(observation_cols, "quantile"))  %>%
      tidyr::pivot_wider(
        id_cols = observation_cols,
        names_from = c("quantile"),
        values_from = c("quantile_coverage", "interval_score", abs_var, "dispersion", "overprediction", "underprediction")
      ) %>%
      purrr::set_names(~ sub(abs_var_rename, "abs_error", .x)) %>%
      dplyr::select(
        -dplyr::ends_with("_NA")
      ) %>%
      dplyr::select(
        -dplyr::starts_with("abs_error."),
        -dplyr::starts_with("ae_median_"),
        -dplyr::starts_with("ae_point_"),
        -dplyr::starts_with("interval_score"),
        -dplyr::starts_with("dispersion_"),
        -dplyr::starts_with("underprediction_"),
        -dplyr::starts_with("overprediction_")
      )
    # order one-sided quantiles to ascending order
    quantile_coverage_columns <-
      sort(colnames(sq %>%
        dplyr::select(dplyr::starts_with("quantile_coverage_"))))

    # select necessary columns and the one-sided quantiles in ascending order
    scores_one_sided <- sq %>%
      dplyr::select(col_names_include, dplyr::all_of(quantile_coverage_columns))

    # combine one and two sided
    scores <- suppressMessages(dplyr::full_join(
      scores_one_sided,
      scores
    ))
  }
  
  # add back in true_value column
  scores <- truth %>% 
    dplyr::select(c("location", "target_variable", "target_end_date","value")) %>% 
    dplyr::left_join(x = scores, y = .,
    by = c("location", "target_variable", "target_end_date")) %>%
    dplyr::rename(true_value = value)


  # remove unwanted columns note if quantile coverage not wanted value is not calculated
  scores <- scores %>%
    dplyr::select(-c(
      if (!("abs_error" %in% metrics)) {
        c("abs_error")
      },
      if (!("wis" %in% metrics)) {
        c("wis")
      },
      if (!("wis_components" %in% metrics)) {
        c("dispersion", "overprediction", "underprediction")
      },
      if (!("interval_coverage" %in% metrics)) {
        dplyr::starts_with("coverage_")
      },
      if (!("interval_coverage" %in% metrics)) {
        c("n_interval_scores", "exists_interval_score_0")
      }
    ))

  if ("coverage_0" %in% names(scores)) {
    scores <- scores %>%
      dplyr::select(-c("coverage_0"))
  }


  # manipulate return format:
  #   summarise_scores(), by default, returns in wide format
  #   only change if user specifies long return format
  if (return_format == "long") {
    scores <- scores %>%
      tidyr::pivot_longer(
        cols = !dplyr::any_of(observation_cols[observation_cols != "true_value"]),
        names_to = "score_name",
        values_to = "score_value"
      )
  }


  return(scores)
}