inst/create-example-data.R

library(data.table)
library(dplyr)
library(devtools)
library(here)
library(covidHubUtils) # devtools::install_github("reichlab/covidHubUtils@v0.1.8") # nolint
library(purrr)
library(data.table)
library(stringr)
library(scoringutils)


# download data from the European Forecast Hub Github Repository using
# subversion. You can also download the folders manually instead.

system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-ensemble") # nolint
system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-baseline") # nolint
system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/UMass-MechBayes") # nolint
system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/epiforecasts-EpiNow2") # nolint

# load truth data using the covidHubutils package ------------------------------
truth <- covidHubUtils::load_truth(hub = "ECDC") |>
  filter(target_variable %in% c("inc case", "inc death")) |>
  mutate(target_variable = ifelse(target_variable == "inc case",
    "Cases", "Deaths"
  )) |>
  rename(
    target_type = target_variable,
    observed = value
  ) |>
  select(-model)

# get the correct file paths to all forecasts ----------------------------------
folders <- here(c("EuroCOVIDhub-ensemble", "EuroCOVIDhub-baseline", "UMass-MechBayes", "epiforecasts-EpiNow2"))

file_paths <- purrr::map(folders,
  .f = function(folder) {
    files <- list.files(folder)
    out <- here::here(folder, files)
    return(out)
  }
) %>%
  unlist()
file_paths <- file_paths[grepl(".csv", file_paths)]

# load all past forecasts ------------------------------------------------------
# ceate a helper function to get model name from a file path
get_model_name <- function(file_path) {
  split <- str_split(file_path, pattern = "/")[[1]]
  model <- split[length(split) - 1]
  return(model)
}

# load forecasts
prediction_data <- map_dfr(file_paths,
  .f = function(file_path) {
    data <- fread(file_path)
    data[, `:=`(
      target_end_date = as.Date(target_end_date),
      quantile = as.numeric(quantile),
      forecast_date = as.Date(forecast_date),
      model = get_model_name(file_path)
    )]
    return(data)
  }
) %>%
  filter(grepl("case", target) | grepl("death", target)) %>%
  mutate(
    target_type = ifelse(grepl("death", target),
      "Deaths", "Cases"
    ),
    horizon = as.numeric(substr(target, 1, 1))
  ) %>%
  rename(predicted = value) %>%
  filter(
    type == "quantile",
    grepl("inc", target)
  ) %>%
  select(
    location, forecast_date, quantile, predicted,
    model, target_end_date, target, target_type, horizon
  )

# harmonise forecast dates to be the date a submission was made
hub_data <- mutate(prediction_data,
  forecast_date = calc_submission_due_date(forecast_date)
)

hub_data <- hub_data |>
  filter(
    horizon <= 3,
    forecast_date > "2021-05-01",
    forecast_date < "2021-07-15",
    # quantile %in% c(seq(0.05, 0.45, 0.1), 0.5, seq(0.55, 0.95, 0.1)),
    location %in% c("DE", "GB", "FR", "IT")
  ) |>
  select(-target) |>
  rename(quantile_level = quantile)

truth <- truth |>
  filter(
    target_end_date > "2021-01-01",
    target_end_date < max(hub_data$target_end_date),
    location %in% c("DE", "GB", "FR", "IT")
  ) |>
  select(-population)

# save example data with forecasts only
example_quantile_forecasts_only <- hub_data
example_quantile_forecasts_only[, quantile_level := round(quantile_level, 3)]

usethis::use_data(example_quantile_forecasts_only, overwrite = TRUE)

example_truth_only <- truth
usethis::use_data(example_truth_only, overwrite = TRUE)

# merge forecast data and truth data and save
example_quantile <- merge_pred_and_obs(hub_data, truth)
data.table::setDT(example_quantile)
usethis::use_data(example_quantile, overwrite = TRUE)


# create data with point forecasts ---------------------------------------------
example_point <- data.table::copy(example_quantile)
example_point <- example_point[quantile %in% c(NA, 0.5)][, quantile_level := NULL]
usethis::use_data(example_point, overwrite = TRUE)


# get continuous sample data ---------------------------------------------------
# define gamma function
fn_gamma <- function(par, x) {
  quantiles <- as.numeric(names(x))
  quantiles <- quantiles[!is.na(x)]
  x <- x[!is.na(x)]
  return(sum((qgamma(quantiles, shape = par[1], rate = par[2]) - x)**2))
}

# define function to fit gamma
fit_gamma <- function(values, quantiles, init) {
  x <- values
  names(x) <- quantiles

  if (missing(init)) {
    init <- c(shape = 1, rate = 1)
  }

  res <- nloptr::sbplx(
    x0 = init, fn = fn_gamma, x = x,
    lower = c(shape = 0, rate = 0),
    control = list(xtol_rel = 1.0e-6, ftol_rel = 1.0e-6)
  )
  sol <- res$par
  names(sol) <- names(init)

  return(as.list(sol))
}

# function to obtain samples
get_samples <- function(values, quantiles, n_samples = 1000) {
  if (any(is.na(values))) {
    return(NA_real_)
  }
  fit <- fit_gamma(values, quantiles)
  samples <- rgamma(n = n_samples, rate = fit$rate, shape = fit$shape)
}

# calculate samples
setDT(example_quantile)
n_samples <- 40
example_sample_continuous <- example_quantile[, .(
  predicted = get_samples(
    predicted,
    quantile,
    n_samples = n_samples
  ),
  sample_id = 1:n_samples,
  observed = unique(observed)
),
by = c(
  "location", "location_name",
  "target_end_date", "target_type",
  "forecast_date", "model", "horizon"
)
]
# remove unnecessary rows where no predictions are available
example_sample_continuous[is.na(predicted), sample_id := NA]
example_sample_continuous <- unique(example_sample_continuous)
usethis::use_data(example_sample_continuous, overwrite = TRUE)


# get integer sample data ------------------------------------------------------
example_sample_discrete <- data.table::copy(example_sample_continuous)
example_sample_discrete <- example_sample_discrete[, predicted := round(predicted)]
usethis::use_data(example_sample_discrete, overwrite = TRUE)


# get binary example data ------------------------------------------------------
# construct a binary prediction by looking at the number of samples below the
# mean prediction. Construct the outcome as whether or not the actually
# observed value was below or above that mean prediction.
# Take this as a way to create example data, not as sound statistical practice

example_binary <- data.table::copy(example_sample_continuous)

# store grouping variable
by <- c(
  "location", "target_end_date", "target_type",
  "forecast_date", "model", "horizon"
)

# calculate mean value
example_binary[, mean_val := mean(predicted),
  by = by
]

# calculate binary prediction as percentage above mean
example_binary[, predicted := mean(predicted > mean_val),
  by = by
]

# calculate true value as whether or not observed was above mean
example_binary[, observed := observed > mean_val]

# delete unnecessary columns and take unique values
example_binary[, `:=`(
  sample_id = NULL, mean_val = NULL,
  observed = factor(as.numeric(observed))
)]
example_binary <- unique(example_binary)

usethis::use_data(example_binary, overwrite = TRUE)
epiforecasts/scoringutils documentation built on May 9, 2024, 12:52 a.m.