library(here)
library(magrittr)
library(stringr)
library(data.table)
library(scoringutils)
# ==============================================================================
# ------------------------------ update data -----------------------------------
# ==============================================================================
# update forecast data from server ---------------------------------------------
# system(
# paste(". data-raw/paper.sh")
# )
# get the correct file paths to all forecasts ----------------------------------
folders <- here::here("data-raw", list.files("data-raw"))
folders <- folders[
!(grepl("\\.R", folders) | grepl(".sh", folders) | grepl(".csv", folders))
]
file_paths <- purrr::map(folders,
.f = function(folder) {
files <- list.files(folder)
out <- here::here(folder, files)
return(out)}) %>%
unlist()
file_paths <- file_paths[grepl(".csv", file_paths)]
file_paths <- file_paths[!(grepl("EpiExpert_Rt", file_paths)) &
!(grepl("included_models", file_paths))]
# load all past forecasts ------------------------------------------------------
# ceate a helper function to get model name from a file path
get_model_name <- function(file_path) {
split <- str_split(file_path, pattern = "/")[[1]]
model <- split[length(split) - 1]
return(model)
}
prediction_data <- purrr::map_dfr(file_paths,
.f = function(file_path) {
data <- data.table::fread(file_path)
data[, `:=`(
target_end_date = as.Date(target_end_date),
forecast_date = as.Date(forecast_date),
model = get_model_name(file_path)
)]
return(data)
}) %>%
dplyr::mutate(target_type = ifelse(grepl("death", target), "death", "case")) %>%
dplyr::rename(prediction = value) %>%
dplyr::mutate(location_name = ifelse(location == "GM", "Germany", "Poland")) %>%
dplyr::filter(type == "quantile",
grepl("inc", target),
location %in% c("GM", "PL")) %>%
dplyr::select(location, location_name, forecast_date, quantile, prediction, model, target_end_date, target, target_type)
# change model names -----------------------------------------------------------
# helper function to change model names
change_model_name <- function(names, old_name, new_name) {
names <- ifelse (names == old_name, new_name, names)
return(names)
}
change_names <- list(
c("KITCOVIDhub-median_ensemble",
"Hub-ensemble-realised"),
c("KITCOVIDhub-mean_ensemble",
"Hub-ensemble-realised-mean"),
c("KIT-baseline",
"Baseline"),
c("epiforecasts-EpiExpert",
"Crowd forecast"),
c("epiforecasts-EpiNow2",
"Renewal"),
c("epiforecasts-EpiNow2-retrospective",
"Renewal-retrospective"),
c("epiforecasts-EpiNow2_secondary",
"Convolution"),
c("epiforecasts-EpiNow2_secondary-retrospective",
"Convolution-retrospective"),
c("KITCOVIDhub-median_ensemble_exclude_both",
"Hub-ensemble"),
c("KITCOVIDhub-mean_ensemble_exclude_both",
"Hub-ensemble-mean"),
c("KITCOVIDhub-median_ensemble_exclude_EpiExpert",
"Hub-ensemble-with-renewal"),
c("KITCOVIDhub-mean_ensemble_exclude_EpiExpert",
"Hub-ensemble-with-renewal-mean"),
c("KITCOVIDhub-median_ensemble_exclude_EpiNow2",
"Hub-ensemble-with-crowd"),
c("KITCOVIDhub-mean_ensemble_exclude_EpiNow2",
"Hub-ensemble-with-crowd-mean"),
c("KITCOVIDhub-median_ensemble_include_EpiNow2_secondary_additionally",
"Hub-ensemble-with-all"),
c("KITCOVIDhub-mean_ensemble_include_EpiNow2_secondary_additionally",
"Hub-ensemble-with-all-mean"),
c("KITCOVIDhub-median_ensemble_include_only_EpiNow2_secondary",
"Hub-ensemble-with-convolution"),
c("KITCOVIDhub-mean_ensemble_include_only_EpiNow2_secondary",
"Hub-ensemble-with-convolution-mean")
) %>%
# not sure why this is not unique?
unique()
purrr::walk(change_names,
.f = function(change) {
prediction_data[, model := ifelse(model == change[1],
change[2],
model)]
})
usethis::use_data(prediction_data, overwrite = TRUE)
# store names of regular models and ensemble models ----------------------------
regular_models <- c("Renewal", "Hub-ensemble", "Crowd forecast", "Convolution")
usethis::use_data(regular_models, overwrite = TRUE)
ensemble_models <- unique(prediction_data$model)
ensemble_models <- ensemble_models[grepl("ensemble", ensemble_models)]
usethis::use_data(ensemble_models, overwrite = TRUE)
# update truth data ------------------------------------------------------------
# source(here("data-raw", "update.R"))
# weekly truth data
weekly_cases <- fread(here("data-raw", "weekly-incident-cases.csv"))
weekly_cases[, target_type := "case"]
weekly_deaths <- fread(here("data-raw", "weekly-incident-deaths.csv"))
weekly_deaths[, target_type := "death"]
truth <- rbindlist(list(weekly_cases, weekly_deaths))
truth <- truth[location_name %in% c("Germany", "Poland")]
truth_data <- truth[, `:=` (location = NULL,
target_end_date = as.Date(target_end_date))]
setnames(truth_data, old = "value", new = "true_value")
usethis::use_data(truth_data, overwrite = TRUE)
# daily truth data
daily_cases <- fread(here("data-raw", "daily-incidence-cases.csv"))
daily_cases[, target_type := "case"]
daily_deaths <- fread(here("data-raw", "daily-incidence-deaths.csv"))
daily_deaths[, target_type := "death"]
dailytruth <- rbindlist(list(daily_cases, daily_deaths))
dailytruth <- dailytruth[location_name %in% c("Germany", "Poland")]
dailytruth_data <- dailytruth[, `:=` (location = NULL,
target_end_date = as.Date(date))]
setnames(dailytruth_data, old = "value", new = "true_value")
usethis::use_data(dailytruth_data, overwrite = TRUE)
# combine raw data and store ---------------------------------------------------
combined_data <- merge_pred_and_obs(
prediction_data,
truth_data,
by = c("location_name", "target_end_date", "target_type")
)[,
forecast_date := as.character(forecast_date)
]
# update data about the number of ensembles included in the Hub ----------------
root_dir <- here("data-raw", "included_models")
filenames <- list.files(root_dir)
get_date <- function(filename) {
date <- substr(filename, 17, 26)
return(as.Date(date))
}
dates <- get_date(filenames)
filenames <- filenames[dates >= "2020-10-12" & dates <= "2021-03-01"]
ensemble_members <- purrr::map_dfr(filenames,
.f = function(x) {
dt <- fread(here(root_dir, x))
dt[, Date := get_date(x)]
return(dt)
})
del_cols <- names(ensemble_members)[grepl("XX", names(ensemble_members)) |
grepl("cum", names(ensemble_members))]
ensemble_members[, (del_cols) := NULL]
usethis::use_data(ensemble_members, overwrite = TRUE)
# ==============================================================================
# -------------------- filter data and perform stratification ------------------
# ==============================================================================
# define different date periods relevant for the paper--------------------------
forecast_dates <- list()
# all unfiltered dates
forecast_dates[["unfiltered"]] <- c(
seq.Date(from = as.Date("2020-10-12"), to = as.Date("2021-03-01"), "week")
)
# all dates relevant to the hubs
forecast_dates[["full_hub_period"]] <- c(
seq.Date(from = as.Date("2020-10-12"), to = as.Date("2020-12-14"), "week"),
seq.Date(from = as.Date("2021-01-11"), to = as.Date("2021-03-01"), "week")
)
# first period in the Hub paper
forecast_dates[["first_period"]] <-
forecast_dates$cases[as.Date(forecast_dates$cases) <= "2020-12-19"]
# second period in the hub paper
forecast_dates[["second_period"]] <-
forecast_dates$cases[as.Date(forecast_dates$cases) >= "2021-01-04"]
# christmas period
forecast_dates[["christmas"]] <-
seq.Date(as.Date("2020-12-19"), as.Date("2021-01-07"), "week")
# dates not scored for death forecasts
forecast_dates[["death_not_scored"]] <-
forecast_dates$unfiltered[as.Date(forecast_dates$unfiltered) < "2020-12-14"]
# forecast_dates[["cases"]] <-
# forecast_dates$unfiltered[
# !(forecast_dates$unfiltered %in% c("2020-12-21", "2020-12-28"))
# ]
#
# forecast_dates[["deaths"]] <-
# forecast_dates$cases[as.Date(forecast_dates$cases) >= "2020-12-14"]
usethis::use_data(forecast_dates, overwrite = TRUE)
# classify epidemic into rising and falling ------------------------------------
# classify epidemic according to whether, on a given forecast date, the two
# weeks before that have seen monotonic rise, decline, or an unclear trend
classify_epidemic <- function(data, cutoff = 0.05, growth_cutoff = 2) {
dt <- as.data.table(data)
# calculate differences and set differences smaller than
# a certain cutoff to zero
dt[, diff_1 := c(NA, diff(true_value, 1)),
by = c("location_name", "target_type")]
dt[abs(diff_1) < (cutoff * true_value), diff_1 := 0]
dt[, diff_prev := c(shift(diff_1, 1)),
by = c("location_name", "target_type")]
# assign a label depending on observed differences
dt[, c("classification", "speed") := "unclear"]
dt[(diff_1 >= 0 & diff_prev >= 0),
classification := "increasing"]
dt[(diff_1 <= 0 & diff_prev <= 0 ),
classification := "decreasing"]
dt[(diff_1 == 0 & diff_prev == 0),
classification := "unclear"]
dt[(classification == "increasing") &
diff_1 > growth_cutoff * diff_prev,
speed := "accelerating"]
dt[(classification == "increasing") &
diff_1 < 1/growth_cutoff * diff_prev,
speed := "decelerating"]
dt[(classification == "decreasing") &
abs(diff_1) > growth_cutoff * abs(diff_prev),
speed := "accelerating"]
dt[(classification == "decreasing") &
abs(diff_1) < 1/growth_cutoff * abs(diff_prev),
speed := "decelerating"]
dt[, c("diff_1", "diff_prev") := NULL]
return(dt)
}
# obtain classification
epitrend <- classify_epidemic(truth_data)
epitrend[, forecast_date := as.character(target_end_date + 2)]
usethis::use_data(epitrend, overwrite = TRUE)
# save unfiltered data and filtered data ---------------------------------------
# save unfiltered data
unfiltered_data <- combined_data[forecast_date %in%
as.character(forecast_dates$unfiltered)]
unfiltered_data[, horizon := substring(target, 1, 1)]
unfiltered_data[
, location_target := paste0(str_to_sentence(target_type),
"s", " in ", location_name)
]
# add classification (as of the forecast date)
# to get this as of target end date, keep the target_end_date column in epitrend
# and remove the forecast_date and merge by target_end_date instead
unfiltered_data <- merge(
unfiltered_data,
copy(epitrend)[, c("target_end_date", "true_value") := NULL],
by = c("location_name", "target_type", "forecast_date")
)
unfiltered_data[, target_phase := str_to_title(paste0(target_type, "s - ",
classification, " phase"))]
unfiltered_data[, target_phase := factor(target_phase,
levels = str_to_title(c("Cases - decreasing phase",
"Cases - unclear phase",
"Cases - increasing phase",
"Deaths - decreasing phase",
"Deaths - unclear phase",
"Deaths - increasing phase")))]
usethis::use_data(unfiltered_data, overwrite = TRUE)
# load and set up data
filtered_cases <- unfiltered_data[
target_type == "case" &
!(as.Date(forecast_date) %in% forecast_dates$christmas)
]
filtered_deaths <- unfiltered_data[
target_type == "death" &
!(as.Date(forecast_date) %in% forecast_dates$christmas) &
!(as.Date(forecast_date) %in% forecast_dates$death_not_scored)
]
filtered_data <- rbindlist(list(filtered_cases, filtered_deaths))
usethis::use_data(filtered_data, overwrite = TRUE)
# additional analysis for individual forecasters -------------------------------
# check number of experts ------------------------------------------------------
purrr::map_dfr(file_paths,
.f = function(file_path) {
data <- data.table::fread(file_path)
data[, `:=`(
target_end_date = as.Date(target_end_date),
submission_date = as.Date(submission_date),
forecast_date = as.Date(forecast_date),
model = get_model_name(file_path)
)]
return(data)
}) %>%
select(board_name, expert) %>%
unique() %>%
summarise(experts = sum(expert),
nonexpert = sum(!expert))
# store prediction data for individual forecasters
file_paths <- list.files(here::here("crowd-forecast", "processed-forecast-data"))
file_dates <- substr(file_paths, 1, 10)
indices <- (1:length(file_dates))[as.Date(file_dates) <= "2021-03-15" &
!is.na(as.Date(file_dates))]
file_paths <- here::here("crowd-forecast", "processed-forecast-data", file_paths)
file_paths <- file_paths[indices]
crowdforecast_data <- purrr::map_dfr(file_paths,
.f = function(x) {
data <- data.table::fread(x) %>%
dplyr::mutate(target_end_date = as.Date(target_end_date),
submission_date = as.Date(submission_date),
forecast_date = as.Date(forecast_date))
}) %>%
dplyr::mutate(target_type = ifelse(grepl("death", target), "death", "case")) %>%
dplyr::rename(prediction = value) %>%
dplyr::mutate(forecast_date = as.Date(submission_date)) %>%
dplyr::rename(model = board_name) %>%
dplyr::filter(type == "quantile",
location_name %in% c("Germany", "Poland")) %>%
dplyr::select(location, location_name, forecast_date, quantile, prediction, model, target_end_date, horizon, target, target_type, expert)
usethis::use_data(crowdforecast_data, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.