In ben18785/incidenceinflation: Estimate R0 Given Case Reporting Delays

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(incidenceinflation)
library(ggplot2)
library(magrittr)
library(dplyr)
library(tidyr)
library(purrr)

We first generate a true case series using a negative binomial renewal process. Here, we assume a time-varying Rt.

days_total <- 100

# allow Rt to vary over time and construct interpolation function
v_Rt <- c(rep(1.5, 40), rep(0.4, 20), rep(1.5, 40))
Rt_function <- stats::approxfun(1:days_total, v_Rt)

# serial interval distribution parameters (assumed gamma)
s_params <- list(mean=5, sd=3)

# negative binomial over-dispersion parameter
kappa <- 10

# generate series
cases <- true_cases(days_total, Rt_function, kappa, s_params,
                    initial_parameters=list(mean=30, length=30))

# plot series
tibble(d=1:days_total,
       cases=cases,
       Rt=v_Rt) %>%
  ggplot(aes(x=d)) +
  geom_point(aes(y=cases)) +
  geom_line(aes(y=cases)) +
  geom_line(aes(y=Rt * 500), colour="orange") +
  scale_y_continuous(sec.axis = sec_axis(~./500, name = "Rt"))

In our model, we assume that cases arising on a given day aren't necessarily observed until some time in the future. In other words, there are imperfections in case reporting meaning that cases may not be uncovered until much after they arise.

We now use the true case series generated previous to simulate observed case trajectories: there is a trajectory for each onset day.

# reporting delay distribution (assumed gamma)
r_params <- list(mean=10, sd=3)

df <- observed_cases(cases, r_params, days_max_follow_up=40)
glimpse(df)

For example, consider those cases arising on day 10. We can plot the number of reported cases which arised on that day subsequently. Over time, reported cases tends to true case counts as move cases arising on day 10 are retroactively uncovered.

df %>% 
  filter(time_onset == 10) %>% 
  pivot_longer(cols=c("cases_reported", "cases_true")) %>% 
  ggplot(aes(x=time_reported, y=value, colour=name)) +
  geom_line() +
  scale_color_brewer("Cases", palette = "Dark2")

Now plotting the same but for each onset day, we obtain a series of trajectories for reported cases.

df %>% 
  filter(time_onset <= 10) %>% 
  ggplot(aes(x=time_reported, y=cases_reported,
             colour=time_onset, group=as.factor(time_onset))) +
  geom_line()

Another way of understanding these data is that the historical snapshots of the epidemic change as time goes on and more cases are retroactively reported. We imagine we are at $t=40$ and consider how the history of cases changes as time goes on.

t <- 40
df %>% 
  filter(time_onset <= t) %>% 
  filter(time_reported >= t) %>% 
  ggplot(aes(x=time_onset, y=cases_reported)) +
  geom_line(aes(colour=time_reported,
                group=time_reported)) +
  geom_point(aes(y=cases_true), colour="black") +
  scale_colour_viridis_c(begin = 0.2)

The process of generated these types of snapshot data is automated in a single function.

df <- generate_snapshots(days_total, Rt_function, s_params, r_params,
                         kappa=kappa)

df %>% 
  filter(time_onset <= t) %>% 
  filter(time_reported >= t) %>% 
  ggplot(aes(x=time_onset, y=cases_reported)) +
  geom_line(aes(colour=time_reported,
                group=time_reported)) +
  geom_point(aes(y=cases_true), colour="black") +
  geom_line(aes(y=cases_true), colour="black") +
  scale_colour_viridis_c(begin = 0.2)

These snapshots can alternatively be visualised using a so-called reporting trapezoid. Here, we draw this at reporting time $t=50$.

df %>% 
  filter(time_reported <= 50) %>% 
  mutate(time_delay=time_reported-time_onset) %>% 
  ggplot(aes(x=time_onset, y=time_delay,
             fill=cases_reported)) +
  geom_tile() +
  scale_fill_distiller(direction=1)

Trying to estimate cases but assuming we know the Rt values over time, the reporting delay distribution and the serial delay distribution.

days_total <- 100
kappa <- 1000
df <- generate_snapshots(days_total, Rt_function,
                         s_params, r_params,
                          kappa=kappa, thinned=T) %>% 
  mutate(reporting_piece_index=1) # denotes a single reporting delay distribution
max_cases <- 5000
r_params_df <- tibble(mean=r_params$mean, sd=r_params$sd,
                      reporting_piece_index=1)
df_est <- sample_cases_history(df, max_cases, Rt_function, s_params, r_params_df, maximise = F)
df_est %>% 
  group_by(time_onset) %>% 
  mutate(cases_reported=max(cases_reported)) %>% 
  pivot_longer(c(cases_true, cases_estimated, cases_reported)) %>%
  ggplot(aes(x=time_onset, y=value, colour=as.factor(name))) +
  geom_line() +
  scale_color_brewer("Series", palette = "Dark2") +
  xlab("Onset time") +
  ylab("Cases") +
  theme(legend.position = c(0.6, 0.6))

Sampling a value of Rt but assuming we know the true case history (and everything else).

cases_history_df <- df

# we split Rt up into intervals of duration 20 days
Rt_indices <- unlist(map(seq(1, 5, 1), ~rep(., 20)))

Rt_index_lookup <- tibble(
  time_onset=seq_along(Rt_indices),
  Rt_index=Rt_indices)

cases_history_df <- cases_history_df %>% 
  left_join(Rt_index_lookup, by = "time_onset") %>% 
  select(time_onset, cases_true, Rt_index) %>% 
  unique()

# Prior on each Rt segment value
Rt_prior <- list(shape=5, rate=5)

Rt_df_po <- sample_Rt(cases_history_df,
  Rt_prior, s_params,
  ndraws = 1000,
  serial_max = 20) %>% 
  mutate(method="poisson")

# graph posterior samples vs actual Rt values
Rt_df_po %>% 
  right_join(Rt_index_lookup, by = "Rt_index") %>%
  mutate(type="estimated") %>% 
  bind_rows(tibble(Rt=v_Rt, time_onset=seq(1, days_total, 1),
                   type="actual")) %>% 
  ggplot(aes(x=time_onset,
             y=Rt, group=as.factor(draw_index))) +
  geom_line(data=. %>% filter(type=="estimated"), alpha=0.1) +
  geom_line(data=. %>% filter(type=="actual"), colour="orange") +
  xlab("Onset time")

Inference on simulated data

snapshot_with_Rt_index_df <- df
initial_cases_true <- df %>% select(time_onset, cases_true) %>% unique()
snapshot_with_Rt_index_df <- snapshot_with_Rt_index_df %>%  select(-cases_true)
initial_Rt <- tribble(~Rt_index, ~Rt,
                      1, 1.5,
                      2, 1.5,
                      3, 0.4,
                      4, 1.5,
                      5, 1.5)
Rt_indices <- unlist(map(seq(1, 5, 1), ~rep(., 20)))

Rt_index_lookup <- tibble(
  time_onset=seq_along(Rt_indices),
  Rt_index=Rt_indices)
snapshot_with_Rt_index_df <- snapshot_with_Rt_index_df %>% 
  left_join(Rt_index_lookup)

initial_reporting_parameters <- list(mean=5, sd=3)
serial_parameters <- list(mean=5, sd=3)
priors <- list(Rt=Rt_prior,
               reporting=list(mean_mu=5,
                              mean_sigma=10,
                              sd_mu=3,
                              sd_sigma=5),
               max_cases=5000)

# running MCMC across 2 chains
## uncomment to run in parallel
# library(doParallel)
# cl <- makeCluster(2)
# registerDoParallel(cl)
res <- mcmc(niterations=200,
            snapshot_with_Rt_index_df,
            priors,
            serial_parameters,
            initial_cases_true,
            initial_reporting_parameters,
            initial_Rt,
            reporting_metropolis_parameters=list(mean_step=0.25, sd_step=0.1),
            serial_max=40, p_gamma_cutoff=0.99, maximise=FALSE,
            nchains = 2, is_parallel = FALSE)
## uncomment if running in parallel
# stopCluster(cl)

Check convergence diagnostics

library(posterior)

results_posterior_format <- convert_results_to_posterior_format(res)

# only look at Rt here due to space
summarise_draws(results_posterior_format$Rt)

Examine results

onsets <- 90:100
cases_true_df <- df %>% 
  filter(time_onset %in% onsets) %>% 
  rename(cases_known=cases_true)
cases_df <- res$cases %>% 
  filter(time_onset %in% onsets) %>% 
  left_join(cases_true_df %>% select(-c(cases_reported, time_reported))) %>% 
  pivot_longer(c(cases_true, cases_known))

cases_df %>% 
  ggplot(aes(x=iteration, y=value, colour=name)) +
  geom_line() +
  facet_wrap(~time_onset)

res_df <- res$Rt

# initialisers were at true values
Rt_true <- initial_Rt %>% 
  rename(Rt_true=Rt)

res_df %>% 
  left_join(Rt_true) %>% 
  pivot_longer(c(Rt, Rt_true)) %>% 
  ggplot(aes(x=iteration, y=value, colour=name)) +
  geom_line() +
  facet_wrap(~Rt_index)

rep_df <- res$reporting %>% 
  mutate(type="estimated") %>% 
  bind_rows(tibble(mean=r_params$mean, sd=r_params$sd, type="true")) %>% 
  pivot_longer(-c(iteration, type))

ggplot(data=rep_df %>% filter(type=="estimated"),
       aes(x=iteration, y=value)) +
  geom_hline(data=rep_df %>% filter(type=="true"),
             aes(yintercept=value), linetype=2) +
  geom_line() +
  facet_wrap(~name, scales="free")

MCMC on real measles data in the Netherlands 2013-14

measles_NL_2013 %>% 
  mutate(time_delay=time_reported-time_onset) %>%
  ggplot(aes(x=time_onset, y=time_delay,
             fill=cases_reported)) +
  geom_tile() +
  scale_fill_distiller(direction=1)

measles_NL_2013 %>% 
  group_by(time_onset) %>% 
  summarise(cases_reported=sum(cases_reported)) %>% 
  ggplot(aes(x=time_onset, y=cases_reported)) +
  geom_line()

# alternative view
measles_NL_2013 %>% 
  mutate(time_delay=time_reported-time_onset) %>%
  group_by(time_onset) %>% 
  summarise(time_delay=sum(time_delay * cases_reported) / sum(cases_reported)) %>% 
  ggplot(aes(x=time_onset, y=time_delay)) +
  geom_line()

Prepare data for fitting

snapshot_with_Rt_index_df <- thin_series(measles_NL_2013) %>% 
  select(-date_onset) %>% 
  group_by(time_onset) %>% 
  mutate(time_reported=time_reported,
         cases_reported=cumsum(cases_reported))

# filter and rebase time to start when first case appears
df_sum <- snapshot_with_Rt_index_df %>% 
  group_by(time_onset) %>% 
  summarise(cases_reported=last(cases_reported)) %>% 
  ungroup() %>% 
  filter(cases_reported>0)
snapshot_with_Rt_index_df <- snapshot_with_Rt_index_df %>% 
  ungroup() %>% 
  filter(time_onset >= df_sum$time_onset[1]) %>% 
  mutate(time_reported=time_reported - df_sum$time_onset[1] + 1,
         time_onset=time_onset - df_sum$time_onset[1] + 1)

initial_cases_true <- snapshot_with_Rt_index_df %>% 
  group_by(time_onset) %>% 
  summarise(cases_true=last(cases_reported)) 

initial_Rt <- tribble(~Rt_index, ~Rt,
                      1, 1,
                      2, 1,
                      3, 1,
                      4, 1,
                      5, 1)
Rt_indices <- c(unlist(map(seq(1, 5, 1), ~rep(., 26))), 5)

Rt_index_lookup <- tibble(
  time_onset=seq_along(Rt_indices),
  Rt_index=Rt_indices)
snapshot_with_Rt_index_df <- snapshot_with_Rt_index_df %>% 
  left_join(Rt_index_lookup)

initial_reporting_parameters <- list(mean=10, sd=3)
serial_parameters <- list(mean=12, sd=3)
priors <- list(Rt=Rt_prior,
               reporting=list(mean_mu=10,
                              mean_sigma=10,
                              sd_mu=3,
                              sd_sigma=5),
               max_cases=100)

Artificially remove hindsight for those onset times near end; we do this by assuming that time_reported=138

hindsight_removed_df <- snapshot_with_Rt_index_df %>% 
  filter(time_reported <= 138)

res <- mcmc(niterations=400,
            hindsight_removed_df,
            priors,
            serial_parameters,
            initial_cases_true,
            initial_reporting_parameters,
            initial_Rt,
            reporting_metropolis_parameters=list(mean_step=0.25, sd_step=0.1),
            serial_max=40, p_gamma_cutoff=0.99, maximise=FALSE)

Rt estimates

res_df <- res$Rt

res_df %>% 
  ggplot(aes(x=iteration, y=Rt)) +
  geom_line() +
  facet_wrap(~Rt_index)

Cases

onsets <- 111:119
cases_df <- res$cases %>% 
  filter(time_onset %in% onsets)

cases_df %>% 
  ggplot(aes(x=iteration, y=cases_true)) +
  geom_line() +
  facet_wrap(~time_onset)