In annecori/mRIIDSprocessData: Epidemiological forcasting using real-time data

Introduction

knitr::opts_chunk$set(
  fig.width = 12, fig.height = 6, echo = FALSE,
  warning = FALSE, message = FALSE, fig.path = "figures/"
)

library(ggplot2)
library(dplyr)
library(EpiEstim)
devtools::load_all()

pow_dist <- params$pow_dist
t.proj <- params$t.proj
n.sim <- params$n.sim
n.dates.sim <- params$n.dates.sim
p.stay <- params$p.stay

w.africa <- c("Guinea", "Liberia", "Sierra Leone")

Parameters for Ebola and Reproduction Number Estimation

Culled from literature

mean_SI <- 14.2
CV_SI <- 9.6 / 14.2
SItrunc <- 40
SI_Distr <- sapply(0:SItrunc, function(e) discr_si(e, mean_SI, mean_SI * CV_SI))
SI_Distr <- SI_Distr / sum(SI_Distr)
time_window <- 7 * 7

Gravity model parameters

pow_N_to <- pow_N_from <- 1
K <- 1

Wide Data load

Read in cleaned-up and wide formatted data.

hm_wide <- here::here(
  "data/CaseCounts/processed",
  "HealthMap_Ebola_wide.csv"
  ) %>%
    readr::read_csv()

We now use the incidence count to estimate reproduction number.

start <- 2:(length(hm_wide$date) - time_window)
end <- start + time_window
end.dates <- hm_wide[end, "date"]
r.estim <- select(hm_wide, -date) %>%
    map(function(x) {
    res <- EstimateR(
      x,
      T.Start = start,
      T.End = end,
      method = "NonParametricSI",
      SI.Distr = SI_Distr,
      plot = FALSE,
      CV.Posterior = 1,
      Mean.Prior = 1,
      Std.Prior = 0.5
    )
    res$R <- cbind(res$R, date = end.dates)
    res$R
    })

lapply(names(r.estim), function(country) {
  R <- r.estim[[country]]
  outfile <- paste0("output/", country, "-R.csv")
  write.csv(R, file = outfile, row.names = F, quote = F)
})

imap(r.estim, function(R, country) {
  p <- ggplot(R, aes(date)) +
    geom_ribbon(
      aes(
        ymin = `Quantile.0.025(R)`,
        ymax = `Quantile.0.975(R)`
      ),
      fill = "grey70"
    ) +
    geom_line(aes(y = `Mean(R)`)) +
      theme_classic() +
      xlab("") +
    ylab("Reproduction Number R")
  ggsave(file = here::here("output", paste0(country, "-R.pdf")), p)
})

We assume that the reproduction number remains unchanged for the time period over which we wish to project. For each location, distribution of r_t at t.proj is r_t over the next n.days.sim.

r.j.t <- r.estim %>%
           lapply(function(R){
                     cutoff <- which(R$Date %in% by.location_incid[t.proj, "Date"])
                     shape  <- R[cutoff, "Mean(R)"]^2 / R[cutoff, "Std(R)"]^2
                     scale  <- R[cutoff, "Std(R)"]^2 / R[cutoff, "Mean(R)"]
                     return(rgamma(n.sim, shape = shape,
                                          scale = scale))}) %>% data.frame

Determine the flow matrix for the countries of interest only.

adm0_centroids <- here::here("data", "Geography/GravityModel/raw/adm0_centroids.tsv") %>%
                   read.csv(stringsAsFactors = FALSE, sep = "\t", header = FALSE) %>%
                   filter(V1 %in% w.africa)
names(adm0_centroids) <- c("country", "id", "lon", "lat", "pop")
flow.matrix           <- flow_matrix(longitude = adm0_centroids[, "lon"],
                                     latitude  = adm0_centroids[, "lat"],
                                     population = adm0_centroids[, "pop"],
                                     place_names = adm0_centroids[, "country"],
                                     model = "gravity",
                                     K = K, pow_N_from = pow_N_from,
                                     pow_N_to = pow_N_to, pow_dist = pow_dist)


## Relative risk
relative.risk <- flow.matrix / rowSums(flow.matrix, na.rm=TRUE)

## matrix characterising the population movement between geographical units
#p.stay      <- 0.99 # this can be a vector
p.movement  <- probability_movement(relative.risk, p.stay)

At this point, all the pieces are in place. by.location_incid contains the incidence count r.j.t contains the estimates of reproduction numbers. p.movement conatins the probabilities. SI_Distr is the serial interval distribution. The model is: lambda_j_t = p.movement * (incidence * r_t) * serial_interval taking care of the dimensions of course. Now divide the dataset into training and validation sets.

We will now split our data into training and validation sets.

training   <- by.location_incid[1:t.proj, ]
validation <- by.location_incid[(t.proj + 1):nrow(by.location_incid), ]

incidence.count <- select(training, -Date)
incid           <- as.matrix(incidence.count)
dates.all       <- training$Date %>%
    c(seq(max(.) + 1, length.out = n.dates.sim, by = 1))

dates_pred <-  seq(from = max(training$Date) + 1,
                   length.out = n.dates.sim,
                   by = 1)

t.max           <- nrow(incidence.count) + n.dates.sim - 1

daily.projections <- plyr::alply(r.j.t, 1, function(r.t){
                                    r.t   <- as.matrix(r.t)
                                    out   <- project(incid, r.t, SI_Distr,
                                                     p.movement, n.dates.sim)

                                    out <- data.frame(out)
                                    out$Date <- dates_pred
                                    out})

weekly.available <- c(training    = list(training),
                       validation = list(validation)) %>%
                       lapply(daily.to.weekly) %>%
                       bind_rows(.id = "Category")

weekly.projections <- lapply(daily.projections, daily.to.weekly) %>% bind_rows(.)



projections_distr <- projection_quantiles(weekly.projections)
#outfile <- paste0("hm_summary_projections_", p.stay, "_", pow_dist, "_", t.proj, ".csv")
#outfile <- here::here("output", outfile)
#write.csv(projections_distr, file = outfile, row.names = F, quote = F)

#outfile <- paste0("hm_summary_projections_", p.stay, "_", pow_dist, "_", t.proj, ".png")
#outfile <- here::here("output", outfile)

Aggregate projections for West African Region

num_samples <- 4550
countries <- select(daily.projections[[1]], -Date) %>% colnames()
wafrica_all <- purrr::map_dfr(countries, function(x){
    out <- sample_projections(daily.projections, x, num_samples)
    out$date <- dates_pred
    out
})
aggregated <- split(wafrica_all, wafrica_all$date) %>%
             purrr::map(function(x)
                 select(x, starts_with("sim")) %>%
                 shuffle_cols() %>%
                        colSums) 
aggregated <- purrr::map_dfr(aggregated, function(x)
    data.frame(matrix(x, nrow = 1, dimnames = list("1", names(x)))),
    .id = "Date") 



aggregated$Date %<>% as.Date
wafrica_weekly <- daily.to.weekly(aggregated)

wafrica_distr <- tidyr::gather(wafrica_weekly, sim, aggregated, -Date) %>%
                   projection_quantiles

Stack them together and write them out for later.

projections_distr <- rbind(projections_distr, wafrica_distr)

projections_distr_wide <- reshape(projections_distr,
                                  idvar = 'Date',
                                  timevar = 'Country',
                                  direction = 'wide')
projections_distr_wide$projection_from <- max(training$Date)
projections_distr_wide <- select(projections_distr_wide,
                                 projection_from,
                                 Date,
                                 y.Guinea:ymax.aggregated)

outfile <- paste0("hm_projections_",
                  p.stay, "_",
                  pow_dist, "_",
                  n.dates.sim, ".csv")
outfile <- here::here("output", outfile)
append  <- file.exists(outfile)
readr::write_csv(x = projections_distr_wide,
                 path = outfile,
                 append = append)

tmp <- select(weekly.available, -Category) %>%
        tidyr::gather(Country, Incidence, -Date)
tmp$Date %<>% as.Date
tmp2 <- dplyr::left_join(projections_distr, tmp,
                         by = c("Date", "Country"))

here::here("output",
           paste0("healthmap_goodness_fit_", t.proj, ".csv")) %>%
   readr::write_csv(x = tmp2, path = .)

outfile <- paste0("hm_summary_projections_", p.stay, "_", pow_dist, "_", t.proj, ".png")
outfile <- here::here("output", outfile)



## write.table(tmp, outfile,  
##            sep = ",", col.names = col.names, append = T,
##            row.names = F, quote = F)

aggregated$place <- "aggregated"
aggregated <- select(aggregated, starts_with("sim"), place, date = Date)
wafrica_all <- rbind(wafrica_all, aggregated)
wafrica_tall <- tidyr::gather(wafrica_all, sim, val, -date, -place)
projections_range <-  wafrica_tall %>%
  group_by(date, place) %>%
  summarise(sim_range = list(range(val))) %>%
  mutate(sim_range = purrr::map(sim_range, tibble::enframe, name = "range")) %>%
    tidyr::unnest() %>%
  tidyr::spread(range, value)

outfile <- paste0("hm_projections_",
                  p.stay, "_",
                  pow_dist, "_",
                  t.proj, "_",
                  n.dates.sim, "_range.csv")
outfile <- here::here("output", outfile)
readr::write_csv(x = projections_range, path = outfile)