In annecori/mRIIDSprocessData: Epidemiological forcasting using real-time data

knitr::opts_chunk$set(fig.width=12, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE, fig.path = "figures/")

library(magrittr)
library(ggthemes)
library(ggplot2)
library(dplyr)
library(EpiEstim)
devtools::load_all()

pow_dist    <- params$pow_dist
t.proj      <- params$t.proj
n.sim       <- params$n.sim
n.dates.sim <- params$n.dates.sim
p.stay      <- params$p.stay
ADM0        <- params$ADM0

infile     <- paste0(ADM0, "_wide.csv") 
infile     <- here::here("data/CaseCounts/processed", infile)
incid_wide <- readr::read_csv(infile)

Split the data into training and validation sets.

training   <- incid_wide[1:t.proj, ]
validation <- incid_wide[(t.proj + 1):(t.proj + n.dates.sim), ]

Parameters for Ebola

Culled from literature.

mean_SI     <- 14.2
CV_SI       <- 9.6 / 14.2
SItrunc     <- 40
SI_Distr    <- sapply(0:SItrunc, function(e) EpiEstim::DiscrSI(e, mean_SI, mean_SI * CV_SI))
SI_Distr    <- SI_Distr / sum(SI_Distr)
time_window <- 7 * 7

Gravity model parameters

pow_N_to <- pow_N_from <- 1
K        <- 1

Estimating the reproduction number

For the predictions, we need to estimate R only in the training window.

start     <- 2:(t.proj - time_window)
end       <- start + time_window
end.dates <- training[end, "Date"]
r.estim   <- select(training, -Date)  %>%
             plyr::alply(2, .dims = TRUE,
                            function(incid) {
                              I   <- pull(incid, 1) 
                              res <- EpiEstim::EstimateR(I,
                                                         T.Start = start ,
                                                         T.End = end,
                                                         method = "NonParametricSI",
                                                         SI.Distr = SI_Distr,
                                                         plot = FALSE ,
                                                         CV.Posterior = 1 ,
                                                         Mean.Prior = 1 ,
                                                         Std.Prior = 0.5)
                              res$R %<>% cbind(Date = end.dates)
                              return(res$R)})

Write the reproduction numbers out for future analysis.

district_rs <- lapply(r.estim, function(df)
                                  filter(df,
                                         Date == training$Date[t.proj]))

district_rs <- bind_rows(district_rs, .id = "Country")

here::here("output", paste0("district_Rs_", t.proj, ".csv")) %>%
    readr::write_csv(x = district_rs, path = .)

We use the estimated R at t.proj to be the R for the window over which predictions are being carried out.

r.j.t  <- r.estim %>%
           lapply(function(R){
                    cutoff <- which(R$Date %in% training[t.proj, "Date"])
                    shape  <- R[cutoff, "Mean(R)"]^2 / R[cutoff, "Std(R)"]^2
                    scale  <- R[cutoff, "Std(R)"]^2 / R[cutoff, "Mean(R)"]
                    return(rgamma(n.sim, shape = shape,
                                  scale = scale))}) %>%
            data.frame(check.names = F)

Calculate the probability of movement between locations. There is some extra arranging of variables here to ensure that the columns in the incidence data and movement matrix are in the same order.

centroids <- here::here("data/Geography/GravityModel/raw",
                     "adm0_centroids.tsv") %>%
             readr::read_tsv(., col_names = c("CL_DistrictRes", "id",
                                     "lon", "lat", "pop")) %>% 
                   filter(CL_DistrictRes %in% names(incid_wide)[-1])

infile <- here::here("data/Geography/GravityModel/processed",
                     "all_centroids.csv")
centroids <- readr::read_csv(infile) %>% 
                   filter(ADM0 == ADM0) 
districts <- data.frame(CL_DistrictRes = colnames(incid_wide)[-1])
centroids <- left_join(districts, centroids)

flow.matrix  <- flow_matrix(longitude   = centroids$lon,
                            latitude    = centroids$lat,
                            population  = centroids$pop,
                            place_names = centroids$CL_DistrictRes,
                            model = "gravity", K = K,
                            pow_N_from = pow_N_from,
                            pow_N_to   = pow_N_to,
                            pow_dist   = pow_dist)


## Relative risk
relative.risk <- flow.matrix / rowSums(flow.matrix, na.rm=TRUE)
p.movement <- probability_movement(relative.risk, p.stay)

incid     <- as.matrix(select(training, -Date))
dates.all <- pull(training, Date) %>%
             c(seq(max(.) + 1, length.out = n.dates.sim, by = 1))
t.max     <- t.proj + n.dates.sim - 1
dates_forecast <-  seq(from = max(training$Date) + 1,
                       length.out = n.dates.sim,
                       by = 1)

daily_projections <- plyr::alply(r.j.t, 1, function(r.t){
                                    r.t   <- as.matrix(r.t)
                                    out   <- project(incid, r.t, SI_Distr,
                                                     p.movement, n.dates.sim)
                                    out <- data.frame(out)
                                    out$Date <- dates_forecast
                                    out })


weekly_projections <- lapply(daily_projections, daily.to.weekly) %>% bind_rows(.)

Aggregating district predictions

We have $N$ simulations for predictions. This means that for each district we have $N$ simulated numbers. To obtain the aggregated national counts, for each date, draw a random sample of size $k$ from the $N$ simulated numbers for each district. Shuffle each sample and add.

num_samples <- 200
districts <- select(incid_wide, -Date) %>% colnames
national <- purrr::map_dfr(districts, function(x){
    out <- sample_projections(daily_projections, x, num_samples)
    out$date <- dates_forecast
    out
})
aggregated <- split(national, national$date) %>%
             purrr::map(function(x)
                       select(x, starts_with("sim")) %>%
                        shuffle_cols() %>%
                        colSums) 
aggregated <- purrr::map_dfr(aggregated, function(x)
                         data.frame(matrix(x, nrow = 1)), .id = "Date") 



aggregated$Date %<>% as.Date
national_weekly <- daily.to.weekly(aggregated)

national_distr <- tidyr::gather(national_weekly, sim, val, -Date) %>%
                   projection_quantiles

And write them out.

paste0(ADM0, "_aggregated_projections_weekly_", p.stay,
       "_", pow_dist, "_", t.proj, ".csv") %>%
here::here("output", .) %>%
readr::write_csv(x = national_distr, path = .)

WHO weekly country data

incid_country <- select(incid_wide, -Date) %>% rowSums

incid_weekly <- data.frame(Date = incid_wide$Date,
                           incid = incid_country) %>% daily.to.weekly

date_min <- "2014-08-01"
small <- filter(incid_weekly, Date > date_min)
p <- ggplot(small, aes(as.Date(Date), incid)) + geom_point()
p <- p + geom_point(data = national_distr, aes(Date, y), col = "red")
p <- p + geom_ribbon(data = national_distr,
                aes(x = Date,
                    ymin = ymin,
                    ymax = ymax,
                    group = 1),
                    inherit.aes = FALSE,
                    alpha = 0.3)
p <- p + theme_tufte()
p <- p + xlab("Date") + ylab("Weekly Incidence")

Quantiles for district predictions

projections_distr <- projection_quantiles(weekly_projections)

outfile <- paste0(ADM0, "_projections_weekly_", p.stay, "_", pow_dist, "_", t.proj, ".csv")
outfile <- here::here("output", outfile)
readr::write_csv(projections_distr, path = outfile)


daily_projections_distr <- bind_rows(daily_projections) %>%
                           projection_quantiles

outfile <- paste0(ADM0, "_projections_daily_", p.stay, "_", pow_dist, "_", t.proj, ".csv")
outfile <- here::here("output", outfile)
readr::write_csv(daily_projections_distr, path = outfile)

Forecast visualisation

The plot consists of the following elements: the training and validation sets, distribution of the predictions (quantiles) and some data beyond the last date for which prediction has been carried out so that we can see the trend.

trng_weekly  <- daily.to.weekly(training) 
vldtn_weekly <- daily.to.weekly(validation)

tmp <- list(Training = trng_weekly, Validation = vldtn_weekly) %>%
        bind_rows(.id = "Category")  %>%
        tidyr::gather(Country, Incidence, -c(Date, Category))

tmp$Date %<>% as.Date

date_min <- "2014-08-01"
date_max <- max(projections_distr$Date) + 45
full_df  <- filter(incid_wide, Date >= date_min & Date <= date_max) %>%
            daily.to.weekly %>%
            tidyr::gather(Country, Incidence, -Date)

full_df$Date %<>% as.Date

p <- ggplot(full_df, aes(Date, Incidence)) +
    geom_point(size = 1.2, stroke = 0, shape = 16, color = "gray") +
    facet_wrap(~Country, scales = "free_y")

p <- p +  theme_tufte() + geom_rangeframe() 


p <- p  +
    geom_point(data = tmp,
               aes(Date, Incidence, color = Category, group = 1),
               size = 1.1, stroke = 0, shape = 16) 



## plot the median projections and the 95% CI

p <- p +
    geom_line(data = projections_distr,
              aes(x = Date, y = y, group = 1),
                  color = 'black', size = 0.5)
p <- p +
    geom_ribbon(data = projections_distr,
                aes(x = Date,
                    ymin = ymin,
                    ymax = ymax,
                    group = 1),
                    inherit.aes = FALSE,
                    alpha = 0.3)


p <- p +  xlab("") + theme(axis.text.x = element_text(angle = 45)) +
     theme(legend.position="none")

p <- p + scale_x_date(date_labels =  "%m-%y")

outfile <- paste0(ADM0, "_projections_", p.stay, "_", pow_dist, "_", t.proj, ".png")
outfile <- here::here("output", outfile)

ggsave(outfile, p)

Evaluating Goodness-of-Fit

Proportion of weekly incidence points that fall inside the 95% CI

tmp2 <- dplyr::left_join(projections_distr, tmp,
                         by = c("Date", "Country"))

#num_districts <- length(unique(tmp2$Country))

#goodness <- filter(tmp2, ymin < Incidence & Incidence < ymax) %>%
#            group_by(Date) %>% summarise(n()/num_districts)



here::here("output", paste0(ADM0, "_goodness_fit_", t.proj, ".csv")) %>%
       readr::write_csv(x = tmp2, path = .)

Coefficient of Determination

For $k$ spatial units,

[ R^2 = 1 - \frac{SS_{res}}{SS_{tot}},]

where

[ SS_{res} = \sum_{i = 1}^n{y_i - \hat{y_i}},]

and [ SS_{tot} = \sum_{k}{\sum_{i = 1}^n{y_{i, k} - \bar{y_k}}}.]

ss_res <- (tmp2$Incidence - tmp2$y)^2 %>% sum
group_means <- group_by(tmp2, Country) %>%
               summarise_at("Incidence", mean) %>%
               rename(mean = Incidence)

tmp2 <- left_join(tmp2, group_means, by = "Country")
ss_tot <- (tmp2$Incidence - tmp2$mean)^2 %>% sum

r2 <- 1 - (ss_res / ss_tot)