In annecori/mRIIDSprocessData: Epidemiological forcasting using real-time data

WHO_SL <- WHO_bydistricts %>%
              filter(Country == "Sierra Leone") %>%
              droplevels %>% na.omit

ggplot(WHO_SL, aes(Date, incid)) +
    geom_point() +
    facet_wrap(~CL_DistrictRes, ncol = 3) +
    theme_minimal() +
    ggtitle("Incidence in the districts of Sierra Leone")

We will proceed as we did with HealthMap data - split the data set into training and validation sets, set up the parameters and press play.

SL_wide <-  WHO_SL %>%
            ungroup %>%
            select(-Country) %>%
            tidyr::spread(CL_DistrictRes, incid, fill = 0)

SL_wide <- readr::read_csv("data/CaseCounts/who/sl_wide.csv")

We are now ready to estimate R and move forward with the projection exactly as we did for HealthMap data.

start     <- 2:(length(SL_wide$Date) - time_window)
end       <- start + time_window
end.dates <- SL_wide[end, "Date"]
r.estim   <- SL_wide  %>%
                   select(-Date) %>%
    plyr::alply(2, .dims = TRUE, function(incid) {
                                   I   <- pull(incid, 1) 
                                   res <- EstimateR(I, T.Start = start , T.End = end,
                                                    method = "NonParametricSI",
                                                    SI.Distr = SI_Distr,
                                                    plot = FALSE ,
                                                    CV.Posterior = 1 ,
                                                    Mean.Prior = 1 ,
                                                    Std.Prior = 0.5)
                                   res$R %<>% cbind(Date = end.dates)
                                   return(res$R)})

r.estim %>%
    bind_rows(.id = "District") %>%
    ggplot(aes(Date, `Mean(R)`)) +
    geom_point() +
    facet_wrap(~District) +
    ggtitle("Mean R for each district in Sierra Leone")

Next, calculate the population flow matrix from gravity model and hence the relative risk for each district.

adm2_centroids <- "data/Geography/GravityModel/raw/adm2.txt" %>%
                   read.csv(stringsAsFactors = FALSE,
                            sep = "\t",
                            header = TRUE) %>%
                   filter(ADM0 == "Sierra Leone")  %>%
                   filter(ADM2 != "Western Rural") 

adm2_centroids$ADM2 %<>% gsub("Western Urban", "Western", .)
adm2_centroids$ADM2  %<>%
    gsub(' ', '', .) %<>%
    toupper %<>%
    factor


adm2_centroids$ADM2 %<>%
    factor(levels = levels(factor(WHO_SL$CL_DistrictRes)))

adm2_centroids %<>% arrange(ADM2)

flow.matrix  <- flow_matrix(longitude   = adm2_centroids[, "Centroid_Lon"],
                            latitude    = adm2_centroids[, "Centroid_Lat"],
                            population  = adm2_centroids[, "Pop"],
                            place.names = adm2_centroids[, "ADM2"],
                            model = "gravity", K = K,
                            pow_N_from = pow_N_from,
                            pow_N_to   = pow_N_to,
                            pow_dist   = pow_dist)


## Relative risk
relative.risk <- flow.matrix / rowSums(flow.matrix, na.rm=TRUE)
p.movement <- probability_movement(relative.risk, p.stay)

## Test with random matrix. Delete later

#prandom <- runif(169)
#p.movement <- matrix(prandom, nrow = nrow(p.movement))

At this point, all the pieces are in place. SL_training contains the incidence count we will use for projecting. Estimate of R is in the data frame r.j.t with a row for each simulation we want to run. p.movement conatins the probabilities. We will project over a period of 6 weeks at three different time points - at the start of the epidemic, during the middle and towards the end.

r.j.t  <- r.estim %>%
           lapply(function(R){
                    cutoff <- which(R$Date %in% SL_wide[t.proj, "Date"])
                    shape  <- R[cutoff, "Mean(R)"]^2 / R[cutoff, "Std(R)"]^2
                    scale  <- R[cutoff, "Std(R)"]^2 / R[cutoff, "Mean(R)"]
                    return(rgamma(n.sim, shape = shape,
                                  scale = scale))}) %>%
            data.frame(check.names = F)


SL_training   <- SL_wide[1:t.proj, ]
validation    <- nrow(SL_wide)
SL_validation <- SL_wide[(1 + t.proj):validation, ]

incid     <- as.matrix(select(SL_training, -Date))
dates.all <- SL_training[1:t.proj, ] %>%
             pull(Date) %>%
             c(seq(max(.) + 1, length.out = n.dates.sim, by = 1))
t.max     <- t.proj + n.dates.sim - 1


sl_daily_projections <- plyr::alply(r.j.t, 1, function(r.t){
                                    r.t   <- as.matrix(r.t)
                                    out   <- project(incid, r.t, SI_Distr,
                                                     p.movement, n.dates.sim) 
                                    incidence.proj <- rbind(incid, out)
                                    incidence.proj %<>% data.frame(check.names = F)
                                    incidence.proj %<>% cbind(Date = dates.all)
                                    return(incidence.proj[(nrow(SL_training) + 1):t.max, ])})


sl_weekly_projections <- lapply(sl_daily_projections, daily.to.weekly) %>% bind_rows(.)

At the same time, we will also obtain the weekly incidence count from available data.

weekly.available <- c(training    = list(SL_training),
                       validation = list(SL_validation)) %>%
                       lapply(daily.to.weekly) %>%
    bind_rows(.id = "Category")

projections_distr <- projection_quantiles(sl_weekly_projections)
outfile <- paste0("output/sl_summary_projections_", p.stay, "_", pow_dist, "_", t.proj, ".csv")
write.csv(projections_distr, file = outfile, row.names = F, quote = F)

outfile <- paste0("output/sl_summary_projections_", p.stay, "_", pow_dist, "_", t.proj, ".png")

t.min <- min(projections_distr$Date) - 63
t.max <- max(projections_distr$Date) + 49
weekly.available %<>% filter(Date >= t.min & Date < t.max)
trng.start <- min(projections_distr$Date) - time_window
valdtn.end <- max(projections_distr$Date)
p <- plot.weekly3(weekly.available, projections_distr, trng.start, valdtn.end)
ggsave(outfile, p)

end <- max(dates.all) + 42
weekly.available %<>% filter(Date < end)
plots.list <- SL_training %>%
                 select(-Date) %>%
                 colnames %>%
                 lapply(function(location){
                         available  <- weekly.available %>%
                                       `[`(, c("Date", "Category", location))
                         projection <- sl_weekly_projections %>%
                                       `[`(, c("Date", location))
                         plot.weekly(available, projection)})

cowplot::plot_grid(plotlist = plots.list)

Evaluating goodness of fit

sl_projections  <- bind_rows(sl_daily_projections)
start           <- min(sl_projections$Date)
end             <- max(sl_projections$Date)
available_small <- filter(SL_wide, Date >= start & Date <= end)

districts <- select(sl_projections, -Date) %>% colnames
names(districts) <- districts
sl_loglh   <- lapply(districts, function(district){
                                      observed  <- available_small[, c("Date", district)]
                                      predicted <- sl_projections[, c("Date", district)]

                                      colnames(observed)  <- c("Date", "incid")
                                      colnames(predicted) <- c("Date", "incid")

                                      log_likelihoods_atj(observed, predicted) %>% sum})

sl_loglh %<>% data.frame
sl_loglh %<>% cbind(data.frame(pow_dist = pow_dist, p.stay = p.stay), .)
outfile   <-  "output/sierra-lh.csv" 
col.names <- ifelse(file.exists(outfile), FALSE, TRUE)
write.table(sl_loglh, outfile,  
            sep = ",", col.names = col.names, append = T, row.names = F, quote = F)

projection_50   <- sl_projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5)


## --- goodness of fit ---
error <- select(projection_50, -Date) - select(available_small, -Date)
sl_rms   <- summarise_all(error, funs(rms)) %>%
            cbind(data.frame(pow_dist = pow_dist, p.stay = p.stay), .)

outfile   <-  "output/sierra-rms.csv" 
col.names <- ifelse(file.exists(outfile), FALSE, TRUE)
write.table(sl_rms, outfile,  
            sep = ",", col.names = col.names, append = T, row.names = F, quote = F)

Visualising the projections on a map

We can compare the spatial spread of the predicted case counts with the available for the period over which we are projecting. which we are projecting

sierra         <- rgdal::readOGR(dsn = "data/Geography/SLE_adm_shp",
                                 layer = "SLE_adm2")
sierra@data$id <- sierra@data$NAME_2
sierra.points  <- broom::tidy(sierra, region="id")
sierra.points$id %<>% factor
sierra.df      <- left_join(sierra.points, sierra@data, by="id")

sierra.df$NAME_2 %<>%
    toupper %<>%
    gsub(' ', '', .) %<>%
    factor


sierra.df$NAME_2 <- plyr::revalue(sierra.df$NAME_2,
                                  c("WESTERNURBAN" = "WESTERN",
                                    "WESTERNRURAL" = "WESTERN"))


start           <- min(sl_weekly_projections$Date)
end             <- max(sl_weekly_projections$Date)
available_small <- filter(weekly.available, Date > start & Date < end) %>%
                   select(-c(Category, Date)) %>%
                   colSums %>% data.frame 

available_small %<>% tibble::rownames_to_column()
colnames(available_small) <- c("NAME_2", "incid")

projection_50   <- sl_weekly_projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5) %>%
                   ungroup %>%
                   select(-Date) %>% colSums %>% data.frame 

projection_50 %<>% tibble::rownames_to_column()
colnames(projection_50) <- c("NAME_2", "incid")



total_incid    <- list(Observed = data.frame(available_small),
                       Predicted = data.frame(projection_50)) %>%
                   bind_rows(.id = "Category")


incid_map  <- left_join(sierra.df, total_incid)

incid_map$Category %<>% factor(levels = c("Observed", "Predicted"))
incid_map %<>% arrange(Category)


p <- ggplot(incid_map) + 
    aes(long, lat, group = group, fill = incid) +
    geom_polygon() +
    geom_path(color = "black", alpha = 0.1 ) +
    coord_equal() +
    facet_wrap(~Category, strip.position = "top") +
    scale_fill_gradient2(low = "grey50",
                         high = "firebrick")

p <- p + theme_bw()
p <- p + theme(strip.text.x = element_text(size = 12,
                                           face = "bold"))
p <- p + theme(panel.border = element_blank(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               axis.line = element_blank(),
               strip.background = element_blank())

p <- p +  theme(axis.title = element_blank(),
                axis.text = element_blank(),
                axis.ticks = element_blank())