In annecori/mRIIDSprocessData: Epidemiological forcasting using real-time data

WHO_Liberia <- WHO_bydistricts %>%
               filter(Country == "Liberia") %>%
               droplevels %>%
               na.omit

ggplot(WHO_Liberia, aes(Date, incid)) +
    geom_point() +
    facet_wrap(~CL_DistrictRes, ncol = 3) +
    theme_minimal() +
    ggtitle("Incidence in districts of Liberia")

Clean up the names of the districts because they will become column names further down. Remove all spaces, make them upper case.

WHO_Liberia$CL_DistrictRes %<>%
    gsub(' ', '', .)

WHO_Liberia$CL_DistrictRes %<>% factor

WHO_Liberia_wide <- WHO_Liberia %>%
                    ungroup %>%
                    select(-Country) %>%
                    tidyr::spread(CL_DistrictRes, incid, fill = 0)

We are now ready to estimate R and move forward with the projection exactly as we did for HealthMap data.

start     <- 2:(length(WHO_Liberia_wide$Date) - time_window)
end       <- start + time_window
end.dates <- WHO_Liberia_wide[end, "Date"]
r.estim   <- WHO_Liberia_wide  %>%
                   select(-Date) %>%
    plyr::alply(2, .dims = TRUE, function(incid) {
                                   I   <- pull(incid, 1) 
                                   res <- EstimateR(I, T.Start = start , T.End = end,
                                                    method = "NonParametricSI",
                                                    SI.Distr = SI_Distr,
                                                    plot = FALSE ,
                                                    CV.Posterior = 1 ,
                                                    Mean.Prior = 1 ,
                                                    Std.Prior = 0.5)
                                   res$R %<>% cbind(Date = end.dates)
                                   return(res$R)})

r.j.t <- r.estim %>%
           lapply(function(R){
                    cutoff <- which(R$Date %in% WHO_Liberia_wide[t.proj, "Date"])
                    shape  <- R[cutoff, "Mean(R)"]^2 / R[cutoff, "Std(R)"]^2
                    scale  <- R[cutoff, "Std(R)"]^2 / R[cutoff, "Mean(R)"]
                    return(rgamma(n.sim, shape = shape,
                                         scale = scale))}) %>% data.frame

r.estim %>%
    bind_rows(.id = "County") %>%
    ggplot(aes(Date, `Mean(R)`)) +
    geom_point() +
    facet_wrap(~County) +
    ggtitle("Mean R for each county in Liberia")

Calculate the population flow matrix and hence the relative risk profile.

adm1_centroids <- "data/Geography/GravityModel/raw/adm1_centroids_fixed.tsv" %>%
                   read.csv(stringsAsFactors = FALSE,
                            sep = "\t",
                            header = TRUE) %>%
                   filter(ADM0 == "Liberia")  


adm1_centroids$ADM1  %<>%
    gsub(' ', '', .) %<>%
    toupper %<>%
    factor


adm1_centroids$ADM1 %<>%
    factor(levels = levels(WHO_Liberia$CL_DistrictRes))

adm1_centroids %<>% arrange(ADM1)
flow.matrix  <- flow_matrix(longitude   = adm1_centroids[, "Centroid_Lon"],
                            latitude    = adm1_centroids[, "Centroid_Lat"],
                            population  = adm1_centroids[, "Pop"],
                            place.names = adm1_centroids[, "ADM1"],
                            model = "gravity", K = K,
                            pow_N_from = pow_N_from,
                            pow_N_to   = pow_N_to,
                            pow_dist   = pow_dist)


## Relative risk
relative.risk <- flow.matrix / rowSums(flow.matrix, na.rm=TRUE)

p.movement    <- probability_movement(relative.risk, p.stay)

At this point, all the pieces are in place. Liberia_training contains the incidence count we will use for projecting. Estimate of R is in the data frame r.j.t with a row for each simulation we want to run. p.movement conatins the probabilities.

Liberia_training   <- WHO_Liberia_wide[1:t.proj, ]
Liberia_validation <- WHO_Liberia_wide[(1 + t.proj):nrow(WHO_Liberia_wide), ]

incid     <- as.matrix(select(Liberia_training, -Date))
dates.all <- Liberia_training[1:t.proj, ] %>%
             pull(Date) %>%
             c(seq(max(.) + 1, length.out = n.dates.sim, by = 1))
t.max     <- t.proj + n.dates.sim - 1


daily.projections <- plyr::alply(r.j.t, 1, function(r.t){
                                    r.t   <- as.matrix(r.t)
                                    out   <- project(incid, r.t, SI_Distr,
                                                     p.movement, n.dates.sim) 
                                    incidence.proj <- rbind(incid, out)
                                    incidence.proj %<>% data.frame
                                    incidence.proj %<>% cbind(Date = dates.all, .)
                                    colnames(incidence.proj) <- colnames(Liberia_training)
                                    return(incidence.proj[(nrow(Liberia_training) + 1):t.max, ])})


weekly.projections <- lapply(daily.projections, daily.to.weekly) %>% dplyr::bind_rows(.)

At the same time, we will also obtain the weekly incidence count from available data.

weekly.available <- c(training    = list(Liberia_training),
                       validation = list(Liberia_validation)) %>%
                       lapply(daily.to.weekly) %>%
                       dplyr::bind_rows(.id = "Category")

weekly.available_small <- filter(weekly.available, Date < "2014-09-13")

plots.list <- Liberia_training %>%
                 select(-Date) %>%
                 colnames %>%
                 lapply(function(location){
                         available  <- weekly.available_small %>%
                                       `[`(, c("Date", "Category", location))
                         projection <- weekly.projections %>%
                                       `[`(, c("Date", location))
                         plot.weekly(available, projection)})

cowplot::plot_grid(plotlist = plots.list)

Evaluating goodness of fit

weekly.available <- c(training    = list(Liberia_training),
                       validation = list(Liberia_validation)) %>%
                       lapply(daily.to.weekly) %>%
                       bind_rows(.id = "Category")

start           <- min(weekly.projections$Date)
end             <- max(weekly.projections$Date)
available_small <- filter(weekly.available, Date >= start & Date <= end) %>%
                   select(-Category)

projection_50   <- weekly.projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5)


## --- goodness of fit ---
error <- select(projection_50, -Date) - select(available_small, -Date)
liberia_rms   <- summarise_all(error, funs(rms)) %>%
         cbind(data.frame(pow_dist = pow_dist, p.stay = p.stay), .)

outfile   <-  "output/liberia-rms.csv" 
col.names <- ifelse(file.exists(outfile), FALSE, TRUE)
write.table(liberia_rms, outfile,  
            sep = ",", col.names = col.names, append = T, row.names = F, quote = F)

Visualising the projections on a map

We will only plot the incidence data from 1 week before we started projection to the end of the projection period.

liberia         <- rgdal::readOGR(dsn = "data/Geography/LBR_adm_shp",
                                 layer = "LBR_adm1")
liberia@data$id <- liberia@data$NAME_1
liberia.points  <- broom::tidy(liberia, region="id")
liberia.points$id %<>% factor
liberia.df      <- left_join(liberia.points, liberia@data, by="id")

liberia.df$NAME_1 %<>%
    toupper %<>%
    gsub(' ', '', .) %<>%
    factor


start           <- min(weekly.projections$Date)
end             <- max(weekly.projections$Date)
available_small <- filter(weekly.available, Date > start & Date < end) %>%
                   select(-c(Category, Date)) %>%
                   colSums %>% data.frame

available_small %<>% tibble::rownames_to_column()
colnames(available_small) <- c("NAME_1", "incid")

projection_50   <- weekly.projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5) %>%
                   ungroup %>%
                   select(-Date) %>% colSums %>% data.frame

projection_50 %<>% tibble::rownames_to_column()
colnames(projection_50) <- c("NAME_1", "incid")



total_incid    <- list(training = data.frame(available_small),
                        projection = data.frame(projection_50)) %>%
                   bind_rows(.id = "Category")

total_incid$NAME_1 %<>% gsub("GBARPOLU", "GBAPOLU", .)
incid_map  <- left_join(liberia.df, total_incid)

incid_map$Category %<>% factor(levels = c("training", "projection"))
incid_map %<>% arrange(Category)


 ggplot(incid_map) +
    aes(long, lat, group = group, fill = incid) +
    geom_polygon() +
    geom_path(color = "white" ) +
    coord_equal() +
    facet_grid(.~Category) +
    scale_fill_gradient2(low = "grey50",
                         high = "firebrick")