In annecori/mRIIDSprocessData: Epidemiological forcasting using real-time data

WHO_Guinea <- WHO_bydistricts %>%
               filter(Country == "Guinea") %>%
               droplevels %>%
               na.omit

ggplot(WHO_Guinea, aes(Date, incid)) +
    geom_point() +
    facet_wrap(~CL_DistrictRes, ncol = 3) +
    theme_minimal() +
    ggtitle("Incidence in districts of Guinea")

adm2_centroids <- "data/Geography/GravityModel/raw/adm2-fixed.txt" %>%
                   read.csv(stringsAsFactors = FALSE,
                            sep = "\t",
                            header = TRUE) %>%
                   filter(ADM0 == "Guinea")  


adm2_centroids$ADM2  %<>%
    iconv(to='ASCII//TRANSLIT')  %<>%
    gsub("\\'", '', .)  %<>%
    toupper %<>%
    factor

WHO_Guinea$CL_DistrictRes %<>%
    plyr::mapvalues(from = c("N'ZEREKORE", "KISSIDOUGO", "YOMOU" ),
                    to   = c("NZEREKORE", "KISSIDOUGOU", "YAMOU"))

Some of the prefectures of Guinea are not represented in the WHO data. These are: Gaoual, Koubia, Koundara, Labe, Lelouma, Mamou and Mandiana. Hence we will drop these from the data frame of centroids.

adm2_centroids %<>%
    filter(ADM2 %in% WHO_Guinea$CL_DistrictRes) %<>%
    droplevels


adm2_centroids$ADM2 %<>%
    factor(levels = levels(WHO_Guinea$CL_DistrictRes))

adm2_centroids %<>% arrange(ADM2)

flow.matrix  <- flow_matrix(longitude   = adm2_centroids[, "Centroid_Lon"],
                            latitude    = adm2_centroids[, "Centroid_Lat"],
                            population  = adm2_centroids[, "Pop"],
                            place.names = adm2_centroids[, "ADM2"],
                            model = "gravity", K = K,
                            pow_N_from = pow_N_from,
                            pow_N_to   = pow_N_to,
                            pow_dist   = pow_dist)


## Relative risk
relative.risk <- flow.matrix / rowSums(flow.matrix, na.rm=TRUE)
p.movement    <- probability_movement(relative.risk, p.stay)

We are now ready to estimate R and move forward with the projection exactly as we did for HealthMap data.

WHO_Guinea_wide <- WHO_Guinea %>%
                    ungroup %>%
                    select(-Country) %>%
                    tidyr::spread(CL_DistrictRes, incid, fill = 0)


start     <- 2:(length(WHO_Guinea_wide$Date) - time_window)
end       <- start + time_window
end.dates <- WHO_Guinea_wide[end, "Date"]
r.estim   <- WHO_Guinea_wide  %>%
                   select(-Date) %>%
    plyr::alply(2, .dims = TRUE, function(incid) {
                                   I   <- pull(incid, 1) 
                                   res <- EstimateR(I, T.Start = start , T.End = end,
                                                    method = "NonParametricSI",
                                                    SI.Distr = SI_Distr,
                                                    plot = FALSE ,
                                                    CV.Posterior = 1 ,
                                                    Mean.Prior = 1 ,
                                                    Std.Prior = 0.5)
                                   res$R %<>% cbind(Date = end.dates)
                                   return(res$R)})

r.j.t <- r.estim %>%
           lapply(function(R){
                    cutoff <- which(R$Date %in% WHO_Guinea_wide[t.proj, "Date"])
                    shape  <- R[cutoff, "Mean(R)"]^2 / R[cutoff, "Std(R)"]^2
                    scale  <- R[cutoff, "Std(R)"]^2 / R[cutoff, "Mean(R)"]
                    return(rgamma(n.sim, shape = shape,
                                         scale = scale))}) %>% data.frame

r.estim %>%
    bind_rows(.id = "County") %>%
    ggplot(aes(Date, `Mean(R)`)) +
    geom_point() +
    facet_wrap(~County) +
    ggtitle("Mean R for each county in Guinea")

At this point, all the pieces are in place. Guinea_training contains the incidence count we will use for projecting. Estimate of R is in the data frame r.j.t with a row for each simulation we want to run. p.movement conatins the probabilities.

Guinea_training   <- WHO_Guinea_wide[1:t.proj, ]
Guinea_validation <- WHO_Guinea_wide[(1 + t.proj):nrow(WHO_Guinea_wide), ]

incid     <- as.matrix(select(Guinea_training, -Date))
dates.all <- Guinea_training[1:t.proj, ] %>%
             pull(Date) %>%
             c(seq(max(.) + 1, length.out = n.dates.sim, by = 1))
t.max     <- t.proj + n.dates.sim - 1


daily.projections <- plyr::alply(r.j.t, 1, function(r.t){
                                    r.t   <- as.matrix(r.t)
                                    out   <- project(incid, r.t, SI_Distr,
                                                     p.movement, n.dates.sim) 
                                    incidence.proj <- rbind(incid, out)
                                    incidence.proj %<>% data.frame
                                    incidence.proj %<>% cbind(Date = dates.all, .)
                                    colnames(incidence.proj) <- colnames(Guinea_training)
                                    return(incidence.proj[(nrow(Guinea_training) + 1):t.max, ])})


weekly.projections <- lapply(daily.projections, daily.to.weekly) %>% dplyr::bind_rows(.)

At the same time, we will also obtain the weekly incidence count from available data.

weekly.available <- c(training    = list(Guinea_training),
                       validation = list(Guinea_validation)) %>%
                       lapply(daily.to.weekly) %>%
                       dplyr::bind_rows(.id = "Category")

end <- max(dates.all) + 21
weekly.available_small <- filter(weekly.available, Date < end)

plots.list <- Guinea_training %>%
                 select(-Date) %>%
                 colnames %>%
                 lapply(function(location){
                         available  <- weekly.available_small %>%
                                       `[`(, c("Date", "Category", location))
                         projection <- weekly.projections %>%
                                       `[`(, c("Date", location))
                         plot.weekly(available, projection)})

cowplot::plot_grid(plotlist = plots.list)

Evaluating goodness of fit

weekly.available <- c(training    = list(Guinea_training),
                       validation = list(Guinea_validation)) %>%
                       lapply(daily.to.weekly) %>%
                       bind_rows(.id = "Category")

start           <- min(weekly.projections$Date)
end             <- max(weekly.projections$Date)
available_small <- filter(weekly.available, Date >= start & Date <= end) %>%
                   select(-Category)

projection_50   <- weekly.projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5)


## --- goodness of fit ---
error        <- select(projection_50, -Date) - select(available_small, -Date)
guinea_rms   <- summarise_all(error, funs(rms)) %>%
                cbind(data.frame(pow_dist = pow_dist, p.stay = p.stay), .)

outfile   <-  "output/guinea-rms.csv" 
col.names <- ifelse(file.exists(outfile), FALSE, TRUE)
write.table(guinea_rms, outfile,  
            sep = ",", col.names = col.names, append = T, row.names = F, quote = F)

Visualising the spread on the map of Guinea

guinea         <- rgdal::readOGR(dsn = "data/Geography/GIN_adm_shp",
                                 layer = "GIN_adm2")
guinea@data$id <- guinea@data$NAME_2
guinea.points  <- broom::tidy(guinea, region="id")
guinea.points$id %<>% factor
guinea.df      <- left_join(guinea.points, guinea@data, by="id")

guinea.df$NAME_2 %<>%
    iconv(to='ASCII//TRANSLIT')  %<>%
    gsub("\\'", '', .)  %<>%
    toupper %<>%
    factor

start           <- min(weekly.projections$Date)
end             <- max(dates.all) 

available_small <- filter(weekly.available, Date > start & Date < end) %>%
                   select(-c(Category, Date)) %>%
                   colSums %>% data.frame 

available_small %<>% tibble::rownames_to_column()
colnames(available_small) <- c("NAME_2", "incid")

projection_50   <- weekly.projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5) %>%
                   ungroup %>%
                   select(-Date) %>% colSums %>% data.frame 

projection_50 %<>% tibble::rownames_to_column()
colnames(projection_50) <- c("NAME_2", "incid")


missing   <- setdiff(unique(guinea.df$NAME_2), unique(total_incid$NAME_2))
available_small %<>% rbind(data.frame(NAME_2 = missing, incid = rep(0, length(missing))))
projection_50 %<>% rbind(data.frame(NAME_2 = missing, incid = rep(0, length(missing))))

total_incid    <- list(training = data.frame(available_small),
                        projection = data.frame(projection_50)) %>%
                   bind_rows(.id = "Category")


incid_map  <- left_join(guinea.df, total_incid)

incid_map$Category %<>% factor(levels = c("training", "projection"))
incid_map %<>% arrange(Category)


 ggplot(incid_map) + 
    aes(long, lat, group = group, fill = incid) +
    geom_polygon() +
    geom_path(color = "white" ) +
    coord_equal() +
    facet_grid(.~Category) +
    scale_fill_gradient2(low = "grey50",
                         high = "firebrick")