In annecori/mRIIDSprocessData: Epidemiological forcasting using real-time data

The key thing is to get the latitudes, longitudes and population estimates for all the districts and determine population flow matrix.

guinea_sierra <- "data/Geography/GravityModel/raw/adm2-fixed.txt" %>%
                     read.csv(stringsAsFactors = FALSE,
                              sep = "\t",
                              header = TRUE) %>%
                    filter(ADM0 %in% c("Guinea", "Sierra Leone"))  

liberia <- "data/Geography/GravityModel/raw/adm1_centroids_fixed.tsv" %>%
                   read.csv(stringsAsFactors = FALSE,
                            sep = "\t",
                            header = TRUE) %>%
                   filter(ADM0 == "Liberia")  

guinea_sierra %<>%
    select(ADM2, Centroid_Lon, Centroid_Lat, Pop) %<>%
    rename(CL_DistrictRes = ADM2)

liberia %<>%
    select(ADM1, Centroid_Lon, Centroid_Lat, Pop) %<>%
    rename(CL_DistrictRes = ADM1)


all_centroids <- bind_rows(guinea_sierra, liberia) 


all_centroids$CL_DistrictRes     %<>%
    iconv(to='ASCII//TRANSLIT')  %<>%
    gsub("\\'", '', .)  %<>%
    gsub(' ', '', .) %<>%
    toupper %<>%
    factor


all_centroids$CL_DistrictRes %<>% gsub("WESTERNURBAN", "WESTERN", .)

Some of the prefectures of Guinea are not represented in the WHO data. These are: Gaoual, Koubia, Koundara, Labe, Lelouma, Mamou and Mandiana. Hence we will drop these from the data frame of centroids.

missing <- setdiff(unique(all_centroids$CL_DistrictRes),
                   unique(WHO_bydistricts$CL_DistrictRes))

notmissing    <- !(all_centroids$CL_DistrictRes %in% missing)
all_centroids <- all_centroids[notmissing, ]


all_centroids$CL_DistrictRes %<>% factor

After this data clean-up we can finally determine the flow matrix.

all_centroids <- read.csv("data/Geography/GravityModel/processed/WHOcentroids.csv")

WHO_bydistricts$CL_DistrictRes %<>% factor
WHO_bydistricts$CL_DistrictRes %<>%
    plyr::mapvalues(from = c("KISSIDOUGO",  "YOMOU",  "N'ZEREKORE", "GBARPOLU" ),
                    to   = c("KISSIDOUGOU", "YAMOU",  "NZEREKORE", "GBAPOLU"))

all_centroids$CL_DistrictRes %<>%
    factor(levels = levels(WHO_bydistricts$CL_DistrictRes))

all_centroids %<>% arrange(CL_DistrictRes)
flow.matrix  <- flow_matrix(longitude   = all_centroids[, "Centroid_Lon"],
                            latitude    = all_centroids[, "Centroid_Lat"],
                            population  = all_centroids[, "Pop"],
                            place.names = all_centroids[, "CL_DistrictRes"],
                            model = "gravity", K = K,
                            pow_N_from = pow_N_from,
                            pow_N_to   = pow_N_to,
                            pow_dist   = pow_dist)


## Relative risk
relative.risk <- flow.matrix / rowSums(flow.matrix, na.rm=TRUE)
p.movement    <- probability_movement(relative.risk, p.stay)

To estimate R, we convert from long format to the wide format.

infile <- here::here("data",
                      "CaseCounts/processed/WHO_wide.csv")
WHO_wide <- readr::read_csv(infile)


start     <- 2:(length(WHO_wide$Date) - time_window)
end       <- start + time_window
end.dates <- WHO_wide[end, "Date"]
r.estim   <- WHO_wide  %>%
                   select(-Date) %>%
    plyr::alply(2, .dims = TRUE, function(incid) {
                                   I   <- pull(incid, 1) 
                                   res <- EstimateR(I, T.Start = start , T.End = end,
                                                    method = "NonParametricSI",
                                                    SI.Distr = SI_Distr,
                                                    plot = FALSE ,
                                                    CV.Posterior = 1 ,
                                                    Mean.Prior = 1 ,
                                                    Std.Prior = 0.5)
                                   res$R %<>% cbind(Date = end.dates)
                                   return(res$R)})

r.j.t <- r.estim %>%
           lapply(function(R){
                    cutoff <- which(R$Date %in% WHO_wide[t.proj, "Date"])
                    shape  <- R[cutoff, "Mean(R)"]^2 / R[cutoff, "Std(R)"]^2
                    scale  <- R[cutoff, "Std(R)"]^2 / R[cutoff, "Mean(R)"]
                    return(rgamma(n.sim, shape = shape,
                                         scale = scale))}) %>% data.frame

r.estim %>%
    bind_rows(.id = "District") %>%
    ggplot(aes(Date, `Mean(R)`)) +
    geom_point() +
    facet_wrap(~District) +
    ggtitle("Mean R for each district in Sierra Leone, Liberia and Guinea")

At this point, all the pieces are in place. alldistricts_training contains the incidence count we will use for projecting. Estimate of R is in the data frame r.j.t with a row for each simulation we want to run. p.movement conatins the probabilities.

alldistricts_training   <- WHO_wide[1:t.proj, ]
alldistricts_validation <- WHO_wide[(1 + t.proj):nrow(WHO_wide), ]

incid     <- as.matrix(select(alldistricts_training, -Date))
dates.all <- alldistricts_training[1:t.proj, ] %>%
             pull(Date) %>%
             c(seq(max(.) + 1, length.out = n.dates.sim, by = 1))
t.max     <- t.proj + n.dates.sim - 1


daily.projections <- plyr::alply(r.j.t, 1, function(r.t){
                                    r.t   <- as.matrix(r.t)
                                    out   <- project(incid, r.t, SI_Distr,
                                                     p.movement, n.dates.sim) 
                                    incidence.proj <- rbind(incid, out)
                                    incidence.proj %<>% data.frame
                                    incidence.proj %<>% cbind(Date = dates.all, .)
                                    colnames(incidence.proj) <- colnames(alldistricts_training)
                                    return(incidence.proj[(nrow(alldistricts_training) + 1):t.max, ])})

At the same time, we will also obtain the weekly incidence count from available data.

weekly.projections <- lapply(daily.projections, daily.to.weekly) %>% dplyr::bind_rows(.)

weekly.available <- c(training    = list(alldistricts_training),
                       validation = list(alldistricts_validation)) %>%
                       lapply(daily.to.weekly) %>%
                       dplyr::bind_rows(.id = "Category")

projections_distr <- projection_quantiles(weekly.projections)
outfile <- paste0("output/alldistricts_summary_projections_", p.stay, "_", pow_dist, "_", t.proj, ".csv")
write.csv(projections_distr, file = outfile, row.names = F, quote = F)

end <- max(dates.all) + 49
weekly.available_small <- filter(weekly.available, Date < end)

plots.list <- alldistricts_training %>%
                 select(-Date) %>%
                 colnames %>%
                 lapply(function(location){
                         available  <- weekly.available_small %>%
                                       `[`(, c("Date", "Category", location))
                         projection <- weekly.projections %>%
                                       `[`(, c("Date", location))
                         plot.weekly(available, projection)})

cowplot::plot_grid(plotlist = plots.list)

Evaluating goodness of fit

daily.projections %<>% bind_rows(.)
start           <- min(daily.projections$Date)
end             <- max(daily.projections$Date)
available_small <- filter(WHO_wide, Date >= start & Date <= end)

We now determine the likelihood of the parameters using the observed data.

incid  <- as.matrix(select(WHO_wide, -Date))
t.max  <- t.proj + n.dates.sim - 1
R      <- r.j.t[1, ]
all_loglh <- log_likelihood_1step(incid,
                                  window = t.proj,
                                  start = t.proj + 1,
                                  end = t.max,
                                  pij = p.movement,
                                  R = as.matrix(R),
                                  si = SI_Distr) %>% sum

districts <- select(daily.projections, -Date) %>% colnames
names(districts) <- districts
all_loglh   <- lapply(districts, function(district){
                                      observed  <- available_small[, c("Date", district)]
                                      predicted <- daily.projections[, c("Date", district)]

                                      colnames(observed)  <- c("Date", "incid")
                                      colnames(predicted) <- c("Date", "incid")

                                      log_likelihoods_atj(observed, predicted) %>% sum})

all_loglh %<>% data.frame
all_loglh %<>% cbind(data.frame(pow_dist = pow_dist, p.stay = p.stay), .)

outfile   <-  paste0("output/alldistricts-lh-tproj-", t.proj, ".csv")
col.names <- ifelse(file.exists(outfile), FALSE, TRUE)

write.table(all_loglh, outfile,  
            sep = ",", col.names = col.names, append = T, row.names = F, quote = F)

projection_50   <- daily.projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5)

error     <- select(projection_50, -Date) - select(available_small, -Date)
all_rms   <- summarise_all(error, funs(rms)) %>%
                cbind(data.frame(pow_dist = pow_dist, p.stay = p.stay), .)

outfile   <-  paste0("output/alldistricts-rms-", t.proj, ".csv")
col.names <- ifelse(file.exists(outfile), FALSE, TRUE)
write.table(all_rms, outfile,
            sep = ",", col.names = col.names, append = T, row.names = FALSE, quote = FALSE)

Visualising the spread on the map of West Africa

liberia         <- rgdal::readOGR(dsn = "data/Geography/LBR_adm_shp",
                                 layer = "LBR_adm2")
liberia@data$id <- liberia@data$NAME_1
liberia.points  <- broom::tidy(liberia, region="id")
liberia.points$id %<>% factor
liberia.df      <- left_join(liberia.points, liberia@data, by="id")
liberia.df$CL_DistrictRes <- liberia.df$NAME_1


sierra         <- rgdal::readOGR(dsn = "data/Geography/SLE_adm_shp",
                                 layer = "SLE_adm2")
sierra@data$id <- sierra@data$NAME_2
sierra.points  <- broom::tidy(sierra, region="id")
sierra.points$id %<>% factor
sierra.df      <- left_join(sierra.points, sierra@data, by="id")
sierra.df$NAME_2 <- plyr::revalue(sierra.df$NAME_2,
                                  c("Western Rural" = "WESTERN",
                                    "Western Urban" = "WESTERN"))

sierra.df$CL_DistrictRes <- sierra.df$NAME_2

guinea         <- rgdal::readOGR(dsn = "data/Geography/GIN_adm_shp",
                                 layer = "GIN_adm2")
guinea@data$id <- guinea@data$NAME_2
guinea.points  <- broom::tidy(guinea, region="id")
guinea.points$id %<>% factor
guinea.df      <- left_join(guinea.points, guinea@data, by="id")
guinea.df$CL_DistrictRes <- guinea.df$NAME_2

wafrica <- rbind(guinea.df, liberia.df) %>% rbind(sierra.df)
wafrica$CL_DistrictRes %<>%
    iconv(to='ASCII//TRANSLIT')  %<>%
    gsub("\\'", '', .)  %<>%
    gsub(' ', '', .) %<>%
    toupper %<>%
    factor



start           <- min(weekly.projections$Date)
end             <- max(dates.all) 

available_small <- filter(weekly.available, Date > start & Date < end) %>%
                   select(-c(Category, Date)) %>%
                   colSums %>% data.frame 

available_small %<>% tibble::rownames_to_column()
colnames(available_small) <- c("CL_DistrictRes", "incid")

projection_50   <- weekly.projections %>%
                   group_by(Date) %>%
                   summarise_all(quantile, probs = 0.5) %>%
                   ungroup %>%
                   select(-Date) %>% colSums %>% data.frame 

projection_50 %<>% tibble::rownames_to_column()
colnames(projection_50) <- c("CL_DistrictRes", "incid")




missing   <- setdiff(unique(wafrica$CL_DistrictRes),
                     unique(available_small$CL_DistrictRes))

available_small %<>% rbind(data.frame(CL_DistrictRes = missing,
                                      incid = rep(0, length(missing))))
projection_50 %<>% rbind(data.frame(CL_DistrictRes = missing,
                                    incid = rep(0, length(missing))))

total_incid    <- list(training = data.frame(available_small),
                        projection = data.frame(projection_50)) %>%
                   bind_rows(.id = "Category")

incid_map  <- left_join(wafrica, total_incid)


incid_map$Category %<>% factor(levels = c("training", "projection"))
incid_map %<>% arrange(Category)


 ggplot(incid_map) + 
    aes(long, lat, group = group, fill = incid) +
    geom_polygon() +
    geom_path(color = "white" ) +
    coord_equal() +
    facet_grid(.~Category) +
    scale_fill_gradient2(low = "grey50",
                         high = "firebrick")