knitr::opts_chunk$set(fig.width=12, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE, fig.path = "figures/")
library(magrittr) library(ggthemes) library(ggplot2) library(dplyr) library(EpiEstim) devtools::load_all() pow_dist <- params$pow_dist t.proj <- params$t.proj n.sim <- params$n.sim n.dates.sim <- params$n.dates.sim p.stay <- params$p.stay ADM0 <- params$ADM0
infile <- paste0(ADM0, "_wide.csv") infile <- here::here("data/CaseCounts/processed", infile) incid_wide <- readr::read_csv(infile)
Split the data into training and validation sets.
training <- incid_wide[1:t.proj, ] validation <- incid_wide[(t.proj + 1):(t.proj + n.dates.sim), ]
Culled from literature.
mean_SI <- 14.2 CV_SI <- 9.6 / 14.2 SItrunc <- 40 SI_Distr <- sapply(0:SItrunc, function(e) EpiEstim::DiscrSI(e, mean_SI, mean_SI * CV_SI)) SI_Distr <- SI_Distr / sum(SI_Distr) time_window <- 7 * 7
pow_N_to <- pow_N_from <- 1 K <- 1
For the predictions, we need to estimate R only in the training window.
start <- 2:(t.proj - time_window) end <- start + time_window end.dates <- training[end, "Date"] r.estim <- select(training, -Date) %>% plyr::alply(2, .dims = TRUE, function(incid) { I <- pull(incid, 1) res <- EpiEstim::EstimateR(I, T.Start = start , T.End = end, method = "NonParametricSI", SI.Distr = SI_Distr, plot = FALSE , CV.Posterior = 1 , Mean.Prior = 1 , Std.Prior = 0.5) res$R %<>% cbind(Date = end.dates) return(res$R)})
Write the reproduction numbers out for future analysis.
district_rs <- lapply(r.estim, function(df) filter(df, Date == training$Date[t.proj])) district_rs <- bind_rows(district_rs, .id = "Country") here::here("output", paste0("district_Rs_", t.proj, ".csv")) %>% readr::write_csv(x = district_rs, path = .)
We use the estimated R at t.proj to be the R for the window over which predictions are being carried out.
r.j.t <- r.estim %>% lapply(function(R){ cutoff <- which(R$Date %in% training[t.proj, "Date"]) shape <- R[cutoff, "Mean(R)"]^2 / R[cutoff, "Std(R)"]^2 scale <- R[cutoff, "Std(R)"]^2 / R[cutoff, "Mean(R)"] return(rgamma(n.sim, shape = shape, scale = scale))}) %>% data.frame(check.names = F)
Calculate the probability of movement between locations. There is some extra arranging of variables here to ensure that the columns in the incidence data and movement matrix are in the same order.
centroids <- here::here("data/Geography/GravityModel/raw", "adm0_centroids.tsv") %>% readr::read_tsv(., col_names = c("CL_DistrictRes", "id", "lon", "lat", "pop")) %>% filter(CL_DistrictRes %in% names(incid_wide)[-1])
infile <- here::here("data/Geography/GravityModel/processed", "all_centroids.csv") centroids <- readr::read_csv(infile) %>% filter(ADM0 == ADM0) districts <- data.frame(CL_DistrictRes = colnames(incid_wide)[-1]) centroids <- left_join(districts, centroids)
flow.matrix <- flow_matrix(longitude = centroids$lon, latitude = centroids$lat, population = centroids$pop, place_names = centroids$CL_DistrictRes, model = "gravity", K = K, pow_N_from = pow_N_from, pow_N_to = pow_N_to, pow_dist = pow_dist) ## Relative risk relative.risk <- flow.matrix / rowSums(flow.matrix, na.rm=TRUE) p.movement <- probability_movement(relative.risk, p.stay)
incid <- as.matrix(select(training, -Date)) dates.all <- pull(training, Date) %>% c(seq(max(.) + 1, length.out = n.dates.sim, by = 1)) t.max <- t.proj + n.dates.sim - 1 dates_forecast <- seq(from = max(training$Date) + 1, length.out = n.dates.sim, by = 1) daily_projections <- plyr::alply(r.j.t, 1, function(r.t){ r.t <- as.matrix(r.t) out <- project(incid, r.t, SI_Distr, p.movement, n.dates.sim) out <- data.frame(out) out$Date <- dates_forecast out }) weekly_projections <- lapply(daily_projections, daily.to.weekly) %>% bind_rows(.)
We have $N$ simulations for predictions. This means that for each district we have $N$ simulated numbers. To obtain the aggregated national counts, for each date, draw a random sample of size $k$ from the $N$ simulated numbers for each district. Shuffle each sample and add.
num_samples <- 200 districts <- select(incid_wide, -Date) %>% colnames national <- purrr::map_dfr(districts, function(x){ out <- sample_projections(daily_projections, x, num_samples) out$date <- dates_forecast out }) aggregated <- split(national, national$date) %>% purrr::map(function(x) select(x, starts_with("sim")) %>% shuffle_cols() %>% colSums) aggregated <- purrr::map_dfr(aggregated, function(x) data.frame(matrix(x, nrow = 1)), .id = "Date") aggregated$Date %<>% as.Date national_weekly <- daily.to.weekly(aggregated) national_distr <- tidyr::gather(national_weekly, sim, val, -Date) %>% projection_quantiles
And write them out.
paste0(ADM0, "_aggregated_projections_weekly_", p.stay, "_", pow_dist, "_", t.proj, ".csv") %>% here::here("output", .) %>% readr::write_csv(x = national_distr, path = .)
incid_country <- select(incid_wide, -Date) %>% rowSums incid_weekly <- data.frame(Date = incid_wide$Date, incid = incid_country) %>% daily.to.weekly date_min <- "2014-08-01" small <- filter(incid_weekly, Date > date_min) p <- ggplot(small, aes(as.Date(Date), incid)) + geom_point() p <- p + geom_point(data = national_distr, aes(Date, y), col = "red") p <- p + geom_ribbon(data = national_distr, aes(x = Date, ymin = ymin, ymax = ymax, group = 1), inherit.aes = FALSE, alpha = 0.3) p <- p + theme_tufte() p <- p + xlab("Date") + ylab("Weekly Incidence")
projections_distr <- projection_quantiles(weekly_projections) outfile <- paste0(ADM0, "_projections_weekly_", p.stay, "_", pow_dist, "_", t.proj, ".csv") outfile <- here::here("output", outfile) readr::write_csv(projections_distr, path = outfile) daily_projections_distr <- bind_rows(daily_projections) %>% projection_quantiles outfile <- paste0(ADM0, "_projections_daily_", p.stay, "_", pow_dist, "_", t.proj, ".csv") outfile <- here::here("output", outfile) readr::write_csv(daily_projections_distr, path = outfile)
The plot consists of the following elements: the training and validation sets, distribution of the predictions (quantiles) and some data beyond the last date for which prediction has been carried out so that we can see the trend.
trng_weekly <- daily.to.weekly(training) vldtn_weekly <- daily.to.weekly(validation) tmp <- list(Training = trng_weekly, Validation = vldtn_weekly) %>% bind_rows(.id = "Category") %>% tidyr::gather(Country, Incidence, -c(Date, Category)) tmp$Date %<>% as.Date
date_min <- "2014-08-01" date_max <- max(projections_distr$Date) + 45 full_df <- filter(incid_wide, Date >= date_min & Date <= date_max) %>% daily.to.weekly %>% tidyr::gather(Country, Incidence, -Date) full_df$Date %<>% as.Date p <- ggplot(full_df, aes(Date, Incidence)) + geom_point(size = 1.2, stroke = 0, shape = 16, color = "gray") + facet_wrap(~Country, scales = "free_y") p <- p + theme_tufte() + geom_rangeframe() p <- p + geom_point(data = tmp, aes(Date, Incidence, color = Category, group = 1), size = 1.1, stroke = 0, shape = 16) ## plot the median projections and the 95% CI p <- p + geom_line(data = projections_distr, aes(x = Date, y = y, group = 1), color = 'black', size = 0.5) p <- p + geom_ribbon(data = projections_distr, aes(x = Date, ymin = ymin, ymax = ymax, group = 1), inherit.aes = FALSE, alpha = 0.3) p <- p + xlab("") + theme(axis.text.x = element_text(angle = 45)) + theme(legend.position="none") p <- p + scale_x_date(date_labels = "%m-%y") outfile <- paste0(ADM0, "_projections_", p.stay, "_", pow_dist, "_", t.proj, ".png") outfile <- here::here("output", outfile) ggsave(outfile, p)
tmp2 <- dplyr::left_join(projections_distr, tmp, by = c("Date", "Country")) #num_districts <- length(unique(tmp2$Country)) #goodness <- filter(tmp2, ymin < Incidence & Incidence < ymax) %>% # group_by(Date) %>% summarise(n()/num_districts) here::here("output", paste0(ADM0, "_goodness_fit_", t.proj, ".csv")) %>% readr::write_csv(x = tmp2, path = .)
For $k$ spatial units,
[ R^2 = 1 - \frac{SS_{res}}{SS_{tot}},]
where
[ SS_{res} = \sum_{i = 1}^n{y_i - \hat{y_i}},]
and [ SS_{tot} = \sum_{k}{\sum_{i = 1}^n{y_{i, k} - \bar{y_k}}}.]
ss_res <- (tmp2$Incidence - tmp2$y)^2 %>% sum group_means <- group_by(tmp2, Country) %>% summarise_at("Incidence", mean) %>% rename(mean = Incidence) tmp2 <- left_join(tmp2, group_means, by = "Country") ss_tot <- (tmp2$Incidence - tmp2$mean)^2 %>% sum r2 <- 1 - (ss_res / ss_tot)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.