In annecori/mRIIDSprocessData: Epidemiological forcasting using real-time data

knitr::opts_chunk$set(fig.width=12, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE, fig.path = "figures/")

library(magrittr)
library(ggplot2)
library(ggthemes)
library(dplyr)
devtools::load_all()

Introduction

The pre-processing steps consist of turning the line count into a daily incidence series and turning the given tall data into wide data.

The current data set has three possible values for the epidemiological case definition - confirmed, probable and suspected. Epidemiological case definition of probable and suspected cases differed across countries while confirmed cases were the ones confirmed through lab report. For the purpose of the current analysis, one approach could be to lump all of them together. The inferred date of onset (rather than the date reported) is used for estimation. The columns we use are : Country, EpiCaseDef (probably), DateOnsetInferred and CL_DistrictRes.

WHO_raw <- here::here("data", "CaseCounts/who/rstb20160308supp1.csv") %>%
           read.csv(colClasses = c(Country = "factor",
                                   EpiCaseDef = "character",
                                   DateOnsetInferred = "Date",
                                   CL_DistrictRes = "factor")) %>%
           select(Country, EpiCaseDef,
                  DateOnsetInferred,CL_DistrictRes) %>%
           na.omit

WHO_raw$CL_DistrictRes %<>% gsub(' ', '', .)  %<>% factor

Some spelling variations between the WHO data and the data set containing the district co-ordinates need to be taken into account.

WHO_raw$CL_DistrictRes %<>% factor
WHO_raw$CL_DistrictRes %<>%
    plyr::mapvalues(from = c("KISSIDOUGO",  "YOMOU",  "N'ZEREKORE", "GBARPOLU" ),
                    to   = c("KISSIDOUGOU", "YAMOU",  "NZEREKORE", "GBAPOLU"))

Grouping by districts and interpolating missing data

For each district in a country, add the number of records for each date to get incidence count.

WHO_bydistricts <- WHO_raw %>% 
                   group_by(Country, CL_DistrictRes,
                                   DateOnsetInferred) %>%
                   summarise(incid = n())

Within each district, if there is a date on which no cases are recorded, we assume that there were no cases on that date and add this to the record. At the end of this step, the incidence time series for each district should be a daily time series.

WHO_bydistricts %<>%
    split(.$CL_DistrictRes) %<>%
    lapply(add_0incid) %<>%
    bind_rows


WHO_bydistricts %<>% rename(Date = DateOnsetInferred)

WHO_bycountry <- WHO_bydistricts %>%
                  group_by(Country,Date) %>%
                  summarise(incid = sum(incid))

ggplot(WHO_bydistricts, aes(Date, incid)) +
    geom_point() +
    facet_wrap(~CL_DistrictRes) +
    ggtitle("Incidence in each district") +
    xlab("") + theme(axis.text.x = element_text(angle = 45, vjust = 0.2)) +
    theme_tufte() + geom_rangeframe() 

ggplot(WHO_bycountry, aes(Date, incid)) +
    geom_point() +
    facet_wrap(~Country) +
    theme_minimal() +
    ggtitle("Incidence in each country") +
    xlab("") + theme(axis.text.x = element_text(angle = 45, vjust = 0.2)) +
    theme_tufte() + geom_rangeframe()

Write the files.

outfile <- here::here("data", "CaseCounts/processed/WHO_bydistricts.csv")
readr::write_csv(WHO_bydistricts, outfile)

outfile <- here::here("data", "CaseCounts/processed/WHO_bycountry.csv")
readr::write_csv(WHO_bycountry, outfile)

Now we will convert the data from the tall to the wide format.

WHO_bydistricts_split <- WHO_bydistricts %>%
                         split(.$Country) 
lapply(WHO_bydistricts_split, function(df){
                                  country <- df$Country[1]
                                  outfile <- paste0("figures/",
                                                    country, "-incid.png")
                                  p <- ggplot(df, aes(Date, incid)) +
                                      geom_point(size = 1.1,
                                                 stroke = 0,
                                                 shape = 16) +
                                      facet_wrap(~CL_DistrictRes) +
                                      ggtitle(paste("Incidence in the districts of", country))
                                  p <- p + geom_rangeframe() + theme_tufte()
                                  ggsave(outfile, p)})

lapply(WHO_bydistricts_split, function(df){
                                  df %<>% na.omit
                                  country <- df$Country[1] 
                                  country %<>% gsub(' ', '', .) %<>% toupper
                                  outfile <- here::here("data",
                                                        "CaseCounts/processed/")
                                  outfile <- paste0(outfile, country, "_wide.csv")
                                  df_wide <-  df %>% ungroup %>%
                                                     select(-Country) %>%
                                                     tidyr::spread(CL_DistrictRes,
                                                                   incid, fill = 0)
                                  readr::write_csv(df_wide, outfile)})

Also write out the data for all districts for later analysis.

WHO_wide <- WHO_bydistricts %>%
                    ungroup %>%
                    select(-Country) %>%
                    tidyr::spread(CL_DistrictRes, incid, fill = 0)

outfile <- here::here("data",
                      "CaseCounts/processed/WHO_wide.csv")
readr::write_csv(WHO_wide, outfile)