library(dplyr) library(magrittr) library(stringr) library(ggplot2) library(ggthemes)
The two different data sources are Patient Database and Situation Report. According to the website, Patient Database ``provides information on the symptoms, diagnosis and outcomes of treatment for each suspected, probable or confirmed Ebola patient; typically gives the fullest representation of the epidemic, but is often incomplete for the most recent days or weeks.''
Situation Report ``Contains daily district information on number of suspected, probable or confirmed Ebola patients; lacks many of the details found in the patient database, but provides the best data on the status of the epidemic now and for recent days.''
sl_website <- here::here( "data/CaseCounts/processed", "processed_who_sl_2015-08-19.csv" ) %>% read.csv(.) sl_website$week_of_year <- factor(sl_website$week_of_year)
And WHO data from Tini's paper.
sl_linelist <- here::here( "data/CaseCounts/who", "rstb20160308supp1.csv" ) %>% read.csv(colClasses = c( Country = "factor", EpiCaseDef = "character", DateOnsetInferred = "Date" )) %>% select( Country, EpiCaseDef, DateOnsetInferred, CL_DistrictRes ) %>% rename(location = CL_DistrictRes) %>% filter(Country == "Sierra Leone") sl_linelist <- filter(sl_linelist, !is.na(DateOnsetInferred)) sl_linelist <- arrange(sl_linelist, DateOnsetInferred) sl_weeks <- lubridate::week(sl_linelist$DateOnsetInferred) %>% stringr::str_pad(2, pad = "0") sl_linelist$week_of_year <- paste0( lubridate::year(sl_linelist$DateOnsetInferred), "-W", sl_weeks )
Are the dates/weeks missing from the above linelist the same as the dates for which data from website contains blanks?
all_dates <- seq( from = min( sl_linelist$DateOnsetInferred, na.rm = TRUE ), to = max( sl_linelist$DateOnsetInferred, na.rm = TRUE ), by = "1 day" ) dates_na_linelist <- all_dates[!(all_dates %in% sl_linelist$DateOnsetInferred)] weeks_padded <- stringr::str_pad( lubridate::week(dates_na_linelist), 2, pad = "0" ) weeks_na_linelist <- paste0( lubridate::year(dates_na_linelist), "-W", weeks_padded )
And for data from the website.
idx <- which(is.na(sl_website$new_cases)) weeks_na_website <- sl_website$week_of_year[idx] %>% unique()
What weeks are NA for the linelist but not in website data?
setdiff(weeks_na_linelist, weeks_na_website)
Weeks from 2013. We won't worry about this. What weeks are NA for the website data but not in linelist?
setdiff(weeks_na_website, weeks_na_linelist)
Dig a little deeper. For each district, for each case type, where we have a blank in the website, what info do we have in the linelist?
by_district <- filter(sl_website, is.na(new_cases)) %>% split(list(.$location, .$case)) tmp <- lapply(by_district, function(x) { district <- unique(x$location) filter(sl_linelist, CL_DistrictRes == district & week_of_year %in% unique(x$week_of_year)) })
Let us treat NAs as 0 and compare weekly data.
idx <- which(is.na(sl_website$new_cases)) sl_website$new_cases[idx] <- 0
Add confirmed and probable cases.
sl_website_total <- group_by( sl_website, week_of_year, data_source, location ) %>% summarise(total_cases = sum(new_cases))
Linelist to weekly
sl_linelist_weekly <- select( sl_linelist, -Country, -EpiCaseDef, -DateOnsetInferred ) %>% group_by(location, week_of_year) %>% mutate(total_cases = n()) sl_linelist_weekly$data_source <- "linelist" sl_linelist_weekly <- select( sl_linelist_weekly, week_of_year, data_source, location, total_cases )
all_sources <- rbind(sl_linelist_weekly, sl_website_total) all_sources$week_of_year <- factor(all_sources$week_of_year) all_sources %>% split(.$location) %>% lapply(function(x){ p <- ggplot(x, aes(week_of_year, total_cases, col = data_source)) + geom_point() p <- p + theme_bw() p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 2)) here::here("output", paste0(unique(x$location), ".png")) %>% ggsave(p)})
So there is significant discrepancy between the three data sources. Let's just focus on the weeks where the website says blank (or 0 now).
sl_website_0s <- filter(sl_website_total, total_cases == 0) %>% droplevels() sl_linelist_0s <- filter(sl_linelist_weekly, week_of_year %in% unique(sl_website_0s$week_of_year)) %>% droplevels() rbind(sl_linelist_0s, sl_website_0s) %>% split(.$location) %>% lapply(function(x){ p <- ggplot(x, aes(week_of_year, total_cases, col = data_source)) + geom_point() p <- p + theme_bw() p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 2)) here::here("output", paste0("0_", unique(x$location), ".png")) %>% ggsave(p)})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.