First read in and clean up the data from ProMed. Since these data are at national level, we will also aggregate the WHO data at country level.

promed <- read.csv("data/CaseCounts/raw/ProMED_Ebola_2014-2016.csv",
                   stringsAsFactors = F)
promed$Issue.Date %<>% lubridate::dmy_hm(.)
promed$Issue.Date %<>% format(format = "%m/%d/%y %H:%M")

The categories in ProMed data are: Cumulative SC (suspected cases),Cumulative SD, Cumulative CC, and Cumulative CD. For data from the WHO, the case status could be confirmed, probable or suspected. The final clinical outcome (alive/dead) for each case is also recorded. In comparing the two data sets, we sum across all four categories in ProMed and derive incidence data from cumulative data.

promed %<>% rename("SC" = "Cumulative.SC",
                   "SD" = "Cumulative.SD",
                   "CC" = "Cumulative.CC",
                   "CD" = "Cumulative.CD",
                   "HealthMap.Alert.ID" = "HM.Alert.ID")

species   <- "Humans"
disease   <- "Ebola"
case.type <- "SCC"
promed    %<>% filter(species == species,
                      disease == disease,
                      Country %in% unique(WHO_raw$Country) )

promed_bycountry <- split(promed, promed$Country) 

promed_weekly    <- promed_bycountry %>%
                       lapply(function(case.count){
                               location <- case.count$Country[1]
                               case.count %<>%
                                  incidence.from.DS1(species, disease,
                                                   case.type,
                                                   location,
                                                   merge_rule = "median") %<>%
                                 daily.to.weekly }) %>%
                        bind_rows(.id = "Country")

promed_weekly$Date %<>% lubridate::ymd(.)
ggplot(promed_weekly, aes(Date, incid)) +
    facet_grid(Country~.) +
    geom_point() +
    theme_minimal()
WHO_weekly <- split(WHO_bycountry, WHO_bycountry$Country) %>%
              lapply(function(df){
                      df      %>%
                      ungroup %>%
                      select(Date, incid) %>%
                      daily.to.weekly}) %>%
                bind_rows(.id = "Country")    
WHO_weekly$Date       %<>% as.Date


color_scale <- c("Promed" = "black", "WHO" = "green")
list( WHO = WHO_weekly,
      Promed = promed_weekly[, c("Country", "Date", "incid")]) %>%
    bind_rows(.id = "Source") %>%
    ggplot(aes(Date, incid, color = Source)) +
    geom_point() +
    facet_grid(Country ~.) +
    scale_x_date(date_labels="%d/%m/%y",date_breaks = "3 weeks") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = color_scale)

Estimating Reproduction number from WHO and Promed Data

promed_R <- promed_bycountry %>%
               lapply(function(case.count){
                        location <- case.count$Country[1]
                        case.count %<>%
                        incidence.from.DS1(species, disease,
                                           case.type,
                                           location,
                                           merge_rule = "median")
                        start     <- 1:(length(case.count$Date) - time_window)
                        end       <- start + time_window
                        end.dates <- case.count$Date[end] 
                        res       <- EstimateR(case.count$incid,
                                               T.Start  = start ,
                                               T.End    = end,
                                               method   = "NonParametricSI",
                                               SI.Distr = SI_Distr,
                                               plot     = FALSE ,
                                               CV.Posterior = 1 ,
                                               Mean.Prior   = 1 ,
                                               Std.Prior    = 0.5)
                        res$R %<>% cbind(Date = end.dates)
                        return(res$R)}) %>%
              bind_rows(.id = "Country")

WHO_R <- WHO_bycountry    %>%
         split(.$Country) %>%
         lapply(function(case.count){
                  location  <- case.count$Country[1]
                  start     <- 1:(length(case.count$Date) - time_window)
                  end       <- start + time_window
                  end.dates <- case.count$Date[end] 
                  res       <- EstimateR(case.count$incid,
                                         T.Start  = start ,
                                         T.End    = end,
                                         method   = "NonParametricSI",
                                         SI.Distr = SI_Distr,
                                         plot     = FALSE ,
                                         CV.Posterior = 1 ,
                                         Mean.Prior   = 1 ,
                                         Std.Prior    = 0.5)
                  res$R %<>% cbind(Date = end.dates)
                  return(res$R)}) %>%
           bind_rows(.id = "Country")


list( WHO = WHO_R, Promed = promed_R) %>%
    bind_rows(.id = "Source") %>%
    ggplot(aes(Date, `Mean(R)`, color = Source)) +
    geom_line() +
    facet_grid(Country ~.) +
    scale_x_date(date_labels="%d/%m/%y",date_breaks = "3 weeks") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = color_scale)

The graphs show that the estimates from the two different sources closely track each other. Here's another way to visualise this.

promed_vs_WHO <- inner_join(WHO_R, promed_R,
                        by = c("Date", "Country")) %>%
              select(Country, Date, `Mean(R).x`,  `Mean(R).y`) %>%
              rename("WHO" = `Mean(R).x`,
                     "Promed" = `Mean(R).y`)

ggplot(promed_vs_WHO, aes(Promed, WHO)) +
    geom_point() +
    facet_wrap(~Country, nrow = 3) +
    xlab("R estimated from ProMed data") +
    ylab("R estimated from WHO data") +
    geom_abline(slope = 1)

As the above graph indicates, estimates of the reproduction number from Promed and WHO data are highly correlated (correlation coefficient = 0.76, 95% CI = (0.74, 0.78)).

cor.test(HM_vs_WHO$WHO, HM_vs_WHO$Promed)


annecori/mRIIDSprocessData documentation built on May 29, 2019, 1:16 p.m.