knitr::opts_chunk$set(echo = FALSE,
                      message = FALSE,
                      warning = FALSE)
library(plyr)
library(dplyr)
library(magrittr)
library(lubridate)
library(ggplot2)
library(devtools)

load_all()
get_current_sos()
import_country_codes()
data(sos_raw)
sosid <- paste0("SOS", 1:nrow(sos_raw))
name <- clean_name(sos_raw)

status <- clean_status(sos_raw)
date_created <- clean_date_created(sos_raw)
date_terminated <- clean_date_terminated(sos_raw)
countries <- clean_countries(sos_raw)
humans <- clean_humans(sos_raw)
animals <- clean_animals(sos_raw)
plants <- clean_plants(sos_raw)
syndromic <- clean_syndromic(sos_raw)

sos_surveillance_class <- data.frame(sosid,
                                     humans,
                                     animals,
                                     plants,
                                     syndromic)

sos_surveillance_class %<>%
  # select(-sosid) %>%
  mutate(humans = factor(humans,
                         levels = c("yes", "no", "nf"),
                         labels = c("Yes", "No", "Not found"),
                         ordered = TRUE),
        animals = factor(animals,
                         levels = c("yes", "no", "nf"),
                         labels = c("Yes", "No", "Not found"),
                         ordered = TRUE),
        plants = factor(plants,
                         levels = c("yes", "no", "nf"),
                         labels = c("Yes", "No", "Not found"),
                         ordered = TRUE),
        # I'm removing "blank" values from the Syndromic column, because
        # I don't think the difference is useful for this table.
        syndromic = revalue(syndromic, c("blank" = "nf")),
        syndromic = factor(syndromic,
                         levels = c("yes", "no", "nf"),
                         labels = c("Yes", "No", "Not found"),
                         ordered = TRUE))
entity_df <- clean_entity_type(sos_raw, return_type = "data.frame")
entity_factor <- clean_entity_type(sos_raw, return_type = "factor")

sos_entity <- data.frame(sosid, entity_df, entity_factor)

The final database contains information on r nrow(sos_raw) biosurveillance systems. Of these, r sum(sos_surveillance_class$humans == "Yes") study humans, r sum(sos_surveillance_class$animals == "Yes") study animals, and r sum(sos_surveillance_class$plants == "Yes") study plants. We found r sum(sos_surveillance_class$syndromic == "Yes") systems which conduct syndromic surveillance and r sum(sos_surveillance_class$syndromic == "No") which did not, but were unable to find syndromic surveillance information for r sum(sos_surveillance_class$syndromic == "Not found") systems. Government entities were involved in r sum(sos_entity$gov) surveillance systems, non-profits in r sum(sos_entity$np), and for-profits in r sum(sos_entity$fp). More than one of these entity types were involved in r sum(apply(sos_entity[2:4], 1, function(x) sum(x) > 1)) systems.

knitr::kable(summary(sos_surveillance_class[2:5]),
             col.names = c("Humans", "Animals", "Plants", "Syndromic"),
             caption = "Table 1. Classes of biosurveillance conducted. Note: for \"Syndromic\" variable, \"Blank\" and \"Not Found\" entries were combined.")
library(reshape2)
ggsurvclass <- sos_surveillance_class %>%
  melt(id.vars = "sosid") %>%
  mutate(variable = factor(variable, labels = c("Humans", "Animals", "Plants", "Syndromic Surveillance")),
         value = factor(value, levels = c("Yes", "No", "Not found"), ordered = TRUE)) %>%
  ggplot()

ggsurvclass + geom_bar(aes(x = value, stat = "bin"), position = "dodge") + facet_grid(. ~ variable) + theme_bw() + labs(x = NULL, y = "Number of Systems", title = "Types of Biosurveillance Conducted")
ggentity <- sos_entity %>%
  select(sosid, entity_factor) %>%
  ggplot()

ggentity + geom_bar(aes(x = entity_factor, stat = "bin"), position = "dodge") + theme_bw() + labs(x = "Classes of entity", y = "Number of Systems", title = "Types of entities conducting biosurviellance")
sos_dates <- data.frame(date_created, date_terminated)
sos_dates$active_interval <- new_interval(sos_dates$date_created, sos_dates$date_terminated)


sum_active_systems_for_year <- function(year, intervals) {
  sum(year %within% intervals, na.rm = TRUE)
}

# Create a data frame for a time-series plot
time_series <- data.frame(year = parse_date_time(1900:2015, orders = "y"))

time_series$number_active <- sapply(time_series$year,
                                    sum_active_systems_for_year,
                                    intervals = sos_dates$active_interval)

# Create a data frame for the creation-termination histogram.
date_hist <- data.frame(sosid, select(sos_dates, date_created, date_terminated)) %>%
  melt(id.vars = "sosid") %>%
  mutate(year = year(value)) %>%
  filter(year != 2015, year >= 1950) %>%
  group_by(year, variable) %>%
  summarize(count = n()) %>%
  dcast(year ~ variable)

# We have to append rows for years not in the data frame, otherwise our over-under line will do weird things.
years_not_in_df <- seq(1950, 2014)[!seq(1950, 2014) %in% date_hist$year]
date_hist <- rbind(date_hist, data.frame(year = years_not_in_df, date_created = NA, date_terminated = NA))
date_hist[is.na(date_hist)] <- 0

max_creation <- date_hist %>%
  filter(date_created == max(date_created)) %>%
  select(year)
max_termination <- date_hist %>%
  filter(date_terminated == max(date_terminated)) %>%
  select(year)
most_systems_year <- time_series %>%
  filter(number_active == max(number_active)) %>%
  select(year) %>%
  mutate(year = year(year)) # Convert the "year" variable to a "year" from a full "date."

We examined data collected on surveillance system termination and creation. Of the r nrow(sos_raw) surveillance systems in the database, r sum(!is.na(sos_dates$active_interval)) have information on both year of creation and, unless currently active, year of termination. Surveillance system creation peaked in r max_creation, tailing off in subsequent years. Termination of active systems increased significantly after 2000, peaking in r max_termination. Accounting for system creation and termination, the largest concurrent number of active surveillance systems was r max(time_series$number_active), occurring in r most_systems_year.

ggplot(data = date_hist, aes(x = year)) +
  geom_bar(aes(y = date_created), stat = "identity", fill = "#00BFC4") +
  geom_bar(aes(y = -date_terminated), stat = "identity", fill = "#F8766D") +
  geom_line(aes(y = date_created - date_terminated), size = 0.5) +
  theme_bw() +
  labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated")
ggplot(time_series, aes(x = year, y = number_active)) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time")
active_period <- as.period(sos_dates$active_interval, unit = "years")
active_duration <- as.duration(sos_dates$active_interval)

active_years <- year(active_period)

sos_dates %<>%
  mutate(years_active = year(as.period(active_interval, unit = "years")))

xlabs <- c(seq(0, 40, by = 10), ">50")

ggplot(sos_dates, aes(x = pmin(sos_dates$years_active, 50), fill = current)) +
  layer(geom = "bar", stat = "bin", breaks = seq(0, 50)) +
  scale_x_continuous(labels = c(seq(0, 40, by = 10), ">50")) +
  labs(x = "Years Active", y = "Count", title = "Lifespan of Biosurveillance Systems") +
  theme_bw()

ggplot(sos_dates, aes(x = pmin(sos_dates$years_active, 50), fill = current)) +
  layer(geom = "bar",
        stat = "bin",
        width=.1,
        breaks = seq(0, 50, by = 5),
        position = position_dodge(width = 4),
        alpha = 0.9) +
  guides(fill = guide_legend(title = "Currently Active")) +
  scale_x_continuous(labels = c(seq(0, 40, by = 10), ">50")) +
  labs(x = "Years Active", y = "Count", title = "Lifespan of Biosurveillance Systems") +
  theme_bw()
sos_dates$entity_type <- clean_entity_type(sos_raw, return_type = "factor")
time_series_entity <- expand.grid(year = time_series$year, entity_type = levels(sos_dates$entity_type))
time_series_entity %<>%
  group_by(entity_type, year) %>%
  mutate(number_active = sum_active_systems_for_year(year,
                                                     sos_dates[sos_dates$entity_type == entity_type,
                                                               "active_interval"])) %>%
  ungroup() %>%
  filter(year(year) >= 1990)


ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_line() + 
  theme_bw() +
  labs(x = "Year", y = "Count of Active Systems", title = "Active Surveillance Systems by Entity Class")
sos_dates$syndromic <- clean_syndromic(sos_raw)
sos_dates$syndromic <- factor(sos_dates$syndromic, levels = c("yes", "no"))
sos_dates$syndromic <- revalue(sos_dates$syndromic, replace = c("yes" = "Syndromic", "no" = "Traditional"))
time_series_entity <- expand.grid(year = time_series$year, syndromic = levels(sos_dates$syndromic))
time_series_entity %<>%
  group_by(syndromic, year) %>%
  mutate(number_active = sum_active_systems_for_year(year,
                                                     sos_dates[sos_dates$syndromic == syndromic,
                                                               "active_interval"])) %>%
  ungroup() %>%
  filter(year(year) >= 1980)


ggplot(time_series_entity, aes(x = year, y = number_active, fill = syndromic, color = syndromic, order = desc(syndromic))) + 
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw() +
  labs(x = "Year", y = "Count of Systems", title = "Syndromic versus Traditional Surveillance Systems")

ggplot(time_series_entity, aes(x = year, y = number_active, fill = syndromic, color = syndromic, order = desc(syndromic))) + 
  geom_bar(position = "fill", stat = "identity") + 
  theme_bw() +
  labs(x = "Year", y = "Proportion of Systems", title = "Syndromic versus Traditional Surveillance Systems")
max_systems_in_country <- countries %>%
  summarise_each(funs = funs(sum)) %>%
  max()

The United States was covered by the largest number of surveillane systems (r max_systems_in_country). However, the number of surveillance systems per country is not by itself a meaningful measure of surveillance effort, as the United States also covers a larger area and a larger population than many other countries. Plotting the number of surveillance systems per capita for each country shows a different aspect of surveillance system distribution.

countries_current <- countries[sos_dates$current, ]
library(curl)

by_country <- countries_current %>%
  summarise_each(funs(n = sum(as.numeric(.)))) %>%
  t() %>% # Transpose it so that countries are rows.
  as.data.frame() %>%
  mutate(iso3 = rownames(.)) %>%
  arrange(desc(V1))
names(by_country)[1] <- "n_systems"

pop_csv <- curl_download(url = "https://raw.githubusercontent.com/datasets/population/master/data/population.csv",
                         destfile = tempfile())
population <- read.csv(pop_csv, stringsAsFactors = FALSE)
population %<>%
  filter(Year == 2014) %>%
  select(iso3 = Country.Code, population = Value) 
by_country %<>%
  left_join(population) %>%
  mutate(systems_per_capita = n_systems / population)
library(rworldmap)
library(RColorBrewer)

sos_map <- joinCountryData2Map(dF = by_country, joinCode = "ISO3", nameJoinColumn = "iso3", verbose = FALSE)

sos_map$log_n_systems <- log(sos_map$n_systems)
spplot(sos_map, zcol = "log_n_systems", main = "Count of Surveillance Systems")

sos_map$log_systems_per_capita <- log(sos_map$systems_per_capita)
spplot(sos_map, zcol = "log_systems_per_capita", main = "Surveillance Systems Per Capita")

sos_map$systems_per_100000 <- sos_map$systems_per_capita * 100000


sos_map$systems_pretty <- cut(sos_map$systems_per_100000,
                              breaks = exp(pretty(log(sos_map$systems_per_100000))), dig.lab = 2)

spplot(sos_map, zcol = "systems_pretty", col.regions = rev(brewer.pal(6, "YlGnBu")), main = "Surveillance Systems Per 10,000 People")

One Health analysis

onehealth <- sos_surveillance_class[, c("humans", "animals", "plants")] %>%
  mutate_each(funs(revalue(., replace = c("Yes" = "TRUE", "No" = "FALSE")))) %>%
  mutate_each(funs(as.logical))

# Version treating NAs as FALSE
onehealth_nona <- onehealth
onehealth_nona[is.na(onehealth_nona)] <- FALSE

onehealth %<>%
  mutate(humans_only = humans & !(plants | animals),
         animals_only = animals & !(humans | plants),
         plants_only = plants & !(humans | animals)) %>%
  mutate(ha = humans & animals,
         pa = animals & plants,
         hap = humans & animals & plants)


onehealth_nona %<>%
  mutate(humans_only = humans & !(plants | animals),
         animals_only = animals & !(humans | plants),
         plants_only = plants & !(humans | animals)) %>%
  mutate(ha = humans & animals,
         pa = animals & plants,
         hap = humans & animals & plants)


knitr::kable(summary(onehealth))

knitr::kable(summary(onehealth_nona))
sos_map@data %>%
  select(ISO3, systems_per_100000) %>%
  arrange(-systems_per_100000) %>%
  knitr::kable()


ecohealthalliance/sos documentation built on May 15, 2019, 7:56 p.m.