knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(plyr) library(dplyr) library(magrittr) library(lubridate) library(ggplot2) library(devtools) load_all()
get_current_sos() import_country_codes() data(sos_raw) sosid <- paste0("SOS", 1:nrow(sos_raw))
name <- clean_name(sos_raw) status <- clean_status(sos_raw) date_created <- clean_date_created(sos_raw) date_terminated <- clean_date_terminated(sos_raw) countries <- clean_countries(sos_raw)
humans <- clean_humans(sos_raw) animals <- clean_animals(sos_raw) plants <- clean_plants(sos_raw) syndromic <- clean_syndromic(sos_raw) sos_surveillance_class <- data.frame(sosid, humans, animals, plants, syndromic) sos_surveillance_class %<>% # select(-sosid) %>% mutate(humans = factor(humans, levels = c("yes", "no", "nf"), labels = c("Yes", "No", "Not found"), ordered = TRUE), animals = factor(animals, levels = c("yes", "no", "nf"), labels = c("Yes", "No", "Not found"), ordered = TRUE), plants = factor(plants, levels = c("yes", "no", "nf"), labels = c("Yes", "No", "Not found"), ordered = TRUE), # I'm removing "blank" values from the Syndromic column, because # I don't think the difference is useful for this table. syndromic = revalue(syndromic, c("blank" = "nf")), syndromic = factor(syndromic, levels = c("yes", "no", "nf"), labels = c("Yes", "No", "Not found"), ordered = TRUE))
entity_df <- clean_entity_type(sos_raw, return_type = "data.frame") entity_factor <- clean_entity_type(sos_raw, return_type = "factor") sos_entity <- data.frame(sosid, entity_df, entity_factor)
The final database contains information on r nrow(sos_raw)
biosurveillance systems. Of these, r sum(sos_surveillance_class$humans == "Yes")
study humans, r sum(sos_surveillance_class$animals == "Yes")
study animals, and r sum(sos_surveillance_class$plants == "Yes")
study plants. We found r sum(sos_surveillance_class$syndromic == "Yes")
systems which conduct syndromic surveillance and r sum(sos_surveillance_class$syndromic == "No")
which did not, but were unable to find syndromic surveillance information for r sum(sos_surveillance_class$syndromic == "Not found")
systems. Government entities were involved in r sum(sos_entity$gov)
surveillance systems, non-profits in r sum(sos_entity$np)
, and for-profits in r sum(sos_entity$fp)
. More than one of these entity types were involved in r sum(apply(sos_entity[2:4], 1, function(x) sum(x) > 1))
systems.
knitr::kable(summary(sos_surveillance_class[2:5]), col.names = c("Humans", "Animals", "Plants", "Syndromic"), caption = "Table 1. Classes of biosurveillance conducted. Note: for \"Syndromic\" variable, \"Blank\" and \"Not Found\" entries were combined.")
library(reshape2) ggsurvclass <- sos_surveillance_class %>% melt(id.vars = "sosid") %>% mutate(variable = factor(variable, labels = c("Humans", "Animals", "Plants", "Syndromic Surveillance")), value = factor(value, levels = c("Yes", "No", "Not found"), ordered = TRUE)) %>% ggplot() ggsurvclass + geom_bar(aes(x = value, stat = "bin"), position = "dodge") + facet_grid(. ~ variable) + theme_bw() + labs(x = NULL, y = "Number of Systems", title = "Types of Biosurveillance Conducted")
ggentity <- sos_entity %>% select(sosid, entity_factor) %>% ggplot() ggentity + geom_bar(aes(x = entity_factor, stat = "bin"), position = "dodge") + theme_bw() + labs(x = "Classes of entity", y = "Number of Systems", title = "Types of entities conducting biosurviellance")
sos_dates <- data.frame(date_created, date_terminated) sos_dates$active_interval <- new_interval(sos_dates$date_created, sos_dates$date_terminated) sum_active_systems_for_year <- function(year, intervals) { sum(year %within% intervals, na.rm = TRUE) } # Create a data frame for a time-series plot time_series <- data.frame(year = parse_date_time(1900:2015, orders = "y")) time_series$number_active <- sapply(time_series$year, sum_active_systems_for_year, intervals = sos_dates$active_interval) # Create a data frame for the creation-termination histogram. date_hist <- data.frame(sosid, select(sos_dates, date_created, date_terminated)) %>% melt(id.vars = "sosid") %>% mutate(year = year(value)) %>% filter(year != 2015, year >= 1950) %>% group_by(year, variable) %>% summarize(count = n()) %>% dcast(year ~ variable) # We have to append rows for years not in the data frame, otherwise our over-under line will do weird things. years_not_in_df <- seq(1950, 2014)[!seq(1950, 2014) %in% date_hist$year] date_hist <- rbind(date_hist, data.frame(year = years_not_in_df, date_created = NA, date_terminated = NA)) date_hist[is.na(date_hist)] <- 0 max_creation <- date_hist %>% filter(date_created == max(date_created)) %>% select(year) max_termination <- date_hist %>% filter(date_terminated == max(date_terminated)) %>% select(year) most_systems_year <- time_series %>% filter(number_active == max(number_active)) %>% select(year) %>% mutate(year = year(year)) # Convert the "year" variable to a "year" from a full "date."
We examined data collected on surveillance system termination and creation. Of the r nrow(sos_raw)
surveillance systems in the database, r sum(!is.na(sos_dates$active_interval))
have information on both year of creation and, unless currently active, year of termination. Surveillance system creation peaked in r max_creation
, tailing off in subsequent years. Termination of active systems increased significantly after 2000, peaking in r max_termination
. Accounting for system creation and termination, the largest concurrent number of active surveillance systems was r max(time_series$number_active)
, occurring in r most_systems_year
.
ggplot(data = date_hist, aes(x = year)) + geom_bar(aes(y = date_created), stat = "identity", fill = "#00BFC4") + geom_bar(aes(y = -date_terminated), stat = "identity", fill = "#F8766D") + geom_line(aes(y = date_created - date_terminated), size = 0.5) + theme_bw() + labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated")
ggplot(time_series, aes(x = year, y = number_active)) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time")
active_period <- as.period(sos_dates$active_interval, unit = "years") active_duration <- as.duration(sos_dates$active_interval) active_years <- year(active_period) sos_dates %<>% mutate(years_active = year(as.period(active_interval, unit = "years"))) xlabs <- c(seq(0, 40, by = 10), ">50") ggplot(sos_dates, aes(x = pmin(sos_dates$years_active, 50), fill = current)) + layer(geom = "bar", stat = "bin", breaks = seq(0, 50)) + scale_x_continuous(labels = c(seq(0, 40, by = 10), ">50")) + labs(x = "Years Active", y = "Count", title = "Lifespan of Biosurveillance Systems") + theme_bw() ggplot(sos_dates, aes(x = pmin(sos_dates$years_active, 50), fill = current)) + layer(geom = "bar", stat = "bin", width=.1, breaks = seq(0, 50, by = 5), position = position_dodge(width = 4), alpha = 0.9) + guides(fill = guide_legend(title = "Currently Active")) + scale_x_continuous(labels = c(seq(0, 40, by = 10), ">50")) + labs(x = "Years Active", y = "Count", title = "Lifespan of Biosurveillance Systems") + theme_bw()
sos_dates$entity_type <- clean_entity_type(sos_raw, return_type = "factor") time_series_entity <- expand.grid(year = time_series$year, entity_type = levels(sos_dates$entity_type)) time_series_entity %<>% group_by(entity_type, year) %>% mutate(number_active = sum_active_systems_for_year(year, sos_dates[sos_dates$entity_type == entity_type, "active_interval"])) %>% ungroup() %>% filter(year(year) >= 1990) ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Active Surveillance Systems by Entity Class")
sos_dates$syndromic <- clean_syndromic(sos_raw) sos_dates$syndromic <- factor(sos_dates$syndromic, levels = c("yes", "no")) sos_dates$syndromic <- revalue(sos_dates$syndromic, replace = c("yes" = "Syndromic", "no" = "Traditional")) time_series_entity <- expand.grid(year = time_series$year, syndromic = levels(sos_dates$syndromic)) time_series_entity %<>% group_by(syndromic, year) %>% mutate(number_active = sum_active_systems_for_year(year, sos_dates[sos_dates$syndromic == syndromic, "active_interval"])) %>% ungroup() %>% filter(year(year) >= 1980) ggplot(time_series_entity, aes(x = year, y = number_active, fill = syndromic, color = syndromic, order = desc(syndromic))) + geom_bar(position = "dodge", stat = "identity") + theme_bw() + labs(x = "Year", y = "Count of Systems", title = "Syndromic versus Traditional Surveillance Systems") ggplot(time_series_entity, aes(x = year, y = number_active, fill = syndromic, color = syndromic, order = desc(syndromic))) + geom_bar(position = "fill", stat = "identity") + theme_bw() + labs(x = "Year", y = "Proportion of Systems", title = "Syndromic versus Traditional Surveillance Systems")
max_systems_in_country <- countries %>% summarise_each(funs = funs(sum)) %>% max()
The United States was covered by the largest number of surveillane systems (r max_systems_in_country
). However, the number of surveillance systems per country is not by itself a meaningful measure of surveillance effort, as the United States also covers a larger area and a larger population than many other countries. Plotting the number of surveillance systems per capita for each country shows a different aspect of surveillance system distribution.
countries_current <- countries[sos_dates$current, ]
library(curl) by_country <- countries_current %>% summarise_each(funs(n = sum(as.numeric(.)))) %>% t() %>% # Transpose it so that countries are rows. as.data.frame() %>% mutate(iso3 = rownames(.)) %>% arrange(desc(V1)) names(by_country)[1] <- "n_systems" pop_csv <- curl_download(url = "https://raw.githubusercontent.com/datasets/population/master/data/population.csv", destfile = tempfile()) population <- read.csv(pop_csv, stringsAsFactors = FALSE) population %<>% filter(Year == 2014) %>% select(iso3 = Country.Code, population = Value) by_country %<>% left_join(population) %>% mutate(systems_per_capita = n_systems / population)
library(rworldmap) library(RColorBrewer) sos_map <- joinCountryData2Map(dF = by_country, joinCode = "ISO3", nameJoinColumn = "iso3", verbose = FALSE) sos_map$log_n_systems <- log(sos_map$n_systems) spplot(sos_map, zcol = "log_n_systems", main = "Count of Surveillance Systems") sos_map$log_systems_per_capita <- log(sos_map$systems_per_capita) spplot(sos_map, zcol = "log_systems_per_capita", main = "Surveillance Systems Per Capita") sos_map$systems_per_100000 <- sos_map$systems_per_capita * 100000 sos_map$systems_pretty <- cut(sos_map$systems_per_100000, breaks = exp(pretty(log(sos_map$systems_per_100000))), dig.lab = 2) spplot(sos_map, zcol = "systems_pretty", col.regions = rev(brewer.pal(6, "YlGnBu")), main = "Surveillance Systems Per 10,000 People")
onehealth <- sos_surveillance_class[, c("humans", "animals", "plants")] %>% mutate_each(funs(revalue(., replace = c("Yes" = "TRUE", "No" = "FALSE")))) %>% mutate_each(funs(as.logical)) # Version treating NAs as FALSE onehealth_nona <- onehealth onehealth_nona[is.na(onehealth_nona)] <- FALSE onehealth %<>% mutate(humans_only = humans & !(plants | animals), animals_only = animals & !(humans | plants), plants_only = plants & !(humans | animals)) %>% mutate(ha = humans & animals, pa = animals & plants, hap = humans & animals & plants) onehealth_nona %<>% mutate(humans_only = humans & !(plants | animals), animals_only = animals & !(humans | plants), plants_only = plants & !(humans | animals)) %>% mutate(ha = humans & animals, pa = animals & plants, hap = humans & animals & plants) knitr::kable(summary(onehealth)) knitr::kable(summary(onehealth_nona))
sos_map@data %>% select(ISO3, systems_per_100000) %>% arrange(-systems_per_100000) %>% knitr::kable()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.