library(tidyverse) library(stringr) library(ISOweek)
The common flu, although different from year to year often exhibits common patterns from year to year. We use CMU's Delphi group's API to collect data from 2000 to 2018 flu seasons in the United States. We will specifically focus on a weekly "wILI" value, which captures a weighted percent of influenza-like illness cases seen in the hospital in that given week.
CMU's Delphi group's API can be sourced by running the following command.
source(paste0("https://raw.githubusercontent.com/", "cmu-delphi/delphi-epidata/main/src/client/delphi_epidata.R"))
This file provides a lists of functions in a more python
ic way - all functions are avaiable in the Epidata
list, and can be accessed through the notation Epidata$function_name
.
We aim to examine flu seasons from 2001 to 2018. Although the API can pull data from as far back as 2001. Note that influenza death estimation (which we are not looking at), change in 2010^[CDC: https://www.cdc.gov/flu/about/burden/faq.htm]. Additionally, flu seasons often are examined October of the previous year to August of the year.
flu_data_raw <- Epidata$fluview(list('nat'), Epidata$range(200010, 201809))
Delphi has documentation on the outcome of the fluview
function (and other functions),
on their website. Regions can be found here and is related to this.
We can extract the desired information from the second item in the list (under the epidata
name), into a data.frame with the following function extract_df
. We've folded the code as it's messy and not that informative.
extract_df
code:
#' extract data frame from delphi's epidata "fluview" function #' #' @param epidata "epidata" output as desired by the fluview function (see https://cmu-delphi.github.io/delphi-epidata/api/fluview.html) #' #' returns data.frame with the same information extract_df <- function(epidata){ # basic structure of data frame ----------------- nr <- length(epidata) nc <- length(epidata[[1]]) colnames <- names(epidata[[1]]) # creating an empty data frame ------------------ ## character and numeric columns c_idx <- which(sapply(epidata[[1]], class) %in% c("character", "factor")) n_idx <- (1:nc)[!(1:nc %in% c_idx)] data <- data.frame(matrix(nrow = nr, ncol = 0)) for (col_idx in 1:nc){ if (col_idx %in% c_idx){ data <- cbind(data, data.frame(matrix("c", nrow = nr, ncol = 1))) } else { data <- cbind(data, data.frame(matrix(0, nrow = nr, ncol = 1))) } } names(data) <- colnames for (row_idx in 1:nr){ inner <- epidata[[row_idx]] inner[sapply(inner, function(x) length(x) == 0L)] <- NA # breaking into parts data[row_idx,c_idx] <- unlist(inner[c_idx]) data[row_idx,n_idx] <- unlist(inner[n_idx]) } return(data) }
We create a data.frame for the flu data (percent of new cases) and clean it up a bit to emphasis the week of the year we're looking at.
flu_data <- extract_df(flu_data_raw$epidata) # convert epiweek to day flu_data <- flu_data %>% rowwise() %>% mutate(epiweek_year = str_sub(epiweek, start = 1L, end = 4L), epiweek_week = str_sub(epiweek, start = 5L, end = 6L), epiweek2 = paste0(epiweek_year, "-W", epiweek_week,"-1"), epiweek_date = ISOweek2date(epiweek2))
Below is an example of the data.
flu_data %>% head() %>% DT::datatable(options = list(scrollX = TRUE))
start_year = 1999 end_year = 2017 flu_season_info_df <- data.frame(year = start_year:end_year) %>% mutate(start_date = paste0(as.character(year), "-09-16"), # splitting september equally end_date = paste0(as.character(year+1), "-09-15"), even_year = c( "odd", "even")[(year %% 2) + 1]) %>% mutate(start_date = as.Date(start_date, format= "%Y-%m-%d"), end_date = as.Date(end_date, format= "%Y-%m-%d"))
# visual flu_data %>% ggplot() + geom_line(aes(x = epiweek_date, y = wili)) + labs(x = "time", y = "wILI percentage", title = "wILI percentage across multiple seasons") + geom_rect(data = flu_season_info_df %>% filter(even_year == "even"), mapping = aes(xmin = start_date, xmax = end_date, ymin = 0, ymax = 8), alpha = .1) + theme_minimal()
library(dygraphs) periods_df <- flu_season_info_df %>% filter(even_year == "even") add_shades <- function(x, periods_df, ...) { for( r_idx in 1:nrow(periods_df) ) { x <- dyShading(x, from = periods_df[r_idx, "start_date"] , to = periods_df[r_idx, "end_date"], ... ) } x } flu_xts <- xts::xts(x = flu_data$wili, order.by = flu_data$epiweek_date) names(flu_xts) <- "wILI percentage" flu_xts %>% dygraph(data = ., main = "wILI percentage across multiple seasons", ylab = "wILI percentage", width = "100%") %>% dyRangeSelector() %>% add_shades(periods_df = periods_df)
season_index <- rep(0, nrow(flu_data)) for (start_date in flu_season_info_df$start_date){ season_index <- season_index + 1*(flu_data$epiweek_date > start_date) } flu_data$season_index <- season_index flu_data$after_start <- flu_data$epiweek_date - flu_season_info_df$start_date[flu_data$season_index] flu_data$season_year <- flu_season_info_df$year[season_index] # visual by_season_vis <- flu_data %>% mutate(season_year = factor(season_year)) %>% ggplot() + geom_line(aes(x = after_start, y = wili, color = season_year)) + labs(x = "Days after start of season", y = "wILI percentage", title = "wILI percentage across multiple seasons \n (2008 & 2009 odd season)", color = "") + theme_minimal() by_season_vis %>% plotly::ggplotly()
The set of code below is messing but let's talk about a few things.
df <- flu_data %>% tibble::rownames_to_column() #df # usa population (by month) from https://fred.stlouisfed.org/series/POPTHM pop <- read.csv("POPTHM.csv") %>% mutate(MONTH = lubridate::month(DATE), YEAR = lubridate::year(DATE)) df <- df %>% mutate(epiweek_month = lubridate::month(epiweek_date)) df_plus <- df %>% mutate(epiweek_year = as.numeric(epiweek_year)) %>% left_join(pop, by = c("epiweek_month" = "MONTH", "epiweek_year" = "YEAR")) %>% mutate(scaled_num_ili = POPTHM * wili/100)
# needs to be updated to do tidyverse also what is par? # and what is confirmed (cumulated relative to what?) # negative values for some output... devtools::load_all(path = "../") #?EpiCompare::cases_to_SIR
# change to days (to get 3 day flu) # expand? df2 <- df_plus %>% mutate(n_wili = round(POPTHM * wili / 100), rowname = as.numeric(rowname)*7, d_wili = wili/7) %>% select(d_wili, epiweek_date) stepwise_fill_in <- function(df2, step = 7){ # fill in with zeros df_new <- data.frame(df2[1,]) for (r_idx in 1:nrow(df2)){ inner_df <- data.frame(d_wili = df2$d_wili[r_idx], epiweek_date = df2$epiweek_date[r_idx] + 0:(step-1)) df_new <- rbind(df_new, inner_df) } df_new <- df_new[-1,] return(df_new) } df3 <- stepwise_fill_in(df2) df3 %>% head() df4 <- df3 %>% mutate(month = lubridate::month(epiweek_date), year = lubridate::year(epiweek_date)) %>% left_join(pop, by = c("month" = "MONTH", "year" = "YEAR")) %>% mutate(n_wili = round(POPTHM * d_wili / 100)) new <- df4 %>% tibble::rownames_to_column() %>% rename(t = rowname, N = POPTHM, current = n_wili) %>% mutate(confirmed = cumsum(current)) %>% EpiCompare::cases_to_SIR(data = ., par = c("gamma" = 1/3)) %>% #unit relative to days (3 days) select(t, X0, X1, X2) new %>% mutate(t = as.numeric(t)) %>% pivot_longer(cols = c(X0, X1, X2)) %>% filter(name == "X1") %>% ggplot2::ggplot() + ggplot2::geom_line(ggplot2::aes(x = t, y = value, color = name)) + ggplot2::theme_minimal() new2 <- new %>% mutate(N = X0 + X1 + X2, confirmed = X2) new2_R_to_S <- new2 %>% select(t, confirmed, N) %>% cases_to_SIR(par = c("gamma" = 1/60)) # 3 months to be suspectible again (to the flu) new2_R_to_S %>% head new$X3 <- new2_R_to_S$X2 new_SIRS <- new new_SIRS <- new_SIRS %>% mutate( X0 = X0 + X3) %>% dplyr::select(t, X0, X1, X2) %>% dplyr::rename(S = X0, I = X1, R = X2) new_SIRS %>% ggplot(aes(x = S, y = I, z = R)) + geom_path() + coord_tern() + theme_minimal() new_SIRS %>% pivot_longer(cols = c(S, I, R)) %>% ggplot() + geom_line(aes(x = as.numeric(t), y = value, group = name, color = name)) + theme_minimal() new %>% pivot_longer(cols = c(X0, X1, X2)) %>% filter(name == "X1") %>% ggplot() + geom_line(aes(x = as.numeric(t), y = value, group = name, color = name)) + theme_minimal()
# change to days (to get 3 day flu) # expand? df2 <- df_plus %>% mutate(n_wili = round(POPTHM * wili / 100), rowname = as.numeric(rowname)*7, d_wili = wili/7) %>% select(d_wili, epiweek_date, season_year) stepwise_fill_in <- function(df2, step = 7){ # fill in with zeros df_new <- data.frame(df2[1,]) for (r_idx in 1:nrow(df2)){ inner_df <- data.frame(d_wili = df2$d_wili[r_idx], epiweek_date = df2$epiweek_date[r_idx] + 0:(step-1), season_year = df2$season_year[r_idx]) df_new <- rbind(df_new, inner_df) } df_new <- df_new[-1,] return(df_new) } df3 <- df2 %>% group_by(season_year) %>% stepwise_fill_in() df4 <- df3 %>% mutate(month = lubridate::month(epiweek_date), year = lubridate::year(epiweek_date)) %>% left_join(pop, by = c("month" = "MONTH", "year" = "YEAR")) %>% mutate(n_wili = round(POPTHM * d_wili / 100)) new <- df4 %>% tibble::rownames_to_column() %>% rename(t = rowname, N = POPTHM, current = n_wili) %>% group_by(season_year) %>% mutate(confirmed = cumsum(current)) %>% EpiCompare::cases_to_SIR(data = ., par = c("gamma" = 1/3)) %>% #unit relative to days (3 days) select(t, X0, X1, X2) new_other <- df4 %>% tibble::rownames_to_column() %>% rename(t = rowname, N = POPTHM, current = n_wili) %>% mutate(confirmed = cumsum(current)) %>% EpiCompare::cases_to_SIR(data = ., par = c("gamma" = 1/3)) %>% #unit relative to days (3 days) select(t, X0, X1, X2)
new %>% mutate(t = as.numeric(t)) %>% pivot_longer(cols = c(X0, X1, X2)) %>% filter(name == "X1") %>% ggplot2::ggplot() + ggplot2::geom_line(ggplot2::aes(x = t, y = value, color = name)) + ggplot2::theme_minimal() new_other %>% mutate(t = as.numeric(t)) %>% pivot_longer(cols = c(X0, X1, X2)) %>% filter(name == "X1") %>% ggplot2::ggplot() + ggplot2::geom_line(ggplot2::aes(x = t, y = value, color = name)) + ggplot2::theme_minimal()
new2 <- new %>% mutate(N = X0 + X1 + X2, confirmed = X2) new2_R_to_S <- new2 %>% select(t, confirmed, N) %>% cases_to_SIR(par = c("gamma" = 1/60)) # 3 months to be suspectible again (to the flu) new2_R_to_S %>% head new$X3 <- new2_R_to_S$X2 new_SIRS <- new new_SIRS <- new_SIRS %>% mutate( X0 = X0 + X3) %>% dplyr::select(t, X0, X1, X2) %>% dplyr::rename(S = X0, I = X1, R = X2) df_lims = data.frame(S = c(1,.4,.4), I = c(0,.6,0), R = c(0,0,.6)) new_SIRS %>% ggplot(aes(x = S, y = I, z = R, color = factor(season_year))) + geom_path() + coord_tern() + theme_minimal() + tern_limits(T=max(df_lims$I), L=max(df_lims$S), R=max(df_lims$R))
plotly::ggplotly(new_SIRS %>% pivot_longer(cols = c(S, I, R)) %>% ggplot() + geom_line(aes(x = as.numeric(t), y = value, group = name, color = name)) + theme_minimal() )
new %>% pivot_longer(cols = c(X0, X1, X2)) %>% filter(name == "X1") %>% ggplot() + geom_line(aes(x = as.numeric(t), y = value, group = name, color = name)) + theme_minimal() new_other %>% pivot_longer(cols = c(X0, X1, X2)) %>% filter(name == "X1") %>% ggplot() + geom_line(aes(x = as.numeric(t), y = value, group = name, color = name)) + theme_minimal()
how to connect wILI to the total US (https://www.cdc.gov/flu/about/burden/how-cdc-estimates.htm)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.