knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Load the Data

library(tidyverse)
library(covdata)
library(ggrepel)

Data for the United States come from a variety of sources:

COVID Tracking Project data

covus

Draw a log-linear graph of cumulative reported US cases.

## Which n states are leading the count of positive cases or deaths?
top_n_states <- function(data, n = 5, measure = c("positive", "death")) {
  meas <- match.arg(measure)
  data %>%
  group_by(state) %>%
  filter(measure == meas, date == max(date)) %>%
  drop_na(count) %>%
  ungroup() %>%
  top_n(n, wt = count) %>%
  pull(state)
}

state_cols <- c("gray70", "#195F90FF", "#D76500FF", "#238023FF", "#AB1F20FF", "#7747A3FF", 
                "#70453CFF", "#D73EA8FF", "#666666FF", "#96971BFF", "#1298A6FF", "#6F9BD6FF", 
                "#FF952DFF", "#66CF51FF", "#FF4945FF", "#A07DBAFF", "#AC7368FF", "#EF69A2FF", 
                "#9F9F9FFF", "#CACA56FF", "#61C3D5FF")

covus %>%
  group_by(state) %>%
  mutate(core = case_when(state %nin% top_n_states(covus) ~ "",
                          TRUE ~ state),
         end_label = ifelse(date == max(date), core, NA)) %>%
  arrange(date) %>%
  filter(measure == "positive", date > "2020-03-09") %>%
  ggplot(aes(x = date, y = count, group = state, color = core, label = end_label)) + 
  geom_line(size = 0.5) + 
  geom_text_repel(segment.color = NA, nudge_x = 0.2, nudge_y = 0.1) + 
  scale_color_manual(values = state_cols) + 
  scale_x_date(date_breaks = "1 month", date_labels = "%b" ) + 
  scale_y_continuous(trans = "log2",
                     labels = scales::comma_format(accuracy = 1),
                     breaks = 2^c(seq(1, 22, 1))) +
  guides(color = "none") + 
  labs(title = "COVID-19 Cumulative Recorded Cases by US State",
       subtitle = paste("Data as of", format(max(covus$date), "%A, %B %e, %Y")),
       x = "Date", y = "Count of Cases (log 2 scale)", 
       caption = "Data: COVID Tracking Project, http://covidtracking.com | Graph: @kjhealy") + 
  theme_minimal()

Calculating daily counts

The COVID Tracking Project reports cumulative counts for key measures such as positive tests and deaths. For example, for New York State:

measures <- c("positive", "death")

covus %>%
  filter(measure %in% measures, state == "NY") %>%
  select(date, state, measure, count) %>%
  pivot_wider(names_from = measure, values_from = count) 

To calculate daily counts from these cumulative measures, use lag().

covus %>%
  filter(measure %in% measures, state == "NY") %>%
  select(date, state, measure, count) %>%
  pivot_wider(names_from = measure, values_from = count) %>%
  mutate(across(positive:death, ~.x - lag(.x, order_by = date), 
                .names = "daily_{col}"))

Draw a graph of the weekly rolling average death rate, by state

state_pops <- uspop %>%
  filter(sex_id == "totsex", hisp_id == "tothisp") %>%
  select(state_abbr, statefips, pop, state) %>%
  rename(name = state, 
         state = state_abbr, fips = statefips) %>%
  mutate(state = replace(state, fips == "11", "DC"))

## Using a convenience function to do something similar
## to the lambda version above
get_daily_count <- function(count, date){
  count - lag(count, order_by = date)
}


covus %>%
  filter(measure == "death", state %in% unique(state_pops$state)) %>%
  group_by(state) %>%
  mutate(
    deaths_daily = get_daily_count(count, date), 
    deaths7 = slider::slide_dbl(deaths_daily, mean, .before = 7, .after = 0, na.rm = TRUE)) %>%
  left_join(state_pops) %>%
  filter(date > lubridate::ymd("2020-03-15")) %>%
  ggplot(mapping = aes(x = date, y = (deaths7/pop)*1e5)) + 
  geom_line(size = 0.5) + 
  scale_y_continuous(labels = scales::comma_format(accuracy = 1)) + 
  facet_wrap(~ name, ncol = 4) +
  labs(x = "Date", 
       y = "Deaths per 100,000 Population (Seven Day Rolling Average)", 
       title = "Average Death Rate from COVID-19: US States and Washington, DC", 
       subtitle = paste("COVID Tracking Project data as of", format(max(covus$date), "%A, %B %e, %Y")), 
       caption = "Kieran Healy @kjhealy / Data: https://www.covidtracking.com/") +
  theme_minimal()

Draw a graph of the weekly rolling average death rate, by state, with free y-axes in the panels

covus %>%
  filter(measure == "death", state %in% unique(state_pops$state)) %>%
  group_by(state) %>%
  mutate(
    deaths_daily = get_daily_count(count, date), 
    deaths7 = slider::slide_dbl(deaths_daily, mean, .before = 7, .after = 0, na.rm = TRUE)) %>%
  left_join(state_pops) %>%
  filter(date > lubridate::ymd("2020-03-15")) %>%
  ggplot(mapping = aes(x = date, y = (deaths7/pop)*1e5)) + 
  geom_line(size = 0.5) + 
  facet_wrap(~ name, ncol = 4, scales = "free_y") +
  labs(x = "Date", 
       y = "Deaths per 100,000 Population (Seven Day Rolling Average)", 
       title = "Average Death Rate from COVID-19: US States and Washington, DC. Free scales.", 
       subtitle = paste("COVID Tracking Project data as of", format(max(covus$date), "%A, %B %e, %Y")), 
       caption = "Kieran Healy @kjhealy / Data: https://www.covidtracking.com/") +
  theme_minimal()

Draw a graph of cumulative reported US deaths, aggregated to the national level

covus %>%
  filter(measure == "death") %>%
  group_by(date) %>%
  summarize(count = sum(count, na.rm = TRUE)) %>%
  ggplot(aes(x = date, y = count)) + 
  geom_line(size = 0.75) + 
  scale_x_date(date_breaks = "3 months", date_labels = "%b %Y" ) + 
  scale_y_continuous(labels = scales::comma, breaks = seq(0, 500000, 100000)) + 
  labs(title = "COVID-19 Cumulative Recorded Deaths in the United States",
       subtitle = paste("Data as of", format(max(covus$date), "%A, %B %e, %Y"), "Recorded counts underestimate total mortality."),
       x = "Date", y = "Count of Deaths", 
       caption = "Data: COVID Tracking Project, http://covidtracking.com | Graph: @kjhealy") + 
  theme_minimal()

State-Level and County-Level (Cumulative) Data from the New York Times

State-level table

nytcovstate

County-level table

nytcovcounty

Draw a log-linear graph of cumulative cases by county in selected states.

We can see that data for some counties is either not available or hasn't been correctly coded.

nytcovcounty %>%
  filter(state %in% c("New York", "North Carolina", "Texas", "Florida", "California", "Illinois")) %>%
  mutate(uniq_name = paste(county, state)) %>% # Can't use FIPS because of how the NYT bundled cities
  group_by(uniq_name) %>%
  mutate(days_elapsed = date - min(date)) %>%
  ggplot(aes(x = days_elapsed, y = cases + 1, group = uniq_name)) + 
  geom_line(size = 0.25, color = "gray20") + 
  scale_x_continuous() + 
  scale_y_log10(labels = scales::label_number_si()) + 
  guides(color = "none") + 
  facet_wrap(~ state, ncol = 2) + 
  labs(title = "COVID-19 Cumulative Recorded Cases by US County",
       subtitle = paste("New York is bundled into a single area in this data.\nData as of", format(max(nytcovcounty$date), "%A, %B %e, %Y")),
       x = "Days since first case", y = "Count of Cases (log 10 scale)", 
       caption = "Data: The New York Times | Graph: @kjhealy") + 
  theme_minimal()


kjhealy/covdata documentation built on Feb. 4, 2023, 12:52 p.m.