knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Introduction

Q. Do stay-at-home measures impact (flatten) the curve? This R script seeks to locate the point in which COVID-19 cases begins to flatten at the state- and county- level.

Methodology

  1. "Linearize" exponential rise of cases using natural log\ \
    y = A$e^{kt}$ → ln(y) = kt + ln(A)
    \
  2. Smooth natural log scale graph
  3. Calculate slope k between all the datepoints
  4. Locate breakpoints in k curve to find dates of greates slope change
  5. Calculate the "lag" between the SAH order date and the last breakpoint

Data

This analysis requires cases data from the covid19clark package, county-level data from our covid19interventions package, and state-level data scraped from NYT, sah.


Load data

# data load
library(covid19interventions)
# run if you don't have covid19clark
# devtools::install_github("agroimpacts/covid19clark", build_vignettes = TRUE)
library(covid19clark)
# covid interventions by county
data("county_interventions")
# covid cases
data("us_cases_daily")
# state SAH dates scraped from NY Times
data("sah")

Sample data

# interventions
head(county_interventions)
# covid cases by county
head(us_cases_daily$county)
# nytimes sah dates
head(sah)

Load packages

library(devtools)
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(lubridate)
library(strucchange)
library(segmented)
library(scales)

Data prep

Cases need to be joined to state-level and county-level data

us_state <- us_cases_daily$state
us_county <- us_cases_daily$county

# state_join
tmptb <- sah %>% mutate(state = tolower(sah$state)) %>% 
  mutate(order_date = as.Date(order_date, format = "%m/%d/%Y")) %>% 
  filter(county == 'All') %>% select(state, order_date)
state_join <- us_state %>% select(state1, state2, date, cases, deaths) %>% 
  inner_join(tmptb, by = c("state1" = "state"))

# county_join
## drop words like "county" and "borough" from admin2
county_interventions$admin2 <- gsub("\\s*\\w*$", "", county_interventions$admin2)
county_interventions <- county_interventions %>% 
  mutate(acronym = tolower(county_interventions$acronym)) %>% 
  mutate(admin1 = tolower(county_interventions$admin1)) %>% 
  mutate(admin2 = tolower(county_interventions$admin2))
tmptb <- us_county %>% select(state1, county.x, date, cases, deaths)
county_join <- county_interventions %>%
  select(rank, acronym, admin1, admin2, SAH_County_Date, SAH_State_Date) %>% 
  inner_join(tmptb, by = c("admin1" = "state1", "admin2" = "county.x"))
## if SAH_County_Date is blank, fill with SAH_State_Date column
county_join$SAH_County_Date <- coalesce(county_join$SAH_County_Date,
                                        county_join$SAH_State_Date)

# function to raise case if needed
firstup <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

# sample
head(state_join)
head(county_join)

State-Level Analysis

Select the state of interest

# vector of state acronyms to loop
statename <- state_join %>% distinct(state1) %>% select(state1)

# placeholder because loop hasn't been figured out yet
statename <- statename %>% slice(1)
statename

# isolate variables of interest
state_tb <- state_join %>% select(state1, date, cases, order_date) %>% 
  filter(state1 == statename[[1]])

# convert date to julian day
state_tb$jday <- yday(state_tb$date)

Calculate k

Calculate k value in y = A$e^{kt}$

# linearize cases curve using natural log
state_tb$cases_log <- log(state_tb$cases)
state_tb$k <- NA
state_tb$k[2:length(state_tb$k)] <- 
  diff(state_tb$cases_log)/diff(state_tb$jday)
state_tb <- state_tb %>% filter_all(all_vars(!is.infinite(.)))

# remove rows with any na values
state_tb_fit <- state_tb %>% drop_na()

# smooth cases_log curve
lw <- loess(cases_log ~ jday, state_tb_fit)
state_tb_fit$cases_logfit <- lw$fitted

## calculate k values from smoothed cases_log curve
state_tb_fit$kfit <- NA
state_tb_fit$kfit[2:length(state_tb_fit$kfit)] <- 
  diff(state_tb_fit$cases_logfit)/diff(state_tb_fit$jday)

# grab SAH order date
orderdate <- unique(state_tb$order_date)
orderjday <- yday(orderdate)

Plot 1

p1 <- ggplot(state_tb, aes(x=date, y=cases)) +
  geom_point() +
  scale_x_date(date_breaks = "4 day", labels=date_format("%b-%d")) +
  labs(title = "COVID-19 Cases",
     subtitle = firstup(statename[[1]]),
     x = "Date", y = "Cases") +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.title = element_blank())
p1

Plot 2

p2 <- ggplot(state_tb_fit, aes(x=date, y=cases_log)) + geom_point() +
  geom_line(aes(x=date, y=cases_logfit), color = "#CD5C5C", size=1) +
  scale_x_date(date_breaks = "4 day", labels=date_format("%b-%d")) +
  labs(title = "COVID-19 Cases in Natural Log Scale",
     subtitle = firstup(statename[[1]]),
     x = "Date", y = "Cases (log scale)") +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.title = element_blank())
p2

Locate breakpoints in cases data

# pull jday and kfit as numeric vectors
x <- state_tb_fit %>% pull(jday)
y <- state_tb_fit %>% pull(kfit)

# use intervention_breakpts function to find breakpoints within the data
output <- intervention_breakpts(x, y)

Investigate outputs of intervention_breakpts() function

## output 1: exported as a tibble
t <- output[[1]]

## output 2: linear fit to curve
my.lm <- output[[2]][[1]]
summary(my.lm)

## output 3: segments
my.seg <- output[[2]][[2]]
slope(my.seg)
summary(my.seg)

## output 4: modeled piecewise linear function
my.model <- output[[2]][[3]]

## output 5: estimated breakpoints in julian days
my.lines <- round(unname(output[[2]][[4]]))

Plot 3

# convert julian day to date
t_join <- t %>% left_join(my.model, by = c("jdates")) %>% 
  rename("k"="rate.x", "klinear"="rate.y")
t_join$date <- as.Date(t_join$jdates, origin = '2019-12-31')
my.lines_date <- as.Date(my.lines, origin = '2019-12-31')

# label vertical lines
vline1 <- data.frame(date = orderdate,
                    label = "SAH")
vline2 <- data.frame(date = my.lines_date,
                     label = "breakpoints")

# plot
p3 <- ggplot(t_join, aes(x=date, y=k)) + geom_point() +
  geom_line(aes(x=date, y=klinear)) +
  geom_vline(aes(xintercept=date, colour=label), data=vline1, size=1) +
  geom_vline(aes(xintercept=date, colour=label), data=vline2, size=1) +
  scale_x_date(date_breaks = "2 day", labels=date_format("%b-%d")) +
  scale_colour_manual(name = "Dates", values = c("breakpoints" = "grey",
                                               "SAH" = "#57B1C5")) +
  labs(title = "Breakpoints in k Curve",
     subtitle = firstup(statename[[1]]),
     caption = paste0("SAH Order Date = ", orderdate),
     x = "Date", y = "k") +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.title = element_blank())
p3

Calculate lag

flatpoint_jday <- my.lines[length(my.lines)]
flatpoint_date <- as.Date(flatpoint_jday, origin = '2019-12-31')
lagdiff <- flatpoint_jday - orderjday

Plot 4

# plot
df <- state_tb_fit %>% as.data.frame()

df_vline <- data.frame(dates = c(flatpoint_date, orderdate),
                       labels = c("flatten", "SAH"))

p4 <- ggplot(df, aes(x=date, y=cases_log)) + geom_point() +
  geom_line(aes(x=date, y=cases_logfit)) +
  geom_vline(aes(xintercept=date, colour=label), data=vline2, size=1) +
  geom_vline(aes(xintercept = dates, colour = labels), data = df_vline,
             linetype = 1, size = 1, show.legend = T) +
  scale_colour_manual(name = "Dates", values = c("breakpoints" = "grey",
                                                 "flatten" = "#E88147",
                                                 "SAH" = "#57B1C5")) +
  scale_x_date(date_breaks = "2 day", labels=date_format("%b-%d")) +
  labs(title = "Impact of SAH Order on Curve",
       subtitle = firstup(statename[[1]]),
       caption = paste0("Lag = ", lagdiff, " days"),
       x = "Date", y = "Cases (log scale)") +
  theme(axis.text.x = element_text(angle = 90))
p4

All Plots

grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)

County level Analysis

This replicates the same methodology as above but at the county-level

# vector of state acronyms to loop
countyname <- county_join %>% distinct(admin1, admin2)

# placeholder because loop hasn't been figured out yet
countyname <- countyname %>% slice(1)
countyname

# isolate variables of interest
county_tb <- county_join %>%
  select(admin1, admin2, date, cases, SAH_County_Date) %>% 
  filter(admin1 == countyname[[1]] & admin2 == countyname[[2]])

# convert date to julian day
county_tb$jday <- yday(county_tb$date)

# calculate k value in y = Ae^(kx)
## linearize cases curve using natural log
county_tb$cases_log <- log(county_tb$cases)
county_tb$k <- NA
county_tb$k[2:length(county_tb$k)] <- 
  diff(county_tb$cases_log)/diff(county_tb$jday)
county_tb <- county_tb %>% filter_all(all_vars(!is.infinite(.)))

## remove rows with any na values
county_tb_fit <- county_tb %>% drop_na()

## smooth cases_log
lw <- loess(cases_log ~ jday, county_tb_fit)
county_tb_fit$cases_logfit <- lw$fitted

## calculate k values from smoothed cases_log curve
county_tb_fit$kfit <- NA
county_tb_fit$kfit[2:length(county_tb_fit$kfit)] <- 
  diff(county_tb_fit$cases_logfit)/diff(county_tb_fit$jday)

# grab SAH order date
orderdate <- unique(county_tb$SAH_County_Date)
orderjday <- yday(orderdate)

# pull jday and kfit as numeric vectors
x <- county_tb_fit %>% pull(jday)
y <- county_tb_fit %>% pull(kfit)

# use intervention_breakpts function to find breakpoints within the data
output <- intervention_breakpts(x, y)

# outputs
t <- output[[1]]
my.lm <- output[[2]][[1]]
summary(my.lm)
my.seg <- output[[2]][[2]]
slope(my.seg)
summary(my.seg)
my.model <- output[[2]][[3]]
my.lines <- round(unname(output[[2]][[4]]))

# calculate lag
flatpoint_jday <- my.lines[length(my.lines)]
flatpoint_date <- as.Date(flatpoint_jday, origin = '2019-12-31')
lagdiff <- flatpoint_jday - orderjday

Plots

# plot 1
p1 <- ggplot(county_tb, aes(x=date, y=cases)) +
  geom_point() +
  scale_x_date(date_breaks = "4 day", labels=date_format("%b-%d")) +
  labs(title = "COVID-19 Cases",
     subtitle = paste0(firstup(countyname[[2]]), ", ",
                       firstup(countyname[[1]])),
     x = "Date", y = "Cases") +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.title = element_blank())
p1

# plot 2
p2 <- ggplot(county_tb_fit, aes(x=date, y=cases_log)) +
  geom_point() +
  geom_line(aes(x=date, y=cases_logfit), color = "#CD5C5C", size=1) +
  scale_x_date(date_breaks = "4 day", labels=date_format("%b-%d")) +
  labs(title = "COVID-19 Cases in Natural Log Scale",
     subtitle = paste0(firstup(countyname[[2]]), ", ",
                       firstup(countyname[[1]])),
     x = "Date", y = "Cases (log scale)") +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.title = element_blank())
p2

# plot 3
## convert julian day to date
t_join <- t %>% left_join(my.model, by = c("jdates")) %>% 
  rename("k"="rate.x", "klinear"="rate.y")
t_join$date <- as.Date(t_join$jdates, origin = '2019-12-31')
my.lines_date <- as.Date(my.lines, origin = '2019-12-31')

## label vertical lines
vline1 <- data.frame(date = orderdate,
                    label = "SAH")
vline2 <- data.frame(date = my.lines_date,
                     label = "breakpoints")

## plot
p3 <- ggplot(t_join, aes(x=date, y=k)) + geom_point() +
  geom_line(aes(x=date, y=klinear)) +
  geom_vline(aes(xintercept=date, colour=label), data=vline1, size=1) +
  geom_vline(aes(xintercept=date, colour=label), data=vline2, size=1) +
  scale_x_date(date_breaks = "2 day", labels=date_format("%b-%d")) +
  scale_colour_manual(name = "Dates", values = c("breakpoints" = "grey",
                                               "SAH" = "#57B1C5")) +
  labs(title = "Breakpoints in k Curve",
     subtitle = paste0(firstup(countyname[[2]]), ", ",
                       firstup(countyname[[1]])),
     caption = paste0("SAH Order Date = ", orderdate),
     x = "Date", y = "k") +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.title = element_blank())
p3

# plot 4
df <- county_tb_fit %>% as.data.frame()

df_vline <- data.frame(dates = c(flatpoint_date, orderdate),
                       labels = c("flatten", "SAH"))

p4 <- ggplot(county_tb_fit, aes(x=date, y=cases_log)) + geom_point() +
  geom_line(aes(x=date, y=cases_logfit)) +
  geom_vline(aes(xintercept=date, colour=label), data=vline2, size=1) +
  geom_vline(aes(xintercept = dates, colour = labels), data = df_vline,
             linetype = 1, size = 1, show.legend = T) +
  scale_colour_manual(name = "Dates", values = c("breakpoints" = "grey",
                                                 "flatten" = "#E88147",
                                                 "SAH" = "#57B1C5")) +
  scale_x_date(date_breaks = "2 day", labels=date_format("%b-%d")) +
  labs(title = "Impact of SAH Order on Curve",
       subtitle = paste0(firstup(countyname[[2]]), ", ",
                         firstup(countyname[[1]])),
       caption = paste0("Lag = ", lagdiff, " days"),
       x = "Date", y = "Cases (log scale)") +
  theme(axis.text.x = element_text(angle = 90))
p4

grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)

Back to top




agroimpacts/covid19interventions documentation built on May 6, 2020, 12:35 a.m.