Analysis/Validate/02-Prep-Data-Inputs-Validate.R

# ---------------------
# Prep data inputs for LEMMAABM
# Chris Hoover Feb 2021
# ---------------------

library(tidyverse)
library(lubridate)
library(data.table)

# Load par inputs to get timeframe ------------------
input_pars <- readRDS(here::here("data", "processed", "input_pars_validate.rds"))

t0        <- input_pars$time_pars$t0
t.end     <- input_pars$time_pars$t.end
ref_date  <- input_pars$time_pars$ref_date

# referece for safegraph which starts on Jan 1 2020
start.num <- as.numeric(t0 - as.Date("2019-12-31"))

# Don't use these agents as agents from previous sim will be used in sim, but need to know N for tests_pp below
agents <- readRDS(here::here("data/processed/SF_agents_processed.rds"))
  N <- nrow(agents)

# Testing data for sims ----------------------
# source(here::here("data", "get","COVID_CA_get_latest.R"))
load(here::here("data", "get", "got", "CA_SF_data2021-02-16.Rdata"))
# sf_test is observed testing completed in SF county
# Must contain columns date_num and tests_pp to convert to testing function in model
sf_test_smooth <- sf_test %>% 
  mutate(Date = as.Date(substr(specimen_collection_date, 1,10)),
         tests_pp = tests/N) %>% 
  padr::pad() %>% 
  mutate(date_num = as.numeric(Date-ref_date)) %>% 
  padr::fill_by_value(tests_pp, value = 0) %>% 
  mutate(tests_pp_7day_avg = zoo::rollmean(tests_pp, 7, na.pad = T, align = "center"),
         tests_pp = case_when(is.na(tests_pp_7day_avg) & Date < as.Date("2020-03-10") ~ 2/N,
                              is.na(tests_pp_7day_avg) & Date > as.Date("2020-03-10") ~ 5000/N,
                              !is.na(tests_pp_7day_avg) ~ tests_pp_7day_avg))

last_sf_test <- max(sf_test_smooth$Date)

tests_avail <- sf_test_smooth %>% 
  filter(date_num >= 0) %>% 
  dplyr::select(date_num, tests_pp)

# sf_test_smooth %>%  ggplot() + geom_line(aes(x = Date, y = tests_pp)) + geom_line(aes(x = Date, y = tests_pp_7day_avg), col = "red") + theme_classic()


# Safegraph data -------------------------
#San francisco stay at home by percent by ct derived from safegraph
sf_sfgrph_pcts20 <- readRDS(here::here("data", "processed","Safegraph", "sfgrph_devices_pct_home_cts_2020.rds"))
sf_sfgrph_pcts21 <- readRDS(here::here("data", "processed","Safegraph", "sfgrph_devices_pct_home_cts_2021.rds"))

sf_sfgrph_pcts <- rbind(sf_sfgrph_pcts20, sf_sfgrph_pcts21)

sf_sfgrph_pcts <- sf_sfgrph_pcts %>% filter(Date >= t0)

last_sfgrph <- max(sf_sfgrph_pcts$Date)

# sf_sfgrph_pcts %>% 
#   group_by(Date) %>% 
#   summarise(n_devices     = sum(device_count), 
#             n_home        = sum(completely_home_device_count), 
#             n_part_work   = sum(part_time_work_behavior_devices), 
#             n_full_work   = sum(full_time_work_behavior_devices), 
#             per_home      = n_home/n_devices, 
#             per_part_work = n_part_work/n_devices, 
#             per_full_work = n_full_work/n_devices) %>% 
#   ggplot() +
#   geom_line(aes(x = Date, y = per_home), col = "green") +
#   geom_line(aes(x = Date, y = per_part_work), col = "orange") +
#   geom_line(aes(x = Date, y = per_full_work), col = "red") +
#   theme_bw() +
#   labs(y = "Pct device behavior",
#        title = "Safegraph home/work metrics SF County")

sf_sfgrph_pct_home <- sf_sfgrph_pcts %>% 
  dplyr::select(CT, Date, pct_home) %>% 
  data.table::as.data.table()

#San Francisco ct mvmt list derived from sfgrph data
sf_ct_cdf_ls20 <- readRDS(here::here("data", "processed", "Safegraph", "safegraph_ct_mvmt_cdf_list_2020processed.rds"))
sf_ct_cdf_ls21 <- readRDS(here::here("data", "processed", "Safegraph", "safegraph_ct_mvmt_cdf_list_2021processed.rds"))
sf_ct_cdf_ls <- c(sf_ct_cdf_ls20, sf_ct_cdf_ls21)

sf_ct_cdf_ls <- sf_ct_cdf_ls[start.num:length(sf_ct_cdf_ls)] 

sf_ct_ids <- read_csv(here::here("data", "raw", "Census_2010_Tracts.csv")) %>% pull(GEOID10) %>% as.numeric()


# Visitors to SF county from other CA counties ------------
sf_visitors20 <- readRDS(here::here("data", "processed", "Safegraph", "SF_visitors_CTs_2020Processed.rds"))
sf_visitors21 <- readRDS(here::here("data", "processed", "Safegraph", "SF_visitors_CTs_2021Processed.rds"))

sf_visitors <- c(sf_visitors20, sf_visitors21)

sf_visitors <- sf_visitors[start.num:length(sf_visitors)]

# Vaccination data ------------------
# Data here but not downloadable yet https://data.sfgov.org/stories/s/a49y-jeyc
# Hack to get approximation of vaccines per day so far
vax_start <- as.Date("2020-12-15")
vax_last_obs <- as.Date("2021-02-01")
vax_last  <- t.end
vax_days <- as.numeric(vax_start-ref_date):as.numeric(vax_last-ref_date)

nvax_last_obs <- 4000
nvax_trend <- nvax_last_obs/as.numeric(vax_last_obs-vax_start)

nvax_last   <- round(nvax_last_obs+nvax_trend*as.numeric(vax_last-vax_last_obs)) # max vaccines per day assuming linear trend continues
nvax <- rep(NA_real_, length(vax_days))
nvax[1] <- 1
nvax[length(vax_days)] <- nvax_last
nvax <- round(zoo::na.approx(nvax))

vax_per_day <- data.frame(days = vax_days ,
                          vax  = nvax)

# Predict/impute into the future if simulation beyond present is desired
if(t.end > last_sf_test){
  # Determine number of days necessary to "impute" then assign average of same number of days in the past into the future  
  n_add <- as.numeric(t.end - last_sf_test)
  lookback <- last_sf_test-n_add-7
  avg_past <- sf_test_smooth %>% filter(Date > lookback) %>% pull(tests_pp) %>% mean()
  tests_pp_pad <- c(sf_test_smooth$tests_pp_7day_avg[which(sf_test_smooth$date_num >0)], 
                    rep(avg_past, n_add))
  dates_pad <- c(sf_test_smooth$date_num[which(sf_test_smooth$date_num >0)], 
                 max(sf_test_smooth$date_num, na.rm = T)+c(1:n_add))
  
  tests_avail <- data.frame(date_num = dates_pad,
                            tests_pp = tests_pp_pad)
}

if(t.end > last_sfgrph){
  #Need the simulation to run beyond when we have safegraph movement data to inform movement, so repeat older safegraph data to project in the future 
  last_sf_day <- wday(last_sfgrph)
  n_add <- as.numeric(t.end - last_sfgrph)
  lookback <- last_sfgrph-n_add
  lookback_day <- ifelse(last_sf_day == 7, 1, last_sf_day+1)
  lookback_week <- lookback-c(1:7)
  lookback_start <- lookback_week[which(wday(lookback_week) == lookback_day)]
  
  sf_sfgrph_pct_home <- rbindlist(list(sf_sfgrph_pct_home,
                                       sf_sfgrph_pct_home %>% 
                                         filter(Date >= lookback_start) %>% 
                                         mutate(Date = Date + 1 + (last_sfgrph-lookback_start))))
  
  sf_ct_cdf_ls <- c(sf_ct_cdf_ls, 
                    sf_ct_cdf_ls[(length(sf_ct_cdf_ls)-as.numeric(last_sfgrph-lookback_start)):length(sf_ct_cdf_ls)]) 
  
  sf_visitors <- c(sf_visitors,
                   sf_visitors[(length(sf_visitors)-as.numeric(last_sfgrph-lookback_start)):length(sf_visitors)])
  
  
}


# Final object to export
data_inputs                 <- list()
data_inputs$ct_cdf_list     <- sf_ct_cdf_ls
data_inputs$ct_ids          <- sf_ct_ids
data_inputs$stay_home_dt    <- sf_sfgrph_pct_home
data_inputs$visitors_list   <- sf_visitors
data_inputs$tests_avail     <- tests_avail
data_inputs$vax_per_day     <- vax_per_day

saveRDS(data_inputs, here::here("data", "processed", "data_inputs_validate.rds"))
cmhoove14/LEMMAABMv4 documentation built on Nov. 1, 2021, 10:23 p.m.