In khelmsmith/flm_NE_WNV: Functional Linear Modeling For Vector-borne Disease

knitr::opts_chunk$set(echo = TRUE)

Set up data for example

library(tidyverse)
library(flm.NE.WNV) 
library(MuMIn)

# Read in population data
pop = flm.NE.WNV::NE_county_pops  %>% 
  filter(year >= 2002) %>% 
  as.data.frame()

county_lookup <- pop %>% 
  filter(year == "2010") %>% 
  select(County, fips)%>% 
  as.data.frame()

#Human cases, simulated, based on predicted values from the fitted model. 
cases = flm.NE.WNV::sampledat %>% 
  select(County, year, cases)%>% 
  # # drop Arthur back to 0 to test new zero filling methods
  # mutate(cases = case_when(County == "Arthur" ~ 0,
  #                          TRUE ~ cases)) %>% 
  as.data.frame()

# Read in environmental data
NEdat = flm.NE.WNV::NEdat %>% 
  rename(tmean = temp) %>% 
  mutate(fips = sprintf("31%03d", cofips)) %>% 
  left_join(county_lookup, by = "fips") %>% 
  select(County, fips, year, month, tmean, ppt)%>% 
  as.data.frame()


# Read in spi
spi = flm.NE.WNV::spi %>% 
  mutate(County = trimws(County),
         County = gsub(" ", "", County)) %>% 
  left_join(county_lookup, by = "County") %>% 
  rename(spi = spi1) %>% 
  select(County, fips, year, month, spi)%>% 
  as.data.frame()

# Read in spei
spei = flm.NE.WNV::spei %>% 
  mutate(County = trimws(County),
         County = gsub(" ", "", County)) %>% 
  left_join(county_lookup, by = "County") %>% 
  rename(spei = spei1) %>% 
  select(County, fips, year, month, spei)%>% 
  as.data.frame()

# Specify the last date to use -- package data incomplete for 2019
target.date = "2016-02-01" 
#target.date = "2019-03-01" -- this should fail with an error
# Specify the first year of data to use
start.year = 2002

# Specify lag lengths to use
lag.lengths = c(12, 18) #c(12, 18, 24, 30, 36)

#For consistent results with simulated data
in.seed = 4872957 # set = NULL if you have actual case data

Some basic quality control on synthetic data

Our approach struggles if there are counties with no cases, so we eliminate those counties (two in the simulated data).

data <- assemble.data.lags(pop, cases, NEdat, spi, spei, 
                           target.date, start.year, in.seed, lag.lengths)
allLagsT <- data[[1]] # 2004 - 2017
allLags0 <- data[[2]] # 2018
# how many counties lack cases? Should be none.
allLagsT %>% group_by(County) %>% 
  summarize(total_cases = sum(cases)) %>% 
  filter(total_cases == 0)

allLagsT %>% group_by(County) %>% 
  summarize(total_cases = sum(cases)) %>% 
  filter(total_cases > 0)

# to identify counties to fill in
allunits <- unique(cases$County)
missingunits <- !(allunits %in% unique(allLagsT$County))
allunits[missingunits]

What does the state total look like plotted against time?

allLagsT %>% group_by(year) %>% 
  summarize(total_cases = sum(cases)) %>% 
  ggplot() + 
  geom_point(mapping = aes(x = year, y = total_cases))

Run the model

This chunk assembles data and runs models, using all combinations of lagged data for mean temperature, precipitation, SPI and SPEI.

Note: It takes about 5 minutes to run the models with a lag of 9. It takes 25 minutes to run all the models with the set of default lags.

flm.results = call.flm(pop, cases, NEdat, 
                       spi, spei, target.date, start.year, 
                       in.seed = in.seed, lag.lengths = 12,
                       fillzeros = TRUE,
                       nsim = 10,
                       predict_from = "best")

Overview of model chosen by AIC

top <- flm.results[[3]]$AICbest

mod <- summary(flm.results[[3]]$fittedModels[[top]])
mod

View predictions from best-fit model

flm.results[[1]][[top]]

dist_lag_terms <- extract_functional(flm.results[[3]]$fittedModels[[top]]) %>% 
  mutate(lcl = fit - 1.96*se,
         ucl = fit + 1.96*se)
ggplot(data = dist_lag_terms) + 
  geom_line(mapping = aes(x = x, y = fit)) +
  geom_ribbon(mapping = aes(x = x, ymin = lcl, ymax = ucl), alpha = 0.2)+
  facet_wrap(~label)

aictable <- model.sel(flm.results[[3]]$fittedModel)

pull out the "in sample" CRPS scores.

model.sims <- map(flm.results[[3]]$fittedModels, simulate, nsim = 100)
names(model.sims) <- 1:64
model.crps <- map(model.sims,
                  ~scoringRules::crps_sample(allLagsT$cases, .)) %>%
  map(~bind_cols(allLagsT[,c("County","year")], crps = .))

crps.table <- model.crps %>% 
  bind_rows(.id = "model") %>% 
  group_by(model) %>% 
  summarize(mean_crps = mean(crps),
            se_crps = sd(crps)/sqrt(n())) %>% 
  arrange(mean_crps) %>% 
  mutate(order = row_number())
ggplot(crps.table, mapping = aes(x = order, y = mean_crps)) +
  geom_point()

Are they related to the AIC scores?

crps.table$AICc <- flm.results[[3]]$AICfits[as.numeric(crps.table$model)]
ggplot(data = crps.table,
       mapping = aes(x = mean_crps, y = AICc)) + 
  geom_point()