In gcgibson/KCDETD: Non-parametric estimation of infectious disease

knitr::opts_chunk$set(echo = TRUE)

Here we introduce the KCDETD package for R. The KCDE method uses an estimate of the conditional distribution of a time-series given: - The last 4 observed points in the time series. - The value observed last season. - The epiweek used to forecast.

library(devtools)
library(dplyr)
#devtools::install_github("https://github.com/reichlab/cdcfluutils",force = T)
library(KCDETD)
library(tsfknn)
library(sarimaTD)


state_data <-read.csv("../data/state_data.csv")
unique_states <- unique(state_data$region)
    nsim <- 1000

fs_mat_kcde <- matrix(NA,nrow=100*length(unique_states),ncol=4)
fs_mat_sarima <- matrix(NA,nrow=100*length(unique_states),ncol=4)

mse_mat_kcde <- matrix(NA,nrow=100*length(unique_states),ncol=4)
mse_mat_sarima <- matrix(NA,nrow=100*length(unique_states),ncol=4)


ls_index <- 1
for (state in sample(unique_states,10)){
  ma_data <- state_data[state_data$region == state,]
  for (first_test_index in 200:220){
    kcde_fit <- fit_kcde(tail(ma_data$unweighted_ili[1:(first_test_index-1)],100),

                         ts_frequency = 52,
                         h=52,transformation = "none",num_nn=10)

    preds <- simulate.KCDE2(kcde_fit,nsim,newX = ma_data$unweighted_ili[1:(first_test_index-1)],ts_frequency = 52,
                                            h=4,seed = 1)



     sarima_fit <- fit_sarima(tail(ma_data$unweighted_ili[1:(first_test_index-1)],100),52,transformation = "box-cox",seasonal_difference = TRUE)

    sarima_pred <- 
  simulate(
    object = sarima_fit,
    nsim = 1000,
    seed = 1,
    newdata = ma_data$unweighted_ili[1:(first_test_index-1)],
    h = 4
  )


    truth <- ma_data[first_test_index:(first_test_index+3),]$unweighted_ili
    for (h in 1:4){
      fs_mat_kcde[ls_index,h] <- sum(round(preds[,ncol(preds) -4 + h],2) == round(truth[h],2))/1000
      fs_mat_sarima[ls_index,h] <- sum(round(sarima_pred[,ncol(sarima_pred) -4 + h],2) == round(truth[h],2))/1000
      mse_mat_kcde[ls_index,h] <- (mean(preds[,ncol(preds)-4+h],na.rm=T) - truth[h])^2
      mse_mat_sarima[ls_index,h] <- (mean(sarima_pred[,ncol(preds)-4+h],na.rm=T) - truth[h])^2

    }
    ls_index <- ls_index+1
  }
}  
#ggplot(data=data.frame(y=c(t(preds)),x=rep(1:ncol(preds),1000),group=rep(1:1000,each=ncol(preds))),aes(x=x,y=y,group=group)) + geom_line() + geom_line(data=ma_data[first_test_index:(first_test_index+3),],aes(x=ncol(preds)-4+ 1:4,y=unweighted_ili,group=1,col='truth'))

exp(mean(pmax(-10,log(fs_mat_sarima)),na.rm=T))
exp(mean(pmax(-10,log(fs_mat_kcde)),na.rm=T))
mean(mse_mat_kcde,na.rm=T)
mean(mse_mat_sarima,na.rm=T)