R/predict_wYr.r

Defines functions predict_wYr

Documented in predict_wYr

#' Predict data
#' 
#' @param fittedModel Model to predict from
#' @param allLagsT The assembled data set to use for training
#' @param allLagsO The assembled data set to use for out-of-sample prediction
#' @param unpredictedO out-of-sample data for counties with no cases in training data
#' @param fillzeros Logical. If TRUE fill in with zeros counties that have no cases ever.
#' @param allunits Character. Vector of county names in complete data set. Used 
#'   when fillzeros == TRUE
#' @param nsim Integer. Number of samples to draw from posterior distribution. Defaults to zero, which has the expected value of cases in predcases.
#' @name predict
#' @export
NULL

#' @describeIn predict Predict from models including year
predict_wYr = function(fittedModel, allLagsT, allLagsO, unpredictedO, fillzeros, allunits, nsim){
  
  # Plug in the model formula here
  modform <- formula(fittedModel)
  
  modsum <- summary(fittedModel)
  
  # extracting the fitted yr coefficients
  ptab <- as.data.frame(modsum$p.table)
  years<-  grepl("year", row.names(ptab))
  ptab <- ptab[years, 1, drop=FALSE]
  colnames(ptab) <- "yrcoef"
  ptab[nrow(ptab)+1,"yrcoef"] <- -sum(ptab$yrcoef)
  ptab$year <- factor(levels(allLagsT$year))
  contrasts(ptab$year) <- contrasts(allLagsT$year)
  
  allLagsT <- merge(allLagsT, ptab) 
  
  # predict the year coefficient with training data 
  
  # change tmean and SP(E)I variable lag lengths here to match model formula
  # yrcoefmodform <- update(modform, yrcoef~.-year-offset(log(pop100K)))
  # yrcoefmod <- gam(yrcoefmodform, data=allLagsT, family=gaussian())
  # do it the hard way, because we don't know what other terms are in the model
  # and update doesn't remove the offset.
  form <- Reduce(paste, deparse(fittedModel$formula[3]))
  form <- gsub('^.|.$', '', form)
  form <- trimws(form)  
  formterms <- unlist(strsplit(form, "+", fixed = TRUE))
  numterms <- length(formterms)
  termtab <- data.frame(matrix(unlist(formterms), nrow=numterms, byrow=TRUE),stringsAsFactors=FALSE)
  colnames(termtab) <- "terms"
  termtab <- termtab %>% mutate(keep = ifelse(grepl("s\\(", terms), "yes", "no"))
  termtab <- termtab[termtab$keep == "yes",]
  newterms <- toString(paste0(termtab$terms, sep = "+"))
  newterms <- gsub("\\+,","\\+", newterms)
  yrcoefmodform <- paste0("yrcoef ~ ",newterms ," County")
  yrcoefmodform <- as.formula(yrcoefmodform)
  
  yrcoefmod <- gam(yrcoefmodform, data=allLagsT, family=gaussian())
  
  # use the year coefficient to predict coyr for each county-year on training data
  new <- predict(yrcoefmod, newdata = allLagsT, type = "link", se = TRUE)
  allLagsT$coyr <- new$fit
  allLagsT$coyrse <- new$se.fit
  allLagsT$yrcoef <- NULL

  new <- predict(yrcoefmod, newdata = allLagsO, type = "link", se = TRUE)
  allLagsO$coyr <- new$fit
  allLagsO$coyrse <- new$se.fit
  allLagsO$yrcoef <- NULL
  
  #refit the model with training data
  predmod <- gam(update(modform, . ~ . - year + coyr), data=allLagsT, family=nb())
  # summary(mod2)

  # now predict using the new coyr variable instead of the year coefficient
  preds <- predict(predmod, newdata=allLagsO, type = "link", se=TRUE) 
  allLagsO$fit <- preds[[1]]
  allLagsO$se <- preds[[2]]
  
  if (nsim > 0){
    allLagsO$predcases <- simulate(predmod, newdata = allLagsO, nsim = nsim)
  } else {
    allLagsO$predcases <- exp(allLagsO$fit)
  }

  allLagsO <- allLagsO[,c("County", "year", "cases", "fit", "se", "predcases")]
  
  if (fillzeros){
    message("Filling in counties with no cases with zero predictions.")
    # allunits has all counties, including those with zeros.
    # unpredicted0 has data for counties with no cases in training set
    # extract counties in cases that are NOT in results$predictions
    # to identify counties to fill in
    missingunits <- !(allunits %in% unique(allLagsO$County))
    if (sum(missingunits) != nrow(unpredictedO)) stop("Bad missing units count in predict_wYr")
    if (sum(missingunits) > 0){
      unpredictedO$predcases = ifelse(nsim > 0, 
                                      matrix(0, nrow = length(missingunits), ncol = nsim),
                                      0)
      unpredictedO <- unpredictedO[,c("County", "year", "cases", "predcases")]
      allLagsO <- dplyr::bind_rows(allLagsO, unpredictedO)                               
    } else {
      message("No missing units found")
    }
  }
  
  return(allLagsO)
}
khelmsmith/flm_NE_WNV documentation built on June 24, 2022, 11:13 p.m.