dfmip: Disease Forecast Model Intercomparison Project

Documented in apply.always.absent.null apply.ar1 apply.historical.null apply.mean.value.null apply.negative.binomial apply.null.models apply.prior.year.null apply.random.incidence apply.stratified.incidence apply.stratified.mean.null apply.uniform check.fields try.again

# Functions to implement different Null models for CDC West Nile Virus data

# A.C. Keyel
# Created on 2021-04-13
# Incorporated into dfmip on 2021-06-21

# Covered by package DESCRIPTION and NAMESPACE
#library(MASS)
#library(testthat)

# Index of Null Models (Unit Tests: Y/N)
# apply.always.absent (N)
# apply.mean.value (N)
# apply.stratified.mean.null (N)
# apply.historical.null (N)
# apply.prior.year.null (N)
# apply.random.incidence (N)
# apply.stratified.incidence
# apply.negative.binomial (N)
# apply.ar1 (N* Yes but not automated)
# apply.uniform (Y)

#### Create an overview function to run the null models in aggregate

#' Apply Null Models
#'
#' Runs the null models as a group and create an joint output data frame and a
#' timing data frame. NOTE: the always absent and mean value were NOT calculated
#' using the apply.always.absent.null function or the apply.mean.value.null
#' functions - these were created later and have not been edited into the code.
#'
#'@param in.data A data set containing 'location' field with locations,
#'annual human cases in a 'count' field, and year in a 'year' field. Note that
#'field names must match EXACTLY. For Stratified Incidence and Random Incidence,
#'human population also needs to be included in a 'POP' field.
#'@param n.draws The number of probabilistic draws to make for each location.
#'@param nulls Which null models to run. Default of "ALL" runs all models.
#'Individual models can be run with 2-letter abbreviations in a vector:\tabular{ll}{
#' HN \tab Historical Null\cr
#' NB \tab Negative Binomial\cr
#' AA \tab Always Absent\cr
#' IN \tab Incidence\cr
#' PI \tab Pooled Incidence\cr
#' AR \tab Autoregressive 1\cr
#' MV \tab Mean Value\cr
#' PY \tab Prior Year\cr
#' UN \tab Uniform Null\cr
#' PM \tab Pooled Mean Value\cr }
#'
#'@param out.path The path to write the crps.df and time.df data objects.
#' If no output is desired, set this to NA (default).
#'@param in.seed The starting seed to ensure replicability of the random
#'processes
#'
#'@return A list with four elements: crps.df; time.df; sample.size.vec, and a list
#'of model specific results. crps df is a data
#' frame with the average CRPS score for that year for each null model, while
#' time.df contains the timing for each null model. sample.size.vec contains
#' a list of sample sizes by year, while the model.lists object contains the
#' individual county CRPS scores by year.
#'
#'@export apply.null.models.one.year
#'
apply.null.models.one.year = function(train.data, obs.data, n.draws, target.year,
                                      nulls.vec = "ALL", out.path = NA, in.seed = 20210622){

  if (nulls.vec == "ALL"){
    nulls.vec = c("AA", "PV","MV", "HN", "PN", "PI", "IN", "UN", "NB", "AR")
  }

  # Check for required fields
  check.fields(train.data, nulls.vec)

  set.seed(in.seed)
  draw.vec = sprintf("DRAW_%04.0f", seq(1,n.draws))

  years = sort(as.numeric(unique(train.data$year)))

  mean.crps.df = data.frame(model = "DROP", target.year = NA, CRPS = NA)
  time.df = data.frame(model = "DROP", target.year = NA, start.time = NA, end.time = NA)
  raw.crps.df = data.frame(model = "DROP", location = NA, count = NA, year = NA, CRPS = NA)
  pred.df = data.frame(model = "DROP", location = NA, count = NA, year = NA)
  for (i in 1:length(draw.vec)){
    this.entry = draw.vec[i]
    pred.df[[this.entry]] = NA
  }

  sample.size = nrow(train.data)

  # Make sure train.data field is numeric
  train.data$count = as.numeric(as.character(train.data$count))
  train.data$year = as.numeric(as.character(train.data$year))

  obs.data.count = obs.data$count
  obs.data.loc = obs.data$location
  n.records = length(obs.data.count)

  for (this.null in nulls.vec){
    null.start = Sys.time()
    out.data = null.engine(this.null, train.data, n.draws, n.records, target.year)
    null.end = Sys.time()
    time.df = rbind(time.df, c(this.null, target.year, null.start, null.end))
    out.data = matrix(out.data, ncol = n.draws)
    out.crps = scoringRules::crps_sample(as.numeric(obs.data.count), as.matrix(out.data))
    out.matrix = matrix(c(rep(this.null, n.records), obs.data.loc, obs.data.count, rep(target.year, n.records), out.crps),
                        ncol = 5)
    colnames(out.matrix) = c("model", "location", "count", "year", "CRPS")
    raw.crps.df = rbind(raw.crps.df, out.matrix)

    pred.matrix = matrix(c(rep(this.null, n.records), obs.data.loc, obs.data.count,
                           rep(target.year, n.records), out.data), ncol = 4 + n.draws)
    colnames(pred.matrix) = c('model', 'location', 'count', 'year', draw.vec)
    pred.df = rbind(pred.df, pred.matrix)
    mean.null.crps = mean(out.crps)
    mean.crps.df = rbind(mean.crps.df, c(this.null, target.year, mean.null.crps))
  }

  # Remove initialization rows
  mean.crps.df = mean.crps.df[2:nrow(mean.crps.df), ]
  time.df = time.df[2:nrow(time.df), ]
  mean.crps.df$CRPS = as.numeric(mean.crps.df$CRPS)
  raw.crps.df = raw.crps.df[2:nrow(raw.crps.df), ]
  pred.df = pred.df[2:nrow(pred.df), ]

  time.df$TIME.DIFF = as.numeric(time.df$end.time) - as.numeric(time.df$start.time)


  return(list(mean.crps.df, time.df, raw.crps.df, pred.df, sample.size))
}

#' Compute the null model results
#'
#' @param this.null an indicator for which null model to run: AA, PV, MV, HN, PN, PI, IN, UN, NB, AR
#' @param train.data The training data to use for parameterizing the null model
#' @param n.draws the number of draws from the resulting distribution to use
#' @param n.records The number of locations being predicted
#' @param year The focal forecast year
#'
#' @return a matrix with one row for each location and one column for each draw
null.engine = function(this.null, train.data, n.draws, n.records, year){

  if (this.null == "AA"){
    out.data = rep(0, n.draws * n.records)
  }

  if (this.null == "PV"){
    mean.value = mean(train.data$count)
    out.data = rep(mean.value, n.draws * n.records)
  }

  if (this.null == "MV"){
    sm.result = apply.stratified.mean.null(train.data, year, n.draws, draw.vec)
    out.data.1 = sm.result[ ,3:ncol(sm.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "HN"){
    hn.result = apply.historical.null(train.data, year, n.draws, draw.vec)
    out.data.1 = hn.result[ ,3:ncol(hn.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "PN"){
    pn.result = apply.prior.year.null(train.data, year, n.draws, draw.vec)
    out.data.1 = pn.result[ ,3:ncol(pn.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "PI"){
    ri.result = apply.random.incidence.v2(train.data, year, n.draws, draw.vec)
    out.data.1 = ri.result[ ,3:ncol(ri.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "IN"){
    si.result = apply.stratified.incidence.v2(train.data, year, n.draws, draw.vec) # target.year
    out.data.1 = si.result[ ,3:ncol(si.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "UN"){
    un.result = apply.uniform(train.data, year, n.draws, draw.vec)
    out.data.1 = un.result[ ,3:ncol(un.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "NB"){
    nb.result = apply.negative.binomial(train.data, year, n.draws, draw.vec)
    out.data.1 = nb.result[ ,3:ncol(nb.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "AR"){
    ar.result = apply.ar1(train.data, year, n.draws, draw.vec)
    out.data.1 = ar.result[ ,3:ncol(ar.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  return(out.data)
}

#' Correct out data
#'
#' Correct the data format coming out of the apply.X functions to be compatible
#' with downstream code.
#' @param out.data.1 A data matrix with a column for every draw, and a row for every location of interest.
#' @param n.draws number of draws from distribution
#'
#' @return the same data matrix, but without some weird list formatting issues R was having.
#'
correct.out.data = function(out.data.1, n.draws){
  # Add a correction for R's automated corruption of the data
  out.data = c()
  for (col in 1:n.draws){
    out.data = c(out.data, out.data.1[[col]])
  }
  out.data = matrix(out.data, ncol = n.draws)

  return(out.data)
}

#' Apply Null Models
#'
#' Runs the null models as a group and create an joint output data frame and a
#' timing data frame. NOTE: the always absent and mean value were NOT calculated
#' using the apply.always.absent.null function or the apply.mean.value.null
#' functions - these were created later and have not been edited into the code.
#'
#'@param in.data A data set containing 'location' field with locations,
#'annual human cases in a 'count' field, and year in a 'year' field. Note that
#'field names must match EXACTLY. For Stratified Incidence and Random Incidence,
#'human population also needs to be included in a 'POP' field.
#'@param n.draws The number of probabilistic draws to make for each location.
#'@param nulls Which null models to run. Default of "ALL" runs all models.
#'Individual models can be run with 2-letter abbreviations in a vector:\tabular{ll}{
#' HN \tab Historical Null\cr
#' NB \tab Negative Binomial\cr
#' AA \tab Always Absent\cr
#' SI \tab Stratified Incidence\cr
#' RI \tab Random Incidence\cr
#' AR \tab Autoregressive 1\cr
#' SM \tab Stratified Mean\cr
#' PY \tab Prior Year\cr
#' UN \tab Uniform Null\cr
#' MV \tab Mean Value\cr }
#'
#'@param out.path The path to write the crps.df and time.df data objects.
#' If no output is desired, set this to NA (default).
#'@param in.seed The starting seed to ensure replicability of the random
#'processes
#'
#'@return A list with four elements: crps.df; time.df; sample.size.vec, and a list
#'of model specific results. crps df is a data
#' frame with the average CRPS score for that year for each null model, while
#' time.df contains the timing for each null model. sample.size.vec contains
#' a list of sample sizes by year, while the model.lists object contains the
#' individual county CRPS scores by year.
#'
#'@export apply.null.models
#'
apply.null.models = function(in.data, n.draws, nulls = "ALL", out.path = NA, in.seed = 20210622){

  # Check for required fields
  check.fields(in.data, nulls)

  set.seed(in.seed)
  draw.vec = sprintf("DRAW_%04.0f", seq(1,n.draws))

  years = sort(as.numeric(unique(in.data$year)))

  crps.df = data.frame(model = NA, target.year = NA, CRPS = NA)
  time.df = data.frame(model = NA, target.year = NA, start.time = NA, end.time = NA, ELAPSED = NA)

  sample.size.vec = c()
  aa.lst = list()
  mn.lst = list()
  sm.lst = list()
  hn.lst = list()
  pn.lst = list()
  ri.lst = list()
  si.lst = list()
  nb.lst = list()
  ar.lst = list()
  un.lst = list()

  #for (i in 2:length(years)){ #
  for (i in 4:length(years)){
    year = years[i]
    train.data = in.data[in.data$year < year, ]
    #year.in.data = in.data$count[in.data$year == year]
    year.in.data = in.data[in.data$year == year, c("count", "location")]

    # Remove counties from train.data with fewer than 3 years of data (this now varies by county, because we are excluding years prior to the arrival of WNV)
    test = table(train.data$location)
    test2 = test[test > 2]
    train.data = train.data[train.data$location %in% names(test2), ]
    year.in.data = year.in.data[year.in.data$location %in% names(test2), c("count")]

    # Make sure train.data field is numeric
    train.data$count = as.numeric(as.character(train.data$count))
    train.data$year = as.numeric(as.character(train.data$year))

    message(sprintf("year %s, %s counties", year, nrow(test2)))
    sample.size.vec = c(sample.size.vec, nrow(test2))

    if (nulls == "ALL" | nulls == "AA"){
      # Apply Always Absent Null
      aa.start = Sys.time()
      n.records = length(year.in.data)
      aa.data = rep(0, n.draws * n.records)
      aa.end = Sys.time()
      time.df = rbind(time.df, c("AA", year, aa.start, aa.end))
      aa.data = matrix(aa.data, ncol = n.draws)
      aa.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(aa.data))
      aa.lst = append(aa.lst, list(aa.crps))
      mean.aa.crps = mean(aa.crps)
      crps.df = rbind(crps.df, c("AA", year, mean.aa.crps))
    }

    if (nulls == "ALL" | nulls == "MV"){
      # Apply Mean Value Null
      mn.start = Sys.time()
      mean.value = mean(train.data$count)
      n.records = length(year.in.data)
      mn.data = rep(mean.value, n.draws * n.records)
      mn.end = Sys.time()
      time.df = rbind(time.df, c("MV", year, mn.start, mn.end))
      mn.data = matrix(mn.data, ncol = n.draws)

      mn.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(mn.data))
      mn.lst = append(mn.lst, list(mn.crps))
      mean.mn.crps = mean(mn.crps)
      crps.df = rbind(crps.df, c("MV", year, mean.mn.crps))
    }

    if (nulls == "ALL" | nulls == "SM"){
      # Apply Stratified Mean Value Null
      sm.start = Sys.time()
      sm.result = apply.stratified.mean.null(train.data, year, n.draws, draw.vec)
      sm.end = Sys.time()
      time.df = rbind(time.df, c("SM", year, sm.start, sm.end))

      sm.data = sm.result[ ,3:ncol(sm.result)]
      sm.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(sm.data))
      sm.lst = append(sm.lst, list(sm.crps))
      mean.sm.crps = mean(sm.crps)
      crps.df = rbind(crps.df, c("SM", year, mean.sm.crps))
    }

    if (nulls == "ALL" | nulls == "HN"){
      # Apply historical null model
      hn.start = Sys.time()
      hn.result = apply.historical.null(train.data, year, n.draws, draw.vec)
      hn.end = Sys.time()
      time.df = rbind(time.df, c("HN", year, hn.start, hn.end))

      hn.data = hn.result[ ,3:ncol(hn.result)]
      hn.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(hn.data))
      hn.lst = append(hn.lst, list(hn.crps))
      mean.hn.crps = mean(hn.crps)
      crps.df = rbind(crps.df, c("HN", year, mean.hn.crps))
    }

    if (nulls == "ALL" | nulls == "PN"){
      # Apply prior-year null model
      pn.start = Sys.time()
      pn.result = apply.prior.year.null(train.data, year, n.draws, draw.vec)
      pn.end = Sys.time()
      time.df = rbind(time.df, c("PN", year, pn.start, pn.end))

      pn.data = pn.result[ ,3:ncol(pn.result)]
      pn.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(pn.data))
      pn.lst = append(pn.lst, list(pn.crps))
      mean.pn.crps = mean(pn.crps)
      crps.df = rbind(crps.df, c("PN", year, mean.pn.crps))
    }

    if (nulls == "ALL" | nulls == "RI"){
      # Apply random draws based on human population based on US-wide incidence
      ri.start = Sys.time()
      ri.result = apply.random.incidence(train.data, year, n.draws, draw.vec)
      ri.end = Sys.time()
      time.df = rbind(time.df, c("RI", year, ri.start, ri.end))

      ri.data = ri.result[ ,3:ncol(ri.result)]
      ri.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(ri.data))
      ri.lst = append(ri.lst, list(ri.crps))
      mean.ri.crps = mean(ri.crps)
      crps.df = rbind(crps.df, c("RI", year, mean.ri.crps))
    }

    if (nulls == "ALL" | nulls == "SI"){
      # Apply random draws based on human population based on stratified incidence
      si.start = Sys.time()
      si.result = apply.stratified.incidence(train.data, year, n.draws, draw.vec) # target.year
      si.end = Sys.time()
      time.df = rbind(time.df, c("SI", year, si.start, si.end))

      si.data = si.result[ ,3:ncol(si.result)]
      si.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(si.data))
      si.lst = append(si.lst, list(si.crps))
      mean.si.crps = mean(si.crps)
      crps.df = rbind(crps.df, c("SI", year, mean.si.crps))
    }

    if (nulls == "ALL" | nulls == "UN"){
      # Uniform Null model
      un.start = Sys.time()
      un.result = apply.uniform(train.data, year, n.draws, draw.vec)
      un.end = Sys.time()
      time.df = rbind(time.df, c("UN", year, un.start, un.end))

      un.data = un.result[ ,3:ncol(un.result)]
      un.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(un.data))
      un.lst = append(un.lst, list(un.crps))
      mean.un.crps = mean(un.crps)
      crps.df = rbind(crps.df, c("UN", year, mean.un.crps))
    }

    if (nulls == "ALL" | nulls == "NB"){
      # Do not apply if less than 3 years of training data
      # Apply negative binomial model
      nb.start = Sys.time()
      nb.result = apply.negative.binomial(train.data, year, n.draws, draw.vec)
      nb.end = Sys.time()
      time.df = rbind(time.df, c("NB", year, nb.start, nb.end))

      nb.data = nb.result[ ,3:ncol(nb.result)]
      nb.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(nb.data))
      nb.lst = append(nb.lst, list(nb.crps))
      mean.nb.crps = mean(nb.crps)
      crps.df = rbind(crps.df, c("NB", year, mean.nb.crps))
    }

    if (nulls == "ALL" | nulls == "AR"){
      # AR(1) Null model
      ar.start = Sys.time()
      ar.result = apply.ar1(train.data, year, n.draws, draw.vec)
      ar.end = Sys.time()
      time.df = rbind(time.df, c("AR", year, ar.start, ar.end))

      ar.data = ar.result[ ,3:ncol(ar.result)]
      ar.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(ar.data))
      ar.lst = append(ar.lst, list(ar.crps))
      mean.ar.crps = mean(ar.crps)
      crps.df = rbind(crps.df, c("AR", year, mean.ar.crps))
    }

    # Write the indiviudal year's output
    if (!is.na(out.path)){
      if (nulls == "ALL" | nulls == "HN"){
        write.table(hn.result, file = sprintf("%s/historical_null/hn_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "MV"){
        write.table(mean.value, file = sprintf("%s/constant_nulls/mv_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = FALSE)}
      if (nulls == "ALL" | nulls == "SI"){
        write.table(si.result, file = sprintf("%s/stratified_incidence/si_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "RI"){
        write.table(ri.result, file = sprintf("%s/random_incidence/ri_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "UN"){
        write.table(un.result, file = sprintf("%s/uniform/un_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "NB"){
        write.table(nb.result, file = sprintf("%s/neg_binomial/nb_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "SM"){
        write.table(sm.result, file = sprintf("%s/constant_nulls/sm_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "PN"){
        write.table(pn.result, file = sprintf("%s/prior_null/pn_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "AR"){
        write.table(ar.result, file = sprintf("%s/autoregressive/ar_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
    } # End of if !is.na block

  } # End of loop over years

  # Remove initialization rows
  crps.df = crps.df[2:nrow(crps.df), ]
  time.df = time.df[2:nrow(time.df), ]
  crps.df$CRPS = as.numeric(crps.df$CRPS)

  time.df$TIME.DIFF = as.numeric(time.df$end.time) - as.numeric(time.df$start.time)

  if (!is.na(out.path)){
    #crps.df$CRPS = sapply(crps.df$CRPS, round, 3) # Do not round - this will result in ties, which will prevent the Wilcoxon test from calculating an exact score.
    write.table(crps.df, file = sprintf("%s/crps_df_%s.csv", out.path, n.draws),
                sep = ',', row.names = FALSE, col.names = TRUE)

    write.table(time.df, file = sprintf("%s/time_df_%s.csv", out.path, n.draws),
                sep = ',', row.names = FALSE, col.names = TRUE)
  }

  model.lists = list(AA = aa.lst, MV = mn.lst, SM = sm.lst, HN = hn.lst,
                     PN = pn.lst, RI = ri.lst, SI = si.lst,
                     NB = nb.lst, AR = ar.lst, UN = un.lst)

  return(list(crps.df, time.df, sample.size.vec, model.lists))
}


#' Check Fields
#'
#' Check that input fields are correct in the in.data object. Stop execution
#' with an informative error if not.
#'
#' @param in.data An input data set, see apply.null.models documentation for details.
#'
check.fields = function(in.data, nulls){
  err.message = ""
  data.error = 0
  loc.error = 0
  year.error = 0
  count.error = 0
  pop.error = 0

  if (nrow(in.data) == 0){
    data.error = 1
    stop("Input data frame is empty!")
  }

  if (length(in.data$location) == 0){
    loc.error = 1
    err.message = sprintf("%sLocation field is mispecified. Needs to be exactly 'location'\n", err.message)
    }
  if (length(in.data$year) == 0){
    year.error = 1
    err.message = sprintf("%sYear field is misspecified. Needs to be exactly 'year'\n", err.message)
    }
  if (length(in.data$count) == 0){
    count.error = 1
    err.message = sprintf("%sCount field is misspecified. Needs to be exactly 'count'\n", err.message)
    }

  if (nulls == "ALL" | nulls == "RI" | nulls == "SI"){
    if (length(in.data$POP) == 0){
      pop.error = 1
      err.message = sprintf("%sPopulation field is misspecified. Needs to be exactly 'POP'\n", err.message)
    }
  }

  if (loc.error == 1 | year.error == 1 | count.error == 1 | pop.error == 1){

    stop(sprintf("One or more input errors detected:\n%s", err.message))
  }

}


#### Define functions to calculate each null model ####

#' Always Absent Null
#'
#' Make predictions of 0 for every location. Does not really warrant a stand-alone
#' function, but done to be parallel to the other functions
#'
#' @param train.data Training data to be used to set up the always absent null.
#' Must have a location field with locations
#' @param n.draws the number of draws per location. n.draws will not change the
#' outcome, as this is deterministic, but it be used to change the structure of
#' the matrix and make the output parallel to that of the other functions.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#' @export apply.always.absent.null
#'
apply.always.absent.null = function(train.data, target.year, n.draws = 1, draw.vec = c("DRAW_0001")){

  locations = unique(train.data$location)
  n.records = length(unique(train.data$location))
  aa.data = rep(0, n.draws * n.records)
  aa.data = matrix(aa.data, ncol = n.draws)

  out.df = cbind(locations, rep(target.year, n.records), aa.data)
  colnames(out.df) = c("location", "target_year", draw.vec)
  out.df = as.data.frame(out.df)
return(out.df)
}

#' Mean Value Null
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.mean.value.null
apply.mean.value.null = function(train.data, target.year, n.draws = 1, draw.vec = c("DRAW_0001")){

  mean.value = mean(train.data$count)
  locations = unique(train.data$location)
  n.records = length(locations)
  mn.data = rep(mean.value, length(locations))
  mn.data = matrix(mn.data, ncol = n.draws)

  out.df = cbind(locations, rep(target.year, n.records), mn.data)
  colnames(out.df) = c("location", "target_year", draw.vec)
  out.df = as.data.frame(out.df)

  return(out.df)
}

#' Stratified Mean Null Model
#'
#' Calculate the mean number of West Nile virus cases for each location for the
#' target year.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.stratified.mean.null
apply.stratified.mean.null = function(train.data, target.year, n.draws, draw.vec){
  locations = unique(train.data$location)
  n.years = length(unique(train.data$year))

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }


  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]
    # Add in 0's to historical counts
    if (nrow(loc.subset) < n.years){
      zeros = n.years - nrow(loc.subset)
      mean.value = mean(c(loc.subset$count, rep(0, zeros)))
    }else{
      mean.value = mean(loc.subset$count)
    }

    distribution.samples = rep(mean.value, n.draws)

    out.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, out.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Historical Null
#'
#' Create a null model based on historical cases. This has the advantage of not
#' assuming a specific distribution, but the disadvantage of not interpolating
#' based on a distribution.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.historical.null
apply.historical.null = function(train.data, target.year, n.draws, draw.vec){
  locations = unique(train.data$location)
  n.years = length(unique(train.data$year))

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }


  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]
    historical.counts = stats::aggregate(loc.subset$count, by = list(loc.subset$year), sum, na.rm = TRUE)
    historical.counts = historical.counts$x
    # Add in 0's to historical counts
    if (length(historical.counts) < n.years){
      zeros = n.years - length(historical.counts)
      historical.counts = c(historical.counts, rep(0, zeros))
    }

    # If historical counts is a single entry, the behavior of sample changes - so need to make sure it just draws from that one value, instead of from seq(1,value)
    if (length(historical.counts) == 1){
      historical.counts = rep(historical.counts, 2) # Make it two entries instead of one
    }

    #historical.counts = dplyr::count(human.data, vars = year)[ ,2] # Tibble wasn't playing well with sample
    distribution.samples = sample(historical.counts, n.draws, replace = TRUE) # SAMPLE WITH REPLACEMENT FROM THE ANNUAL HUMAN CASES #**# LEFT OFF HERE

    out.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, out.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Prior Year Null
#'
#' Use the prior year to predict the current year.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.prior.year.null
apply.prior.year.null = function(train.data, target.year, n.draws, draw.vec){
  locations = unique(train.data$location)
  n.years = length(unique(train.data$year))

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    prior.year = train.data$count[train.data$location == location & train.data$year == (target.year - 1)]
    # NOTE: strictly speaking, the draws are not doing anything useful other than putting the model predictions into the same format as the other null models
    distribution.samples = sample(rep(prior.year, 2), n.draws, replace = TRUE)
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Uniform Random Incidence
#'
#' Uses population data for prior year
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.random.incidence
apply.random.incidence = function(train.data, target.year, n.draws, draw.vec){

  # Calculate US-wide incidence
  n.years = length(unique(train.data$year))
  cases.per.year = sum(train.data$count) / n.years
  total.population = sum(train.data$POP[train.data$year == (target.year - 1)]) # Pull population from previous year
  incidence = cases.per.year / total.population

  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    pop = train.data$POP[train.data$location == location & train.data$year == (target.year - 1)]

    sim.vec = c()
    for (i in 1:n.draws){
      random.numbers = runif(pop)
      sim.cases = length(random.numbers[random.numbers < incidence])
      sim.vec = c(sim.vec, sim.cases)
    }

    distribution.samples = sim.vec
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Stratified Incidence
#'
#' Calculate incidence by county, then take random draws from the incidence based
#' on county population.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.stratified.incidence
apply.stratified.incidence = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    cases.per.year = sum(loc.subset$count) / n.years
    loc.population = sum(loc.subset$POP[loc.subset$year == (target.year - 1)]) # Pull population from previous year
    incidence = cases.per.year / loc.population

    if (incidence == 0){
      distribution.samples = rep(0, n.draws)
    }else{
      sim.vec = c()
      for (i in 1:n.draws){
        random.numbers = runif(loc.population)
        sim.cases = length(random.numbers[random.numbers < incidence])
        sim.vec = c(sim.vec, sim.cases)
      }

      distribution.samples = sim.vec
    }

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Pooled Uniform Random Incidence (updated)
#'
#' Uses population data for prior year
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.random.incidence
apply.random.incidence.v2 = function(train.data, target.year, n.draws, draw.vec,
                                     use.current = 0){

  # Calculate US-wide incidence
  n.years = length(unique(train.data$year))
  cases.per.year = sum(train.data$count) / n.years
  total.population = sum(train.data$POP[train.data$year == (target.year - 1)]) # Pull population from previous year
  incidence = cases.per.year / total.population

  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    pop = train.data$POP[train.data$location == location & train.data$year == (target.year - 1)]

    distribution.samples = rbinom(n.draws, pop, incidence)
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Stratified Incidence (updated)
#'
#' Calculate incidence by county, then take random draws from the incidence based
#' on county population.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.stratified.incidence
apply.stratified.incidence.v2 = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    cases.per.year = sum(loc.subset$count) / n.years
    loc.population = sum(loc.subset$POP[loc.subset$year == (target.year - 1)]) # Pull population from previous year
    incidence = cases.per.year / loc.population

    distribution.samples = rbinom(n.draws, loc.population, incidence)

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}


#' Apply a negative binomial model
#'
#' Fits a negative binomial based on historical cases. If there are not enough
#' historical cases to fit a model, the historical cases are duplicated to
#' enable a model fit.
>>>>>>> districts
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.negative.binomial
#'
apply.negative.binomial = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    #for (j in 1:10){ # FOR TESTING PURPOSES
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    # MASS::glm.nb(formula = count ~ , data = loc.subset) # This requires a formula, I just want a negative binomial fit with an intercept

    if (max(loc.subset$count) == 0){
      distribution.samples = rep(0, n.draws) # Using 0's if no cases in a county. Alternative would be to fit the negative binomial at a higher level of stratification (e.g., state)
    }else{
        error.output = tryCatch({
          fit = suppressWarnings(MASS::fitdistr(loc.subset$count, 'negative binomial'))
          # Getting a warning: In densfun(x, parm[1], parm[2], ...) : NaNs produced
          # But I don't see any NaN values, and it produces model parameters that can be used to generate a distribution.

        },
        error=function(cond){
          message(sprintf("Negative Binomial optimization failed for %s. Doubled the input data to obtain a fit", location))
          fit = try.again(loc.subset, location)
          #fit = MASS::fitdistr(rep(loc.subset$count, 2), 'negative binomial')
          return(list("YES", fit))
        })
      # Check first if it is a list
      if (is.list(error.output)){
        if (length(error.output[[1]]) == 1){
          # Check if it was an error
          if (error.output[[1]] == "YES"){
            fit = error.output[[2]]
          }
        }
      }
      rm(error.output)

      distribution.samples = rnbinom(n.draws, size = fit$estimate[1], mu = fit$estimate[2])
    }

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}


#' Try to fit a negative binomial, progressively modifying the data to obtain a fit
#'
#' @param loc.subset the input data set with cases in a 'count' field
#' @param location The location of the cases
#'
#' @return fit the fit object from a negative binomial model
#'
try.again = function(loc.subset, location){

  # Try just doubling the data
  is.error = 1
  try2 = tryCatch({
    fit = MASS::fitdistr(rep(loc.subset$count, 2), 'negative binomial')
    is.error = 0
  },
  error=function(cond){
    was.error = "YES"
  })

  # 1 3 5 1 1 1 1  is a sequence that can cause this to fail (also it's double)
  # If that doesn't work, try resampling the doubled data, (up to 10 times)
  counter = 0
  while(is.error == 1){
    counter = counter + 1
    if (counter == 10){
      stop(sprintf("Could not find a fit or a work around for %s. Please remove from data set and try again with the rest of your data", location))
    }

    try2 = tryCatch({
      message(sprintf("Trying sampling the cases with replacement to allow a fit for %s", location))
      fit = MASS::fitdistr(sample(rep(loc.subset$count, 2), replace = TRUE), 'negative binomial')
      is.error = 0
    },
    error=function(cond){
      was.error = "YES"
    })

  }

  return(fit)
}

#' Apply Autoregressive 1 model
#'
#' Apply an Autoregressive 1 model assuming a normal distribution
#' (with negative values reclassified to be 0)
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.ar1
apply.ar1 = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  n.predictions = 1 # For now, just allow predictions one time step into the future.
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    #for (j in 1:10){ # FOR TESTING PURPOSES
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    # Clear any prior distribution.samples objects to avoid accidental re-use
    # Note: the error block does not affect the scope outside the error function! But the tryCatch block does. That is weird.
    # Updating based on this post: https://stackoverflow.com/questions/38482937/variable-scope-in-r-trycatch-block-is-necessary-to-change-local-variable-de/38483499
    if(exists('distribution.samples')){rm(distribution.samples)} # This will give a warning if distribution.samples exists in the global scope but not the local scope!

    if (max(loc.subset$count) == 0){
      distribution.samples = rep(0, n.draws) # Using 0's if no cases in a county. Alternative would be to fit the negative binomial at a higher level of stratification (e.g., state)
    }else{
      error.output = tryCatch({
        fit = arima(loc.subset$count, order = c(1,0,0))
        next.prediction = predict(fit, n.predictions)
        distribution.samples = rnorm(n.draws, mean = next.prediction$pred, sd = next.prediction$se)
        distribution.samples[distribution.samples < 0] = 0 # Change any negative predictions to be 0.
        # Last object will be returned to function as a default
        #return(distribution.samp)
      },
      error=function(cond){
        message(sprintf("AR model fitting failed for %s. Fit a normal distribution without an auto-regressive term (negative values reclassified to 0)", location))
        loc.mean = mean(loc.subset$count)
        loc.sd = sd(loc.subset$count)
        distribution.samples = rnorm(n.draws, mean = loc.mean, sd = loc.sd)
        distribution.samples[distribution.samples < 0] = 0 # Change any negative predictions to be 0.
        return(list("YES", distribution.samples))
      })
      # Check first if it is a list
      if (is.list(error.output)){
        if (length(error.output[[1]]) == 1){
          # Check if it was an error
          if (error.output[[1]] == "YES"){
            distribution.samples = error.output[[2]]
          }
        }
      }
      rm(error.output)
    }

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)

}

#' Uniform Null Model
#'
#' Apply a uniform distribution between 0 and the maximum number of cases observed
#' stratified by county.
#'
#' @param train.data Training data to be used to generate the null predictions
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the forecast for each location for the target year
#'
#' @export apply.uniform
apply.uniform = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    #for (j in 1:10){ # FOR TESTING PURPOSES
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    if (max(loc.subset$count) == 0){
      distribution.samples = rep(0, n.draws) # Using 0's if no cases in a county. Alternative would be to fit the negative binomial at a higher level of stratification (e.g., state)
    }else{
      max.cases = max(loc.subset$count, na.rm = TRUE)
      distribution.samples = runif(n.draws, 0, max.cases)
      #**# SHOULD CASES BE ROUNDED TO NEAREST INTEGER? THERE WILL NEVER BE FRACTIONAL CASES, but an intermediate value might score partial credit on either bound.
    }
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }


  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)

}