R/NullModels.R

Defines functions apply.uniform apply.ar1 try.again apply.negative.binomial apply.stratified.incidence.v2 apply.random.incidence.v2 apply.stratified.incidence apply.random.incidence apply.prior.year.null apply.historical.null apply.stratified.mean.null apply.mean.value.null apply.always.absent.null check.fields apply.null.models correct.out.data null.engine apply.null.models.one.year

Documented in apply.always.absent.null apply.ar1 apply.historical.null apply.mean.value.null apply.negative.binomial apply.null.models apply.prior.year.null apply.random.incidence apply.stratified.incidence apply.stratified.mean.null apply.uniform check.fields try.again

# Functions to implement different Null models for CDC West Nile Virus data

# A.C. Keyel
# Created on 2021-04-13
# Incorporated into dfmip on 2021-06-21

# Covered by package DESCRIPTION and NAMESPACE
#library(MASS)
#library(testthat)

# Index of Null Models (Unit Tests: Y/N)
# apply.always.absent (N)
# apply.mean.value (N)
# apply.stratified.mean.null (N)
# apply.historical.null (N)
# apply.prior.year.null (N)
# apply.random.incidence (N)
# apply.stratified.incidence
# apply.negative.binomial (N)
# apply.ar1 (N* Yes but not automated)
# apply.uniform (Y)

#### Create an overview function to run the null models in aggregate

#' Apply Null Models
#'
#' Runs the null models as a group and create an joint output data frame and a
#' timing data frame. NOTE: the always absent and mean value were NOT calculated
#' using the apply.always.absent.null function or the apply.mean.value.null
#' functions - these were created later and have not been edited into the code.
#'
#'@param in.data A data set containing 'location' field with locations,
#'annual human cases in a 'count' field, and year in a 'year' field. Note that
#'field names must match EXACTLY. For Stratified Incidence and Random Incidence,
#'human population also needs to be included in a 'POP' field.
#'@param n.draws The number of probabilistic draws to make for each location.
#'@param nulls Which null models to run. Default of "ALL" runs all models.
#'Individual models can be run with 2-letter abbreviations in a vector:\tabular{ll}{
#' HN \tab Historical Null\cr
#' NB \tab Negative Binomial\cr
#' AA \tab Always Absent\cr
#' IN \tab Incidence\cr
#' PI \tab Pooled Incidence\cr
#' AR \tab Autoregressive 1\cr
#' MV \tab Mean Value\cr
#' PY \tab Prior Year\cr
#' UN \tab Uniform Null\cr
#' PM \tab Pooled Mean Value\cr }
#'
#'@param out.path The path to write the crps.df and time.df data objects.
#' If no output is desired, set this to NA (default).
#'@param in.seed The starting seed to ensure replicability of the random
#'processes
#'
#'@return A list with four elements: crps.df; time.df; sample.size.vec, and a list
#'of model specific results. crps df is a data
#' frame with the average CRPS score for that year for each null model, while
#' time.df contains the timing for each null model. sample.size.vec contains
#' a list of sample sizes by year, while the model.lists object contains the
#' individual county CRPS scores by year.
#'
#'@export apply.null.models.one.year
#'
apply.null.models.one.year = function(train.data, obs.data, n.draws, target.year,
                                      nulls.vec = "ALL", out.path = NA, in.seed = 20210622){

  if (nulls.vec == "ALL"){
    nulls.vec = c("AA", "PV","MV", "HN", "PN", "PI", "IN", "UN", "NB", "AR")
  }

  # Check for required fields
  check.fields(train.data, nulls.vec)

  set.seed(in.seed)
  draw.vec = sprintf("DRAW_%04.0f", seq(1,n.draws))

  years = sort(as.numeric(unique(train.data$year)))

  mean.crps.df = data.frame(model = "DROP", target.year = NA, CRPS = NA)
  time.df = data.frame(model = "DROP", target.year = NA, start.time = NA, end.time = NA)
  raw.crps.df = data.frame(model = "DROP", location = NA, count = NA, year = NA, CRPS = NA)
  pred.df = data.frame(model = "DROP", location = NA, count = NA, year = NA)
  for (i in 1:length(draw.vec)){
    this.entry = draw.vec[i]
    pred.df[[this.entry]] = NA
  }

  sample.size = nrow(train.data)

  # Make sure train.data field is numeric
  train.data$count = as.numeric(as.character(train.data$count))
  train.data$year = as.numeric(as.character(train.data$year))

  obs.data.count = obs.data$count
  obs.data.loc = obs.data$location
  n.records = length(obs.data.count)

  for (this.null in nulls.vec){
    null.start = Sys.time()
    out.data = null.engine(this.null, train.data, n.draws, n.records, target.year)
    null.end = Sys.time()
    time.df = rbind(time.df, c(this.null, target.year, null.start, null.end))
    out.data = matrix(out.data, ncol = n.draws)
    out.crps = scoringRules::crps_sample(as.numeric(obs.data.count), as.matrix(out.data))
    out.matrix = matrix(c(rep(this.null, n.records), obs.data.loc, obs.data.count, rep(target.year, n.records), out.crps),
                        ncol = 5)
    colnames(out.matrix) = c("model", "location", "count", "year", "CRPS")
    raw.crps.df = rbind(raw.crps.df, out.matrix)

    pred.matrix = matrix(c(rep(this.null, n.records), obs.data.loc, obs.data.count,
                           rep(target.year, n.records), out.data), ncol = 4 + n.draws)
    colnames(pred.matrix) = c('model', 'location', 'count', 'year', draw.vec)
    pred.df = rbind(pred.df, pred.matrix)
    mean.null.crps = mean(out.crps)
    mean.crps.df = rbind(mean.crps.df, c(this.null, target.year, mean.null.crps))
  }

  # Remove initialization rows
  mean.crps.df = mean.crps.df[2:nrow(mean.crps.df), ]
  time.df = time.df[2:nrow(time.df), ]
  mean.crps.df$CRPS = as.numeric(mean.crps.df$CRPS)
  raw.crps.df = raw.crps.df[2:nrow(raw.crps.df), ]
  pred.df = pred.df[2:nrow(pred.df), ]

  time.df$TIME.DIFF = as.numeric(time.df$end.time) - as.numeric(time.df$start.time)


  return(list(mean.crps.df, time.df, raw.crps.df, pred.df, sample.size))
}

#' Compute the null model results
#'
#' @param this.null an indicator for which null model to run: AA, PV, MV, HN, PN, PI, IN, UN, NB, AR
#' @param train.data The training data to use for parameterizing the null model
#' @param n.draws the number of draws from the resulting distribution to use
#' @param n.records The number of locations being predicted
#' @param year The focal forecast year
#'
#' @return a matrix with one row for each location and one column for each draw
null.engine = function(this.null, train.data, n.draws, n.records, year){

  if (this.null == "AA"){
    out.data = rep(0, n.draws * n.records)
  }

  if (this.null == "PV"){
    mean.value = mean(train.data$count)
    out.data = rep(mean.value, n.draws * n.records)
  }

  if (this.null == "MV"){
    sm.result = apply.stratified.mean.null(train.data, year, n.draws, draw.vec)
    out.data.1 = sm.result[ ,3:ncol(sm.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "HN"){
    hn.result = apply.historical.null(train.data, year, n.draws, draw.vec)
    out.data.1 = hn.result[ ,3:ncol(hn.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "PN"){
    pn.result = apply.prior.year.null(train.data, year, n.draws, draw.vec)
    out.data.1 = pn.result[ ,3:ncol(pn.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "PI"){
    ri.result = apply.random.incidence.v2(train.data, year, n.draws, draw.vec)
    out.data.1 = ri.result[ ,3:ncol(ri.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "IN"){
    si.result = apply.stratified.incidence.v2(train.data, year, n.draws, draw.vec) # target.year
    out.data.1 = si.result[ ,3:ncol(si.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "UN"){
    un.result = apply.uniform(train.data, year, n.draws, draw.vec)
    out.data.1 = un.result[ ,3:ncol(un.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "NB"){
    nb.result = apply.negative.binomial(train.data, year, n.draws, draw.vec)
    out.data.1 = nb.result[ ,3:ncol(nb.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  if (this.null == "AR"){
    ar.result = apply.ar1(train.data, year, n.draws, draw.vec)
    out.data.1 = ar.result[ ,3:ncol(ar.result)]
    out.data = correct.out.data(out.data.1, n.draws)
  }

  return(out.data)
}

#' Correct out data
#'
#' Correct the data format coming out of the apply.X functions to be compatible
#' with downstream code.
#' @param out.data.1 A data matrix with a column for every draw, and a row for every location of interest.
#' @param n.draws number of draws from distribution
#'
#' @return the same data matrix, but without some weird list formatting issues R was having.
#'
correct.out.data = function(out.data.1, n.draws){
  # Add a correction for R's automated corruption of the data
  out.data = c()
  for (col in 1:n.draws){
    out.data = c(out.data, out.data.1[[col]])
  }
  out.data = matrix(out.data, ncol = n.draws)

  return(out.data)
}

#' Apply Null Models
#'
#' Runs the null models as a group and create an joint output data frame and a
#' timing data frame. NOTE: the always absent and mean value were NOT calculated
#' using the apply.always.absent.null function or the apply.mean.value.null
#' functions - these were created later and have not been edited into the code.
#'
#'@param in.data A data set containing 'location' field with locations,
#'annual human cases in a 'count' field, and year in a 'year' field. Note that
#'field names must match EXACTLY. For Stratified Incidence and Random Incidence,
#'human population also needs to be included in a 'POP' field.
#'@param n.draws The number of probabilistic draws to make for each location.
#'@param nulls Which null models to run. Default of "ALL" runs all models.
#'Individual models can be run with 2-letter abbreviations in a vector:\tabular{ll}{
#' HN \tab Historical Null\cr
#' NB \tab Negative Binomial\cr
#' AA \tab Always Absent\cr
#' SI \tab Stratified Incidence\cr
#' RI \tab Random Incidence\cr
#' AR \tab Autoregressive 1\cr
#' SM \tab Stratified Mean\cr
#' PY \tab Prior Year\cr
#' UN \tab Uniform Null\cr
#' MV \tab Mean Value\cr }
#'
#'@param out.path The path to write the crps.df and time.df data objects.
#' If no output is desired, set this to NA (default).
#'@param in.seed The starting seed to ensure replicability of the random
#'processes
#'
#'@return A list with four elements: crps.df; time.df; sample.size.vec, and a list
#'of model specific results. crps df is a data
#' frame with the average CRPS score for that year for each null model, while
#' time.df contains the timing for each null model. sample.size.vec contains
#' a list of sample sizes by year, while the model.lists object contains the
#' individual county CRPS scores by year.
#'
#'@export apply.null.models
#'
apply.null.models = function(in.data, n.draws, nulls = "ALL", out.path = NA, in.seed = 20210622){

  # Check for required fields
  check.fields(in.data, nulls)

  set.seed(in.seed)
  draw.vec = sprintf("DRAW_%04.0f", seq(1,n.draws))

  years = sort(as.numeric(unique(in.data$year)))

  crps.df = data.frame(model = NA, target.year = NA, CRPS = NA)
  time.df = data.frame(model = NA, target.year = NA, start.time = NA, end.time = NA, ELAPSED = NA)

  sample.size.vec = c()
  aa.lst = list()
  mn.lst = list()
  sm.lst = list()
  hn.lst = list()
  pn.lst = list()
  ri.lst = list()
  si.lst = list()
  nb.lst = list()
  ar.lst = list()
  un.lst = list()

  #for (i in 2:length(years)){ #
  for (i in 4:length(years)){
    year = years[i]
    train.data = in.data[in.data$year < year, ]
    #year.in.data = in.data$count[in.data$year == year]
    year.in.data = in.data[in.data$year == year, c("count", "location")]

    # Remove counties from train.data with fewer than 3 years of data (this now varies by county, because we are excluding years prior to the arrival of WNV)
    test = table(train.data$location)
    test2 = test[test > 2]
    train.data = train.data[train.data$location %in% names(test2), ]
    year.in.data = year.in.data[year.in.data$location %in% names(test2), c("count")]

    # Make sure train.data field is numeric
    train.data$count = as.numeric(as.character(train.data$count))
    train.data$year = as.numeric(as.character(train.data$year))

    message(sprintf("year %s, %s counties", year, nrow(test2)))
    sample.size.vec = c(sample.size.vec, nrow(test2))

    if (nulls == "ALL" | nulls == "AA"){
      # Apply Always Absent Null
      aa.start = Sys.time()
      n.records = length(year.in.data)
      aa.data = rep(0, n.draws * n.records)
      aa.end = Sys.time()
      time.df = rbind(time.df, c("AA", year, aa.start, aa.end))
      aa.data = matrix(aa.data, ncol = n.draws)
      aa.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(aa.data))
      aa.lst = append(aa.lst, list(aa.crps))
      mean.aa.crps = mean(aa.crps)
      crps.df = rbind(crps.df, c("AA", year, mean.aa.crps))
    }

    if (nulls == "ALL" | nulls == "MV"){
      # Apply Mean Value Null
      mn.start = Sys.time()
      mean.value = mean(train.data$count)
      n.records = length(year.in.data)
      mn.data = rep(mean.value, n.draws * n.records)
      mn.end = Sys.time()
      time.df = rbind(time.df, c("MV", year, mn.start, mn.end))
      mn.data = matrix(mn.data, ncol = n.draws)

      mn.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(mn.data))
      mn.lst = append(mn.lst, list(mn.crps))
      mean.mn.crps = mean(mn.crps)
      crps.df = rbind(crps.df, c("MV", year, mean.mn.crps))
    }

    if (nulls == "ALL" | nulls == "SM"){
      # Apply Stratified Mean Value Null
      sm.start = Sys.time()
      sm.result = apply.stratified.mean.null(train.data, year, n.draws, draw.vec)
      sm.end = Sys.time()
      time.df = rbind(time.df, c("SM", year, sm.start, sm.end))

      sm.data = sm.result[ ,3:ncol(sm.result)]
      sm.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(sm.data))
      sm.lst = append(sm.lst, list(sm.crps))
      mean.sm.crps = mean(sm.crps)
      crps.df = rbind(crps.df, c("SM", year, mean.sm.crps))
    }

    if (nulls == "ALL" | nulls == "HN"){
      # Apply historical null model
      hn.start = Sys.time()
      hn.result = apply.historical.null(train.data, year, n.draws, draw.vec)
      hn.end = Sys.time()
      time.df = rbind(time.df, c("HN", year, hn.start, hn.end))

      hn.data = hn.result[ ,3:ncol(hn.result)]
      hn.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(hn.data))
      hn.lst = append(hn.lst, list(hn.crps))
      mean.hn.crps = mean(hn.crps)
      crps.df = rbind(crps.df, c("HN", year, mean.hn.crps))
    }

    if (nulls == "ALL" | nulls == "PN"){
      # Apply prior-year null model
      pn.start = Sys.time()
      pn.result = apply.prior.year.null(train.data, year, n.draws, draw.vec)
      pn.end = Sys.time()
      time.df = rbind(time.df, c("PN", year, pn.start, pn.end))

      pn.data = pn.result[ ,3:ncol(pn.result)]
      pn.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(pn.data))
      pn.lst = append(pn.lst, list(pn.crps))
      mean.pn.crps = mean(pn.crps)
      crps.df = rbind(crps.df, c("PN", year, mean.pn.crps))
    }

    if (nulls == "ALL" | nulls == "RI"){
      # Apply random draws based on human population based on US-wide incidence
      ri.start = Sys.time()
      ri.result = apply.random.incidence(train.data, year, n.draws, draw.vec)
      ri.end = Sys.time()
      time.df = rbind(time.df, c("RI", year, ri.start, ri.end))

      ri.data = ri.result[ ,3:ncol(ri.result)]
      ri.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(ri.data))
      ri.lst = append(ri.lst, list(ri.crps))
      mean.ri.crps = mean(ri.crps)
      crps.df = rbind(crps.df, c("RI", year, mean.ri.crps))
    }

    if (nulls == "ALL" | nulls == "SI"){
      # Apply random draws based on human population based on stratified incidence
      si.start = Sys.time()
      si.result = apply.stratified.incidence(train.data, year, n.draws, draw.vec) # target.year
      si.end = Sys.time()
      time.df = rbind(time.df, c("SI", year, si.start, si.end))

      si.data = si.result[ ,3:ncol(si.result)]
      si.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(si.data))
      si.lst = append(si.lst, list(si.crps))
      mean.si.crps = mean(si.crps)
      crps.df = rbind(crps.df, c("SI", year, mean.si.crps))
    }

    if (nulls == "ALL" | nulls == "UN"){
      # Uniform Null model
      un.start = Sys.time()
      un.result = apply.uniform(train.data, year, n.draws, draw.vec)
      un.end = Sys.time()
      time.df = rbind(time.df, c("UN", year, un.start, un.end))

      un.data = un.result[ ,3:ncol(un.result)]
      un.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(un.data))
      un.lst = append(un.lst, list(un.crps))
      mean.un.crps = mean(un.crps)
      crps.df = rbind(crps.df, c("UN", year, mean.un.crps))
    }

    if (nulls == "ALL" | nulls == "NB"){
      # Do not apply if less than 3 years of training data
      # Apply negative binomial model
      nb.start = Sys.time()
      nb.result = apply.negative.binomial(train.data, year, n.draws, draw.vec)
      nb.end = Sys.time()
      time.df = rbind(time.df, c("NB", year, nb.start, nb.end))

      nb.data = nb.result[ ,3:ncol(nb.result)]
      nb.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(nb.data))
      nb.lst = append(nb.lst, list(nb.crps))
      mean.nb.crps = mean(nb.crps)
      crps.df = rbind(crps.df, c("NB", year, mean.nb.crps))
    }

    if (nulls == "ALL" | nulls == "AR"){
      # AR(1) Null model
      ar.start = Sys.time()
      ar.result = apply.ar1(train.data, year, n.draws, draw.vec)
      ar.end = Sys.time()
      time.df = rbind(time.df, c("AR", year, ar.start, ar.end))

      ar.data = ar.result[ ,3:ncol(ar.result)]
      ar.crps = scoringRules::crps_sample(as.numeric(year.in.data), as.matrix(ar.data))
      ar.lst = append(ar.lst, list(ar.crps))
      mean.ar.crps = mean(ar.crps)
      crps.df = rbind(crps.df, c("AR", year, mean.ar.crps))
    }

    # Write the indiviudal year's output
    if (!is.na(out.path)){
      if (nulls == "ALL" | nulls == "HN"){
        write.table(hn.result, file = sprintf("%s/historical_null/hn_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "MV"){
        write.table(mean.value, file = sprintf("%s/constant_nulls/mv_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = FALSE)}
      if (nulls == "ALL" | nulls == "SI"){
        write.table(si.result, file = sprintf("%s/stratified_incidence/si_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "RI"){
        write.table(ri.result, file = sprintf("%s/random_incidence/ri_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "UN"){
        write.table(un.result, file = sprintf("%s/uniform/un_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "NB"){
        write.table(nb.result, file = sprintf("%s/neg_binomial/nb_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "SM"){
        write.table(sm.result, file = sprintf("%s/constant_nulls/sm_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "PN"){
        write.table(pn.result, file = sprintf("%s/prior_null/pn_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
      if (nulls == "ALL" | nulls == "AR"){
        write.table(ar.result, file = sprintf("%s/autoregressive/ar_%s_%s.csv", out.path, year, n.draws),
                    sep = ',', row.names = FALSE, col.names = TRUE)}
    } # End of if !is.na block

  } # End of loop over years

  # Remove initialization rows
  crps.df = crps.df[2:nrow(crps.df), ]
  time.df = time.df[2:nrow(time.df), ]
  crps.df$CRPS = as.numeric(crps.df$CRPS)

  time.df$TIME.DIFF = as.numeric(time.df$end.time) - as.numeric(time.df$start.time)

  if (!is.na(out.path)){
    #crps.df$CRPS = sapply(crps.df$CRPS, round, 3) # Do not round - this will result in ties, which will prevent the Wilcoxon test from calculating an exact score.
    write.table(crps.df, file = sprintf("%s/crps_df_%s.csv", out.path, n.draws),
                sep = ',', row.names = FALSE, col.names = TRUE)

    write.table(time.df, file = sprintf("%s/time_df_%s.csv", out.path, n.draws),
                sep = ',', row.names = FALSE, col.names = TRUE)
  }

  model.lists = list(AA = aa.lst, MV = mn.lst, SM = sm.lst, HN = hn.lst,
                     PN = pn.lst, RI = ri.lst, SI = si.lst,
                     NB = nb.lst, AR = ar.lst, UN = un.lst)

  return(list(crps.df, time.df, sample.size.vec, model.lists))
}


#' Check Fields
#'
#' Check that input fields are correct in the in.data object. Stop execution
#' with an informative error if not.
#'
#' @param in.data An input data set, see apply.null.models documentation for details.
#'
check.fields = function(in.data, nulls){
  err.message = ""
  data.error = 0
  loc.error = 0
  year.error = 0
  count.error = 0
  pop.error = 0

  if (nrow(in.data) == 0){
    data.error = 1
    stop("Input data frame is empty!")
  }

  if (length(in.data$location) == 0){
    loc.error = 1
    err.message = sprintf("%sLocation field is mispecified. Needs to be exactly 'location'\n", err.message)
    }
  if (length(in.data$year) == 0){
    year.error = 1
    err.message = sprintf("%sYear field is misspecified. Needs to be exactly 'year'\n", err.message)
    }
  if (length(in.data$count) == 0){
    count.error = 1
    err.message = sprintf("%sCount field is misspecified. Needs to be exactly 'count'\n", err.message)
    }

  if (nulls == "ALL" | nulls == "RI" | nulls == "SI"){
    if (length(in.data$POP) == 0){
      pop.error = 1
      err.message = sprintf("%sPopulation field is misspecified. Needs to be exactly 'POP'\n", err.message)
    }
  }

  if (loc.error == 1 | year.error == 1 | count.error == 1 | pop.error == 1){

    stop(sprintf("One or more input errors detected:\n%s", err.message))
  }

}


#### Define functions to calculate each null model ####

#' Always Absent Null
#'
#' Make predictions of 0 for every location. Does not really warrant a stand-alone
#' function, but done to be parallel to the other functions
#'
#' @param train.data Training data to be used to set up the always absent null.
#' Must have a location field with locations
#' @param n.draws the number of draws per location. n.draws will not change the
#' outcome, as this is deterministic, but it be used to change the structure of
#' the matrix and make the output parallel to that of the other functions.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#' @export apply.always.absent.null
#'
apply.always.absent.null = function(train.data, target.year, n.draws = 1, draw.vec = c("DRAW_0001")){

  locations = unique(train.data$location)
  n.records = length(unique(train.data$location))
  aa.data = rep(0, n.draws * n.records)
  aa.data = matrix(aa.data, ncol = n.draws)

  out.df = cbind(locations, rep(target.year, n.records), aa.data)
  colnames(out.df) = c("location", "target_year", draw.vec)
  out.df = as.data.frame(out.df)
return(out.df)
}

#' Mean Value Null
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.mean.value.null
apply.mean.value.null = function(train.data, target.year, n.draws = 1, draw.vec = c("DRAW_0001")){

  mean.value = mean(train.data$count)
  locations = unique(train.data$location)
  n.records = length(locations)
  mn.data = rep(mean.value, length(locations))
  mn.data = matrix(mn.data, ncol = n.draws)

  out.df = cbind(locations, rep(target.year, n.records), mn.data)
  colnames(out.df) = c("location", "target_year", draw.vec)
  out.df = as.data.frame(out.df)

  return(out.df)
}

#' Stratified Mean Null Model
#'
#' Calculate the mean number of West Nile virus cases for each location for the
#' target year.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.stratified.mean.null
apply.stratified.mean.null = function(train.data, target.year, n.draws, draw.vec){
  locations = unique(train.data$location)
  n.years = length(unique(train.data$year))

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }


  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]
    # Add in 0's to historical counts
    if (nrow(loc.subset) < n.years){
      zeros = n.years - nrow(loc.subset)
      mean.value = mean(c(loc.subset$count, rep(0, zeros)))
    }else{
      mean.value = mean(loc.subset$count)
    }

    distribution.samples = rep(mean.value, n.draws)

    out.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, out.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Historical Null
#'
#' Create a null model based on historical cases. This has the advantage of not
#' assuming a specific distribution, but the disadvantage of not interpolating
#' based on a distribution.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.historical.null
apply.historical.null = function(train.data, target.year, n.draws, draw.vec){
  locations = unique(train.data$location)
  n.years = length(unique(train.data$year))

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }


  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]
    historical.counts = stats::aggregate(loc.subset$count, by = list(loc.subset$year), sum, na.rm = TRUE)
    historical.counts = historical.counts$x
    # Add in 0's to historical counts
    if (length(historical.counts) < n.years){
      zeros = n.years - length(historical.counts)
      historical.counts = c(historical.counts, rep(0, zeros))
    }

    # If historical counts is a single entry, the behavior of sample changes - so need to make sure it just draws from that one value, instead of from seq(1,value)
    if (length(historical.counts) == 1){
      historical.counts = rep(historical.counts, 2) # Make it two entries instead of one
    }

    #historical.counts = dplyr::count(human.data, vars = year)[ ,2] # Tibble wasn't playing well with sample
    distribution.samples = sample(historical.counts, n.draws, replace = TRUE) # SAMPLE WITH REPLACEMENT FROM THE ANNUAL HUMAN CASES #**# LEFT OFF HERE

    out.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, out.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Prior Year Null
#'
#' Use the prior year to predict the current year.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.prior.year.null
apply.prior.year.null = function(train.data, target.year, n.draws, draw.vec){
  locations = unique(train.data$location)
  n.years = length(unique(train.data$year))

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    prior.year = train.data$count[train.data$location == location & train.data$year == (target.year - 1)]
    # NOTE: strictly speaking, the draws are not doing anything useful other than putting the model predictions into the same format as the other null models
    distribution.samples = sample(rep(prior.year, 2), n.draws, replace = TRUE)
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Uniform Random Incidence
#'
#' Uses population data for prior year
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.random.incidence
apply.random.incidence = function(train.data, target.year, n.draws, draw.vec){

  # Calculate US-wide incidence
  n.years = length(unique(train.data$year))
  cases.per.year = sum(train.data$count) / n.years
  total.population = sum(train.data$POP[train.data$year == (target.year - 1)]) # Pull population from previous year
  incidence = cases.per.year / total.population

  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    pop = train.data$POP[train.data$location == location & train.data$year == (target.year - 1)]

    sim.vec = c()
    for (i in 1:n.draws){
      random.numbers = runif(pop)
      sim.cases = length(random.numbers[random.numbers < incidence])
      sim.vec = c(sim.vec, sim.cases)
    }

    distribution.samples = sim.vec
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Stratified Incidence
#'
#' Calculate incidence by county, then take random draws from the incidence based
#' on county population.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.stratified.incidence
apply.stratified.incidence = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    cases.per.year = sum(loc.subset$count) / n.years
    loc.population = sum(loc.subset$POP[loc.subset$year == (target.year - 1)]) # Pull population from previous year
    incidence = cases.per.year / loc.population

    if (incidence == 0){
      distribution.samples = rep(0, n.draws)
    }else{
      sim.vec = c()
      for (i in 1:n.draws){
        random.numbers = runif(loc.population)
        sim.cases = length(random.numbers[random.numbers < incidence])
        sim.vec = c(sim.vec, sim.cases)
      }

      distribution.samples = sim.vec
    }

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Pooled Uniform Random Incidence (updated)
#'
#' Uses population data for prior year
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.random.incidence
apply.random.incidence.v2 = function(train.data, target.year, n.draws, draw.vec,
                                     use.current = 0){

  # Calculate US-wide incidence
  n.years = length(unique(train.data$year))
  cases.per.year = sum(train.data$count) / n.years
  total.population = sum(train.data$POP[train.data$year == (target.year - 1)]) # Pull population from previous year
  incidence = cases.per.year / total.population

  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    pop = train.data$POP[train.data$location == location & train.data$year == (target.year - 1)]

    distribution.samples = rbinom(n.draws, pop, incidence)
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}

#' Apply Stratified Incidence (updated)
#'
#' Calculate incidence by county, then take random draws from the incidence based
#' on county population.
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.stratified.incidence
apply.stratified.incidence.v2 = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    cases.per.year = sum(loc.subset$count) / n.years
    loc.population = sum(loc.subset$POP[loc.subset$year == (target.year - 1)]) # Pull population from previous year
    incidence = cases.per.year / loc.population

    distribution.samples = rbinom(n.draws, loc.population, incidence)

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}


#' Apply a negative binomial model
#'
#' Fits a negative binomial based on historical cases. If there are not enough
#' historical cases to fit a model, the historical cases are duplicated to
#' enable a model fit.
>>>>>>> districts
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.negative.binomial
#'
apply.negative.binomial = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    #for (j in 1:10){ # FOR TESTING PURPOSES
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    # MASS::glm.nb(formula = count ~ , data = loc.subset) # This requires a formula, I just want a negative binomial fit with an intercept

    if (max(loc.subset$count) == 0){
      distribution.samples = rep(0, n.draws) # Using 0's if no cases in a county. Alternative would be to fit the negative binomial at a higher level of stratification (e.g., state)
    }else{
        error.output = tryCatch({
          fit = suppressWarnings(MASS::fitdistr(loc.subset$count, 'negative binomial'))
          # Getting a warning: In densfun(x, parm[1], parm[2], ...) : NaNs produced
          # But I don't see any NaN values, and it produces model parameters that can be used to generate a distribution.

        },
        error=function(cond){
          message(sprintf("Negative Binomial optimization failed for %s. Doubled the input data to obtain a fit", location))
          fit = try.again(loc.subset, location)
          #fit = MASS::fitdistr(rep(loc.subset$count, 2), 'negative binomial')
          return(list("YES", fit))
        })
      # Check first if it is a list
      if (is.list(error.output)){
        if (length(error.output[[1]]) == 1){
          # Check if it was an error
          if (error.output[[1]] == "YES"){
            fit = error.output[[2]]
          }
        }
      }
      rm(error.output)

      distribution.samples = rnbinom(n.draws, size = fit$estimate[1], mu = fit$estimate[2])
    }

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)
}


#' Try to fit a negative binomial, progressively modifying the data to obtain a fit
#'
#' @param loc.subset the input data set with cases in a 'count' field
#' @param location The location of the cases
#'
#' @return fit the fit object from a negative binomial model
#'
try.again = function(loc.subset, location){

  # Try just doubling the data
  is.error = 1
  try2 = tryCatch({
    fit = MASS::fitdistr(rep(loc.subset$count, 2), 'negative binomial')
    is.error = 0
  },
  error=function(cond){
    was.error = "YES"
  })

  # 1 3 5 1 1 1 1  is a sequence that can cause this to fail (also it's double)
  # If that doesn't work, try resampling the doubled data, (up to 10 times)
  counter = 0
  while(is.error == 1){
    counter = counter + 1
    if (counter == 10){
      stop(sprintf("Could not find a fit or a work around for %s. Please remove from data set and try again with the rest of your data", location))
    }

    try2 = tryCatch({
      message(sprintf("Trying sampling the cases with replacement to allow a fit for %s", location))
      fit = MASS::fitdistr(sample(rep(loc.subset$count, 2), replace = TRUE), 'negative binomial')
      is.error = 0
    },
    error=function(cond){
      was.error = "YES"
    })

  }

  return(fit)
}

#' Apply Autoregressive 1 model
#'
#' Apply an Autoregressive 1 model assuming a normal distribution
#' (with negative values reclassified to be 0)
#'
#' @param train.data Training data to be used to calculate the mean values.
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' Retained here only for consistency in formatting with the probabilistic methods.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the predictions for each location for the target year
#'
#' @export apply.ar1
apply.ar1 = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  n.predictions = 1 # For now, just allow predictions one time step into the future.
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    #for (j in 1:10){ # FOR TESTING PURPOSES
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    # Clear any prior distribution.samples objects to avoid accidental re-use
    # Note: the error block does not affect the scope outside the error function! But the tryCatch block does. That is weird.
    # Updating based on this post: https://stackoverflow.com/questions/38482937/variable-scope-in-r-trycatch-block-is-necessary-to-change-local-variable-de/38483499
    if(exists('distribution.samples')){rm(distribution.samples)} # This will give a warning if distribution.samples exists in the global scope but not the local scope!

    if (max(loc.subset$count) == 0){
      distribution.samples = rep(0, n.draws) # Using 0's if no cases in a county. Alternative would be to fit the negative binomial at a higher level of stratification (e.g., state)
    }else{
      error.output = tryCatch({
        fit = arima(loc.subset$count, order = c(1,0,0))
        next.prediction = predict(fit, n.predictions)
        distribution.samples = rnorm(n.draws, mean = next.prediction$pred, sd = next.prediction$se)
        distribution.samples[distribution.samples < 0] = 0 # Change any negative predictions to be 0.
        # Last object will be returned to function as a default
        #return(distribution.samp)
      },
      error=function(cond){
        message(sprintf("AR model fitting failed for %s. Fit a normal distribution without an auto-regressive term (negative values reclassified to 0)", location))
        loc.mean = mean(loc.subset$count)
        loc.sd = sd(loc.subset$count)
        distribution.samples = rnorm(n.draws, mean = loc.mean, sd = loc.sd)
        distribution.samples[distribution.samples < 0] = 0 # Change any negative predictions to be 0.
        return(list("YES", distribution.samples))
      })
      # Check first if it is a list
      if (is.list(error.output)){
        if (length(error.output[[1]]) == 1){
          # Check if it was an error
          if (error.output[[1]] == "YES"){
            distribution.samples = error.output[[2]]
          }
        }
      }
      rm(error.output)
    }

    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }

  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)

}

#' Uniform Null Model
#'
#' Apply a uniform distribution between 0 and the maximum number of cases observed
#' stratified by county.
#'
#' @param train.data Training data to be used to generate the null predictions
#' Must have a location field with locations, and a count field with number of cases.
#' @param target.year The year for the prediction. Included in the output data
#' frame to indicate the target year, but otherwise unused
#' @param n.draws The number of probabilistic draws to include in the data matrix.
#' @param draw.vec A vector of column names for the probabilistic draws.
#'
#' @return out.df A data frame containing the forecast for each location for the target year
#'
#' @export apply.uniform
apply.uniform = function(train.data, target.year, n.draws, draw.vec){

  n.years = length(unique(train.data$year))
  locations = unique(train.data$location)

  out.df = data.frame(location = NA, target_year = NA)
  for (i in 1:length(draw.vec)){  out.df[[draw.vec[i]]] = NA  }

  for (j in 1:length(locations)){
    #for (j in 1:10){ # FOR TESTING PURPOSES
    location = locations[j]
    loc.subset = train.data[train.data$location == location, ]

    if (max(loc.subset$count) == 0){
      distribution.samples = rep(0, n.draws) # Using 0's if no cases in a county. Alternative would be to fit the negative binomial at a higher level of stratification (e.g., state)
    }else{
      max.cases = max(loc.subset$count, na.rm = TRUE)
      distribution.samples = runif(n.draws, 0, max.cases)
      #**# SHOULD CASES BE ROUNDED TO NEAREST INTEGER? THERE WILL NEVER BE FRACTIONAL CASES, but an intermediate value might score partial credit on either bound.
    }
    this.record = c(location, target.year, distribution.samples)
    out.df = rbind(out.df, this.record)
  }


  for (k in 3:ncol(out.df)){
    out.df[ ,k] = as.numeric(out.df[ ,k])
  }

  # Return out.df without the initialization row
  out.df = out.df[2:nrow(out.df), ]
  return(out.df)

}
akeyel/dfmip documentation built on Sept. 3, 2022, 1:26 a.m.