R/rf1_main.R

Defines functions district.to.location splitter is.leap get.days get.DOY calculate.MLE.v2 spatial.temporal.barplots write.predictions simple.threshold run.partitioning do.variance.partitioning estimate.custom.r2 null.validation systematic.validation get.best.m add.data cleanup.garbage add.rf1.inputs assign.groups process.env.data convert.env.data assign.breaks assign.breaks.old convert.human.data FormatDataForRF1 do.rf rf1

Documented in assign.breaks calculate.MLE.v2 convert.env.data do.rf get.days get.DOY is.leap rf1 splitter

# Code for building package (Not Run)
#devtools::document()
#devtools::check()

#' @importFrom caret RMSE
#' @importFrom psych pairs.panels
#' @importFrom randomForest randomForest
#' @importFrom quantregForest quantregForest
#' @importFrom grDevices dev.off tiff
#' @importFrom graphics barplot par plot segments text
#' @importFrom stats aggregate as.formula cor median na.exclude optimize predict qchisq runif uniroot rnorm
#' @importFrom utils read.csv write.table
#' @importFrom usethis use_data
#' @importFrom dplyr '%>%'
NULL

#' Run the rf1 model
#'
#' The RF1 model first fits a random forest, then excludes all variables with
#' importance scores below the mean importance (NOTE: This will be problematic
#' if all input variables are relevant) It then further pares down the variable
#' list using variance partitioning. It retains only variables that contribute
#' uniquely to explaining variation in the unmeasured year (via
#' leave-one-year-out cross-validation). The RF1 model uses the randomForest
#' package by Liaw & Wiener 2002, that implements the Random Forest method
#' developed by Breiman 2001. MLE calcualtions from MLE_IR.R were written by
#' Williams and Moffit 2005 (reformatted by A. Keyel). Note that an arbitrary
#' starting seed is set, to ensure that results are repeatable.
#'
#' Citations
#'    Breiman, L. 2001. Random forests. Machine Learning 45: 5- 32
#'    Keyel, A.C. et al. 2019 PLOS ONE 14(6): e0217854. https://doi.org/10.1371/journal.pone.0217854
#'    Liaw & Wiener 2002. Classification and Regression by randomForest. R News 2: 18-22
#'    Williams, C and C. Moffitt 2005. Estimation of pathogen prevalence in pooled samples
#'    using maximum likelihood methods and open source software. Journal of Aquatic Animal Health 17: 386-391
#'
#' @param forecast.targets A vector containing options of what to forecast.
#' 'annual.human.cases' generates human case predictions, while 'seasonal.mosquito.MLE'
#' provides options for mosquito predictions
#' @param human.data Data on human cases of the disease. Must be formatted with
#'   two columns: location and date. The location column contains the spatial
#'   unit (typically county), while the date corresponds to the date of the
#'   onset of symptoms for the human case.
#' @param mosq.data Data on mosquito pools tested for the disease. Must be
#'   formatted with 4 columns: location (the spatial unit, e.g. county),
#'   col_date: the date the mosquitoes were tested, wnv_result: whether or not
#'   the pool was positive, pool_size: the number of mosquitoes tested in the
#'   pool. A fifth column species is optional but is not used by the code
#' @param weather.data Data on weather variables to be included in the analysis.
#'   See the read.weather.data function for details about data format. The
#'   read.weather.data function from ArboMAP is a useful way to process one or
#'   more data files downloaded via Google Earth Engine.
#' @param weekinquestion The focal week for the forecast. For the Random Forest
#'   model, this will be the last day used for making the forecast
#' @param rf1.inputs Inputs specific to the RF1 model, see
#'   \code{\link{rf1.inputs}}. If this model is not included, this should be set
#'   to 'none' or omitted from the function call #**# LINK TO AN OBJECT WITH
#'   MORE DETAILS
#' @param results.path The base path in which to place the modeling results.
#'   Some models will create sub-folders for model specific results
#' @param id.string An id to use for labeling the aggregations across all locations
#' @param break.type The temporal frequency to use for the data. The default is
#'   'seasonal' which breaks the environmental data into January, February,
#'   March; April, May, June; July, August, September; October, November,
#'   December. Other options may be supported in the future.
#' @param response.type Whether data should be treated as continuous (mosquito
#'   rates, number of cases) or binary (0 or 1).
#' @param quantile.model Whether (1) or not (0) to use a quantile random forest
#' for the final model output. All other calculations and model fitting use the
#' standard randomForest package.
#' @param n.draws The number of random realizations to draw for the RF1.distributions object
#' @param bins Bin break points for the CDC forecast challenge
#' @param use.testing.objects An indicator. If TRUE, the analysis will not run,
#' but will load previously saved outputs in order to expedite testing the formatting
#' of the code outputs.
#'
#' @return Four outputs are generated: The Results dataframe, the Distributions dataframe, the Bins dataframe, and the model object results
#'
#' @export rf1
rf1 = function(forecast.targets, human.data, mosq.data, weather.data,
               weekinquestion, rf1.inputs, results.path, id.string, break.type = "seasonal", response.type = "continuous",
               quantile.model = 1, n.draws = 1000, bins = c(0,seq(1,51,1),101,151,201,1000),
               use.testing.objects = FALSE){
  
  out = FormatDataForRF1(human.data, mosq.data, weekinquestion, weather.data, rf1.inputs, results.path, break.type)
  my.data = out[[1]]
  independent.vars = out[[2]]
  
  # Split the data set into the forecast year and the historical data
  forecast.year = as.numeric(substr(as.character(weekinquestion), 1, 4))
  
  forecast.data = my.data[my.data$year == forecast.year, ]
  historical.data = my.data[my.data$year < forecast.year, ] #**# This will prevent later years from informing hindcasts of earlier years

  message(sprintf("nrow(historical.data) = %s", nrow(historical.data)))
  #message(paste(colnames(my.data), collapse = ', '))
  message("Independent Vars:")
  message(paste(independent.vars, collapse = ', '))
  
  #message(sprintf("Forecast data rows: %s", nrow(forecast.data)))
  #message(forecast.year)
  if (nrow(forecast.data) == 0){ stop(sprintf("Forecast subset has no data for forecast year %s. Please ensure that all temporally-merged data sets reach the final year.", forecast.year))}
  #stop("Need to fix stuff, maybe")
  
    
  #**# How do I go from a MIR to positive location weeks? Isn't the MIR more useful?
  # First question: Could estimate number of trap nights, number of mosquitoes sampled, and then use the infection rate to get an estimate of the number of positive location-weeks.
  # For second question - does seem like a lot of work to get at a number that tells us what? In ArboMAP it is used to estimate human cases, but we do that directly.
  #**# So for now, maybe the mosquito results aren't relevant? Or phrased better - are not relevant to the specific comparison question being asked.
  annual.positive.location.weeks = NA

  # Create output objects
  RF1.results = data.frame(forecast.target = NA, location = NA, value = NA)
  RF1.distributions = data.frame(forecast.target = NA, location = NA)
  for (i in 1:n.draws){
    new.col = sprintf("DRAW%s", i)
    RF1.distributions[[new.col]] = NA}
  RF1.bins = data.frame(location = NA, target = 'Total WNV neuroinvasive disease cases',
                        type = 'bin', unit = 'cases', bin_start_inclusive = NA, bin_end_notinclusive = NA )
    
  #**# WHAT AM I DOING WITH INFECTION RATE? PREVIOUSLY I ESTIMATED IT OVER THE ENTIRE YEAR, BUT HERE, THE DATA MIGHT BE MISSING OR FOR PART OF THE YEAR.
  #**# I could use the forecast mosquito infection rate? Would that be more useful than just leaving it out?
  # NOTE: the mosquito-only analysis is not directly connected to the prediction of human cases #**# Is there a way to do that, if human cases are not directly estimated?
  if ('seasonal.mosquito.MLE' %in% forecast.targets){
    # First forecast mosquito infection rates
    mosq.model = rf1.inputs[[6]]
    
    # If the mosquito model needs to be estimated, estimate it
    if (is.na(mosq.model)){
      dep.var = "IR"
      # drop IR variable from independent.vars
      m.independent.vars = independent.vars[independent.vars != "IR"]
      mosquito.results.path = sprintf("%s/mosquitoes", results.path)
      m.label = "mosquitoes" #**# THINK ABOUT THIS AND WHETHER THIS SHOULD BE AN ENTRY in rf1.inputs
      
      # Load previously run data for some testing situations
      if (use.testing.objects == 1){
        mosquito.results = rf1::mosquito.results
      }else{
        mosquito.results = do.rf(historical.data, dep.var, m.independent.vars, mosquito.results.path,
                                 response.type = response.type, label = m.label, quantile.model = quantile.model) #do.spatial = 0, create.test.set = 0, create.ci.null = 0
      }
      mosq.model = mosquito.results[[1]]
      kept.vars = mosquito.results[[6]]
    }
    
    # Update RF1.results
    if (quantile.model == 1){
      data.subset = forecast.data[ ,kept.vars] #**# HOPE THIS WORKS
      predictions = predict(mosq.model, data.subset, what = c(0.5)) # Make a single prediction at the 50% quantile value
      dist.predictions = predict(mosq.model, data.subset, what = runif(n.draws)) # select random quantile values. #**# Should talk with someone if this is a legitimate way to sample a distribution
    }else{
      predictions = predict(mosq.model, newdata = forecast.data)
      dist.predictions = rep(predictions, n.draws)
      dist.predictions = matrix(dist.predictions, ncol = n.draws) #**# Confirm the order is correct on this. Should be - should fill by column first.
    }
    
    locations = forecast.data$location
    targets = rep('seasonal.mosquito.MLE', length(predictions))
    mosq.records = data.frame(forecast.target = targets, location = locations, value = predictions)
    RF1.results = rbind(RF1.results, mosq.records)

    # Add statewide result
    seasonal.mosquito.MLE = mean(predictions, na.rm = TRUE) #**# NOTE: perhaps locations should be weighted in this estimate
    statewide.record = c('seasonal.mosquito.MLE', sprintf("%s-STATEWIDE", id.string), seasonal.mosquito.MLE)
    RF1.results = rbind(RF1.results, statewide.record)
    
    # Update RF1.distributions & add statewide result
    mosq.distributions = cbind(targets, locations, dist.predictions)
    colnames(mosq.distributions) = c('forecast.target', 'location', sprintf("DRAW%s", seq(1,n.draws)))
    RF1.distributions = rbind(RF1.distributions, mosq.distributions)
    statewide.prediction = apply(dist.predictions, c(2), mean, na.rm = TRUE) # Should take average MLE across all counties. #**# Has some obvious statistical issues, but not sure how else to do it.
    statewide.distribution = c('seasonal.mosquito.MLE', sprintf("%s-STATEWIDE", id.string), statewide.prediction) 
    names(statewide.distribution) =  c('forecast.target', 'location', sprintf("DRAW%s", seq(1,n.draws)))
    RF1.distributions = rbind(RF1.distributions, statewide.distribution)
        
    # Update RF1.bins
    #**# NOT YET SCRIPTED
  }
  
  # Run the human analysis (or not)
  if ('annual.human.cases' %in% forecast.targets){
    # Second, forecast human cases
    human.model = rf1.inputs[[7]]
    
    # If the human model needs to be estimated, estimate it
    if (is.na(human.model)){
      dep.var = "Cases"
      human.results.path = sprintf("%s/humans", results.path)
      h.label = "humans"
      
      # Load previously run data for some testing situations
      if (use.testing.objects == 1){
        human.results = rf1::human.results
      }else{
        human.results = do.rf(historical.data, dep.var, independent.vars, human.results.path,
                              label = h.label, response.type = response.type,
                              quantile.model = quantile.model)
      }
      human.model = human.results[[1]]
      kept.vars = human.results[[6]]
    }
    
    
    if (quantile.model == 1){
      data.subset = forecast.data[ , kept.vars]
      predictions = predict(human.model, data.subset, what = 0.5)
      dist.predictions = predict(human.model, data.subset, what = runif(n.draws))
    }else{
      predictions = predict(human.model, newdata = forecast.data)
      dist.predictions = rep(predictions, n.draws)
      dist.predictions = matrix(dist.predictions, ncol = n.draws) #**# Confirm correct matrix formation
    }

    locations = forecast.data$location
    targets = rep('annual.human.cases', length(predictions))
    human.records = data.frame(forecast.target = targets, location = locations, value = predictions)
    RF1.results = rbind(RF1.results, human.records)
    
    statewide.cases = sum(predictions)
    statewide.record = c('annual.human.cases', sprintf("%s-STATEWIDE", id.string), statewide.cases)
    RF1.results = rbind(RF1.results, statewide.record)
    
    # Update forecast distributions
    human.distributions = cbind(targets, locations, dist.predictions)
    colnames(human.distributions) = c('forecast.target', 'location', sprintf("DRAW%s", seq(1,n.draws)))
    RF1.distributions = rbind(RF1.distributions, human.distributions)
    statewide.prediction = apply(dist.predictions, c(2), sum, na.rm = TRUE) # Should take average MLE across all counties. #**# Has some obvious statistical issues, but not sure how else to do it.
    statewide.distribution = c('annual.human.cases', sprintf("%s-STATEWIDE", id.string), statewide.prediction) 
    names(statewide.distribution) =  c('forecast.target', 'location', sprintf("DRAW%s", seq(1,n.draws)))
    RF1.distributions = rbind(RF1.distributions, statewide.distribution)

    # Update forecast bins
    #**# NOT SCRIPTED
    
  }
  
  # Remove the leading NA row
  RF1.results = RF1.results[2:nrow(RF1.results), ]
  RF1.distributions = RF1.distributions[2:nrow(RF1.distributions), ]
  #RF1.bins = RF1.bins[2:nrow(RF1.bins), ]

  # Check that fields come out as appropriate format
  RF1.results$value = as.numeric(as.character(RF1.results$value))
  for (i in 1:n.draws){
    RF1.distributions[ , (i + 2)] = as.numeric(as.character(RF1.distributions[ , (i + 2)]))
  }

  # Set up other outputs
  RF1.other = list()
  if ('annual.human.cases' %in% forecast.targets){
    RF1.other$human.results =  human.results
  }
  if ('seasonal.mosquito.MLE' %in% forecast.targets){
    RF1.other$mosquito.results = mosquito.results
  }
    
  return(list(RF1.results, RF1.distributions, RF1.bins, RF1.other = RF1.other))
}

#' rf1.inputs
#'
#' The model-specific inputs required to run the RF1 model
#'
#' @param files.to.add A list of file names of other data sources to be
#' included in the Random Forest model. If no additional data is to be added,
#' this should be an empty list (i.e. list()) or NA.
#' @param merge.type.vec A vector identifying how the files should be joined to
#' the mosquito and human data. Options are spatial_temporal where merges will
#' be performed on county and year (e.g., climate data); state_temporal where
#' merges will be performed on state and year (e.g. BBS data at state level),
#' and spatial, where merges will be performed on county only (e.g., landcover
#' and census data). This must have the same length as files.to.add, and if no
#' additional data is to be added, this should be an empty vector (i.e. c()) or NA.
#' @param analysis.counties A list of counties included in the analysis (this
#' is for ensuring that a county and year that does not have a human case is
#' treated as a 0)
#' @param analysis.years A list of years included in the analysis (this is for
#' ensuring that a county and year that does not have a human case is treated
#' as a 0 rather than as missing) #**# This is problematic if a specific county
#' or year is missing.
#' @param user.drop.vars A list of independent variables that should be
#' excluded from the analysis. If no variables should be excluded, this should
#' be an empty vector c() or NA.
#' @param mosq.model The Random Forest model to use for forecasting mosquito
#' infection rates. If this is to be fitted from the empirical data, this
#' should be set to NA
#' @param human.model The Random Forest model to use for forecasting human
#' cases. If this is to be fitted from the empirical data, this should be set
#' to NA.
#' @param no.data.exceptions #**# COMING SOON A list of county-years within the
#' range of analysis.years and analysis.counties that are missing data rather
#' than 0's
#'
#' @name rf1.inputs
NULL
#**# NOTE: DUPLICATED IN DOCUMENTATION IN DFMIP. NEED TO FIGURE OUT HOW TO COORDINATE


#' Random Forest 1 Model Outputs
#'
#' @description other.outputs$rf1 gives a list of 2 objects. The first is the
#' mosquito results, the second is the human results. Each of these objects are
#' lists of objects. The first $MODEL is the Random Forest Model object. This
#' can be used to generate novel predictions. The second $TEMPORAL.ACCURACY is
#' a summary of the cross-validation accuracy, with $OVERALL giving the overall
#' accuracy. Other accuracy results include $RMSE, $SPEARMAN, $N, $ERROR.DF
#' The third is $ SPATIAL.ACCURACY which performs the same cross-validation,
#' except on spatial units instead of on years. This has the same structure as
#' $TEMPORAL.ACCURACY. The fourth is $TEMPORAL.NULL, which contains two null
#' model results for comparison ($NULL.MEAN and $NULL.CI) each with the same
#' structure as $TEMPORAL.ACCURACY. The fifth is $SPATIAL.NULL, which contains
#' null model results for spatial structure, following the same structure as
#' $TEMPORAL.NULL. The sixth object is $RETAINED.VARS, which includes a list of
#' independent variables that were retained in the final model. (NOTE: These
#' can also be extracted from the $MODEL object)
#'
#' @name rf1.outputs
NULL
#**# Link to documentation from main RF1 description
#**# NOTE: DUPLICATED IN DOCUMENTATION IN DFMIP. NEED TO FIGURE OUT HOW TO
#**# USE A LINK TO JUST HAVE ONE COPY OF DOCUMENTATION
#**# SWITCH DESCRIPTION TO USE A TABLE/LIST FORMAT

#' A random forest based disease forecasting approach
#'
#'@description The do.rf function runs the random forest portion of the code
#'
#'@details Majority of code was writen in wnv_hlpr.R; January - June 2018.
#'Transferred to this file & modified beginning in March 2019. Code adapted to
#'use common inputs with the ArboMAP code of Davis & Wimberly
#' 
#' @param trap.data A data frame containing human cases, mosquito infection rate (if applicable),
#' and independent variables for prediction of human cases and/or mosquito infection rates
#' @param dep.var The field name containing the dependent variable to analyze
#' @param independent.vars The field names of the independent variables to include in the analysis
#'   Variables not listed here but present in the data frame will be excluded from the analysis.
#' @param results.path The base path in which to place the modeling results.
#'   Some models will create sub-folders for model specific results
#' @param do.spatial Whether or not to do spatial crossvalidation (can be time-consuming)
#' @param create.ci.null Whether or not to generate null model results
#' @param label A label for the analysis run
#' @param response.type Whether data should be treated as continuous (mosquito
#'   rates, number of cases) or binary (0 or 1).
#' @param exploratory Whether to identify the best model (exploratory = TRUE),
#'   or whether to run the random forest with all input independent variables (exploratory = FALSE)
#' @param input.seed A seed for the random number generator to ensure that the results are repeatable
#' @param temporal.field The field containing the temporal units
#' @param spatial.field The field containing the spatial units
#' @param quantile.model Whether (1) or not (0) to use a quantile random forest
#'   for the final model output. All other calculations and model fitting use the
#'   standard randomForest package.
#' @param display.messages Whether or not update messages should be output
#' 
#' @return A list containing:\tabular{ll}{
#' MODEL \tab the random forest prediction model\cr
#' TEMPORAL.ACCURACY \tab  an accuracy assessment using leave one year out cross-validation\cr
#' SPATIAL.ACCURACY \tab an accuracy assessment based on leave one location out cross-validation\cr
#' TEMPORAL.NULL \tab Null model results based on estimates across time\cr
#' SPATIAL.NULL \tab Null model results based on estimates across space\cr
#' RETAINED.VARS \tab Variables retained in the final prediction model\cr} 
#' 
#' @export do.rf
#'
do.rf = function(trap.data, dep.var, independent.vars, results.path, do.spatial = 0,
                 create.ci.null = 0, label = "", response.type = "continuous", exploratory = TRUE,
                 input.seed = 20180830, temporal.field = "year", spatial.field = "location",
                 quantile.model = 1, display.messages = 1, make.panel.plot = 1){
  
  #require(psych)
  
  response.types = c("continuous", "binary")
  if (!response.type %in% response.types){ stop(sprintf("Response type must be %s, and is case sensitive", paste(response.types, collapse = ", "))) }
  
  # Reset the seed to ensure that the random elements in the code are repeataable
  set.seed(input.seed)
  
  message(nrow(trap.data))
  message(dep.var)
  message(names(trap.data))
  
  # Subset data to just non-NA values
  trap.data = trap.data[!is.na(trap.data[[dep.var]]), ]
  
  # Add MERGE_ID #**# FIX THIS TO BE MORE GENERAL / MORE ROBUST
  trap.data$MERGE_ID = trap.data$location_year
  
  #**# Also formally output these somewhere  Report after test data set is removed
  # Report sample size
  N = nrow(trap.data)
  NO.IR.N = nrow(trap.data[is.na(trap.data$IR), ]) # Get number of NA values for Infection Rate
  IR.not.zero = nrow(trap.data[trap.data$IR > 0, ])
  WNV.N = IR.not.zero + NO.IR.N
  if (display.messages == 1){
    message(sprintf("Total N: %s\nWNV+ N: %s\nNo IR N: %s", N, WNV.N, NO.IR.N))
    message("These numbers are not valid for the human case analyses")
  }
  
  
  # Set up the dependent variable
  if (response.type == "continuous"){ trap.data[[dep.var]] = as.numeric(as.character(trap.data[[dep.var]])) }
  if (response.type == "binary"){ trap.data[[dep.var]] = as.factor(trap.data[[dep.var]]) }
  
  message(nrow(trap.data))
  message(paste(independent.vars, collapse = ', '))
  
  ### EXAMINE PREDICTORS FOR COLLINEARITY
  # Run for Everything #**# Put someplace that makes more sense
  if (make.panel.plot == 1){
    correlation.path = sprintf("%s/ModelResults/correlations", results.path)
    dir.create(correlation.path, showWarnings = FALSE, recursive = TRUE)
    tiff(filename = sprintf("%s/pairspanel_%s.tif", correlation.path, label), height = 3000, width = 3000)
    message(paste(independent.vars, collapse = ', '))
    try.output = try(psych::pairs.panels(trap.data[ , independent.vars]))
    if (length(try.output) > 0){
      if (grep("Error", try.output) == 1){
        m1 = "One or more independent variables is character and not numeric. Please correct this. "
        m2 = "Independent variables are: "
        stop(sprintf("%s%s%s. Actual error message was:\n %s", m1, m2, paste(independent.vars, collapse = ", "), try.output[1]))
      }
    }
    dev.off()
  }
  
  if (exploratory == TRUE){
    ### RUN MODEL FOR ALL PREDICTOR VARIABLES
    f = as.formula(paste(dep.var, ' ~ ', paste(independent.vars, collapse = "+")))
    
    best.m = get.best.m(f, independent.vars, trap.data, response.type)
    #best.m = 3 #**#
    
    # Regenerate best model using more trees
    if (display.messages == 1){  message("Creating the model with the best m using 5000 trees")  }
    rf.model = randomForest(f, data = trap.data, na.action = na.exclude, stringsAsFactors = TRUE, importance = TRUE, mtry = best.m, ntree = 5000)
    
    important.stuff = sort(rf.model$importance[,1], decreasing = TRUE)
    
    # Show plot of mean importance, with a line indicating the mean importance cut-off
    importance.path = sprintf("%s/ModelResults/importance", results.path)
    dir.create(importance.path, showWarnings = FALSE, recursive = TRUE)
    tiff(filename = sprintf("%s/importance_%s.tif", importance.path, label))
    plot(seq(1,length(important.stuff)), important.stuff)
    segments(0,mean(important.stuff), length(independent.vars), mean(important.stuff))
    dev.off()
    
    # Keep any variables above the mean importance
    best.vars = names(important.stuff)[important.stuff > mean(important.stuff)]
    
    if(display.messages == 1){  message("Creating a refined model that only includes variables of high importance")  }
    f2 = as.formula(paste(dep.var, ' ~ ', paste(best.vars, collapse = "+")))
    best.m = get.best.m(f2, best.vars, trap.data, response.type)# Recreate best.m variable for the subset. This should remove one of the warnings R was giving me
    #best.m = 3 #**#
    rf.model2 = randomForest(f2, data = trap.data, na.action = na.exclude, stringsAsFactors = TRUE, importance = TRUE, mtry = best.m, ntree = 5000)
    
    ## Refine variable set based on variance partitioning results
    # Calculate a preliminary temporal accuracy to provide a baseline
    temporal.accuracy.prelim = systematic.validation(trap.data, dep.var, f2, best.m, "year", response.type, display.messages)
    temporal.accuracy.R2 = temporal.accuracy.prelim$OVERALL[["R2.sasha"]]
    correlation.threshold = "" # 0.5 #**# not using this aspect initially
    drop.thresholds = c(0, 0.001, 0.002, 0.005, 0.01) #**# Try a vector of thresholds. Use the threshold that corresponds to the fewest variables with no more than a 5% decrease in prediction accuracy
    
    # Run variance partitioning
    kept.vars = do.variance.partitioning(trap.data, dep.var, best.vars, best.m, temporal.accuracy.R2, drop.thresholds,
                                         correlation.threshold, response.type, do.spatial, results.path,
                                         spatial.field, temporal.field, label, temporal.field, display.messages)
    
    kept.vars = as.character(kept.vars) # Ensure they are not treated as factors
    # Re-run the model with just those that are kept after the variance partitioning process
    f3 = as.formula(paste(dep.var, ' ~ ', paste(kept.vars, collapse = "+")))
    best.m = get.best.m(f3, kept.vars, trap.data, response.type)# Recreate best.m variable for the subset. This should remove one of the warnings R was giving me
    #best.m = 3 #**#
    if (quantile.model == 1){
      y = trap.data[[dep.var]]
      #message(y[1:5])
      #message(paste(kept.vars, collapse = ', '))
      x.df = trap.data[ , kept.vars]
      # if only one variable is left, R will drop the matrix formatting and make it a vector. SOOO annoying. But can only apply this step if kept.vars = 1, because otherwise R was corrupting the data inexplicably
      if (length(kept.vars) == 1){
        x.df = matrix(x.df, ncol = length(kept.vars)) 
      }
      message(length(y))
      message(length(x.df[ ,1]))
      message(length(kept.vars))
      message(ncol(x.df))
      
      #message(head(x.df))
      rf.model3 = quantregForest(x.df, y, na.action = na.exclude, stringsAsFactors = TRUE, importance = TRUE, mtry = best.m, ntree = 5000)
    }else{
      rf.model3 = randomForest(f3, data = trap.data, na.action = na.exclude, stringsAsFactors = TRUE, importance = TRUE, mtry = best.m, ntree = 5000)
    }
    
  }
  
  # Just run the model for the input independent variables
  if (exploratory == FALSE){
    best.m = get.best.m(f3, independent.vars, trap.data, response.type)# Recreate best.m variable for the subset. This should remove one of the warnings R was giving me
    f3 = as.formula(paste(dep.var, ' ~ ', paste(independent.vars, collapse = "+")))
    kept.vars = as.character(independent.vars) # For results output
    
    if (quantile.model == 1){
      y = trap.data[[dep.var]]
      x.df = trap.data[ , kept.vars]
      x.df = matrix(x.df, ncol = length(kept.vars)) # if only one variable is left, R will drop the matrix formatting and make it a vector. SOOO annoying.
      rf.model3 = quantregForest(x.df, y, na.action = na.exclude, stringsAsFactors = TRUE, importance = TRUE, mtry = best.m, ntree = 5000)
    }else{
      rf.model3 = randomForest(f3, data = trap.data, na.action = na.exclude, stringsAsFactors = TRUE, importance = TRUE, mtry = best.m, ntree = 5000)
    }
  }
  
  ## Temporal validation step
  if (display.messages == 1){  message("Calculating temporal accuracy statistics")  }
  temporal.accuracy = systematic.validation(trap.data, dep.var, f3, best.m, temporal.field, response.type, display.messages) # formerly "TEMPORAL" instead of being temporal.field
  
  if (do.spatial == 1){
    ## Spatial validation step
    if (display.messages == 1){ message("Calculating spatial accuracy statistics") }
    spatial.accuracy = systematic.validation(trap.data, dep.var, f3, best.m, spatial.field, response.type, display.messages) #**# formerly SPATIAL instead of location
  }else{
    if (display.messages == 1){message("Skipping spatial accuracy assessment (may take > 17 hours for some analyses)")}
    spatial.accuracy = NA
  }
  
  if (display.messages == 1){message("Creating null models")}
  temporal.null = null.validation(trap.data, dep.var, temporal.field, create.ci.null, display.messages)
  spatial.null = null.validation(trap.data, dep.var, spatial.field, create.ci.null, display.messages)
  
  if (quantile.model == 0){
    # Write overall model observations and predictions to file #**# But these will be predictions from the same data used to generate the model. Yes. But the accuracy was assessed separately, and those will be the accuracy metrics reported.
    if (display.messages == 1){message("Writing predictions")}
    write.predictions(trap.data, dep.var, rf.model3, results.path, spatial.field, temporal.field, label)
  }
  
  #**# MAKE OUTPUTS #**# CLEAN THIS UP
  #make.maps(model.results, spatial.resolution, temporal.resolution, results.path)
  #plot.residuals(model.results, spatial.resolution, temporal.resolution)
  if (do.spatial == 1){
    if(display.messages == 1){ message("Creating barplots")}
    spatial.temporal.barplots(temporal.accuracy, spatial.accuracy, spatial.field, temporal.field, results.path, label)
  }
  
  return(list(MODEL = rf.model3, TEMPORAL.ACCURACY = temporal.accuracy,
              SPATIAL.ACCURACY = spatial.accuracy, TEMPORAL.NULL = temporal.null,
              SPATIAL.NULL = spatial.null, RETAINED.VARS = kept.vars))
}


#' Convert data in the format for ArboMAP to a format that can be used by the
#' RF1 model
#'
#' @param human.data human data from the main input, but read into memory
#' @param mosq.data mosquito data from main input, but read into memory
#' @param weekinquestion #**# ADD
#' @param weather.data non-temporal data from main input
#' @param rf1.inputs see \code{\link{rf1.inputs}}
#' @param results.path #**# ADD
#' @param break.type #**# ADD
#'
#' @noRd
#'
FormatDataForRF1 = function(human.data, mosq.data, weekinquestion, weather.data, rf1.inputs, results.path, break.type){

  # If it is NA or any other single entry, do not process the mosquito data
  if (length(mosq.data) != 1){
    # Need to convert mosquito data into mosquito infection rates (uses MLE code, which is not mine. Need to wait for response from Moffit. Perhaps a phone call? If no email response.)
    temporal.resolution = "annual"

    #Switch to a location_year naming convention & add year field from col_date field
    mosq.data$year = mapply(substr, mosq.data$col_date, nchar(as.character(mosq.data$col_date)) - 3, nchar(as.character(mosq.data$col_date)))
    mosq.data$year = as.numeric(mosq.data$year)
    mosq.data = district.to.location(mosq.data, "mosq.data")
    
    md.data = calculate.MLE.v2(mosq.data, temporal.resolution)
  }
  
  # Need to convert human data to cases per county per year, and add 0's for county years without cases
  all.locations = as.character(rf1.inputs[[3]]) # Get list of counties
  all.years = as.numeric(as.character(rf1.inputs[[4]])) # Get list of all years of analysis
  # Ensure human data is in data frame format, and not just a file name
  if (typeof(human.data) == "character"){  human.data = read.csv(human.data)  }
  
  hd.data = convert.human.data(human.data, all.locations, all.years)
  #message(max(hd.data$year, na.rm = TRUE))
  #message(weekinquestion)
  #message(break.type)
  breaks = assign.breaks(weekinquestion, break.type)
  #message(paste(breaks, collapse = ", "))
  
  # Need to aggregate the climate data by season & merge with static.data
  # Only process if weather.data is not NA
  if (length(weather.data) != 1){
    weather.data = district.to.location(weather.data)
    env.data = convert.env.data(weather.data, all.locations, breaks) #**# Could save this for faster re-use. How should I do that?
    #**# Watch for problem where location_year contains lower-case entries, while district has been converted to upper case for merging purposes.
  }else{
    #**# Watch input field names. These will need to be standardized, and the standardized names listed in the documentation
    # Otherwise, initialize an empty env.data with only the location_year, county, and year fields
    in.counties = rep(all.locations, length(all.years))
    in.years = sort(rep(all.years, length(all.locations)))
    env.data = data.frame(location = in.counties, year = in.years, location_year = sprintf("%s_%s", in.counties, in.years))
  }
  
  #message(max(env.data$year))
  
  env.data = add.rf1.inputs(env.data, rf1.inputs, breaks)
  #message(max(env.data$year))
  
  if (length(unique(env.data$location)) != length(all.locations)){
    m1 = "Not all records were joined for all environmental variables."
    m2 = sprintf("There should have been %s locations, but the environmental data only has %s unique locations",
                 length(all.locations), length(unique(env.data$location)))
    stop(sprintf("%s%s", m1, m2))
  }
  
  if (length(mosq.data) == 1){  my.data = hd.data
  }else{
    # Merge on county.year. Keep only records that have mosquito data
    my.data = merge(md.data, hd.data, by = "location_year", all.x = TRUE) #, all = TRUE
    my.data = cleanup.garbage(my.data)
  }
  
  # Merge to environmental data.
  my.data = merge(my.data, env.data, by = "location_year") # , all = TRUE
  my.data = cleanup.garbage(my.data)
  
  #message(max(my.data$year))
  
  # Pull independent variable names from the climate data. Add those from the mosquito data
  independent.vars = colnames(my.data)
  
  #**# maybe independent.vars should be an input? That might be easier, and give the user more control. But how will the user know what fields there are?
  
  # Remove non-analysis variables #**# This needs an upgrade to allow a user input and user control!
  user.drop.vars = rf1.inputs[[5]]
  drop.vars = c(user.drop.vars, "location_year", "GROUP", "CI.lower", "CI.upper", "Cases", "YEAR", "year", "breaks", "location") #**# I have a names problem! Should really clean up the name usage!
  
  for (drop.var in drop.vars){
    if (drop.var %in% independent.vars){
      independent.vars = independent.vars[independent.vars != drop.var]
    }
  }
  
  # Ensure variables come out as numeric, unless they are specified as factors
  for (var in (independent.vars)){
    # If it is not a factor, ensure it is 
    if (!typeof(my.data[[var]]) == 'factor'){
      my.data[[var]] = as.numeric(as.character(my.data[[var]]))
    }
  }
  
  #message(max(my.data$year))
  
  out = list(my.data, independent.vars)
  return(out)
}

#' Convert human data
#'
#' Simple little function to aggregate human case data by year and county
#'
#' @noRd
#'
convert.human.data = function(hd, all.locations, all.years){
  #**# The steps creating the year were already done in forecast_NYCT.R and could be required as part of the input.
  hd$year = mapply(substr, hd$date, nchar(as.character(hd$date)) - 3, nchar(as.character(hd$date)))
  hd$year = as.numeric(hd$year)
  # Check if data sets are input with district and district_year instead of location and location_year
  hd = district.to.location(hd)
  hd$location = as.character(hd$location)
  #hd$location_year = sprintf("%s_%s", hd$location, hd$year)
  hd$count = 1 # One case per entry
  hd.data = aggregate(hd$count, by = list(hd$location_year), "sum")
  colnames(hd.data) = c("location_year", "Cases")
  hd.data$location = sapply(hd.data$location_year, splitter, "_", 1, 1)
  hd.data$year =  sapply(hd.data$location_year, splitter, "_", 2, 0)
  
  # Make sure there is a record for every year and location included in the data set
  for (location in all.locations){
    for (year in all.years){
      location_year = sprintf("%s_%s", location, year)
      if (!location_year %in% hd.data$location_year){
        # Add 0 cases for missing location_years
        #test.type(c(location_year, 0, location, year), 'L1480')
        hd.data = rbind(hd.data, c(location_year, 0, location, year))
      }
    }
  }
  
  # For some reason, this field is being converted to text following the rbind. Odd
  hd.data$Cases = as.numeric(hd.data$Cases)
  
  return(hd.data)
}

#' Assign Breaks
#'
#' @noRd
#'
assign.breaks.old = function(weekinquestion, break.type){
  # Get the date of the forecast. Only include breaks prior to forecast.doy
  forecast.year=  as.numeric(substr(as.character(weekinquestion), 1, 4))
  forecast.month = as.numeric(substr(as.character(weekinquestion), 6,7))
  forecast.day = as.numeric(substr(as.character(weekinquestion), 9,10))
  forecast.doy = get.DOY(forecast.year, forecast.month, forecast.day)
  
  # Assign all breaks
  if (break.type == "seasonal"){
    # Breaks on 3/31, 6/30, 9/30, 12/31
    breaks = c(get.DOY(forecast.year,3,31), get.DOY(forecast.year, 6, 30), get.DOY(forecast.year,9,30), get.DOY(forecast.year, 12, 31))
  }
  
  if (break.type != "seasonal"){ stop(sprintf("%s needs to be scripted out", break.type)) }
  
  # Drop any breaks that go past the forecast.doy - these cannot be used to make a prediction as they have not been observed yet!
  breaks = breaks[breaks < forecast.doy] # Assume we have data up to the day before the forecast.doy #**# May not be a good assumption
  
  return(breaks)
}

#' Assign Breaks
#'
#' Identify break points according to day of year, and drop any that exceed
#' the weekinquestion variable (if you want the entire year, select December 33)
#' Does not correct for leap years, but these will be corrected for in later steps,
#' such as in the convert.env.data function (correcting here made it difficult
#' to correct appropriately in other parts of the code)
#'
#' @param weekinquestion The week to determine the last full break to include.
#' Must be in YYYY-MM-DD format
#' @param break.type The type of breaks to use. Options are seasonal or monthly
#'
#' @export
assign.breaks = function(weekinquestion, break.type){
  # Get the date of the forecast. Only include breaks prior to forecast.doy
  #forecast.year=  as.numeric(substr(as.character(weekinquestion), 1, 4))
  forecast.year = 2001 # Use a constant non-leap year
  forecast.month = as.numeric(substr(as.character(weekinquestion), 6,7))
  forecast.day = as.numeric(substr(as.character(weekinquestion), 9,10))
  forecast.doy = get.DOY(2001, forecast.month, forecast.day) # Always assume not a leap-year. Leap years will be adjusted for in another step
  
  # Assign all breaks
  if (break.type == "seasonal"){
    # Breaks on 3/31, 6/30, 9/30, 12/31
    breaks = c(get.DOY(forecast.year,3,31), get.DOY(forecast.year, 6, 30), get.DOY(forecast.year,9,30), get.DOY(forecast.year, 12, 31))
  }
  
  if (break.type == 'monthly'){
    Jan = 31 
    Feb = 28
    Mar = 31
    Apr = 30
    May = 31
    Jun = 30
    Jul = 31
    Aug = 31
    Sep = 30
    Oct = 31
    Nov = 30
    Dec = 31
    
    months = c(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)

    # Breaks on end of months
    breaks = c()
    cum.total = 0
    for (month in months){
      cum.total = cum.total + month
      breaks = c(breaks, cum.total)
    }
  }
  
  if (break.type != "seasonal" & break.type != 'monthly'){ stop(sprintf("%s needs to be scripted out", break.type)) }
  
  # Drop any breaks that go past the forecast.doy - these cannot be used to make a prediction as they have not been observed yet!
  breaks = breaks[breaks < forecast.doy] # Assume we have data up to the day before the forecast.doy #**# May not be a good assumption
  
  return(breaks)
}


#' Convert environmental data to the seasonal format used by the RF1 model
#'
#' @param weather.data Data on weather variables to be included in the analysis.
#'   See the read.weather.data function for details about data format. The
#'   read.weather.data function from ArboMAP is a useful way to process one or
#'   more data files downloaded via Google Earth Engine.
#' @param all.locations A list of counties included in the analysis (this
#' is for ensuring that a county and year that does not have a human case is
#' treated as a 0)
#' @param season.breaks The break points to use for calculating summaries. These should be in the format of day of year.
#' For example, c(31, 59) would create monthly summaries for January and February. the assign.breaks function
#' can assist with generating the breakpoints automatically in the correct format.
#' @param leap.correct 0: no leap year corrections will be made, all years will have the same day of year.
#' Day 366 may be lost unless the last break is >366. 1: leap year corrections will be made for years with leap years
#'
#'@export
convert.env.data = function(weather.data, all.locations, season.breaks, leap.correct = 1){
  
  #Save processing time by restricting to just locations of interest
  weather.data = weather.data[weather.data$location %in% all.locations, ]
  
  if (nrow(weather.data) == 0){ stop("No location data returned. Does weather data have a location field? Does the all.locations variable use the same name convention?") }
  
  # Separate into leap years and non-leap years
  weather.data$LEAP = sapply(weather.data$year, is.leap)
  leap.data = weather.data[weather.data$LEAP == 1, ]
  nonleap.data = weather.data[weather.data$LEAP == 0, ]

  # If correcting, add 1 to all the break points for the leap years
  leap.breaks = season.breaks
  if (leap.correct == 1){ leap.breaks[leap.breaks >= 59] = season.breaks[season.breaks >= 59] + 1 }
  
  # Processed separately, but the leap correction is only applied if leap.correct is set to 1
  leap.out = process.env.data(leap.data, leap.breaks)
  nonleap.out = process.env.data(nonleap.data, season.breaks)
  
  # Rejoin the two data sets
  env.data = rbind(leap.out, nonleap.out)
  # Resort in order by year
  env.data$location_year = as.character(env.data$location_year) # Somehow this was turning into a factor (!)
  env.data = env.data[order(env.data$location_year), ]
  
  env.data$location = sapply(env.data$location_year, splitter, "_", 1, 1)
  env.data$year = sapply(env.data$location_year, splitter, "_", 2, 0)
  
  #env.data$location = toupper(env.data$location) #**# Is this going to break things badly? YES, given the changes
  
  return(env.data)
}

process.env.data = function(weather.data, season.breaks){
  # These are column names, but the dplyr syntax makes them flag up as undefined global variables
  # So I define them here to make R happy, even though these are not actually used anywhere in the following
  # code, as the entries below refer to column names in a data object
  pr = rmean = tmaxc = tmeanc = tminc = vpd = wbreaks = NA
  
  # Drop weather data beyond the last break
  last.break = season.breaks[length(season.breaks)]
  weather.data = weather.data[weather.data$doy <= last.break, ]
  
  # Add grouping factor based on day of year
  weather.data$wbreaks = sapply(weather.data$doy, assign.groups, season.breaks)
  weather.data$location_year = sprintf("%s_%s", weather.data$location, weather.data$year)
  
  # This looks like a job for dplyr; https://datacarpentry.org/dc_zurich/R-ecology/04-dplyr
  #**# This is hard-coded to specific variables. Can dplyr take a more general input?
  env.data.pre = weather.data %>%
    dplyr::group_by(location_year, wbreaks) %>%
    dplyr::summarize(TMINC = mean(tminc), TMEANC = mean(tmeanc), TMAXC = mean(tmaxc), PR = mean(pr),
                     RMEAN = mean(rmean), VPD = mean(vpd))
  
  
  #**# May be a better way to do this - this is not optimized for speed
  # Reformat env.data to have a separate column per break
  env.data = data.frame(location_year = unique(env.data.pre$location_year))
  
  # Add variables to the env.data data frame
  vars = c("TMINC", "TMEANC", "TMAXC", "PR", "RMEAN", "VPD") #**# HARD CODED, BECAUSE dplyr step is hard coded
  for (var in vars){
    for (b in unique(env.data.pre$wbreaks)){
      this.var = sprintf("%s_%s", var, b)
      env.data[[this.var]] = rep(NA, nrow(env.data))
      
      # Subset data frame to just this break
      env.data.subset = env.data.pre[env.data.pre$wbreaks == b, ]
      # Loop through subset and extract break values for this variable
      for (i in 1:nrow(env.data.subset)){
        location_year = env.data.subset$location_year[i]
        this.value = env.data.subset[[var]][env.data.subset$location_year == location_year]
        env.data[[this.var]][env.data$location_year == location_year] = this.value
      }
    }
  }
  
  return(env.data)
}


#' Assign break grouping for aggregating weather data
#'
#' @noRd
#'
assign.groups = function(doy, breaks){
  
  out.group = 1
  
  # Increment the out.group until the appropriate break period is found
  # Starts at 1
  for (b in breaks){
    if (doy > b){
      out.group = out.group + 1
    }
  }
  
  if (out.group > length(breaks)){
    stop("An input day of year (doy) exceeded the last break")
  }
  
  # Simple manual version
  # Use of get.DOY function adjusts based on whether the year is a leap year
  #if (doy <= breaks[1]){  out.group = 1  }
  #if (doy > breaks[1] & doy <= breaks[2]){ out.group = 2  }
  #if (doy > breaks[2] & doy <= breaks[3]){ out.group = 3  }
  #if (doy > breaks[3]){ out.group = 4 }
  
  return(out.group)
}

#' Add data layers specific for the Random Forest model
#'
#' Modified from the add.other function in wnv_hlpr.R
#'
#' @param env.data The environmental data object
#' @param files.to.add A list containing the full paths for .csv files or the data objects to merge into the env.data object
#' @param merge.type.vec A vector containing the merge type for each of the files to be added
#'
#' @noRd
#'
add.rf1.inputs = function(env.data, rf1.inputs, breaks){
  
  # unpack rf1.inputs
  files.to.add = rf1.inputs[[1]]
  merge.type.vec = rf1.inputs[[2]]

  # env.data will be coming in with location_year, location, and year fields. Goal is to use consistent terminology
  
  # If other inputs are present, merge them in
  if (length(files.to.add) > 0){
    
      # Check that files.to.add has same length as merge.type
    if (length(files.to.add) != length(merge.type.vec)){  stop("files.to.add must have the same number of elements as merge.type.vec")  }
    
    for (i in 1:length(files.to.add)){
      
      # Add covariate information
      this.file = files.to.add[[i]]
      
      run.file = 1
      # Check if it is NA # More complicated because this.file could be a data set with length > 1, and is.na() just checks first element and makes R cranky
      if (length(this.file) == 1){
        if (is.na(this.file)){ run.file = 0 }
      }
      
      if(run.file == 1){ 
        
        this.merge = merge.type.vec[i]
        
        env.data = add.data(env.data, this.file, this.merge, breaks)
        env.data = cleanup.garbage(env.data) #**# This step may need modification in the context of the changes made to dfmip
      }
    }
    #}
  }
  
  # If rf1.inputs are NULL, do nothing, and just return the env.data object
  
  return(env.data)
}

#' Cleanup Garbage
#'
#' Taken from wnv_hlpr.R, rs.data is env.data object, but with the name used in wnv_hlpr.R
#'
#' Quick function to restore original field names after the merge
#' 
#' @noRd
#' 
cleanup.garbage = function(rs.data){
  # Clean up merge garbage to restore original variable names
  if (length(rs.data$year) == 0){
    rs.data$year = rs.data$year.x
    rs.data$year.x = NULL
    rs.data$year.y = NULL
  }
  
  #**# Using year as the temporal field for now
  #if (length(rs.data$TEMPORAL) == 0){
  #  rs.data$TEMPORAL = rs.data$TEMPORAL.x
  #  rs.data$TEMPORAL.x = NULL
  #  rs.data$TEMPORAL.y = NULL
  #}
  
  if (length(rs.data$location) == 0){
    rs.data$location = rs.data$location.x
    rs.data$location.x = NULL
    rs.data$location.y = NULL
  }
  
  return(rs.data)
}

#' Function to add data to the env.data object
#'
#' Modified from wnv_hlpr.
#' 
#' @param rs.data env.data, the object to have data appended to
#' @param this.file the file containing the data to append
#' @param this.merge the structure of the data for the merge - i.e. "spatial_temporal" for spatial and temporal,
#' "spatial" for spatial only or "state_temporal" for statewide data that varies by year (i.e. the BBS data)
#' temporal only would be theoretically possible, but has not been scripted.
#' For now, it assumes the spatial is county, and temporal is year. Need to generalize this to spatial and temporal.
#'
#' @noRd
#'
add.data = function(rs.data, this.file, this.merge, breaks){

  n.breaks = length(breaks)
  
  # If a file, load the file. If an R data object, merge the data object
  if (typeof(this.file) == 'character'){
    these.data = read.csv(this.file)
  }else{ these.data = this.file }
  
  # Merge by location and year
  if (this.merge == "spatial_temporal"){
    
    if (length(these.data$location_year) == 0){
      #**# ideally the stop will be in the external loop, and will give information about which data set is missing the required field
      #**# or we can add an rf1.inputs check tool that just checks this stuff right off the bat in the model
      stop("location_year field is missing from one or more of the spatail_temporal merge input data sets")
    }
    
    # # Format for my climate and anomaly data was variable_break. As this is being done for a specific forecast day, we need to drop any that exceed the forecast day
    # # Drop any fields that have a numeric ending beyond the last break
    # these.names = colnames(these.data)
    # keep.vec = c()
    # #**# This is a tricky split, as the variable names might have variable numbers of "_", but if we ever move past seasons to months, this will require a double-digit extraction
    # 
    # if (n.breaks < 10){
    #   # For now, do the simple approach and assume a single-digit extraction
    #   for (name in these.names){
    #     keep = FALSE
    #     # Just get last digit
    #     season = as.numeric(as.character(substr(name,length(name) - 1, length(name))))
    #     
    #     # Only keep if this variable corresponds to a season (i.e. ends in a number)
    #     if (season %in% seq(1,9)){
    #       if (season < n.breaks){
    #         keep = TRUE
    #       }
    #     }
    #     keep.vec = c(keep.vec, keep)
    #   }
    # }
    # 
    # if (max(breaks) > 9){ stop("Need to upgrade the add.data function to handle more than 9 breaks")  }
    # 
    # keep.vars = these.names[keep.vec] #NOTE: 0 and 1 do not seem to work here.
    
    #these.data$county_year = these.data$COUNTY_YEAR
    rs.data = merge(rs.data, these.data, by = "location_year")
  }
  
  # Merge by unit and year. Needs updating, but currently only BBS would be applicable here, and the county-scale merge could be done separately
  if (this.merge == "state_temporal"){
    stop("Need to add state as an input to the rf1 function")
    # Merge by State and year
    if (length(rs.data$year) > 0){ rs.data$STATE_YEAR = sprintf("%s_%s", rs.data$STATE, rs.data$year) }
    rs.data = merge(rs.data, these.data, by = "STATE_YEAR") #, all.x = TRUE)
  }
  
  # Merge by location only
  if (this.merge == "spatial"){
    
    # check that location field is present
    if (length(these.data$location) == 0){
      #**# ideally the stop will be in the external loop, and will give information about which data set is missing the required field
      #**# or we can add an rf1.inputs check tool that just checks this stuff right off the bat in the model
      stop("location_year field is missing from one or more of the spatial_temporal merge input data sets")
    }
    
    
    # Merge by spatial unit - no temporal resolution
    #these.data$location = toupper(these.data$SPATIAL) # Ensure data is upper-case for proper join
    #these.data$SPATIAL = NULL # Remove this field, otherwise it will later get converted to a SPATIAL.x if more than one gets merged. Can always re-assign from location if it is needed later
    
    rs.data = merge(rs.data, these.data, by = "location")
  }
  
  #message(sprintf("Joined. Max year is %s. Max data year is %s", max(rs.data$year), max(these.data$year)))
  
  return(rs.data)
}




#### HELPER FUNCTIONS FOR do.rf FUNCTION CALL ####
#### MODIFIED (OR NOT) FROM wnv_hlpr.R ####


#' Function to identify the m value that produces the best results
#'
#' @noRd
#'
get.best.m = function(f, independent.vars, trap.data, response.type){
  best.r2 = 0
  best.m = 0
  # Try different values of "m" # With 37 iterations of 5000 trees, this is going slow.
  # Demo with 500 trees - probably too few.
  # Getting m of 3-4. Seems about right
  for (m in 1:length(independent.vars)){
    #message(sprintf("Trying m = %s", m))
    rf.model = randomForest(f, data = trap.data, na.action = na.exclude, stringsAsFactors = TRUE, importance = TRUE, mtry = m, ntree = 1000)
    
    if (response.type == "continuous"){
      if(median(rf.model$rsq) > best.r2){
        best.m = m
        best.r2 = median(rf.model$rsq)
      }
    }
    if (response.type == "binary"){
      #message("R2 used for choice of m is actually the out-of-box error rate, not the R2")
      if(median(rf.model$err.rate[ ,1]) < best.r2){
        best.m = m
        best.r2 = median(rf.model$err.rate[ ,1])
      }
      
    }
    
  }
  return(best.m)
}

#' Systematic Validation
#'
#' @noRd
#'
systematic.validation = function(trap.data, dep.var, f2, best.m, drop.field, response.type = "continuous", display.messages = 1){
  #require(caret)
  
  # Get list of unique values from the field to have one unit sequentially dropped
  drop.vals = unique(trap.data[[drop.field]])
  #message(paste(names(trap.data), collapse = ", "))
  #message(drop.field)
  #message(drop.vals)
  
  # Create a data frame to hold the results
  errors = data.frame(observed = NA, predicted = NA, unit = NA, MERGE_ID = NA)
  rmse.errors = c()
  spearmans = c()
  samples = c()
  to.exclude = c()
  
  for (drop.val in drop.vals){
    # Subset the data to exclude the field in question
    sub.data = trap.data[trap.data[[drop.field]] != drop.val, ]
    this.sample = nrow(sub.data)
    samples = c(samples, this.sample)
    
    # Check to make sure there is variation to explain with a random forest
    if (length(unique(sub.data[[dep.var]])) > 0){
      # Fit the random forest
      this.result = randomForest(f2, data = sub.data, na.action = na.exclude, stringsAsFactors = TRUE, importance = FALSE, mtry = best.m, ntree = 5000)
      
      # Get validation subset
      validation.data = trap.data[trap.data[[drop.field]] == drop.val, ]
      outcomes = predict(this.result, newdata = validation.data)
      
      # Get RMSE for this unit
      this.rmse = caret::RMSE(validation.data[[dep.var]], outcomes, na.rm = TRUE)
      rmse.errors = c(rmse.errors, this.rmse)
      
      # Get scaled RMSE for this unit #**# Don't need this for every unit - just do once overall
      #scaled.rmse = this.rmse / (mean(validation.data[[dep.var]]))
      
      if (response.type == "continuous"){
        # Get Spearman correlations for this unit
        this.spearman = suppressWarnings(cor(validation.data[[dep.var]], outcomes, use = "complete.obs"))
        spearmans = c(spearmans, this.spearman)
      }
      if (response.type == "binary"){
        spearmans = NA
      }
      
      # Add results to data frame
      stuff = cbind(validation.data[[dep.var]], outcomes, rep(as.character(drop.val), length(outcomes)), validation.data$MERGE_ID)
      colnames(stuff) = c("observed", "predicted", "unit", "MERGE_ID")
      #test.type(stuff, 'L1991')
      errors = rbind(errors, stuff)
    }else{
      if(display.messages == 1){message(sprintf("%s had no variation in the dependent variable %s", drop.val, dep.var))}
      to.exclude = c(to.exclude, drop.val)
    }
    
  }
  
  # Exclude any variables where the Random Forest could not be tested due to lack of variation
  for (ex in to.exclude){
    if (ex %in% drop.vals){
      drop.vals = drop.vals[drop.vals != ex]
    }
  }
  
  # Drop NAs in first row added during data frame initialization
  errors = errors[2:nrow(errors), ]
  errors$observed = as.numeric(as.character(errors$observed))
  errors$predicted = as.numeric(as.character(errors$predicted))
  overall.rmse = caret::RMSE(errors$observed, errors$predicted, na.rm = TRUE)
  #0.0039 So, our average error is 3.9 cases per 1000.
  # Our average infection rate is 3.0 cases per 1000. So... on average we are just very wrong.
  min.error = min(abs(errors$observed - errors$predicted))
  max.error = max(abs(errors$observed - errors$predicted))
  #0.0231 (!) That is a HUGE mistake!
  
  custom.r2 = estimate.custom.r2(errors$observed, errors$predicted)
  #0.26 (!) # Better than I expected!
  
  # get median rmse of those calculated for each unit
  median.rmse = median(rmse.errors)
  # The median is 0.0029. So better than the mean overall RMSE.
  # Are the years with the most WNV the ones with the worst predictions?
  
  # Divide the root mean squared error by the mean value in the observed dataset. How big is the typical error relative to the mean?
  # One citation said a good rule of thumb is aim for an RMSE that is 10% of the target value or less
  scaled.rmse = overall.rmse / mean(errors$observed)
  
  #errors.by.year = cbind(as.numeric(as.character(drop.vals)), rmse.errors)
  #errors.by.year = errors.by.year[order(errors.by.year[ ,2], decreasing = TRUE), ]
  
  if (response.type == "continuous"){
    # Look at rank correlation ? See how well it does at ranking the magnitude of WNV?
    spearman = cor(errors[,c(1,2)], method = "spearman")[1,2]
    # 0.58 spearman correlation. So... there is a correlation between what the model predicts is a bad year and how bad the year is.
    # But it's not a great correlation.
    # And this is at the county level. Ouch.
    pearson = cor(errors[,c(1,2)], method = "pearson")[1,2]
  }
  
  if (response.type == "binary"){
    spearman = NA
    pearson = NA
  }
  
  overall.results = c(overall.rmse, median.rmse, scaled.rmse, custom.r2, spearman, pearson, min.error, max.error)
  #message(overall.rmse)
  #message(median.rmse)
  #message(scaled.rmse)
  #message(custom.r2)
  #message("break")
  #message(spearman)
  #message(pearson)
  #message(min.error)
  #message(max.error)
  names(overall.results) = c("Overall.RMSE", "Median.RMSE", "Scaled.RMSE", "R2.sasha", "Spearman", "Pearson", "Min.Error", "Max.Error")
  #**# 2012 was a bad year - totally underpredicted the WNV infection rates
  #**# So... this is limited as an early warning system (!)
  
  # Add unit informatoin to the rmse.errors & spearman correlations
  names(rmse.errors) = as.character(drop.vals)
  if (response.type == "continuous"){ names(spearmans) = as.character(drop.vals) }
  names(samples) = as.character(drop.vals)
  
  #hist(temporal.accuracy$SPEARMAN, breaks = seq(-1,1,0.1))
  
  return(list(OVERALL = overall.results, RMSE = rmse.errors, SPEARMAN = spearmans, N = samples, ERROR.DF = errors))
}


#' Similar to systematic, but for the production of the null comparisons
#' 
#' @noRd
#' 
null.validation = function(trap.data, dep.var, drop.field, create.ci.null, display.messages){
  #require(caret)
  
  # Add a message to warn users that some functionality has been turned off - to ensure it is intentional
  if (create.ci.null != 1 & display.messages == 1){ message("Not creating a Null model based on the confidence intervals of the original data")}
  
  # Get list of unique values from the field to have one unit sequentially dropped
  drop.vals = unique(trap.data[[drop.field]])
  
  # Create a data frame to hold the results
  errors = data.frame(observed = NA, mean.predicted = NA, ci.predicted = NA, unit = NA, MERGE_ID = NA)
  mean.rmse.errors = c()
  mean.spearmans = c()
  
  if (create.ci.null == 1){
    ci.rmse.errors = c()
    ci.spearmans = c()
  }
  samples = c()
  
  for (drop.val in drop.vals){
    # Subset the data to exclude the field in question
    sub.data = trap.data[trap.data[[drop.field]] != drop.val, ]
    this.sample = nrow(sub.data)
    samples = c(samples, this.sample)
    
    # Fit the random forest
    mean.result = mean(sub.data[[dep.var]])
    
    # Get validation subset
    validation.data = trap.data[trap.data[[drop.field]] == drop.val, ]
    mean.outcomes = rep(mean.result, nrow(validation.data))
    
    # Get RMSE for this unit
    this.mean.rmse = caret::RMSE(validation.data[[dep.var]], mean.outcomes)
    mean.rmse.errors = c(mean.rmse.errors, this.mean.rmse)
    
    if (create.ci.null == 1){
      # Repeat for CI Null
      ci.outcomes = mapply(runif, 1, validation.data$CI.lower, validation.data$CI.upper)
      this.ci.rmse = caret::RMSE(validation.data[[dep.var]], ci.outcomes)
      ci.rmse.errors = c(ci.rmse.errors, this.ci.rmse)
      
      # Get Spearman correlations for this unit
      # correlations for the mean values are meaningless! they're all the same predictions!
      #this.mean.spearman = cor(validation.data[[dep.var]], mean.outcomes)
      #mean.spearmans = c(mean.spearmans, this.mean.spearman)
      
      this.ci.spearman = cor(validation.data[[dep.var]], ci.outcomes)
      ci.spearmans = c(ci.spearmans, this.ci.spearman)
    }else{ ci.outcomes = rep(NA, length(mean.outcomes)) }
    
    # Add results to data frame
    stuff = cbind(validation.data[[dep.var]], mean.outcomes, ci.outcomes, rep(as.character(drop.val), length(mean.outcomes)), validation.data$MERGE_ID)
    colnames(stuff) = c("observed", "mean.predicted", "ci.predicted", "unit", "MERGE_ID")
    #test.type(stuff, 'L2128')
    errors = rbind(errors, stuff)
  }
  
  # Drop NAs in first row added during data frame initialization
  errors = errors[2:nrow(errors), ]
  errors$observed = as.numeric(as.character(errors$observed))
  errors$mean.predicted = as.numeric(as.character(errors$mean.predicted))
  
  mean.overall.rmse = RMSE(errors$observed, errors$mean.predicted)
  
  # Scale by mean to put into the context of the data
  mean.scaled.rmse = mean.overall.rmse / mean(errors$observed)
  
  mean.min.error = min(abs(errors$observed - errors$mean.predicted))
  mean.max.error = max(abs(errors$observed - errors$mean.predicted))
  mean.custom.r2 = estimate.custom.r2(errors$observed, errors$mean.predicted)
  
  # Look at median of the individual RMSE's
  mean.median.rmse = median(mean.rmse.errors)
  
  mean.overall.results = c(mean.overall.rmse, mean.median.rmse, mean.scaled.rmse, mean.custom.r2, NA, NA, mean.min.error, mean.max.error)
  names(mean.overall.results) = c("Overall.RMSE", "Median.RMSE", "Scaled.RMSE", "R2.sasha", "Spearman", "Pearson", "Min.Error", "Max.Error")
  
  # Add unit informatoin to the rmse.errors & spearman correlations
  names(mean.rmse.errors) = as.character(drop.vals)
  #names(mean.spearmans) = as.character(drop.vals)
  names(samples) = as.character(drop.vals)
  
  mean.results = list(OVERALL = mean.overall.results, RMSE = mean.rmse.errors, SPEARMAN = NA, N = samples, ERROR.DF = errors)
  #hist(temporal.accuracy$SPEARMAN, breaks = seq(-1,1,0.1))
  
  # Repeat process for CI results, if applicable
  ci.results = NA
  if (create.ci.null == 1){
    errors$ci.predicted = as.numeric(as.character(errors$ci.predicted))
    ci.overall.rmse = caret::RMSE(errors$observed, errors$ci.predicted)
    ci.scaled.rmse = ci.overall.rmse / mean(errors$observed)
    
    ci.min.error = min(abs(errors$observed - errors$ci.predicted))
    ci.max.error = max(abs(errors$observed - errors$ci.predicted))
    ci.custom.r2 = estimate.custom.r2(errors$observed, errors$ci.predicted)
    
    ci.median.rmse = median(ci.rmse.errors)
    # Look at rank correlation ? See how well it does at ranking the magnitude of WNV?
    #mean.spearman = cor(errors[,c(1,2)], method = "spearman")[1,2]
    #mean.pearson = cor(errors[,c(1,2)], method = "pearson")[1,2]
    ci.spearman = cor(errors[,c(1,3)], method = 'spearman')[1,2]
    ci.pearson = cor(errors[,c(1,3)], method = 'pearson')[1,2]
    
    ci.overall.results = c(ci.overall.rmse, ci.median.rmse, ci.scaled.rmse, ci.custom.r2, ci.spearman, ci.pearson, ci.min.error, ci.max.error)
    names(ci.overall.results) = c("Overall.RMSE", "Median.RMSE", "Scaled.RMSE", "R2.sasha", "Spearman", "Pearson", "Min.Error", "Max.Error")
    
    names(ci.rmse.errors) = as.character(drop.vals)
    names(ci.spearmans) = as.character(drop.vals)
    
    ci.results = list(OVERALL = ci.overall.results, RMSE = ci.rmse.errors, SPEARMAN = ci.spearmans, N = samples, ERROR.DF = errors)
    
  }
  
  return(list(NULL.MEAN = mean.results, NULL.CI = ci.results))
}

#' Calculate R2 estimate of variation explained
#'
#' This R2 is not bounded as zero, as the model is not derived from the original data
#' The model is derived from a novel data set, consquently, it could fail to explain even
#' as much variation as the data mean. In these cases, negative R2's will be obtained
#'
#' Consequently, a negative R2 might be better than no knowledge at all, but it's not apt to be very good (depends on how variable the data are from the mean!)
#'  Basic assumption is that there is some variability around the mean, and that typically, a model should be able to explain that variation better than a static mean.
#'
#' @noRd
#'
estimate.custom.r2 = function(observed, predicted){
  
  deviation = observed - predicted
  
  squared.deviation = deviation ^ 2
  
  # Calculate prediction R2 (this is not PRESS, which is based on leaving one out. This is an entirely new validation data set, and so the R2 could be negative)
  sum.squared.deviation = sum(squared.deviation)
  
  data.mean = mean(observed, na.rm = TRUE)
  deviation.from.mean = data.mean - observed
  deviation.from.mean.squared = deviation.from.mean ^ 2
  sum.dfm.squared = sum(deviation.from.mean.squared)
  
  # Scale the error remaining after the model by the error obtained from just using the mean
  # Subtract this from one.
  R2.pred = 1 - (sum.squared.deviation / sum.dfm.squared)
  
  return(R2.pred)
}

#' Determine the contribution of each variable to the total variation explained
#' 
#' @noRd
#' 
do.variance.partitioning = function(trap.data, dep.var, best.vars, best.m, temporal.accuracy.R2,
                                    drop.thresholds, correlation.threshold, response.type,
                                    do.spatial, results.path,
                                    spatial.resolution, temporal.resolution, this.label,
                                    temporal.field, display.messages){
  
  if (display.messages == 1){
    message("Running variance partitioning")
    message(sprintf("Baseline R2 is %.3f", temporal.accuracy.R2))
  }
  
  # Check if a spatial partitioning is desired - if so, throw a warning
  if (do.spatial == 1){ warning("Variance partitioning was performed for temporal crossvalidation, NOT spatial crossvalidation. The code could easily be adapted to support the spatial partitioning, but currently it does not.") }
  
  if (correlation.threshold != ""){
    # Identify correlations among variables
    cor.matrix = cor(trap.data[ ,best.vars])
    corr.variation = c()
    n.corr = c()
  }
  
  unique.variation = c()
  
  # Run variance partitioning leaving each variable out in sequence
  for (this.var in best.vars){
    if (display.messages == 1){message(sprintf("Processing %s", this.var))}
    remaining.vars = best.vars[best.vars != this.var] # Drop the variable in question
    out.R2 = run.partitioning(trap.data, dep.var, remaining.vars, best.m, temporal.accuracy.R2,
                              response.type, temporal.field, display.messages)
    unique.variation = c(unique.variation, out.R2)
    
    if (correlation.threshold != ""){
      # Run variance partitioning leaving out correlated suites of variables (greater than a threshold)
      cor.row = cor.matrix[this.var, ]
      cor.index = sapply(cor.row, simple.threshold, correlation.threshold)
      #cor.index = matrix(cor.index, ncol = 1)
      remaining.vars2 = best.vars[!cor.index]
      out.cor.R2 = run.partitioning(trap.data, dep.var, remaining.vars2, best.m, temporal.accuracy.R2,
                                    response.type, temporal.field, display.messages)
      corr.variation = c(corr.variation, out.cor.R2)
      n.corr = c(n.corr, sum(cor.index)) # This works because TRUE evaluates to 1 and FALSE evaluates to 0
    }
  }
  
  out.df = data.frame(VARIABLES = best.vars, UNIQUE.R2 = unique.variation)
  
  if (correlation.threshold != ""){
    out.df$CORR.R2 = corr.variation
    out.df$N.CORRELATED = n.corr
  }
  
  best.threshold = NA
  best.R2.diff = 0
  kept.vars = best.vars
  
  for (threshold in drop.thresholds){
    # Try dropping all variables that contribute less than 0.1% uniquely to the overall result #**# 0.1% is also arbitrary
    remaining.vars3 = out.df[out.df$UNIQUE.R2 > threshold, 1] # 1 indicates the first column
    
    if (length(remaining.vars3) == 0){
      if(display.messages == 1){message(sprintf("No variables remaining at threshold %s", threshold))}
    }else{
      this.R2.diff = run.partitioning(trap.data, dep.var, remaining.vars3, best.m, temporal.accuracy.R2,
                                      response.type, temporal.field, display.messages)
      if(display.messages == 1){message(sprintf("R2 difference for threshold %s is %.3f", threshold, this.R2.diff))}
      
      # Right now, this JUST accepts models that constitute an improvement.
      # We likely want to add a penalty term that favors simpler models with similar explanatory power.
      if (this.R2.diff < best.R2.diff){
        best.R2.diff = this.R2.diff
        best.threshold = threshold
        kept.vars = remaining.vars3
      }
    }
  }
  
  # Add a threshold column to the data set for reference as to what was used to identify the final model variables
  out.df$best.threshold = best.threshold
  
  # Write variance partitioning results to an output file
  out.path = sprintf("%s/ModelResults/VariancePartitions", results.path)
  dir.create(out.path, showWarnings = FALSE, recursive = TRUE) # showWarnings = FALSE suppresses a warning if the path already exists
  out.file = sprintf("%s/varpar_%s_%s_%s.csv", out.path, spatial.resolution, temporal.resolution, this.label)
  write.table(out.df, out.file, sep = ',', row.names = FALSE, col.names = TRUE)
  
  return(kept.vars)
}

#' Calculate the variance partitioning run
#' 
#' @noRd
#' 
run.partitioning = function(trap.data, dep.var, remaining.vars, best.m, temporal.accuracy.R2, response.type,
                            temporal.field = "TEMPORAL", display.messages = 1){
  
  my.f = as.formula(paste(dep.var, ' ~ ', paste(remaining.vars, collapse = "+")))
  
  # Redefine best.m if it is no longer applicable when the variable set is reduced
  if (best.m > length(remaining.vars)){ best.m = length(remaining.vars) }
  
  temporal.accuracy = systematic.validation(trap.data, dep.var, my.f, best.m, temporal.field, response.type, display.messages)
  
  # Calculate difference between the main run with all variables and this run
  R2.difference = temporal.accuracy.R2 - temporal.accuracy$OVERALL[["R2.sasha"]]
  
  return(R2.difference)
}

#' A simple function for using a threshold
#' 
#' @noRd
#' 
simple.threshold = function(x, threshold){
  y = NA
  if (x >= threshold){ y = TRUE }
  if (x < threshold){ y = FALSE }
  return(y)
}

#' Write model predictions
#'
#' @noRd
#'
write.predictions = function(trap.data, dep.var, rf.model, results.path, spatial.resolution, temporal.resolution, label){
  # Code used to patch the human analyses prior to addition of dep.var
  # DiseasePath = "C:/Users/ak697777/University at Albany - SUNY/Elison Timm, Oliver - CCEID/RESULTS/WNV_statistical/ModelResults"
  # spatial.resolution = "county"; temporal.resolution = "annual"; analysis.label = label = "human" #"human_subset"
  # trap.data = read.csv(sprintf("%s/%s_%s_%s.csv", DiseasePath, spatial.resolution, temporal.resolution, analysis.label))
  # load(sprintf("C:/Users/ak697777/University at Albany - SUNY/Elison Timm, Oliver - CCEID/RESULTS/WNV_statistical/ModelRuns/Results_%s_%s_%s.RData", spatial.resolution, temporal.resolution, analysis.label))
  # rf.model = model.results$MODEL
  trap.data$OBSERVED = trap.data[[dep.var]]
  trap.data$PREDICTED = predict(rf.model) #**# More commands needed?
  #output = trap.data[ , c("OBSERVED", "PREDICTED", "SPATIAL", "TEMPORAL")] #**# Do we need more than this? Yes for the visualization
  
  out.file = sprintf("%s/ModelResults/%s_%s_%s.csv", results.path, spatial.resolution, temporal.resolution, label)
  
  write.table(trap.data, file = out.file, sep = ',', row.names = FALSE)
  
}

#' Create spatial/temporal barplots
#' 
#' @noRd
#' 
spatial.temporal.barplots = function(temporal.accuracy, spatial.accuracy, spatial.resolution, temporal.resolution, results.path, label){
  
  outfile = sprintf("%s/RMSE_Spearman_%s_%s%s.tif", results.path, spatial.resolution, temporal.resolution, label)
  tiff(filename = outfile, compression = c("lzw"), height = 2400, width = 2400, res = 300)
  
  par(mfrow = c(2,2))
  par(mar = c(8,6,2,0))
  rmse.y.lim = c(0, 10)
  sp.y.lim = c(-1,1)
  
  x.axis = "s"
  x.offset = 0.8
  x.adj1 = x.adj2 = 0
  if (spatial.resolution == "point"){
    x.axis = "n"
    x.offset = 0.8
    x.adj1 = 20
    x.adj2 = 4.2
  }
  
  
  barplot(sort(temporal.accuracy$RMSE * 1000), ylab = "RMSE\n(# infected per 1000 mosquitoes)" , las = 2, ylim = rmse.y.lim, xpd = NA) #main = "RMSE"
  text(par("usr")[1] + x.offset, 9.8, "a.", cex = 1, xpd = NA)
  barplot(sort(spatial.accuracy$RMSE * 1000), ylab = 'RMSE\n(# infected per 1000 mosquitoes)', las = 2, ylim = rmse.y.lim, xaxt = x.axis, xpd = NA) #main = "RMSE"
  text(par("usr")[1] + x.offset + x.adj1, 9.8, "b.", cex = 1, xpd = NA)
  barplot(sort(temporal.accuracy$SPEARMAN), ylab = "Spearman coefficient", las = 2, ylim = sp.y.lim)
  text(par("usr")[1] + x.offset, 0.95, "c.", cex = 1, xpd = NA)
  barplot(sort(spatial.accuracy$SPEARMAN), ylab = "Spearman coefficient", las = 2, ylim = sp.y.lim, xaxt = x.axis)
  text(par("usr")[1] + x.offset + x.adj2, 0.95, "d.", cex = 1, xpd = NA)
  
  dev.off()
}

#' Calculate MLE for a specified subset of the data using R
#'
#' Modified from wnv_hlpr.R
#'
#' @param md Data on mosquito pools tested for the disease. Must be formatted with 4 columns: location (the spatial unit, e.g. county), col_date: the date the mosquitoes were tested, wnv_result: whether or not the pool was positive, pool_size: the number of mosquitoes tested in the pool. A fifth column species is optional but is not used by the code
#' @param temporal.resolution Must be set to 'annual' for now. Ideally support for finer-scale resolution will be added.
#'
#' @return Mosquito data compiled to give estimated mosquito infection rates by spatial unit and temporal resolution
#'
#' @export calculate.MLE.v2
calculate.MLE.v2 = function(md, temporal.resolution = "annual"){
  
  # Remove any NA rows from the calculations, report their presence to the user
  virus.nas = sum(is.na(md$wnv_result))
  pool.nas = sum(is.na(md$pool_size))
  md = md[!is.na(md$wnv_result), ]
  md = md[!is.na(md$pool_size), ]
  
  if (virus.nas != 0 | pool.nas != 0){
    warning(sprintf("NAs present in the input mosquito data: %s for wnv_result and %s for pool_size. These NA records have been removed from further analysis", virus.nas, pool.nas))
  }
  
  # Check that expected column names are present, if not, give an informative error
  expected.names = c('location', "col_date", "wnv_result", "pool_size") #'species' is not actually required at this point
  is.error = 0
  missing.vec = c()
  for (e.name in expected.names){
    if (!e.name %in% names(md)){
      is.error = 1
      missing.vec = c(missing.vec, e.name)
    }
  }
  if (is.error == 1){
    m = "One or more required fields is missing. Inputs are case sensitive. Missing fields are:"
    
    stop(sprintf("%s %s. Input fields were: %s", m, paste(missing.vec, collapse = ", "), paste(names(md), collapse = ', ')))
  }
  
  # Call dprev function from MLE_IR.R
  if (!exists("dprev")){ stop(" Load the dprev function from Williams & MOffit 2005 into computer memory. It is required for the calculation of MLE's") }
  # Make sure mosq.data is a data frame and not a file string
  if (typeof(md) == "character"){  md = read.csv(md)  }
  
  if (length(unique(md$species)) > 1){ warning(sprintf("Pooling across %s species", length(unique(md$species))))  }
  if (temporal.resolution != "annual"){ stop("Need to code other temporal resolution options")  }
  
  #**# THIS IS SPECIFIC TO THE FORMAT MIKE & JUSTIN USED FOR ARBOMAP - REVISIT TO SEE IF A MORE GENERAL APPROACH IS DESIRABLE
  # Get year from date
  md$year = mapply(substr, md$col_date, nchar(as.character(md$col_date)) - 3, nchar(as.character(md$col_date)))
  md$year = as.numeric(md$year)
  
  # Add VIRUS and ABUND fields #**# CONDSIDER WHETHER THESE ARE THE FIELD NAMES YOU REALLY WANT TO USE
  
  if (length(md$wnv_result) == 0){
    stop("A field with WNV result must be present, and must be labeled 'wnv_result'")
  }
  
  md$VIRUS = md$wnv_result
  md$wnv_result = NULL # Prevent an overabundance of redundant fields
  
  # Check that VIRUS field has only positives, negatives, and NA values
  md.positives = md$VIRUS[md$VIRUS == 1] 
  md.negatives = md$VIRUS[md$VIRUS == 0]
  #md.nas = md$VIRUS[is.na(md$VIRUS)]
  md.check = length(md.positives) + length(md.negatives) # + length(md.nas)
  #message(md.check)
  #message(length(md.positives))
  #message(length(md.negatives))
  #message(length(md.nas))
  #message(nrow(md))
  if (md.check != nrow(md)){
    m1 = "The VIRUS field (wnv_result) can only contain 0, 1, or NA values."
    m2 = sprintf("Entered values were: %s", paste(unique(md$VIRUS), collapse = ', '))
    m3 = sprintf("There were %s positive pools, %s negative pools, and %s pools total")
    m4 = sprintf("Thus %s pools are missing, as there should be %s pools", (nrow(md) - md.check), nrow(md))
    stop(sprintf("%s %s %s %s", m1, m2, m3, m4))
  }
  
  if (length(md$pool_size) == 0){
    stop("A field with mosquito pool sizes must be present, and must be labeled 'pool_size'")
  }
  
  md$ABUND = md$pool_size
  md$pool_size = NULL # Prevent redundnat fields
  
  # Drop any missing abundance field #**# Perhaps want to do this PRIOR to this function to ensure quality data inputs?
  md = md[!is.na(md$ABUND), ]
  
  # Aggregate by spatial field and year
  md$location_year = sprintf("%s_%s", md$location, md$year)
  
  # Get group ID's
  group.ids = unique(md$location_year)
  
  # Create a new data frame with group ID information
  md.data = data.frame(GROUP = group.ids, CI.lower = NA, CI.upper = NA,
                       IR = NA, location = NA)
  
  # Loop through each group and calculate MLE
  for (i in 1:length(group.ids)){
    group.id = group.ids[i]
    # Subset to just this group.id
    group.data = md[md$location_year == group.id, ]
    location = as.character(group.data$location[1]) # Should all be the same, just use first value
    this.year = group.data$year[1] # Should all be same, just use first value #**# Modify if temporal resolution is not annual
    
    # Get number of positive and negative pools for this group
    positives.index = group.data$VIRUS == 1
    negatives.index = group.data$VIRUS == 0
    #Explicitly finding negatives to exclude any NA values (i.e. from NY)
    
    # Get positive and negative pools including pool sizes
    positives = group.data$ABUND[positives.index]
    negatives = group.data$ABUND[negatives.index]
    abundance = sum(positives) + sum(negatives) # Get total number of mosquitoes tested. In CT, this is true abundance, in NY, pool size is capped, so this will underestimate abundance
    density = abundance / (length(positives) + length(negatives)) # Get number of mosquitoes divided by number of pools tested
    
    # If no pools, assign a value of -999 to distinguish from undefined IR's
    if (length(positives) == 0 & length(negatives) == 0){
      warning(sprintf("No pools for group %s. Not sure how that happened. Values of -999 assigned", group.id))
      IR = -999
      CI.upper = -999
      CI.lower = -999
      abundance = 0
      density = 0
    }else{
      # Proceed with checking for missing positives or negatives alone
      # If no positive pools, assign an IR of 0 and CI's of 0 to 0
      if (length(positives) == 0){
        IR = 0
        CI.lower = 0
        CI.upper = 0
      }
      
      # If no negative pools, assign an IR of NA, and CI's of NA
      if (length(negatives) == 0){
        IR = NA
        CI.lower = NA
        CI.upper = NA
      }
      
      if (length(positives) != 0 & length(negatives) != 0){
        # Calculate the infection rate (IR)
        output = dprev(positives, negatives, disp = 'n')
        CI.lower = output[1]
        CI.upper = output[3]
        IR = output[2]
      }
    } # END OF ELSE STATEMENT
    
    # update the data frame
    md.data$CI.lower[i] = CI.lower
    md.data$CI.upper[i] = CI.upper
    md.data$IR[i] = IR
    md.data$abundance[i] = abundance
    md.data$density[i] = density
    
    # Update the location entry
    md.data$location[i] = location
    
    # Ensure there is a clearly delineated year field. Adjust as appropriate for other temporal resolutions
    if (temporal.resolution == "annual") { md.data$year[i] = this.year }
  } # END OF LOOP OVER GROUPS
  
  md.data$location_year = sprintf("%s_%s", md.data$location, md.data$year)
  
  return(md.data)
}

#**# The below functions are redefined in dfmip. Need to think about the flow of code and whether an extra package is needed.

#' Get Day of Year, given year, month, and day
#'
#' COPIED FROM wnv_hlpr.R
#'
#' @param year Input year
#' @param month Input month
#' @param day Input day
#'
#' @return Return ordinal day of the year
#'
get.DOY = function(year, month, day){
  
  # Determine if it is a leap year
  days = get.days(year)
  
  # Sum up days from months
  Jan = 31
  Feb = 28
  Mar = 31
  Apr = 30
  May = 31
  Jun = 30
  Jul = 31
  Aug = 31
  Sep = 30
  Oct = 31
  Nov = 30
  Dec = 31
  
  if (days == 366){  Feb = 29  }
  months = c(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)
  
  month.days = 0
  if (month != 1){
    month.days = sum(months[1:(month - 1)])
  }
  
  # Add days from days
  doy = month.days + day
  
  return(doy)
}

#' Function to get number of days in a year (copied from wnv_hlpr.R)
#'
#' Does not work for years 1900 and before or after 2100 (i.e. it does not handle the Century cases)
#'
#' @param year The year to examine
#'
#' @return The number of days in the year (365 for non-leap years, 366 for a leap year)
#'
get.days = function(year){
  
  # Add check - not designed for 1900 leap year, and won't work past 2400
  if (year <= 1900 | year >= 2100){ stop("Not designed years <= 1900 or >= 2100")}
  
  # Days are 365 unless it is a leap year
  days = 365
  if (year %% 4 == 0){
    days = 366
  }
  return(days)
}


#' Function to identify is a leap year or not
#' 
#' Takes a year, returns a 1 if it is a leap year, based on the result of the get.days function
#' 
#' @param year The year to check if it is a leap-year or not
#' 
#' @return 1 for leap year, 0 for non-leap year
#' 
is.leap = function(year){
  days = get.days(year)
  leap = 0
  if (days == 366){ leap = 1  }
  return(leap)
}

#' Split strings using strsplit in a way that can easily be used within sapply
#'
#' @param string The string to be split
#' @param delimiter What to use to split the string
#' @param position Which part of the string to retain. Only one piece can be retained
#' To retain multiple pieces, do a separate function call for each piece
#' @param as.string 0 Result will be numeric, 1 Result will be string
#'
splitter = function(string, delimiter, position, as.string = 0){
  parts = strsplit(string, delimiter)[[1]]
  out = parts[position]
  
  if (as.string == 0){
    out = as.numeric(as.character(out)) # Make sure it is in number format and drop any leading 0's
  }
  if (as.string == 1){  out = as.character(out)  }
  
  return(out)
}


#' District to location
#' 
#' Could be generalized even
#' Copy in dfmip as well
#' 
#' @noRd
district.to.location = function(in.data, data.label, old.name = 'district', new.name = 'location'){
  
  fields = colnames(in.data)
  new.name.regex = sprintf("\\b%s\\b", new.name)
  old.name.regex = sprintf("\\b%s\\b", old.name)
  new.pos = grep(new.name.regex, fields)
  old.pos = grep(old.name.regex, fields)
  
  if (length(new.pos) == 0){
    # If the old name is present and the new name is absent, rename the field
    if (length(old.pos) == 1){
      colnames(in.data)[old.pos] = new.name
      warning(sprintf("%s field missing. Substituting values from %s field", new.name, old.name))
    }
    if (length(old.pos) == 0){
      stop(sprintf("Required 'location' field is missing. Field names are %s", paste(fields, collapse = ', ')))
    }
    if (length(old.pos) > 1){
      stop(sprintf("More than one field returned for %s. Probably an error with the regular expressions. Ideally just include a %s field", old.name, new.name))
    }
  }
  
  new.name_year = sprintf("%s_year", new.name)
  new.name_year.regex = sprintf("\\b%s\\b", new.name_year)
  new.name_year.pos = grep(new.name_year.regex, fields)
  
  if (length(new.name_year.pos) == 0){
    # Search for variables associated with the old year
    old.name_year = sprintf("%s_year", old.name)
    old.name_year.regex = sprintf("\\b%s\\b", old.name_year)
    old.name_year.pos = grep(old.name_year.regex, fields)

    # This check must precede the next, otherwise the new.name_year field will exist!
    if (length(old.name_year.pos) == 0){
      # If the field is not there, generate the location_year field from the location and year fields
      in.data[[new.name_year]] = sprintf("%s_%s", in.data[[new.name]], in.data[["year"]])
      
      # Check that this was successful
      if (length(in.data[[new.name_year]]) == 0){
        stop(sprintf("Something went wrong with generation of the %s_year field for data set %s", new.name, data.label))
      }
    }
    
    if (length(old.name_year.pos) == 1){
      warning(sprintf("%s field missing. Subsituting values from %s field", new.name_year, old.name_year))
      colnames(in.data)[old.name_year.pos] = new.name_year
    }
    
    if (length(old.name_year.pos) > 1){
      stop(sprintf("More than one field returned for %s. Probably an error with the regular expressions. Ideally just include a %s field", old.name_year, new.name_year))
    }
  }
 
  return(in.data) 
}
akeyel/rf1 documentation built on Dec. 28, 2020, 4:48 a.m.