COVIDNearTerm: Near Term Prediction of COVID-Related Population Metrics

Documented in simulateUnivariateAR

#' Builds an autoregressive model and uses it to predict
#'
#' A wrapper function for buildAR and predictAR.
#'
#'
#' @param vec A vector of numeric data
#' @param x A vector of numeric data, used to predict vec. Must be the same size as vec and indexed to vec
#' @param x_lag An integer used to specify the number of observations x should be shifted. Vec will also be truncated on top by x_lag
#' @param wsize Number of prior observations to use for averaging, default is 14
#' @param method Type of weighting to use, default is equal
#' @param pdays Number of days into the future to make predictions, default is 28
#' @param nsim  Number of simulations, default is 100
#' @param skip Number of input values to skip, default is 0
#' @param seed Seed for random number generator
#' @param output_type Type of output, default is all
#' @param rhat_method Method for calculating rhat, if "none", rhat = 1 and has no effect
#' @param lambda Shrinkage parameter, if not specified, default is a grid search from 0 to 1 by 0.05. A value of 0 produces no shrinkage. If an array is specified, all values in the array
#' are evaluated and the optimal lambda is chosen based on residual sum of squares. Values should be between 0 and 1 inclusive.
#' @param alpha Alpha parameter, if not specified, default is 0 which produces no scaling. If an array is specified, all values in the array
#' are evaluated and the optimal alpha is chosen based on residual sum of squares. Values should be >=0.
#' @param rolling_mean rolling mean window for x. Must be an integer less than length of x and at least 1. Default is 1
#' @param debug TRUE returns buildAR objects in addition to standard output

#' @return A data frame containing the specified output statistics for each sim
#'
#' @import dplyr
#' @import purrr
#' @import reshape2
#' @import lubridate
#' @importFrom zoo rollmean
#'
#' @export
#'
#'
simulateUnivariateAR <- function(vec,
                                 x = NULL,
                                 x_lag = 0,
                                 wsize = 14,
                                 method = c("equal", "unweighted", "triangle"),
                                 pdays = 28,
                                 nsim = 100,
                                 skip  = 0,
                                 seed = NULL,
                                 output_type = "all",
                                 rhat_method = c("none", "geometric", "arithmetic"),
                                 lambda = seq(0, 1, 0.05),
                                 alpha = 0,
                                 rolling_mean = 1,
                                 debug = FALSE){

  if(length(rhat_method) > 1) {
    rhat_method <- rhat_method[1]
  }

  if(is.null(x)) {
    stop("x cannot be NULL")

  }

  if(length(x) != length(vec)) {
    stop(paste0("x length (", length(x), ") should be the same as vec length (", length(vec), ")") )
  }

  if(rolling_mean < 1 | rolling_mean > (length(x) - x_lag) ) {
    stop(paste0("rolling mean must be at least 1 and no more than the length of x - x_lag. specified rolling mean is ", rolling_mean) )
  }

  if( round(rolling_mean, 0) != rolling_mean | is.na(rolling_mean) ) {
    stop(paste0("specified rolling_mean, ", rolling_mean, ", should be an integer") )
  }

  if( round(x_lag, 0) != x_lag | is.na(rolling_mean) ) {
    stop(paste0("specified x_lag, ", x_lag, ", should be an integer") )
  }

  if( !is.numeric(lambda) ) {
    stop("lambda must be numeric")
  }

  if( any(lambda > 1 ) |  any(lambda < 0 ) ) {
    stop("lambda must be between 0 and 1 inclusive")
  }

  if( !is.numeric(alpha) ) {
    if( alpha != "c") {
      stop("if alpha is not 'c' it must be numeric")
    }
  } else {
    if( any(alpha < 0 ) ) {
      stop("alpha must be >= 0")
    }
  }

  #########################################################
  # chop x_lag entries from the bottom of x
  # chop x_lag entries from the top of y
  #########################################################
  if( x_lag > 0 ) {
    cat("shifting x by", x_lag, "\n")
    x <- head(x, -x_lag)
    cat("truncating vec by", x_lag, "\n")
    vec <- tail(vec, -x_lag)
  }


  #########################################################
  # transform x using rolling mean
  #########################################################
  ### generate rolling means for first k values

  if(rolling_mean != 1){
    first_means <- map(1:(rolling_mean - 1), function(.k, .x) {

      zoo::rollmean(x = .x[1:.k], k = .k, align = "right" )

    }, x) %>% unlist

    last_means <- zoo::rollmean(x = x, k = rolling_mean, align = "right" )

    x <- c(first_means, last_means)
  }
  # x_rolling <- c(first_means, last_means)

  #########################################################
  # build training models
  #########################################################
  build_ar_object_y <- buildAR(vec = vec, x = vec, wsize, method = method, seed = seed, rhat_method = rhat_method, lambda = lambda)
  build_ar_object_x <- buildAR(vec = x, x = x, wsize, method = method, seed = seed, rhat_method = rhat_method, lambda = lambda)
  # build_ar_object_x_rolling <- buildAR(vec = x_rolling, x = x_rolling, wsize, method = method, seed = seed, rhat_method = rhat_method, lambda = lambda)

  ###Fit lowess for relationship between data and error so error is proportional to magnitude
  errors <- build_ar_object_y$errors
  dat <- build_ar_object_y$vec[-1]
  ###Skip the first skip features in the vector
  if( skip > 0 ) {
    dat <- dat[-(1:skip)]
    errors <- errors[-(1:skip)]
    diff_phis <- diff_phis[-(1:(skip-1))]
  }

  ###Fit lowess for relationship between data and error so error is proportional to magnitude
  lowess_fit <- lowess(dat, errors^2)

  #########################################################
  # determine alpha to use
  #########################################################

  y_obs <-  build_ar_object_y$vec
  y_phis <- build_ar_object_y$phi_s
  x_phis <- build_ar_object_x$phi_s
  # x__rolling_phis <- build_ar_object_x_rolling$phi_s

  x_phis_standardized <- ( x_phis - mean( x_phis ) ) * ( sd( y_phis ) / sd( x_phis ) ) + mean( y_phis )

  grid_search <- FALSE

  if( length(alpha) > 1) {

    alpha_grid <- alpha
    grid_search <- TRUE

  } else {
    if(  alpha == "c" ) {

      # C <- 1 / mean( x_phis_standardized - y_phis )
      C <- 1/50
      alpha_grid = seq(0, C, length.out = 50)
      grid_search <- TRUE

    }
  }

  if( grid_search ){

    grid_alphas <- map_dfr(alpha_grid, function(.alpha, .y_phis, .x_phis_standardized, .y_obs) {

      .alpha_pred <- .y_phis * head(.y_obs, -1) +
        .alpha * ( .x_phis_standardized - y_phis ) * head(.y_obs, -1)

      .ssr <- sum( (.alpha_pred - .y_obs[-1] ) ^ 2 )

      data.frame(alpha = .alpha,
                 SSR_alpha = .ssr)

    }, y_phis, x_phis_standardized, y_obs) %>%
      arrange(SSR_alpha)

    alpha <- grid_alphas$alpha[1]
  }

  #########################################################
  # get predicted x and y values and corresponding phis
  #########################################################

  ar_out_y <- predictAR(buildAR_obj = build_ar_object_y,
                        pdays = pdays,
                        nsim = nsim,
                        skip = skip,
                        output_type = "predictions",
                        debug = TRUE)

  y_debug <- ar_out_y$predict_debug_object %>%
    filter(!is.na(predicted)) %>%
    select(sim, day, phi_s, predicted, current_vec)

  predict_y <- split(y_debug, y_debug$sim) %>%
    map2_dfr(unique(y_debug$sim),  function(.debug_sim, .sim) {

      data.frame(sim = .sim,
                 day = .debug_sim$day,
                 y_t1 = c(.debug_sim$current_vec[1], head(.debug_sim$predicted, -1)),
                 phi_y = .debug_sim$phi_s)


    })

  ar_out_x <- predictAR(buildAR_obj = build_ar_object_x,
                        pdays = pdays,
                        nsim = nsim,
                        skip = skip,
                        output_type = "predictions",
                        debug = TRUE)

  # ar_out_x_rolling <- predictAR(buildAR_obj = build_ar_object_x_rolling,
  #                       pdays = pdays,
  #                       nsim = nsim,
  #                       skip = skip,
  #                       output_type = "predictions",
  #                       debug = TRUE)

  x_debug <- ar_out_x$predict_debug_object %>%
    filter(!is.na(predicted)) %>%
    select(sim, day, phi_s, predicted, current_vec)

  # x_rolling_debug <- ar_out_x$predict_debug_object %>%
  #   filter(!is.na(predicted)) %>%
  #   select(sim, day, phi_s, predicted, current_vec)

  predict_x <- split(x_debug, x_debug$sim) %>%
    map2_dfr(unique(x_debug$sim), function(.debug_sim, .sim) {

      data.frame(sim = .sim,
                 day = .debug_sim$day,
                 phi_p = .debug_sim$phi_s)


    })

  # predict_x_rolling <- split(x_debug, x_debug$sim) %>%
  #   map2_dfr(unique(x_debug$sim), function(.debug_sim, .sim) {
  #
  #     data.frame(sim = .sim,
  #                day = .debug_sim$day,
  #                phi_p = .debug_sim$phi_s)
  #
  #
  #   })

  predict_data_sub <- predict_y %>%
    merge(y = predict_x,
          by = c("sim", "day") ) %>%
    arrange(sim, day)


  predict_data <- split(predict_data_sub, predict_data_sub$sim) %>%
    map2_dfr(unique(predict_data_sub$sim), function(.predict_sim, .sim, .alpha) {

      .predict_sim  %>%
        mutate(sim = .sim,
               phi_p_standardized = ( phi_p - mean( phi_p ) ) * ( sd( phi_y ) / sd( phi_p ) ) + mean( phi_y ),
               f_phi = phi_p_standardized - phi_y,
               value = phi_y * y_t1 + .alpha * f_phi * y_t1)
     }, alpha)


  predict_data <- pmap_dfr(predict_data, function(...) {
    current <- tibble(...)

    current$error <- addError(xpred = current$value, lowess_obj = lowess_fit)
    current
  }) %>%
    mutate(predicted = value + error)


  output <- predict_data %>%
    select(sim, day, predicted) %>%
    reshape2::dcast(sim ~ day, value.var = "predicted") %>%
    select(-sim)

  if(output_type == "max") {
    return_stat <- data.frame(max = apply(output, 1, max) )
    # return_stat$rep <- 1:nsim
  }
  if(output_type=="mean") {
    return_stat <- data.frame(mean = apply(output, 1, mean) )
    # return_stat$rep <- 1:nsim
  }
  if(output_type == "all" ) {
    return_stat <- data.frame( t( apply(output, 1, summary) ) )
    names( return_stat ) <- gsub("\\.|X", "", names( return_stat ) )
    names( return_stat ) <- gsub("1st", "First", names( return_stat ) )
    names( return_stat ) <- gsub("3rd", "Third", names( return_stat ) )
    # return_stat$rep <- 1:nsim

  }
  if(output_type == "predictions" ) {
    return_stat <- data.frame( t( output ) )
    return_stat$t <- 1:pdays
    return_stat <- return_stat %>% select(t, everything())
  }

  if( debug ) {

    return_object <- list("return_stat" = return_stat)

    return_object[["build_object_y"]] <- build_ar_object_y
    return_object[["build_object_x"]] <- build_ar_object_x
    return_object[["predict_debug_object_y"]] <- ar_out_y
    return_object[["predict_debug_object_x"]] <- ar_out_x
    return_object[["predict_info_matrix"]] <-  as.data.frame(predict_data)


    if(grid_search) {
      return_object[["alpha_grid"]] <- grid_alphas
    }

    return_object[["alpha_used"]] <- alpha

    class(return_object) <- "simulateAR"

  } else {
    return_object <- return_stat
  }
  return( return_object )
}