R/knn_forecast_boot_intervals.R
In knnwtsim: K Nearest Neighbor Forecasting with a Tailored Similarity Metric

Documented in knn.forecast.boot.intervals

#' KNN Forecast Bootstrap Prediction Intervals
#'
#' A function for forecasting using KNN regression with prediction intervals. The approach is based on the description of
#' "Prediction intervals from bootstrapped residuals" from chapter 5.5 of Hyndman R, Athanasopoulos G (2021) \url{https://otexts.com/fpp3/prediction-intervals.html#prediction-intervals-from-bootstrapped-residuals},
#' modified as needed for use with KNN regression. The algorithm starts by calculating a pool of forecast errors to later
#' sample from. If there are \code{n} points prior to the first observation indicated in \code{f.index.in} then there will be \code{n - k.in} errors generated by one-step ahead forecasts
#' starting with the point of the response series at the index \code{k.in + 1}. The first \code{k.in} points cannot be estimated because
#' a minimum of \code{k.in} eligible neighbors would be needed. The optional \code{burn.in} argument can be used to increase the number
#' of points from the start of the series that need to be available as neighbors before calculating errors for the pool. Next, \code{B}
#' possible paths the series could take are simulated using the pool of errors. Each path is simulated by calling \code{knn.forecast()}, estimating the first point in \code{f.index.in}, adding a sampled forecast error, then adding
#' this value to the end of the series. This process is then repeated for the next point in \code{f.index.in} until all have been estimated. The final output
#' interval estimates are calculated for each point in \code{f.index.in} by taking the appropriate percentiles of the corresponding simulations of that point.
#' The mean and medians are also calculated from these simulations. One important implication of this behavior is that the mean forecast output from this function can
#' differ from the point forecast produced by \code{knn.forecast()} alone.
#'
#'
#'
#' @param Sim.Mat.in numeric and symmetric matrix of similarities (recommend use of \code{S_w}, see \code{SwMatrixCalc()}).
#' @param f.index.in numeric vector indicating the indices of \code{Sim.Mat.in} and \code{y.in} which correspond to the time order of the points to be forecast.
#' @param k.in integer value indicating the the number of nearest neighbors to be considered in forecasting, must be \code{>= 1}.
#' @param y.in numeric vector of the response series to be forecast.
#' @param burn.in integer value which indicates how many points at the start of the series to set aside as eligible neighbors before calculating forecast errors to be re-sampled.
#' @param B integer value representing the number of bootstrap replications, this will be the number of forecasts simulated and used to calculate outputs, must be \code{>= 1}.
#' @param return.simulations logical value indicating whether to return all simulated forecasts.
#' @param level numeric value over the range (0,1) indicating the confidence level for the prediction intervals.
#'
#' @return  list of the following components:
#' \describe{
#'           \item{lb}{numeric vector of the same length as \code{f.index.in}, with the estimated lower bound of the prediction interval.}
#'           \item{ub}{numeric vector of the same length as \code{f.index.in}, with the estimated upper bound of the prediction interval.}
#'           \item{mean}{numeric vector of the same length as \code{f.index.in}, with the mean of the \code{B} simulated paths for each forecasted point.}
#'           \item{median}{numeric vector of the same length as \code{f.index.in}, with the median of the \code{B} simulated paths for each forecasted point.}
#'           \item{simulated.paths}{numeric matrix where each of the \code{B} rows contains a simulated path for the points in \code{f.index.in}, only returned if \code{return.simulations = TRUE}.}
#' }
#' @export
#'
#' @seealso
#' * [knn.forecast()] for the function called to perform knn regression.
#' * [SwMatrixCalc()] for the function to calculate a matrix with the recommended similarity measure.
#' * Hyndman R, Athanasopoulos G (2021),"Forecasting: Principles and Practice, 3rd ed", Chapter 5.5, \url{https://otexts.com/fpp3/prediction-intervals.html#prediction-intervals-from-bootstrapped-residuals}.
#'   For background on the algorithm this function is based on.
#' @md
#'
#' @examples
#' data("simulation_master_list")
#' series.index <- 15
#' ex.series <- simulation_master_list[[series.index]]$series.lin.coef.chng.x
#'
#' # Weights pre tuned by random search. In alpha, beta, gamma order
#' pre.tuned.wts <- c(0.2148058, 0.2899638, 0.4952303)
#' pre.tuned.k <- 5
#'
#' df <- data.frame(ex.series)
#' # Generate vector of time orders
#' df$t <- c(1:nrow(df))
#'
#' # Generate vector of periods
#' nperiods <- simulation_master_list[[series.index]]$seasonal.periods
#' df$p <- rep(1:nperiods, length.out = nrow(df))
#'
#' # Pull corresponding exogenous predictor(s)
#' X <- as.matrix(simulation_master_list[[series.index]]$x.chng)
#'
#'
#' # Calculate the weighted similarity matrix using Sw
#' Sw.ex <- SwMatrixCalc(
#'   t.in = df$t,
#'   p.in = df$p, nPeriods.in = nperiods,
#'   X.in = X,
#'   weights = pre.tuned.wts
#' )
#'
#' n <- length(ex.series)
#' # Index we want to forecast
#' f.index <- c((n - 5 + 1):length(ex.series))
#'
#' interval.forecast <- knn.forecast.boot.intervals(
#'   Sim.Mat.in = Sw.ex,
#'   f.index.in = f.index,
#'   y.in = ex.series,
#'   k.in = pre.tuned.k
#' )
knn.forecast.boot.intervals <- function(Sim.Mat.in,
                                        f.index.in,
                                        k.in,
                                        y.in,
                                        burn.in = NULL,
                                        B = 200,
                                        return.simulations = FALSE,
                                        level = .95) {



  # bad argument type checks
  if (!(is.vector(y.in, mode = "numeric")) |
    !(is.vector(f.index.in, mode = "numeric"))) {
    stop("y.in and f.index.in should be numeric vectors")
  }

  if (!((is.matrix(Sim.Mat.in) & is.numeric(Sim.Mat.in)))) {
    stop("Sim.Mat.in should be a numeric matrix")
  } else if (!(isSymmetric.matrix(Sim.Mat.in))) {
    stop("Sim.Mat.in should be a symmetric matrix")
  }

  if ((!(is.vector(k.in, mode = "numeric"))) |
    (!(identical(length(k.in), 1L)))) {
    stop("k.in should be an integer with length 1L")
  } else if (!(identical((k.in %% 1), 0))) {
    warning("k.in should be an integer, argument will be
            floored to nearest whole number")
    k.in <- floor(k.in)
  }


  if (!identical(burn.in, NULL)) {
    if (((!(is.vector(burn.in, mode = "numeric"))) |
      (!(identical(length(burn.in), 1L))))) {
      stop("burn.in should be an integer with length 1L, or NULL")
    } else if (!(identical((burn.in %% 1), 0))) {
      warning("burn.in should be an integer, argument will be floored to
              nearest whole number")
      burn.in <- floor(burn.in)
    }
  }

  if ((!(is.vector(B, mode = "numeric"))) | (!(identical(length(B), 1L)))) {
    stop("B should be an integer with length 1L")
  } else if (!(identical((B %% 1), 0))) {
    warning("B should be an integer, argument will be
            floored to nearest whole number")
    B <- floor(B)
  }

  if ((!(is.logical(return.simulations))) |
    (!(identical(length(return.simulations), 1L)))) {
    stop("return.simulations should be a logical with length 1L")
  }

  if ((!(is.vector(level, mode = "numeric"))) |
    (!(identical(length(level), 1L)))) {
    stop("level should be a numeric value with length 1L")
  } else if (!((level > 0) & (level < 1))) {
    stop("level should be between 0 and 1")
  }

  # conflicting sizes checks
  if (max(f.index.in) < nrow(Sim.Mat.in)) {
    warning("Sim.Mat.in row count is greater than the maximum value of
            f.index.in, rows and columns at indices greater than maximum value
            of f.index.in will be removed")
    remove.indices <- c((max(f.index.in) + 1):nrow(Sim.Mat.in))
    Sim.Mat.in <- Sim.Mat.in[-(remove.indices), -(remove.indices)]
  }

  if (max(f.index.in) > nrow(Sim.Mat.in)) {
    stop("f.index.in contains higher values than row count of Sim.Mat.in")
  }

  if (min(f.index.in) > (length(y.in) + 1)) {
    stop("f.index.in should start at an index <= (length(y.in) + 1)")
  }

  if (k.in > nrow(Sim.Mat.in[-(f.index.in), ])) {
    stop("k.in is larger than the number of eligible neighbors")
  }

  if ((k.in < 1) | (B < 1)) {
    stop("integer arguments k.in and B cannot be < 1")
  }

  if (!is.null(burn.in)) {
    if (burn.in < k.in) {
      stop("burn.in is less than k.in, at least k.in points are needed
           for forecasting")
    }
  }


  # burn.in defaults to k.in
  if (is.null(burn.in)) {
    burn.in <- k.in
  }

  # set up sim mat and response series to gather residuals
  Sim.Mat.noval <- Sim.Mat.in[-(f.index.in), -(f.index.in)]
  y.noval <- y.in[-(c(min(f.index.in):length(y.in)))]

  # gather pool of residuals to sample from
  gather.res.index <- c((burn.in + 1):length(y.noval))
  res.len <- length(gather.res.index)
  et <- numeric(length = res.len)
  test.frcst <- numeric(length = res.len)


  for (t in gather.res.index) {
    Sim.Mat.noval.t <- Sim.Mat.noval[1:t, 1:t]
    y.t <- y.noval[1:t]
    y.t.act <- y.noval[t]

    res.est.frcst.t <- knn.forecast(
      Sim.Mat.in = Sim.Mat.noval.t,
      f.index.in = t,
      k.in = k.in,
      y.in = y.t
    )

    et.t <- y.t.act - res.est.frcst.t

    et[t - burn.in] <- et.t
    test.frcst[t - burn.in] <- res.est.frcst.t
  }

  h <- length(f.index.in)
  boot.paths <- matrix(NA, nrow = B, ncol = h)

  # create B simulated forecasts
  for (b in c(1:B)) {

    # this loop produces the bth simulated path
    for (i in c(1:h)) {
      one.hop.point <- f.index.in[i]
      one.hop.res <- base::sample(et, size = 1, replace = TRUE)

      # assign a 0 vector for the new forecast if first iteration, then append
      # previously estimated points in later iterations
      if (i == 1) {
        path <- numeric(length = h)
        y.sim <- y.noval
      } else {
        y.sim <- c(y.sim, path[i - 1])
      }

      # removing uneeded columns in the sim matrix to avoid warnings
      if (i != h) {
        remove.index <- f.index.in[(i + 1):h]
        Sim.Mat.one.hop <- Sim.Mat.in[-(remove.index), -(remove.index)]
      } else {
        Sim.Mat.one.hop <- Sim.Mat.in
      }

      # produce one step ahead forecast and store result for next iteration
      one.hop.frcst <- knn.forecast(
        Sim.Mat.in = Sim.Mat.one.hop,
        f.index.in = one.hop.point,
        k.in = k.in,
        y.in = y.sim
      )

      one.hop.frsct.w.res <- one.hop.frcst + one.hop.res

      path[i] <- one.hop.frsct.w.res
    }
    boot.paths[b, ] <- path
  }

  # use simulated paths for estimates
  ub.boot <- apply(boot.paths,
    MARGIN = 2, FUN = stats::quantile,
    probs = (level + 1) / 2
  )

  lb.boot <- apply(boot.paths,
    MARGIN = 2, FUN = stats::quantile,
    probs = (1 - level) / 2
  )

  mean.boot <- apply(boot.paths, MARGIN = 2, FUN = mean)
  median.boot <- apply(boot.paths, MARGIN = 2, FUN = stats::median)

  return.list <- list(
    lb = lb.boot,
    ub = ub.boot,
    mean = mean.boot,
    median = median.boot
  )

  if (return.simulations == TRUE) {
    simulations <- list(simulated.paths = boot.paths)
    return.list <- append(return.list, simulations)
  }

  return(return.list)
}