R/knn_forecast.R
In knnp: Time Series Prediction using K-Nearest Neighbors Algorithm (Parallel)

Documented in knn_forecast

#' Predicts next value of the time series using k-nearest neighbors algorithm.
#'
#' @param y A time series or a trained kNN model generated by the
#' knn_param_search_function. In case that a model is provided the rest of
#' parameters will be ignored and all of them will be taken from the model.
#' @param k Number of neighbors.
#' @param d Length of each of the 'elements'.
#' @param distance Type of metric to evaluate the distance between points. Many
#' metrics are supported: euclidean, manhattan, dynamic time warping, camberra
#' and others. For more information about the supported metrics check the
#' values that 'method' argument of function parDist (from parallelDist
#' package) can take as this is the function used to calculate the distances.
#' Link to package info: https://cran.r-project.org/web/packages/parallelDist
#' Some of the values that this argument can take are "euclidean", "manhattan",
#' "dtw", "camberra", "chord".
#' @param weight Type of weight to be used at the time of calculating the
#' predicted value with a weighted mean. Three supported: proportional,
#' average, linear.
#' \describe{
#'   \item{proportional}{the weight assigned to each neighbor is inversely
#'   proportional to its distance}
#'   \item{average}{all neighbors are assigned with the same weight}
#'   \item{linear}{nearest neighbor is assigned with weight k, second closest
#'   neighbor with weight k-1, and so on until the least nearest neighbor which
#'   is assigned with a weight of 1.}
#' }
#' @param v Variable to be predicted if given multivariate time series.
#' @param threads Number of threads to be used when parallelizing, default is 1
#' @param h Temporal horizon of the prediction (only value 1 is implemented).
#' This parameter is present only for compatibility with the forecast package.
#' @return The predicted value.
#' @examples
#' knn_forecast(AirPassengers, 5, 2)
#' knn_forecast(LakeHuron, 3, 6)
#' @export
knn_forecast <- function(y, k, d, distance = "euclidean", weight =
                           "proportional", v = 1, threads = 1, h = 1) {

  # Default number of threads to be used
  if (is.null(threads)) {
    cores <- parallel::detectCores(logical = FALSE)
    threads <- ifelse(cores == 1, cores, cores - 1)
  }

  forec <- list()
  class(forec) <- "forecast"
  forec$method <- "k-Nearest Neighbors for unknown observations"

  if (any(class(y) == "kNN")) {
    forec$model <- y

    k <- y$opt_k
    d <- y$opt_d
    distance <- y$distance
    weight <- y$weight
    threads <- threads

    y <- y$x
  }
  else {
    model <- list()
    class(model) <- "kNN"
    model$method <- "k-Nearest Neighbors"
    model$k <- k
    model$d <- d
    model$distance <- distance
    model$weight <- weight
    forec$model <- model
  }

  if (any(is.na(y))) {
    stop("There are NAs values in the time series")
  }

  if (any(is.nan(y))) {
    stop("There are NaNs values in the time series")
  }

  if (all(weight != c("proportional", "average", "linear"))) {
    stop(paste0("Weight metric '", weight, "' unrecognized."))
  }

  # Initialization of variables to be used
  n <- NROW(y)
  forec$x <- y

  if (any(class(y) == "ts")) {
    if (!requireNamespace("tseries", quietly = TRUE)) {
      stop("Package 'tseries' needed for this function to work with ts objects.
           Please install it.", call. = FALSE)
    }

    if (NCOL(y) < v) {
      stop(paste0("Index of variable off limits: v = ", v,
                  " but given time series has ", NCOL(y), " variables."))
    }

    sta <- stats::time(y)[n]
    freq <- stats::frequency(y)
    res_type <- "ts"

    y <- matrix(sapply(y, as.double), ncol = NCOL(y))
  }
  else if (any(class(y) == "tbl_ts")) {
    if (!requireNamespace("tsibble", quietly = TRUE)) {
      stop(paste0("Package 'tsibble' needed for this function to work with ",
                  "tsibble objects. Please install it."), call. = FALSE)
    }

    if (length(tsibble::measured_vars(y)) < v) {
      stop(paste0("Index of variable off limits: v = ", v,
                  " but given time series has ",
                  length(tsibble::measured_vars(y)), " variables."))
    }

    resul <- utils::tail(tsibble::append_row(y), 1)

    res_type <- "tsibble"

    y <- matrix(sapply(y[tsibble::measured_vars(y)], as.double), ncol =
                  length(tsibble::measures(y)))
  }
  else {
    res_type <- "undef"

    if (NCOL(y) < v) {
      stop(paste0("Index of variable off limits: v = ", v,
                  " but given time series has ", NCOL(y), " variables."))
    }

    y <- matrix(sapply(y, as.double), ncol = NCOL(y))
  }

  # Get 'elements' matrices (one per variable)
  distances <- plyr::alply(y, 2, function(y_col)
    knn_elements(matrix(y_col, ncol = 1), d))

  # For each of the elements matrices, obtain the distances between
  # the most recent 'element' and the rest of the 'elements'.
  # This results in a list of distances vectors
  distances <- plyr::llply(distances, function(elements_matrix)
    parallelDist::parDist(elements_matrix, distance,
                          threads = threads)[1:(n - d)])

  # Combine all distances vectors by aggregating them
  distances <- Reduce("+", distances)

  # Get the indexes of the k nearest 'elements', these are called neighbors
  k_nn <- which(distances <= sort.int(distances, partial = k)[k],
                 arr.ind = TRUE)
  # We sort them so the closer neighbor is at the first position
  k_nn <- utils::head(k_nn[sort.int(distances[k_nn], index.return = TRUE,
                             decreasing = FALSE)$ix], k)

  # Calculate the weights for the future computation of the weighted mean
  weights <- switch(weight,
                    proportional = 1 / (distances[k_nn] +
                                          .Machine$double.xmin * 1e150),
                    average = rep.int(1, k),
                    linear = k:1
  )

  # Calculate the predicted value
  forec$neighbors <- n - k_nn
  prediction <- stats::weighted.mean(y[n - k_nn + 1, v], weights)

  if (res_type == "ts") {
    forec$mean <- utils::tail(stats::ts(c(1, prediction), start = sta,
                                        frequency = freq), 1)
    forec$fitted <- stats::ts(start = sta, frequency = freq)
  }
  else if (res_type == "tsibble") {
    forec$fitted <- resul
    resul[tsibble::measured_vars(resul)[v]] <- prediction
    forec$mean <- resul
  }
  else {
    forec$mean <- prediction
    forec$fitted <- NA
  }

  forec$lower <- NA
  forec$upper <- NA

  forec$residuals <- utils::tail(y[, v], 1) - prediction

  forec$distances <- rev(distances)

  forec
}