R/knn_past.R
In knnp: Time Series Prediction using K-Nearest Neighbors Algorithm (Parallel)

Documented in knn_past

#' Predicts values of the time series using k-nearest neighbors algorithm.
#' Values corresponding to instants from initial + 1 to the last one are
#' predicted. The first value predicted, which corresponds to instant
#' initial + 1, is calculated using instants from 1 to instant initial;
#' the second value predicted, which corresponds to instant initial + 2, is
#' predicted using instants from 1 to instant initial + 1; and so on until the
#' last value, which corresponds to instant n (length of the time series),
#' is predicted using instants from 1 to instant n - 1.
#'
#' @param y A time series or a trained kNN model generated by the
#' knn_param_search_function. In case that a model is provided the rest of
#' parameters will be ignored and all of them will be taken from the model.
#' @param k Number of neighbors.
#' @param d Length of each of the 'elements'.
#' @param initial Variable that determines the limit of the known past for the
#' first instant predicted.
#' @param distance Type of metric to evaluate the distance between points. Many
#' metrics are supported: euclidean, manhattan, dynamic time warping, camberra
#' and others. For more information about supported metrics check the values
#' that 'method' argument of function parDist (from parallelDist package)
#' can take as this is the function used to calculate the distances. Link to
#' the package info: https://cran.r-project.org/web/packages/parallelDist
#' Some of the values that this argument can take are "euclidean", "manhattan",
#' "dtw", "camberra", "chord".
#' @param weight Type of weight to be used at the time of calculating the
#' predicted value with a weighted mean. Three supported: proportional,
#' average, linear.
#' \describe{
#'   \item{proportional}{the weight assigned to each neighbor is inversely
#'   proportional to its distance}
#'   \item{average}{all neighbors are assigned with the same weight}
#'   \item{linear}{nearest neighbor is assigned with weight k, second closest
#'   neighbor with weight k-1, and so on until the least nearest neighbor
#'   which is assigned with a weight of 1.}
#' }
#' @param v Variable to be predicted if given multivariate time series.
#' @param threads Number of threads to be used when parallelizing, default is 1
#' @return The predicted value.
#' @examples
#' knn_past(AirPassengers, 5, 2)
#' knn_past(LakeHuron, 3, 6)
#' @export
knn_past <- function(y, k, d, initial = NULL, distance = "euclidean", weight =
                       "proportional", v = 1, threads = 1) {

  forec <- list()
  class(forec) <- "forecast"
  forec$method <- "k-Nearest Neighbors over known observations"

  if (any(class(y) == "kNN")) {
    forec$model <- y

    forec$model$optim_call <- forec$model$call
    forec$model$call <- NULL

    k <- y$opt_k
    d <- y$opt_d
    distance <- y$distance
    weight <- y$weight
    threads <- threads

    y <- y$x
  }
  else {
    model <- list()
    class(model) <- "kNN"
    model$method <- "k-Nearest Neighbors"
    model$k <- k
    model$d <- d
    model$distance <- distance
    model$weight <- weight
    forec$model <- model
  }

  if (any(is.na(y))) {
    stop("There are NAs values in the time series")
  }

  if (any(is.nan(y))) {
    stop("There are NaNs values in the time series")
  }

  if (all(weight != c("proportional", "average", "linear"))) {
    stop(paste0("Weight metric '", weight, "' unrecognized."))
  }

  # Default number of threads to be used
  if (is.null(threads)) {
    cores <- parallel::detectCores(logical = FALSE)
    threads <- ifelse(cores == 1, cores, cores - 1)
  }

  # Initialization of variables to be used
  n <- NROW(y)
  initial <- ifelse(is.null(initial), initial <- floor(n * 0.7), initial)

  forec$x <- y

  if (any(class(y) == "ts")) {
    if (!requireNamespace("tseries", quietly = TRUE)) {
      stop("Package 'tseries' needed for this function to work with ts objects.
           Please install it.", call. = FALSE)
    }

    if (NCOL(y) < v) {
      stop(paste0("Index of variable off limits: v = ", v,
                  " but given time series has ", NCOL(y), " variables."))
    }

    sta <- stats::time(y)[initial + 1]
    freq <- stats::frequency(y)
    res_type <- "ts"

    y <- matrix(sapply(y, as.double), ncol = NCOL(y))
  }
  else if (any(class(y) == "tbl_ts")) {
    if (!requireNamespace("tsibble", quietly = TRUE)) {
      stop(paste0("Package 'tsibble' needed for this function to work with ",
                  "tsibble objects. Please install it."), call. = FALSE)
    }

    if (length(tsibble::measured_vars(y)) < v) {
      stop(paste0("Index of variable off limits: v = ", v,
                  " but given time series has ",
                  length(tsibble::measured_vars(y)), " variables."))
    }

    resul <- utils::tail(y, (n - initial))

    resul[tsibble::measured_vars(resul)] <- NA

    res_type <- "tsibble"

    y <- matrix(sapply(y[tsibble::measured_vars(y)], as.double), ncol =
                  length(tsibble::measures(y)))

  }
  else {
    res_type <- "undef"

    if (NCOL(y) < v) {
      stop(paste0("Index of variable off limits: v = ", v,
                  " but given time series has ", NCOL(y), " variables."))
    }

    y <- matrix(sapply(y, as.double), ncol = NCOL(y))
  }

  predictions <- array(dim = n - initial)
  neighbors <- matrix(nrow = k, ncol = n - initial)

  # Get 'elements' matrices (one per variable)
  distances <- plyr::alply(y, 2, function(y_col)
    knn_elements(matrix(y_col, ncol = 1), d))

  # For each of the elements matrices, calculate the distances between
  # every 'element'. This results in a list of triangular matrices.
  distances <- plyr::llply(distances, function(elements_matrix)
    parallelDist::parDist(elements_matrix, distance, threads = threads))

  # Combine all distances matrices by aggregating them
  distances <- Reduce("+", distances)

  distances_size <- attr(distances, "Size")

  prediction_index <- length(predictions)
  for (j in 2:(n - initial + 1)) {
    # Get column needed from the distances matrix
    initial_index <- distances_size * (j - 1) - j * (j - 1) / 2 + 1
    distances_col <- distances[initial_index:(initial_index + n - d - j)]

    # Get the indexes of the k nearest 'elements', these are called neighbors
    k_nn <- which(distances_col <= sort.int(distances_col, partial = k)[k],
                   arr.ind = TRUE)
    # We sort them so the closer neighbor is at the first position
    k_nn <- utils::head(k_nn[sort.int(distances_col[k_nn], index.return = TRUE,
                               decreasing = FALSE)$ix], k)

    # Calculate the weights for the future computation of the weighted mean
    weights <- switch(weight,
                      proportional = 1 / (distances_col[k_nn] +
                                            .Machine$double.xmin * 1e150),
                      average = rep.int(1, k),
                      linear = k:1
    )

    neighbors[, prediction_index] <- (n + 2 - j - k_nn) - 1

    # Calculate the predicted value
    predictions[prediction_index] <-
        stats::weighted.mean(y[n + 2 - j - k_nn, v], weights)
    prediction_index <- prediction_index - 1
  }
  if (res_type == "ts") {
    forec$fitted <- stats::ts(predictions, start = sta, frequency = freq)
    forec$mean <- stats::ts(start = sta, frequency = freq)
  }
  else if (res_type == "tsibble") {
    forec$mean <- resul
    resul[tsibble::measured_vars(resul)[v]] <- predictions
    forec$fitted <- resul
  }
  else {
    forec$fitted <- predictions
    forec$mean <- rep(NA, length(predictions))
  }

  forec$lower <- rep(NA, length(predictions))
  forec$upper <- rep(NA, length(predictions))

  forec$residuals <- utils::tail(y[, v], length(predictions)) - predictions

  forec$neighbors <- neighbors
  forec$initial <- initial

  return(forec)
}