Nothing
#' Predicts values of the time series using k-nearest neighbors algorithm.
#' Values corresponding to instants from initial + 1 to the last one are
#' predicted. The first value predicted, which corresponds to instant
#' initial + 1, is calculated using instants from 1 to instant initial;
#' the second value predicted, which corresponds to instant initial + 2, is
#' predicted using instants from 1 to instant initial + 1; and so on until the
#' last value, which corresponds to instant n (length of the time series),
#' is predicted using instants from 1 to instant n - 1.
#'
#' @param y A time series or a trained kNN model generated by the
#' knn_param_search_function. In case that a model is provided the rest of
#' parameters will be ignored and all of them will be taken from the model.
#' @param k Number of neighbors.
#' @param d Length of each of the 'elements'.
#' @param initial Variable that determines the limit of the known past for the
#' first instant predicted.
#' @param distance Type of metric to evaluate the distance between points. Many
#' metrics are supported: euclidean, manhattan, dynamic time warping, camberra
#' and others. For more information about supported metrics check the values
#' that 'method' argument of function parDist (from parallelDist package)
#' can take as this is the function used to calculate the distances. Link to
#' the package info: https://cran.r-project.org/web/packages/parallelDist
#' Some of the values that this argument can take are "euclidean", "manhattan",
#' "dtw", "camberra", "chord".
#' @param weight Type of weight to be used at the time of calculating the
#' predicted value with a weighted mean. Three supported: proportional,
#' average, linear.
#' \describe{
#' \item{proportional}{the weight assigned to each neighbor is inversely
#' proportional to its distance}
#' \item{average}{all neighbors are assigned with the same weight}
#' \item{linear}{nearest neighbor is assigned with weight k, second closest
#' neighbor with weight k-1, and so on until the least nearest neighbor
#' which is assigned with a weight of 1.}
#' }
#' @param v Variable to be predicted if given multivariate time series.
#' @param threads Number of threads to be used when parallelizing, default is 1
#' @return The predicted value.
#' @examples
#' knn_past(AirPassengers, 5, 2)
#' knn_past(LakeHuron, 3, 6)
#' @export
knn_past <- function(y, k, d, initial = NULL, distance = "euclidean", weight =
"proportional", v = 1, threads = 1) {
forec <- list()
class(forec) <- "forecast"
forec$method <- "k-Nearest Neighbors over known observations"
if (any(class(y) == "kNN")) {
forec$model <- y
forec$model$optim_call <- forec$model$call
forec$model$call <- NULL
k <- y$opt_k
d <- y$opt_d
distance <- y$distance
weight <- y$weight
threads <- threads
y <- y$x
}
else {
model <- list()
class(model) <- "kNN"
model$method <- "k-Nearest Neighbors"
model$k <- k
model$d <- d
model$distance <- distance
model$weight <- weight
forec$model <- model
}
if (any(is.na(y))) {
stop("There are NAs values in the time series")
}
if (any(is.nan(y))) {
stop("There are NaNs values in the time series")
}
if (all(weight != c("proportional", "average", "linear"))) {
stop(paste0("Weight metric '", weight, "' unrecognized."))
}
# Default number of threads to be used
if (is.null(threads)) {
cores <- parallel::detectCores(logical = FALSE)
threads <- ifelse(cores == 1, cores, cores - 1)
}
# Initialization of variables to be used
n <- NROW(y)
initial <- ifelse(is.null(initial), initial <- floor(n * 0.7), initial)
forec$x <- y
if (any(class(y) == "ts")) {
if (!requireNamespace("tseries", quietly = TRUE)) {
stop("Package 'tseries' needed for this function to work with ts objects.
Please install it.", call. = FALSE)
}
if (NCOL(y) < v) {
stop(paste0("Index of variable off limits: v = ", v,
" but given time series has ", NCOL(y), " variables."))
}
sta <- stats::time(y)[initial + 1]
freq <- stats::frequency(y)
res_type <- "ts"
y <- matrix(sapply(y, as.double), ncol = NCOL(y))
}
else if (any(class(y) == "tbl_ts")) {
if (!requireNamespace("tsibble", quietly = TRUE)) {
stop(paste0("Package 'tsibble' needed for this function to work with ",
"tsibble objects. Please install it."), call. = FALSE)
}
if (length(tsibble::measured_vars(y)) < v) {
stop(paste0("Index of variable off limits: v = ", v,
" but given time series has ",
length(tsibble::measured_vars(y)), " variables."))
}
resul <- utils::tail(y, (n - initial))
resul[tsibble::measured_vars(resul)] <- NA
res_type <- "tsibble"
y <- matrix(sapply(y[tsibble::measured_vars(y)], as.double), ncol =
length(tsibble::measures(y)))
}
else {
res_type <- "undef"
if (NCOL(y) < v) {
stop(paste0("Index of variable off limits: v = ", v,
" but given time series has ", NCOL(y), " variables."))
}
y <- matrix(sapply(y, as.double), ncol = NCOL(y))
}
predictions <- array(dim = n - initial)
neighbors <- matrix(nrow = k, ncol = n - initial)
# Get 'elements' matrices (one per variable)
distances <- plyr::alply(y, 2, function(y_col)
knn_elements(matrix(y_col, ncol = 1), d))
# For each of the elements matrices, calculate the distances between
# every 'element'. This results in a list of triangular matrices.
distances <- plyr::llply(distances, function(elements_matrix)
parallelDist::parDist(elements_matrix, distance, threads = threads))
# Combine all distances matrices by aggregating them
distances <- Reduce("+", distances)
distances_size <- attr(distances, "Size")
prediction_index <- length(predictions)
for (j in 2:(n - initial + 1)) {
# Get column needed from the distances matrix
initial_index <- distances_size * (j - 1) - j * (j - 1) / 2 + 1
distances_col <- distances[initial_index:(initial_index + n - d - j)]
# Get the indexes of the k nearest 'elements', these are called neighbors
k_nn <- which(distances_col <= sort.int(distances_col, partial = k)[k],
arr.ind = TRUE)
# We sort them so the closer neighbor is at the first position
k_nn <- utils::head(k_nn[sort.int(distances_col[k_nn], index.return = TRUE,
decreasing = FALSE)$ix], k)
# Calculate the weights for the future computation of the weighted mean
weights <- switch(weight,
proportional = 1 / (distances_col[k_nn] +
.Machine$double.xmin * 1e150),
average = rep.int(1, k),
linear = k:1
)
neighbors[, prediction_index] <- (n + 2 - j - k_nn) - 1
# Calculate the predicted value
predictions[prediction_index] <-
stats::weighted.mean(y[n + 2 - j - k_nn, v], weights)
prediction_index <- prediction_index - 1
}
if (res_type == "ts") {
forec$fitted <- stats::ts(predictions, start = sta, frequency = freq)
forec$mean <- stats::ts(start = sta, frequency = freq)
}
else if (res_type == "tsibble") {
forec$mean <- resul
resul[tsibble::measured_vars(resul)[v]] <- predictions
forec$fitted <- resul
}
else {
forec$fitted <- predictions
forec$mean <- rep(NA, length(predictions))
}
forec$lower <- rep(NA, length(predictions))
forec$upper <- rep(NA, length(predictions))
forec$residuals <- utils::tail(y[, v], length(predictions)) - predictions
forec$neighbors <- neighbors
forec$initial <- initial
return(forec)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.