#' Forecasting Engine API
#' @description Function which enables the user to select different forecasting algorithms ranging from
#' traditional time series models (i.e. ARIMA, ETS, STL) to machine learning methods (i.e. LSTM, AutoML).
#' @param mts_data A univariate or multivariate 'ts', 'mts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param xreg_data A univariate or multivariate 'ts', 'mts' or 'xts' object, optional external regressors
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param model_names A list or vector of strings representing the model names to be used
#' @param models_args A list, optional arguments to passed to the models
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param nb_threads An integer, the number of threads to use for the automl_h2o model selection process
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc(AirPassengers,
#' fc_horizon = 12)
#'
#' fc <- generate_fc(AirPassengers,
#' fc_horizon = 6,
#' model_names = c("arima", "ets",
#' "lstm_keras",
#' "automl_h2o"))
#' fc <- generate_fc(AirPassengers,
#' fc_horizon = 6,
#' model_names = c("ets", "snaive",
#' "stl", "nnetar"),
#' model_args = list(ets_arg = list(model = "ZZA",
#' opt.crit = "amse",
#' upper = c(0.3, 0.2,
#' 0.2, 0.98),
#' stl_arg = list(s.window = "periodic"))))
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc(AirPassengers,
#' model_names = "arima",
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc(AirPassengers,
#' model_names = "tbats",
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc <- function(mts_data, fc_horizon = 12,
xreg_data = NULL,
backtesting_opt = list(use_bt = FALSE,
nb_iters = 1,
method = c("rolling",
"moving"),
sample_size = c("expanding",
"fixed")),
model_names = c("arima", "ets", "tbats", "bsts",
"snaive", "nnetar", "stl",
"automl_h2o"),
prepro_fct = NULL,
models_args = NULL,
data_dir = NULL,
time_id = base::Sys.time(),
nb_threads = 1,
...) {
`%>%` <- magrittr::`%>%`
`%do%` <- foreach::`%do%`
`%dopar%` <- foreach::`%dopar%`
model_output <- ini_model_output()
mts_data_xts <- check_data_sv_as_xts(mts_data, default_colname = "time_series")
xreg_data_xts <- check_data_sv_as_xts(xreg_data, default_colname = "feature")
if (!base::is.null(xreg_data_xts)) {
keys_in_col <- base::colnames(xreg_data_xts) %>% stringr::str_detect("__")
print(base::paste("Info about specified regressors: \n",
"Number of total features: ",
base::ncol(xreg_data_xts), "\n",
"Number of shared features (colnames w/o '__'): ",
base::sum(!keys_in_col), "\n",
"Number of ts specific features (ts_name + '__' + feature_name): ",
base::sum(keys_in_col),
sep = ""))
}
fc_horizon <- check_fc_horizon(fc_horizon)
model_names <- check_model_names(model_names)
models_args <- check_models_args(models_args, model_names)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
nb_threads <- check_nb_cores(nb_threads)
time_id <- check_time_id(time_id)
ind_seq <- base::seq(base::ncol(mts_data_xts))
foreach::foreach(ind = ind_seq) %do% {
model_names_parall_proc <- model_names
ts_data_xts <- univariate_xts(mts_data_xts, ind)
ts_colname <- base::colnames(ts_data_xts)
for (model_name in model_names_parall_proc) {
base::eval(base::parse(text = base::paste("model_output$", ts_colname, "$",
model_name, " <- ",
"generate_fc_", model_name, "(",
"ts_data = ts_data_xts, ",
"xreg_data = xreg_data_xts, ",
"fc_horizon = fc_horizon, ",
"backtesting_opt = backtesting_opt, ",
"data_dir = data_dir, ",
"prepro_fct = prepro_fct, ",
"time_id = time_id, ",
"nb_threads = nb_threads, ",
model_name, "_arg = models_args$",
model_name, "_arg)",
sep = "")))
}
}
return(model_output)
}
#' ARIMA Model
#' @description Function to apply the \code{\link[forecast]{auto.arima}} function from the \code{forecast} package
#' on time series data.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param xreg_data A univariate or multivariate 'ts', 'mts' or 'xts' object, optional external regressors
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param arima_arg A list, optional arguments to pass to the \code{\link[forecast]{auto.arima}} function
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_arima(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_arima(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_arima(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_arima <- function(ts_data,
fc_horizon = 12,
xreg_data = NULL,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
arima_arg = NULL,
time_id = base::Sys.time(),
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
fc_horizon <- check_fc_horizon(fc_horizon)
xreg_xts <- check_data_sv_as_xts(xreg_data)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
model_output <- ini_model_output()
md <- fc <- NULL
model_name <- "arima"
print_model_name(model_name)
ts_contiguous_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct) %>%
add_placeholders(fc_horizon,
backtesting_opt) %>%
add_features(xreg_xts)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_contiguous_data,
fc_horizon = fc_horizon,
bt_iter = bt_iter,
backtesting_opt = backtesting_opt)
x_train <- sample_split[["train"]][, base::colnames(ts_data_xts)]
x_test <- sample_split[["test"]][, base::colnames(ts_data_xts)]
if (!is.null(xreg_xts)) {
xreg_names <-
ts_contiguous_data %>%
base::colnames(.)[!base::colnames(.) %in% base::colnames(ts_data_xts)]
xreg_train <- sample_split[["train"]][, xreg_names]
xreg_test <- sample_split[["test"]][, xreg_names]
arima_arg$xreg <- xreg_train
} else {
xreg_test <- NULL
}
if (valid_md_arima(x_train)) {
md <- base::do.call(forecast::auto.arima,
c(base::list(x_train),
arima_arg))
} else {
return(model_output)
}
fc <- forecast::forecast(md,
h = fc_horizon,
xreg = xreg_test)
results <- save_fc_forecast(forecast_obj = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
time_id = time_id,
period_iter = period_iter,
model_args = arima_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' Exponential Smoothing Model
#' @description Function to apply the \code{\link[forecast]{ets}} function from the \code{forecast} package on
#' time series data.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param ets_arg A list, optional arguments to pass to the \code{\link[forecast]{ets}} function
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_ets(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_ets(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_ets(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_ets <- function(ts_data,
fc_horizon = 12,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
ets_arg = NULL,
time_id = base::Sys.time(),
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
fc_horizon <- check_fc_horizon(fc_horizon)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
if (!base::is.list(ets_arg) & !base::is.null(ets_arg)) {
stop("Model arguments must be of type list!")
}
model_output <- ini_model_output()
md <- fc <- NULL
model_name <- "ets"
print_model_name(model_name)
ts_contiguous_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct) %>%
add_placeholders(fc_horizon,
backtesting_opt)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_contiguous_data,
fc_horizon = fc_horizon,
bt_iter = bt_iter,
backtesting_opt = backtesting_opt)
x_train <- sample_split[["train"]]
x_test <- sample_split[["test"]]
if (valid_md_ets(x_train)) {
md <- base::do.call(forecast::ets,
c(base::list(x_train),
ets_arg))
} else {
return(model_output)
}
fc <- forecast::forecast(md, h = fc_horizon)
results <- save_fc_forecast(forecast_obj = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
time_id = time_id,
period_iter = period_iter,
model_args = ets_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' TBATS Model
#' @description Function to apply the \code{\link[forecast]{tbats}} function from the \code{forecast} package on
#' time series data. The \code{\link[forecast]{tbats}} function requires 'ts' objects. Consequently, 'xts' objects will
#' be converted to 'ts' objects.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param tbats_arg A list, optional arguments to pass to the \code{\link[forecast]{tbats}} function
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_tbats(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_tbats(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_tbats(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_tbats <- function(ts_data,
fc_horizon = 12,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
tbats_arg = NULL,
time_id = base::Sys.time(),
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
fc_horizon <- check_fc_horizon(fc_horizon)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
model_output <- ini_model_output()
md <- fc <- NULL
model_name <- "tbats"
print_model_name(model_name)
ts_contiguous_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct) %>%
add_placeholders(fc_horizon,
backtesting_opt)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_contiguous_data,
fc_horizon = fc_horizon,
nb_iter = bt_iter,
backtesting_opt = backtesting_opt)
# TBATS requires 'ts' objects
x_train <-
sample_split[["train"]] %>%
{
dates <-
zoo::index(.) %>%
lubridate::as_date()
data <-
stats::ts(., start = c(lubridate::year(dates[1]),
lubridate::month(dates[1])),
frequency = stats::frequency(.))
data
}
x_test <-
sample_split[["test"]] %>%
{
dates <-
zoo::index(.) %>%
lubridate::as_date()
data <-
stats::ts(., start = c(lubridate::year(dates[1]),
lubridate::month(dates[1])),
frequency = stats::frequency(x_train))
data
}
if (valid_md_tbats(x_train)) {
md <- base::do.call(forecast::tbats, c(base::list(x_train), tbats_arg))
} else {
return(model_output)
}
fc <- forecast::forecast(md, h = fc_horizon)
results <- save_fc_forecast(forecast_obj = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
period_iter = period_iter,
time_id = time_id,
model_args = tbats_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' Neural Network
#' @description Function to apply the \code{\link[forecast]{nnetar}} function from the \code{forecast} package on
#' time series data.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param xreg_data A univariate or multivariate 'ts', 'mts' or 'xts' object, optional external regressors
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param nnetar_arg A list, optional arguments to pass to the \code{\link[forecast]{nnetar}} function
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future periods
#' fc <- generate_fc_nnetar(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_nnetar(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_nnetar(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_nnetar <- function(ts_data,
fc_horizon = 12,
xreg_data = NULL,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
nnetar_arg = NULL,
time_id = base::Sys.time(),
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
fc_horizon <- check_fc_horizon(fc_horizon)
xreg_xts <- check_data_sv_as_xts(xreg_data)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
model_output <- ini_model_output()
md <- fc <- NULL
model_name <- "nnetar"
print_model_name(model_name)
ts_contiguous_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct) %>%
add_placeholders(fc_horizon,
backtesting_opt) %>%
add_features(xreg_xts)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_contiguous_data,
fc_horizon = fc_horizon,
nb_iter = bt_iter,
backtesting_opt = backtesting_opt)
x_train <- sample_split[["train"]][, base::colnames(ts_data_xts)]
x_test <- sample_split[["test"]][, base::colnames(ts_data_xts)]
if (!base::is.null(xreg_xts)) {
xreg_names <- base::colnames(ts_contiguous_data)[!base::colnames(ts_contiguous_data) %in%
base::colnames(ts_data_xts)]
xreg_train <- sample_split[["train"]][, xreg_names]
xreg_test <- sample_split[["test"]][, xreg_names]
nnetar_arg$reg <- xreg_train
} else {
xreg_test <- NULL
}
if (valid_md_nnetar(x_train)) {
md <- base::do.call(forecast::nnetar, c(base::list(x_train), nnetar_arg))
} else {
return(model_output)
}
fc <- forecast::forecast(md, h = fc_horizon, xreg = xreg_test)
results <- save_fc_forecast(forecast_obj = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
time_id = time_id,
period_iter = period_iter,
exclude_PI = TRUE,
model_args = nnetar_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' Season-Trend Decomposition with Loess Model
#' @description Function to apply the \code{\link[stats]{stl}} function from the \code{stats} package on
#' time series data.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param stl_arg A list, optional arguments to pass to the \code{\link[stats]{stl}} function
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_stl(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_stl(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_stl(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_stl <- function(ts_data,
fc_horizon = 12,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
stl_arg = NULL,
time_id = base::Sys.time(),
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
fc_horizon <- check_fc_horizon(fc_horizon)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
model_output <- ini_model_output()
md <- fc <- NULL
model_name <- "stl"
print_model_name(model_name)
if (!"s.window" %in% names(stl_arg)) {
stl_arg = base::list(s.window = "periodic")
}
ts_contiguous_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct) %>%
add_placeholders(fc_horizon,
backtesting_opt)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_contiguous_data,
fc_horizon = fc_horizon,
nb_iter = bt_iter,
backtesting_opt = backtesting_opt)
x_train <- sample_split[["train"]]
x_test <- sample_split[["test"]]
if (valid_md_stl(x_train)) {
md <- base::do.call(stats::stl,
c(base::list(x_train),
stl_arg))
} else {
return(model_output)
}
fc <- forecast::forecast(md, h = fc_horizon)
results <- save_fc_forecast(forecast_obj = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
time_id = time_id,
period_iter = period_iter,
model_args = stl_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' Seasonal Naive Model
#' @description Function to apply the \code{\link[forecast]{snaive}} function from the \code{forecast} package on
#' time series data.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param snaive_arg A list, optional arguments to pass to the \code{\link[forecast]{snaive}} function
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_snaive(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_snaive(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_snaive(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_snaive <- function(ts_data,
fc_horizon = 12,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
snaive_arg = NULL,
time_id = base::Sys.time(),
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
fc_horizon <- check_fc_horizon(fc_horizon)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
model_output <- ini_model_output()
md <- fc <- NULL
model_name <- "snaive"
print_model_name(model_name)
ts_contiguous_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct) %>%
add_placeholders(fc_horizon,
backtesting_opt)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_contiguous_data,
fc_horizon = fc_horizon,
nb_iter = bt_iter,
backtesting_opt = backtesting_opt)
x_train <- sample_split[["train"]]
x_test <- sample_split[["test"]]
if (valid_md_snaive(x_train, fc_horizon)) {
md <- base::do.call(forecast::snaive,
c(base::list(x_train),
snaive_arg))
} else {
return(model_output)
}
fc <- forecast::forecast(md, h = fc_horizon)
results <- save_fc_forecast(forecast_obj = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
period_iter = period_iter,
time_id = time_id,
model_args = snaive_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' Bayesian Structural Time Series Model
#' @description Function to apply the \code{\link[bsts]{bsts}} function from the \code{bsts} package on
#' time series data.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param bsts_arg A list, optional arguments to pass to the \code{\link[bsts]{bsts}} function
#' @param data_transf_method A string, the data transformation method to be passed to the function.
#' (available options: 'diff', 'log', 'sqrt')
#'
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_bsts(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_bsts(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_bsts(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_bsts <- function(ts_data,
fc_horizon = 12,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
data_transf_method = "diff",
bsts_arg = NULL,
time_id = base::Sys.time(),
...){
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
fc_horizon <- check_fc_horizon(fc_horizon)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
model_output <- ini_model_output()
ss <- base::list()
md <- fc <- NULL
model_name <- "bsts"
print_model_name(model_name)
if (stats::frequency(ts_data_xts) <= 1) {
message(base::paste("For 'bsts': as the data frequency is lower or equal to 1, the value of the ",
"'seasonal' argument must be set to FALSE and the value of the 'linear_trend' ",
"argument must be set to TRUE.",
sep = ""))
bsts_arg$seasonal <- FALSE
bsts_arg$linear_trend <- TRUE
}
if (base::is.null(bsts_arg)) {
bsts_arg <- base::list(linear_trend = TRUE,
seasonal = TRUE,
niter = 1000,
ping = 0,
family = "gaussian",
seed = 1234)
} else {
if ("linear_trend" %in% base::names(bsts_arg)) {
if (!bsts_arg$linear_trend %in% c(TRUE, FALSE)) {
message("The value of the 'linear_trend' argument of the bsts model is invalid, using default (TRUE)")
bsts_arg$linear_trend <- TRUE
}
} else {
message("The 'linear_trend' was not defined, using TRUE as default")
bsts_arg$linear_trend <- TRUE
}
if ("seasonal" %in% base::names(bsts_arg)) {
if (!bsts_arg$seasonal %in% c(TRUE, FALSE)) {
message("The value of the 'seasonal' argument of the bsts model is invalid, using TRUE as default")
bsts_arg$seasonal <- TRUE
}
} else {
message("The 'seasonal' argument was not defined, using TRUE as default")
bsts_arg$seasonal <- TRUE
}
if ("niter" %in% base::names(bsts_arg)) {
if (!base::is.numeric(bsts_arg$niter)) {
message("The value of the 'niter' argument of the bsts model is invalid, setting the argument to 1000")
bsts_arg$niter <- 1000
} else if (bsts_arg$niter%%1 != 0) {
message("The value of the 'niter' argument of the bsts model is not integer, setting the argument to 1000")
bsts_arg$niter <- 1000
}
} else {
message("The 'niter' argument was not defined, setting the argument to 1000")
bsts_arg$niter <- 1000
}
if ("ping" %in% base::names(bsts_arg)) {
if (!base::is.numeric(bsts_arg$ping)) {
message("The value of the 'ping' argument of the bsts model is invalid, setting the argument to 100")
bsts_arg$ping <- 100
} else if (bsts_arg$ping%%1 != 0) {
message("The value of the 'ping' argument of the bsts model is not integer, setting the argument to 100")
bsts_arg$ping <- 1000
}
} else {
message("The 'ping' argument was not defined, setting the argument to 100")
bsts_arg$ping <- 100
}
if ("seed" %in% base::names(bsts_arg)) {
if (!base::is.numeric(bsts_arg$seed)) {
message("The value of the 'seed' argument of the bsts model is invalid, setting the argument to 1234")
bsts_arg$seed <- 1234
} else if (bsts_arg$seed%%1 != 0) {
message("The value of the 'seed' argument of the bsts model is not integer, setting the argument to 1234")
bsts_arg$seed <- 1234
}
} else {
message("The 'seed' argument was not defined, setting the argument to 1234")
bsts_arg$seed <- 1234
}
if ("family" %in% base::names(bsts_arg)) {
if (!bsts_arg$family %in% c("gaussian", "logit",
"poisson", "student")) {
message("The value of the 'family' argument of the bsts model is invalid, using 'gaussian' as default")
bsts_arg$family <- "gaussian"
}
} else {
message("The value of the 'family' argument is missing, using 'gaussian' as default")
bsts_arg$family <- "gaussian"
}
}
if (bsts_arg$linear_trend) {
ss <- bsts::AddLocalLinearTrend(ss, ts_data_xts)
}
if (bsts_arg$seasonal) {
ss <- bsts::AddSeasonal(ss, ts_data_xts, nseasons = stats::frequency(ts_data_xts))
}
ts_preprocessed_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct)
nb_diffs <- forecast::ndiffs(ts_preprocessed_data)
if (nb_diffs > 0) {
ts_transformed_data <-
ts_preprocessed_data %>%
transform_data(., transf_method = data_transf_method) %>%
timeSeries::na.contiguous() %>%
add_placeholders(fc_horizon,
backtesting_opt)
} else {
ts_transformed_data <-
ts_preprocessed_data %>%
add_placeholders(fc_horizon,
backtesting_opt)
}
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_transformed_data,
fc_horizon = fc_horizon,
nb_iter = bt_iter,
backtesting_opt = backtesting_opt)
x_train <- sample_split[["train"]]
x_test <- sample_split[["test"]]
if (valid_md_bsts(x_train)) {
md <- bsts::bsts(x_train,
state.specification = ss,
niter = bsts_arg$niter,
ping = bsts_arg$ping,
seed = bsts_arg$seed,
family = bsts_arg$family)
} else {
return(model_output)
}
fc <- stats::predict(md, horizon = fc_horizon,
quantiles = c(0.025, 0.975))
fc$mean <- fc$mean %>% xts::as.xts(order.by = zoo::index(x_test))
fc$median <- fc$median %>% xts::as.xts(order.by = zoo::index(x_test))
fc$interval <- t(fc$interval) %>% xts::as.xts(order.by = zoo::index(x_test))
if (nb_diffs > 0) {
fc$mean <-
fc$mean %>%
transform_data(ts_preprocessed_data, .,
transf_method = data_transf_method,
apply_transform = FALSE)
fc$median <-
fc$median %>%
transform_data(ts_preprocessed_data, .,
transf_method = data_transf_method,
apply_transform = FALSE)
fc$interval[,1] <-
fc$interval[, 1] %>%
transform_data(ts_preprocessed_data, .,
transf_method = data_transf_method,
apply_transform = FALSE)
fc$interval[, 2] <-
fc$interval[, 2] %>%
transform_data(ts_preprocessed_data, .,
transf_method = data_transf_method,
apply_transform = FALSE)
}
results <- save_fc_bsts(bsts_obj = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
period_iter = period_iter,
time_id = time_id,
model_args = bsts_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' Long-Short Term Memory Network
#' @description Function to apply lstm networks (\code{\link[keras]{layer_lstm}}) from the \code{keras} package on
#' time series data. In order to use the LSTM model, Python and Tensorflow (version <= 1.14) must be installed on the machine.
#' Please check the README file for more information.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param xreg_data A univariate or multivariate 'ts', 'mts' or 'xts' object, optional external regressors
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param lstm_keras_arg A list, optional arguments to pass to the lstm network
#' @param data_transf_method A string, the data transformation method to be passed to the function.
#' (available options: 'diff', 'log', 'sqrt')
#'
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_lstm_keras(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_lstm_keras(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_lstm_keras(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_lstm_keras <- function(ts_data,
fc_horizon = 12,
xreg_data = NULL,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
data_transf_method = "diff",
lstm_keras_arg = NULL,
time_id = base::Sys.time(),
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
xreg_xts <- check_data_sv_as_xts(xreg_data)
fc_horizon <- check_fc_horizon(fc_horizon)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
model_output <- ini_model_output()
model_name <- "lstm_keras"
print_model_name(model_name)
model_name <- check_model_names(model_name)
if (base::length(model_name) == 0) {
stop("LSTM Model not found! Please check model requirements in README file.")
}
all_time_features <-
timetk::tk_get_timeseries_signature(ts_data_xts %>%
zoo::index() %>%
lubridate::as_date()) %>%
base::colnames()
if (base::is.null(lstm_keras_arg)) {
lstm_keras_arg = base::list(valid_set_size = stats::frequency(ts_data_xts),
stateful = FALSE,
nb_stacked_layers = 0,
lag_setting = stats::frequency(ts_data_xts) ,
loss = "mean_absolute_error",
lr = 0.001,
momentum = 0.1,
dropout = 0.3,
recurrent_dropout = 0.2,
nb_units = 50,
nb_epochs = 70,
nb_timesteps = stats::frequency(ts_data_xts),
batch_size = 1,
optimizer_type = "adam",
patience = 20,
verbose = TRUE,
seed = 1234,
time_features = c("month", "year"))
} else {
if ("valid_set_size" %in% base::names(lstm_keras_arg)) {
if (!base::is.numeric(lstm_keras_arg$valid_set_size)) {
message("The 'valid_set_size' argument is invalid, setting to default: frequency of the data")
lstm_keras_arg$valid_set_size <- stats::frequency(ts_data_xts)
}
} else {
message("The 'valid_set_size' argument is missing, setting to default: frequency of the data")
lstm_keras_arg$valid_set_size <- stats::frequency(ts_data_xts)
}
if ("stateful" %in% base::names(lstm_keras_arg)) {
if (!base::is.logical(lstm_keras_arg$stateful)) {
message("The value of the 'stateful' argument is invalid, using FALSE as default.")
lstm_keras_arg$stateful <- FALSE
}
} else {
message("The value of the 'stateful' argument is missing, using FALSE as default.")
lstm_keras_arg$stateful <- FALSE
}
if ("nb_stacked_layers" %in% base::names(lstm_keras_arg)) {
if (!base::is.numeric(lstm_keras_arg$nb_stacked_layers)) {
message("The value of the number of stacked layers is invalid, using 0 as default")
lstm_keras_arg$nb_stacked_layers <- 0
}
} else {
message("The value of the number of stacked layers is missing, using 0 as default")
lstm_keras_arg$nb_stacked_layers <- 0
}
if (!"lag_setting" %in% base::names(lstm_keras_arg)) {
message("The number of lags is missing, using the frequency of the data as default")
lstm_keras_arg$lag_setting <- stats::frequency(ts_data_xts)
}
if (!"loss" %in% base::names(lstm_keras_arg)) {
message("The value of the 'loss' argument is missing, using 'mean_absolute_error' as default")
lstm_keras_arg$loss <- "mean_absolute_error"
}
if (!"lr" %in% base::names(lstm_keras_arg)) {
message("The value of the learning rate is missing, using 0.001 as default")
lstm_keras_arg$lr <- 0.001
}
if (!"momentum" %in% base::names(lstm_keras_arg)) {
message("The value of the momentum is missing, using 0.05 as default")
lstm_keras_arg$momentum <- 0.1
}
if (!"dropout" %in% base::names(lstm_keras_arg)) {
message("The value of the dropout rate is missing, using 0.2 as default")
lstm_keras_arg$dropout <- 0.3
}
if (!"recurrent_dropout" %in% base::names(lstm_keras_arg)) {
message("The value of the recurrent dropout rate is missing, using 0.2 as default")
lstm_keras_arg$recurrent_dropout <- 0.2
}
if (!"nb_units" %in% base::names(lstm_keras_arg)) {
message("The value of the number of units in the LSTM cell is missing, using 100 as default")
lstm_keras_arg$nb_units <- 50
}
if (!"nb_epochs" %in% base::names(lstm_keras_arg)) {
message("The value of the number of epochs is missing, using 50 as default")
lstm_keras_arg$nb_epochs <- 70
}
if (!"nb_timesteps" %in% base::names(lstm_keras_arg)) {
message("The value of the number of time steps is missing, using the frequency of the data as default")
lstm_keras_arg$nb_timesteps <- stats::frequency(ts_data_xts)
}
if (!"batch_size" %in% base::names(lstm_keras_arg)) {
message("The value of the batch size is missing, using 1 as default")
lstm_keras_arg$batch_size <- 1
}
if (!"optimizer_type" %in% base::names(lstm_keras_arg)) {
message("The type of optimizer to apply is missing, using 'adam' as default")
lstm_keras_arg$optimizer_type <- "adam"
}
if (!"patience" %in% base::names(lstm_keras_arg)) {
message("The value of the loss function is missing, using 10 as default")
lstm_keras_arg$patience <- 20
}
if (!"verbose" %in% base::names(lstm_keras_arg)) {
message("The value of verbose is missing, using TRUE as default")
lstm_keras_arg$verbose <- TRUE
}
if ("time_features" %in% base::names(lstm_keras_arg)) {
if (sum(!lstm_keras_arg$time_features %in% all_time_features) > 0) {
message(base::paste("The value of time features to select is invalid, setting to default: ",
"c('month', 'year'). All available options are: c('",
base::paste(all_time_features, collapse = "', '"), "')",
sep = ""))
lstm_keras_arg$time_features <- c("month", "year")
}
} else {
message(base::paste("The value of time features to select is missing, setting to default: ",
"c('month', 'year'). All available options are: c('",
base::paste(all_time_features, collapse = "', '"), "')",
sep = ""))
lstm_keras_arg$time_features <- c("month", "year")
}
if ("seed" %in% base::names(lstm_keras_arg)) {
if (!base::is.numeric(lstm_keras_arg$seed)) {
message("The value of seed is invalid, using 1234 as default")
lstm_keras_arg$seed <- 1234
}
} else {
message("The value of seed is missing, using 1234 as default")
lstm_keras_arg$seed <- 1234
}
}
if (is.null(lstm_keras_arg$seed)) {
base::set.seed(lstm_keras_arg$seed)
seed <- base::as.integer(stats::runif(1, min = 1, max = 9999))
} else {
seed <- lstm_keras_arg$seed
}
keras::use_session_with_seed(seed,
disable_gpu = TRUE,
disable_parallel_cpu = TRUE)
callbacks <- base::list(keras::callback_early_stopping(patience = lstm_keras_arg$patience))
ts_preprocessed_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct)
nb_diffs <- forecast::ndiffs(ts_preprocessed_data)
if (nb_diffs > 0) {
ts_transformed_data <-
ts_preprocessed_data %>%
transform_data(., transf_method = data_transf_method)
} else {
ts_transformed_data <-
ts_preprocessed_data
}
ts_features <-
ts_transformed_data %>%
timeSeries::na.contiguous() %>%
add_placeholders(fc_horizon,
backtesting_opt) %>%
add_features(xreg_xts)
ts_name <- base::colnames(ts_data_xts)
ts_freq <- stats::frequency(ts_data_xts)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_features,
fc_horizon = fc_horizon,
nb_iter = bt_iter,
valid_set_size = lstm_keras_arg$valid_set_size,
backtesting_opt = backtesting_opt)
if (!valid_md_lstm_keras(sample_split[["train"]],
lstm_keras_arg)) {
return(model_output)
}
ts_train <-
sample_split[["train"]] %>%
tibble::as_tibble() %>%
dplyr::mutate("index" = zoo::index(sample_split[["train"]]) %>% lubridate::as_date()) %>%
dplyr::mutate("key" = "Training") %>%
timetk::tk_augment_timeseries_signature()
ts_valid <-
sample_split[["valid"]] %>%
tibble::as_tibble() %>%
dplyr::mutate("index" = zoo::index(sample_split[["valid"]]) %>% lubridate::as_date()) %>%
dplyr::mutate("key" = "Validation") %>%
timetk::tk_augment_timeseries_signature()
ts_test <-
sample_split[["test"]] %>%
tibble::as_tibble() %>%
dplyr::mutate("index" = zoo::index(sample_split[["test"]]) %>% lubridate::as_date()) %>%
dplyr::mutate("key" = "Test") %>%
timetk::tk_augment_timeseries_signature()
ts_data <-
dplyr::bind_rows(ts_train, ts_valid, ts_test) %>%
dplyr::select(base::list(colnames(ts_features),
lstm_keras_arg$time_features,
"key") %>%
base::unlist())
normalization_step <- normalize_data(ts_data)
normalized_data <- normalization_step$data
mean_history <- normalization_step$scale_arg_ls[[base::colnames(ts_data_xts)]][, 'mean_history']
scale_history <- normalization_step[["scale_arg_ls"]][[base::colnames(ts_data_xts)]][, 'scale_history']
data_with_tsteps <- add_timesteps(data_df = normalized_data,
fc_horizon = fc_horizon,
valid_set_size = lstm_keras_arg$valid_set_size,
tsteps = lstm_keras_arg$nb_timesteps,
lag_setting = lstm_keras_arg$lag_setting,
backtesting_opt = backtesting_opt)
nb_features <-
base::colnames(data_with_tsteps)[!base::colnames(data_with_tsteps) %in% c("key", "y")] %>%
base::length() / lstm_keras_arg$nb_timesteps
y_train_tensor <-
data_with_tsteps %>%
dplyr::filter(key == "Training") %>%
dplyr::select(y) %>%
base::as.matrix() %>%
reshape_Y()
y_valid_tensor <-
data_with_tsteps %>%
dplyr::filter(key == "Validation") %>%
dplyr::select(y) %>%
base::as.matrix() %>%
reshape_Y()
y_test_tensor <-
data_with_tsteps %>%
dplyr::filter(key == "Test") %>%
dplyr::select(y) %>%
base::as.matrix() %>%
reshape_Y()
x_train_tensor <-
data_with_tsteps %>%
dplyr::filter(key == "Training") %>%
dplyr::select(-y, -key) %>%
base::as.matrix() %>%
reshape_X(.,
tsteps = lstm_keras_arg$nb_timesteps,
nb_features = nb_features)
x_valid_tensor <-
data_with_tsteps %>%
dplyr::filter(key == "Validation") %>%
dplyr::select(-y, -key) %>%
base::as.matrix() %>%
reshape_X(.,
tsteps = lstm_keras_arg$nb_timesteps,
nb_features = nb_features)
x_test_tensor <-
data_with_tsteps %>%
dplyr::filter(key == "Test") %>%
dplyr::select(-y, -key) %>%
base::as.matrix() %>%
reshape_X(.,
tsteps = lstm_keras_arg$nb_timesteps,
nb_features = nb_features)
model <- keras::keras_model_sequential()
if (lstm_keras_arg$nb_stacked_layers > 0) {
model %>%
keras::layer_lstm(
units = lstm_keras_arg$nb_units,
batch_input_shape = c(lstm_keras_arg$batch_size,
lstm_keras_arg$nb_timesteps,
nb_features),
dropout = lstm_keras_arg$dropout,
recurrent_dropout = lstm_keras_arg$recurrent_dropout,
return_sequences = TRUE,
stateful=lstm_keras_arg$stateful)
i <- lstm_keras_arg$nb_stacked_layers
while (i > 1) {
model %>%
keras::layer_lstm(
units = lstm_keras_arg$nb_units,
dropout = lstm_keras_arg$dropout,
recurrent_dropout = lstm_keras_arg$recurrent_dropout,
return_sequences = TRUE,
stateful = lstm_keras_arg$stateful)
i <- i - 1
}
model %>%
keras::layer_lstm(
units = lstm_keras_arg$nb_units,
dropout = lstm_keras_arg$dropout,
recurrent_dropout = lstm_keras_arg$recurrent_dropout,
return_sequences = FALSE,
stateful = lstm_keras_arg$stateful) %>%
keras::layer_dense(units = 1)
} else {
model %>%
keras::layer_lstm(
units = lstm_keras_arg$nb_units,
batch_input_shape = c(lstm_keras_arg$batch_size,
lstm_keras_arg$nb_timesteps,
nb_features),
dropout = lstm_keras_arg$dropout,
recurrent_dropout = lstm_keras_arg$recurrent_dropout,
return_sequences = FALSE,
stateful=lstm_keras_arg$stateful) %>%
keras::layer_dense(units = 1)
}
model %>%
keras::compile(
loss = lstm_keras_arg$loss,
optimizer = lstm_keras_arg$optimizer_type,
metrics = list(lstm_keras_arg$loss))
if (length(x_valid_tensor) == 0) {
lstm_validation_data <- NULL
} else {
lstm_validation_data <- list(x_valid_tensor, y_valid_tensor)
}
if (!lstm_keras_arg$stateful) {
model_fit <-
model %>%
keras::fit(x = x_train_tensor,
y = y_train_tensor,
callbacks = callbacks,
validation_data = lstm_validation_data,
batch_size = lstm_keras_arg$batch_size,
epochs = lstm_keras_arg$nb_epochs,
shuffle = FALSE,
verbose = lstm_keras_arg$verbose)
}else{
for (epoch_iter in 1:lstm_keras_arg$nb_epochs) {
model_fit <-
model %>% keras::fit(x = x_train_tensor,
y = y_train_tensor,
validation_data = lstm_validation_data,
callbacks = callbacks,
batch_size = lstm_keras_arg$batch_size,
epochs = 1,
shuffle = FALSE,
verbose = lstm_keras_arg$verbose)
model %>% keras::reset_states()
base::cat("Epoch: ", epoch_iter)
}
}
new_x_test_data <- x_test_tensor
pred_list <- NULL
for (i in 1:fc_horizon) {
# generate a one-step ahead forecast
prediction <- model %>%
stats::predict(new_x_test_data[i, ,] %>%
array(data = .,
dim = c(1,
lstm_keras_arg$nb_timesteps,
nb_features)),
batch_size = lstm_keras_arg$batch_size) %>%
.[, 1]
pred_list <- c(pred_list, prediction)
if (i < fc_horizon) {
first_tstep <- base::max((lstm_keras_arg$nb_timesteps - i + 1), 0)
new_x_test_data[i + 1,
first_tstep:lstm_keras_arg$nb_timesteps,
1] <-
utils::tail(pred_list, lstm_keras_arg$nb_timesteps)
}
}
fc_transformed <-
pred_list %>%
base::matrix(., nrow = fc_horizon, ncol = 1) %>%
{
. * scale_history + mean_history
} %>%
xts::as.xts(order.by = zoo::index(sample_split[["test"]]))
if (nb_diffs > 0) {
fc <-
fc_transformed %>%
transform_data(ts_preprocessed_data, .,
transf_method = data_transf_method,
apply_transform = FALSE) %>%
as.data.frame()
} else {
fc <-
fc_transformed %>%
as.data.frame()
}
results <- save_fc_ml(data_df = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir = data_dir,
model_name = model_name,
time_id = time_id,
period_iter = period_iter,
model_args = lstm_keras_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
return(model_output)
}
#' Automated Machine Learning
#' @description Function to apply the \code{\link[h2o]{h2o.automl}} function from the \code{h2o} package on
#' time series data.
#' @param ts_data A univariate 'ts' or 'xts' object
#' @param fc_horizon An integer, the forecasting horizon (i.e. the number of periods to forecast)
#' @param xreg_data A univariate or multivariate 'ts', 'mts' or 'xts' object, optional external regressors
#' @param backtesting_opt A list, options which define the backtesting approach:
#'
#' use_bt - A boolean, to determine whether forecasts should be generated on future dates (default) or on past values. Generating
#' forecasts on past dates allows to measure past forecast accuracy and to monitor a statistical model's ability to learn
#' signals from the data.
#'
#' nb_iters - An integer, to determine the number of forecasting operations to apply (When no backtesting is selected, then only
#' one forecasting exercise is performed)
#'
#' method - A string, to determine whether to apply a 'rolling' (default) or a 'moving' forecasting window. When 'rolling' is selected,
#' after each forecasting exercise, the forecasting interval increments by one period and drops the last period to include it in
#' the new training sample. When 'moving' is selected, the forecasting interval increments by its size rather than one period.
#'
#' sample_size - A string, to determine whether the training set size should be 'expanding' (default) or 'fixed'.
#' When 'expanding' is selected, then after each forecasting operation, the periods dropped from the forecasting interval will
#' be added to the training set. When 'fixed' is selected, then adding new periods to the training set will require dropping as
#' many last periods to keep the set's size constant.
#'
#' @param data_dir A string, directory to which results can be saved as text files
#' @param prepro_fct A function, a preprocessing function which handles missing values in the data.
#' The default preprocessing function selects the largest interval of non-missing values and then attributes the
#' most recent dates to those values. Other data handling functions can be applied (e.g. timeSeries::na.contiguous,
#' imputeTS::na.mean, custom-developed...).
#'
#' @param automl_h2o_arg A list, optional arguments to pass to the \code{\link[h2o]{h2o.automl}} function
#' @param data_transf_method A string, the data transformation method to be passed to the function.
#' (available options: 'diff', 'log', 'sqrt')
#'
#' @param time_id A POSIXct, timestamp created with \code{\link[base]{Sys.time}} which is then appended to the results
#' @param nb_threads A numeric, number of threads to use in parallel computed model selection process
#' @param ... Additional arguments to be passed to the function
#' @examples
#' \dontrun{
#' library(datasets)
#'
#' # Generate forecasts on future dates
#' fc <- generate_fc_automl_h2o(AirPassengers,
#' fc_horizon = 12)
#'
#' # Generate forecasts on past dates to analyze performance
#' fc <- generate_fc_automl_h2o(AirPassengers,
#' fc_horizon = 12,
#' backtesting_opt = list(use_bt = TRUE))
#'
#' # Generate forecasts on past dates with multiple iterations and a rolling window
#' fc <- generate_fc_automl_h2o(AirPassengers,
#' fc_horizon = 6,
#' backtesting_opt = list(use_bt = TRUE,
#' nb_iters = 6))
#' }
#' @return A 'tsForecastR' object
#' @export
generate_fc_automl_h2o <- function(ts_data,
xreg_data = NULL,
fc_horizon = 12,
backtesting_opt = NULL,
data_dir = NULL,
prepro_fct = NULL,
data_transf_method = "diff",
automl_h2o_arg = NULL,
time_id = base::Sys.time(),
nb_threads = 1,
...) {
`%>%` <- magrittr::`%>%`
ts_data_xts <- check_data_sv_as_xts(ts_data)
xreg_xts <- check_data_sv_as_xts(xreg_data)
fc_horizon <- check_fc_horizon(fc_horizon)
backtesting_opt <- check_backtesting_opt(backtesting_opt)
data_dir <- check_data_dir(data_dir)
prepro_fct <- check_preprocess_fct(prepro_fct)
time_id <- check_time_id(time_id)
nb_threads <- check_nb_cores(nb_threads)
model_output <- ini_model_output()
model_name <- "automl_h2o"
print_model_name(model_name)
h2o::h2o.init(port = 54321, nthreads = nb_threads)
all_time_features <-
timetk::tk_get_timeseries_signature(ts_data_xts %>%
zoo::index() %>%
lubridate::as_date()) %>%
colnames()
if (base::is.null(automl_h2o_arg)) {
automl_h2o_arg <-
base::list(max_models = 5,
max_runtime_secs = 3600,
max_runtime_secs_per_model = 30,
stopping_metric = "MAE",
seed = 1234,
algos_to_exclude = "StackedEnsemble",
time_features = all_time_features,
valid_set_size = stats::frequency(ts_data_xts),
test_set_size = stats::frequency(ts_data_xts))
} else {
if (!base::is.list(automl_h2o_arg)) {
message(base::paste("The model arguments must be passed as a list! ",
" Setting to defaults: ",
"list(max_models = 5, max_runtime_secs = 3600, ",
"max_runtime_secs_per_model = 30, ",
"stopping_metric = 'MAE', seed = 1234, ",
"algos_to_exclude = NULL, ",
"valid_set_size = frequency(ts_data), ",
"test_set_size = frequency(ts_data))",
sep = ""))
automl_h2o_arg <-
base::list(max_models = 5,
max_runtime_secs = 3600,
max_runtime_secs_per_model = 30,
stopping_metric = "MAE",
seed = 1234,
algos_to_exclude = "StackedEnsemble",
time_features = all_time_features,
valid_set_size = stats::frequency(ts_data_xts),
test_set_size = stats::frequency(ts_data_xts))
}
if (!base::is.null(automl_h2o_arg$max_models)) {
if (!base::is.numeric(automl_h2o_arg$max_models)) {
message("The value of the 'max_models' argument is invalid. Setting to default: 10")
automl_h2o_arg$max_models <- 5
}
} else {
message("The value of the 'max_models' argument is missing. Setting to default: 10")
automl_h2o_arg$max_models <- 5
}
if ("max_runtime_secs" %in% base::names(automl_h2o_arg)) {
if (!base::is.numeric(automl_h2o_arg$max_runtime_secs)) {
message("The value of the 'max_runtime_secs' argument is invalid. Setting to default: 3600")
automl_h2o_arg$max_runtime_secs <- 3600
}
} else {
message("The value of the 'max_runtime_secs' argument is missing. Setting to default: 3600")
automl_h2o_arg$max_runtime_secs <- 3600
}
if ("max_runtime_secs" %in% base::names(automl_h2o_arg)) {
if (!is.numeric(automl_h2o_arg$max_runtime_secs_per_model)) {
message(base::paste("The value of the 'max_runtime_secs_per_model' argument is invalid. ",
"Setting to default: 0 (to disable)",
sep = ""))
automl_h2o_arg$max_runtime_secs_per_model <- 30
}
} else {
message(base::paste("The value of the 'max_runtime_secs_per_model' argument is missing. ",
"Setting to default: 0 (to disable)",
sep = ""))
automl_h2o_arg$max_runtime_secs_per_model <- 30
}
if ("stopping_metric" %in% base::names(automl_h2o_arg)) {
if (!automl_h2o_arg$stopping_metric %in% c("AUTO", "deviance", "logloss",
"MSE", "RMSE", "MAE", "RMSLE")) {
message("The value of the 'stopping_metric' argument is invalid. Setting to default: 'MAE'")
automl_h2o_arg$stopping_metric <- 'MAE'
}
} else {
message("The value of the 'stopping_metric' argument is missing. Setting to default: 'MAE'")
automl_h2o_arg$stopping_metric <- "MAE"
}
if (!base::is.null(automl_h2o_arg$seed)) {
if (!base::is.numeric(automl_h2o_arg$seed)) {
message("The value of the 'seed' argument is invalid. Setting to default: 1234")
automl_h2o_arg$seed <- 1234
}
}
if (!base::is.null(automl_h2o_arg$algos_to_exclude)) {
if (!base::is.character(automl_h2o_arg$algos_to_exclude)) {
message("The value of the 'exclude_algos' argument is invalid. Setting to default: 'StackedEnsemble'")
automl_h2o_arg$algos_to_exclude <- "StackedEnsemble"
}
} else {
message("The value of the 'exclude_algos' argument is missing Setting to default: 'StackedEnsemble'")
automl_h2o_arg$algos_to_exclude <- "StackedEnsemble"
}
if ("time_features" %in% base::names(automl_h2o_arg)) {
if (base::sum(!automl_h2o_arg$time_features %in% all_time_features) > 0) {
message(base::paste("These values of the 'time_features' argument are invalid: ",
automl_h2o_arg$time_features %>%
.[!. %in% all_time_features],
". ",
"Setting to default: ",
"c('",
base::paste(all_time_features, collapse = "', '"),
"')",
sep = ""))
automl_h2o_arg$time_features <- all_time_features
}
} else {
message(base::paste("The value of the 'time_features' argument is missing. ",
"Setting to default: c('",
paste(all_time_features, collapse = "', '"),
"')",
sep = ""))
automl_h2o_arg$time_features <- all_time_features
}
if ("valid_set_size" %in% base::names(automl_h2o_arg)) {
if (!base::is.numeric(automl_h2o_arg$valid_set_size)) {
message(base::paste("The value of the 'valid_set_size' argument is invalid. ",
"Setting to default: frequency(ts_data)",
sep = ""))
automl_h2o_arg$valid_set_size <- stats::frequency(ts_data_xts)
}
} else {
automl_h2o_arg$valid_set_size <- stats::frequency(ts_data_xts)
}
if ("test_set_size" %in% base::names(automl_h2o_arg)) {
if (!base::is.numeric(automl_h2o_arg$test_set_size)) {
message(base::paste("The value of the 'test_set_size' argument is invalid. ",
"Setting to default: frequency(ts_data)",
sep = ""))
automl_h2o_arg$test_set_size <- stats::frequency(ts_data_xts)
}
} else {
automl_h2o_arg$test_set_size <- stats::frequency(ts_data_xts)
}
}
ts_preprocessed_data <-
preprocess_custom_fct(ts_data_xts,
prepro_fct)
nb_diffs <- forecast::ndiffs(ts_preprocessed_data)
if (nb_diffs > 0) {
ts_transformed_data <-
ts_preprocessed_data %>%
transform_data(., transf_method = data_transf_method)
} else {
ts_transformed_data <-
ts_preprocessed_data
}
ts_features <-
ts_transformed_data %>%
timeSeries::na.contiguous() %>%
add_placeholders(fc_horizon,
backtesting_opt) %>%
add_features(xreg_xts)
for (bt_iter in 1:backtesting_opt$nb_iters) {
period_iter <- base::paste("period_", bt_iter, sep = "")
sample_split <- split_train_test_set(ts_features,
fc_horizon,
bt_iter,
automl_h2o_arg$valid_set_size,
automl_h2o_arg$test_set_size,
backtesting_opt)
if (!valid_md_autml_h2o(sample_split[["train"]],
automl_h2o_arg)) {
return(model_output)
}
ts_train <-
sample_split[["train"]] %>%
tibble::as_tibble() %>%
dplyr::mutate("index" = zoo::index(sample_split[["train"]]) %>% lubridate::as_date()) %>%
dplyr::mutate("key" = "Training") %>%
timetk::tk_augment_timeseries_signature()
ts_valid <-
sample_split[["valid"]] %>%
tibble::as_tibble() %>%
dplyr::mutate("index" = zoo::index(sample_split[["valid"]]) %>% lubridate::as_date()) %>%
dplyr::mutate("key" = "Validation") %>%
timetk::tk_augment_timeseries_signature()
ts_tmp_test <-
sample_split[["tmp_test"]] %>%
tibble::as_tibble() %>%
dplyr::mutate("index" = zoo::index(sample_split[["tmp_test"]]) %>% lubridate::as_date()) %>%
dplyr::mutate("key" = "Tmp_Test") %>%
timetk::tk_augment_timeseries_signature()
ts_test <-
sample_split[["test"]] %>%
tibble::as_tibble() %>%
dplyr::mutate("index" = zoo::index(sample_split[["test"]]) %>% lubridate::as_date()) %>%
dplyr::mutate("key" = "Test") %>%
timetk::tk_augment_timeseries_signature()
ts_data <-
dplyr::bind_rows(ts_train, ts_valid, ts_tmp_test, ts_test) %>%
dplyr::select(base::list(colnames(ts_features),
automl_h2o_arg$time_features,
"key") %>%
base::unlist())
ts_cleaned <- ts_data[!colnames(ts_data) %in% c("diff", "index")]
train_h2o <-
ts_cleaned %>%
dplyr::filter(key == "Training") %>%
dplyr::mutate_if(base::is.ordered, ~base::as.character(.)
%>% base::as.factor()) %>%
h2o::as.h2o()
valid_h2o <-
ts_cleaned %>%
dplyr::filter(key == "Validation") %>%
dplyr::mutate_if(base::is.ordered, ~base::as.character(.)
%>% base::as.factor()) %>%
h2o::as.h2o()
tmp_test_h2o <-
ts_cleaned %>%
dplyr::filter(key == "Tmp_Test") %>%
dplyr::mutate_if(base::is.ordered, ~base::as.character(.)
%>% base::as.factor()) %>%
h2o::as.h2o()
test_h2o <-
ts_cleaned %>%
dplyr::filter(key == "Test") %>%
dplyr::mutate_if(base::is.ordered, ~base::as.character(.)
%>% base::as.factor()) %>%
h2o::as.h2o()
y <- base::colnames(ts_data_xts)
x <- dplyr::setdiff(base::names(train_h2o), y)
automl_models_h2o <- h2o::h2o.automl(x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = tmp_test_h2o,
max_models = automl_h2o_arg$max_models,
max_runtime_secs = automl_h2o_arg$max_runtime_secs,
max_runtime_secs_per_model = automl_h2o_arg$max_runtime_secs_per_model,
stopping_metric = automl_h2o_arg$stopping_metric,
seed = automl_h2o_arg$seed,
exclude_algos = automl_h2o_arg$algos_to_exclude)
h2o_model <- automl_models_h2o@leader
pred_h2o <-
h2o::h2o.predict(h2o_model, test_h2o) %>%
base::as.data.frame() %>%
xts::as.xts(order.by = zoo::index(sample_split[["test"]]))
if (nb_diffs > 0) {
fc <-
pred_h2o %>%
transform_data(ts_preprocessed_data, .,
transf_method = data_transf_method,
apply_transform = FALSE) %>%
base::as.data.frame()
} else {
fc <-
pred_h2o %>%
base::as.data.frame()
}
results <- save_fc_ml(data_df = fc,
sample_split = sample_split,
raw_data = ts_data_xts,
data_dir= data_dir,
model_name = model_name,
time_id = time_id,
period_iter = period_iter,
model_args = automl_h2o_arg)
base::eval(base::parse(text = base::paste("model_output$",
period_iter,
"$fc <- results",
sep = "")))
}
h2o::h2o.removeAll()
return(model_output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.