knitr::opts_chunk$set(fig.width = 7.15, fig.height = 4, warning = FALSE, message = FALSE)
knitr::opts_knit$set(fig.width = 7.15, fig.height = 4, warning = FALSE, message = FALSE)

Purpose

The purpose of this vignette is to provide an overview of direct multi-step-ahead forecasting with multiple time series in forecastML. The benefits to modeling multiple time series in one go with a single model or ensemble of models include (a) modeling simplicity, (b) potentially more robust results from pooling data across time series, and (c) solving the cold-start problem when few data points are available for a given time series.

Setup

To forecast with multiple/grouped/hierarchical time series in forecastML, your data need the following characteristics:

Example - Direct Forecasting

To illustrate forecasting with multiple time series, we'll use the data_buoy dataset that comes with the package. This dataset consists of daily sensor measurements of several environmental conditions collected by 14 buoys in Lake Michigan from 2012 through 2018. The data were obtained from NOAA's National Buoy Data Center available at https://www.ndbc.noaa.gov/ using the rnoaa package.

Load Packages and Data

data_buoy_gaps consists of:

library(forecastML)
library(dplyr)
library(DT)
library(ggplot2)
library(xgboost)

data("data_buoy_gaps", package = "forecastML")

DT::datatable(head(data_buoy_gaps), options = list(scrollX = TRUE))

forecastML::fill_gaps

data <- forecastML::fill_gaps(data_buoy_gaps, date_col = 1, frequency = '1 day', 
                              groups = 'buoy_id', static_features = c('lat', 'lon'))

print(list(paste0("The original dataset with gaps in data collection is ", nrow(data_buoy_gaps), " rows."), 
      paste0("The modified dataset with no gaps in data collection from fill_gaps() is ", nrow(data), " rows.")))

Dynamic Features

data$day <- lubridate::mday(data$date)
data$year <- lubridate::year(data$date)

Plot Wind Speed Outcome

p <- ggplot(data, aes(x = date, y = wind_spd, color = ordered(buoy_id), group = year))
p <- p + geom_line()
p <- p + facet_wrap(~ ordered(buoy_id), scales = "fixed")
p <- p + theme_bw() + theme(
  legend.position = "none"
) + xlab(NULL)
p

Model Training with Nested CV

data$buoy_id <- as.numeric(factor(data$buoy_id))

Training dataset - forecastML::create_lagged_df

outcome_col <- 1  # The column position of our 'wind_spd' outcome (after removing the 'date' column).

horizons <- c(1, 7, 30)  # Forecast 1, 1:7, and 1:30 days into the future.

lookback <- c(1:30, 360:370)  # Features from 1 to 30 days in the past and annually.

dates <- data$date  # Grouped time series forecasting requires dates.
data$date <- NULL  # Dates, however, don't need to be in the input data.

frequency <- "1 day"  # A string that works in base::seq(..., by = "frequency").

dynamic_features <- c("day", "year")  # Features that change through time but which will not be lagged.

groups <- "buoy_id"  # 1 forecast for each group or buoy.

static_features <- c("lat", "lon")  # Features that do not change through time.
type <- "train"  # Create a model-training dataset.

data_train <- forecastML::create_lagged_df(data, type = type, outcome_col = outcome_col,
                                           horizons = horizons, lookback = lookback,
                                           dates = dates, frequency = frequency,
                                           dynamic_features = dynamic_features,
                                           groups = groups, static_features = static_features, 
                                           use_future = FALSE)

DT::datatable(head(data_train$horizon_1), options = list(scrollX = TRUE))


p <- plot(data_train)  # plot.lagged_df() returns a ggplot object.
p <- p + geom_tile(NULL)  # Remove the gray border for a cleaner plot.
p

CV setup - forecastML::create_windows

windows <- forecastML::create_windows(data_train, window_length = 365, skip = 730,
                                      include_partial_window = FALSE)

p <- plot(windows, data_train) + theme(legend.position = "none")
p


p <- plot(windows, data_train, group_filter = "buoy_id == 1") + 
  theme(legend.position = "none")
p

User-defined modeling function

Any data transformations, hyperparameter tuning, or inner loop cross-validation procedures should take place within this function, with the limitation that it ultimately needs to return() a model suitable for the user-defined predict() function; a list can be returned to capture meta-data such as pre-processing pipelines or hyperparameter results.

# The value of outcome_col can also be set in train_model() with train_model(outcome_col = 1).
model_function <- function(data, outcome_col = 1) {

  # xgboost cannot handle missing outcomes data.
  data <- data[!is.na(data[, outcome_col]), ]

  indices <- 1:nrow(data)

  set.seed(224)
  train_indices <- sample(1:nrow(data), ceiling(nrow(data) * .8), replace = FALSE)
  test_indices <- indices[!(indices %in% train_indices)]

  data_train <- xgboost::xgb.DMatrix(data = as.matrix(data[train_indices, 
                                                           -(outcome_col), drop = FALSE]),
                                     label = as.matrix(data[train_indices, 
                                                            outcome_col, drop = FALSE]))

  data_test <- xgboost::xgb.DMatrix(data = as.matrix(data[test_indices, 
                                                          -(outcome_col), drop = FALSE]),
                                    label = as.matrix(data[test_indices, 
                                                           outcome_col, drop = FALSE]))

  params <- list("objective" = "reg:linear")
  watchlist <- list(train = data_train, test = data_test)

  set.seed(224)
  model <- xgboost::xgb.train(data = data_train, params = params, 
                              max.depth = 8, nthread = 2, nrounds = 30,
                              metrics = "rmse", verbose = 0, 
                              early_stopping_rounds = 5, 
                              watchlist = watchlist)

  return(model)
}

Model training - forecastML::train_model

#future::plan(future::multiprocess)  # Multi-core or multi-session parallel training.

model_results_cv <- forecastML::train_model(lagged_df = data_train,
                                            windows = windows,
                                            model_name = "xgboost",
                                            model_function = model_function, 
                                            use_future = FALSE)
summary(model_results_cv$horizon_1$window_1$model)

Forecasting with Nested Models

User-defined prediction function

The following user-defined prediction function is needed for each model:

# If 'model' is passed as a named list, the prediction model would be accessed with model$model or model["model"].
prediction_function <- function(model, data_features) {
  x <- xgboost::xgb.DMatrix(data = as.matrix(data_features))
  data_pred <- data.frame("y_pred" = predict(model, x),
                          "y_pred_lower" = predict(model, x) - 2,  # Optional; in practice, forecast bounds are not hard coded.
                          "y_pred_upper" = predict(model, x) + 2)  # Optional; in practice, forecast bounds are not hard coded.
  return(data_pred)
}

Historical model fit

data_pred_cv <- predict(model_results_cv, prediction_function = list(prediction_function), data = data_train)
plot(data_pred_cv) + theme(legend.position = "none")


plot(data_pred_cv, facet = group ~ model, group_filter = "buoy_id %in% c(1, 2, 3)", windows = 1) 


plot(data_pred_cv, facet = group ~ horizon, group_filter = "buoy_id %in% c(1, 2, 3)", windows = 1) 


Historical prediction error - forecastML::return_error

data_error <- forecastML::return_error(data_pred_cv)

plot(data_error, type = "window", group_filter = "buoy_id %in% c(1, 2, 3)", metric = "mae")
plot(data_error, type = "horizon", group_filter = "buoy_id %in% c(1, 2, 3)", metric = "mae")
plot(data_error, type = "global", group_filter = "buoy_id %in% c(1, 2, 3)", metric = "mae")

Forecasting with multiple models from nested CV

type <- "forecast"  # Create a forecasting dataset for our predict() function.

data_forecast <- forecastML::create_lagged_df(data, type = type, outcome_col = outcome_col,
                                              horizons = horizons, lookback = lookback,
                                              dates = dates, frequency = frequency,
                                              dynamic_features = dynamic_features,
                                              groups = groups, static_features = static_features, 
                                              use_future = FALSE)

DT::datatable(head(data_forecast$horizon_1), options = list(scrollX = TRUE))

Dynamic features and forecasting

for (i in seq_along(data_forecast)) {
  data_forecast[[i]]$day <- lubridate::mday(data_forecast[[i]]$index)  # When dates are given, the 'index` is date-based.
  data_forecast[[i]]$year <- lubridate::year(data_forecast[[i]]$index)
}

Forecast

data_forecasts <- predict(model_results_cv, prediction_function = list(prediction_function), data = data_forecast)
plot(data_forecasts)
plot(data_forecasts, facet = group ~ ., group_filter = "buoy_id %in% 1:3")

Model Training with All Data

windows <- forecastML::create_windows(data_train, window_length = 0)

p <- plot(windows, data_train) + theme(legend.position = "none")
p
# Un-comment the code below and set 'use_future' to TRUE.
#future::plan(future::multiprocess)

model_results_no_cv <- forecastML::train_model(lagged_df = data_train, 
                                               windows = windows,
                                               model_name = "xgboost",
                                               model_function = model_function,
                                               use_future = FALSE)
data_forecasts <- predict(model_results_no_cv, prediction_function = list(prediction_function), data = data_forecast)

DT::datatable(head(data_forecasts), options = list(scrollX = TRUE))

Forecast Combination - forecastML::combine_forecasts

data_combined <- forecastML::combine_forecasts(data_forecasts)

# Plot a background dataset of actuals using the most recent data.
data_actual <- data[dates >= as.Date("2018-11-01"), ]
actual_indices <- dates[dates >= as.Date("2018-11-01")]

# Plot all final forecasts plus historical data.
plot(data_combined, data_actual = data_actual, actual_indices = actual_indices)


plot(data_combined, data_actual = data_actual, actual_indices = actual_indices, 
     facet = group ~ ., group_filter = "buoy_id %in% c(1, 11, 12)")


# Plot final forecasts for a single buoy plus historical data.
plot(data_combined, data_actual = data_actual, actual_indices = actual_indices,
     group_filter = "buoy_id == 10")



nredell/forecastML documentation built on June 14, 2020, 5:12 p.m.