knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(mlr3temporal)
library(mlr3)
library(ggplot2)
library(forecast)
library(anytime)

Load and prepare data

In this vignette we are going to use the example of forecasting atmospheric data from Munich (muc) and New York City (nyc). We are interesting in forecasting three quantities: "TAVG", "TMIN", "TMAX", the average, minimum and maximum temperature on a given day.

We first obtain the data from the NOAA (National Centers for Environmental Information, National Oceanic and Atmospheric Administration) website.

# Download the data
muc = read.csv("https://www.ncei.noaa.gov/data/global-summary-of-the-year/access/GM000004199.csv")
nyc = read.csv("https://www.ncei.noaa.gov/data/global-summary-of-the-year/access/USW00094728.csv")

# And subset columns
columns = c("DATE", "PRCP", "TAVG", "TMIN", "TMAX")
muc = muc[, columns]
nyc = nyc[, columns]

The data contains few missing values and years. We will have to deal with them before forecasting.

# Add missing years.
fill_missing = function(data, date) {
  # Filter all-na rows
  data = data[!apply(data, 1, function(x) {all(is.na(x))}), ]
  # Add all missing dates from start to end of time-series
  dates = data[[date]]
  missing = setdiff(seq(from = min(dates, na.rm = TRUE), to = max(dates, na.rm = TRUE)), dates)
  if (length(missing)) {
    dt_miss = data.table(DATE = missing, PRCP = NA, TAVG = NA, TMIN = NA, TMAX = NA)
    data = rbind(data, dt_miss)
  }
  data[[date]] = as.POSIXct(as.character(data[["DATE"]]), format = "%Y")
  return(data.table(data[order(data[[date]]),]))
}

muc = fill_missing(muc, "DATE")
nyc = fill_missing(nyc, "DATE")

Generate Task

We can now supply this data.table to create a new TaskRegrForecast.

task = TaskRegrForecast$new(
  id = "nyc",
  backend = nyc,
  target = c("TAVG", "TMIN", "TMAX"),
  date_col = "DATE"
)
task$print()

Learner

learner = LearnerRegrForecastVAR$new()
learner$train(task, row_ids = 1:140)
learner$model

Predict

p = learner$predict(task, row_ids = 141:150)

p$response

Rolling Window CV

rr = rsmp("RollingWindowCV", fixed_window = F)
rr$instantiate(task)
resample = resample(task, learner, rr, store_models = TRUE)

resample$predictions()[1:2]

Plotting

autoplot(task) + ggtitle("NYC - Yearly Climate Data")

task = TaskRegrForecast$new(
  id = "muc",
  backend = ts_dt(muc),
  target = "TAVG",
  date_col = "DATE"
)

autoplot(task) + ggtitle("MUC - Yearly Climate Data")

learner = LearnerRegrForecastAutoArima$new()
learner$train(task, row_ids = 1:85)
learner$model
p = learner$predict(task, row_ids = 86:136)
p$se


checkresiduals(learner$model)

autoplot(forecast(learner$model, xreg = as.matrix(task$data(cols = "PRCP", rows = 131:136)))) + ylab("value")


mlr-org/mlr3forecasting documentation built on June 29, 2023, 11:57 p.m.