ifwtrends: Evaluating Google Trends data for economic forecasts

Documented in forecast_m

###
# Takes a list of vintages r_list, generated with roll and a monthly target variable dat
# to evaluate a forecast.
# Google Data should be log-transformed
# currently it uses a RIDGE-Regression but other models such as OLS, LASSO or
# PCA are available (change the relevant lines of code)

#' Monthly forecast evaluation of vintages
#'
#' @description
#'
#' `r lifecycle::badge('experimental')`
#'
#' [forecast_m()] uses monthly vintages generated by [roll()]
#' to predict the value of another time series for the next month.
#' Currently, this function couldn't be tested yet.
#'
#' @param r_list A Google Trends time series in form of a tibble (or tsibble)
#' which is used to predict the next month in data.
#' @param data A tibble with some data you
#' want to predict for the next month
#' that is somewhat correlated with the data in r_list.
#' @param fd A logical value to indicate if you want to use the first differences
#' in r_list (true) or normal values (false).
#'
#' @section Methods:
#' Google Data should be log-transformed
#' currently it uses a RIDGE-Regression
#' but other models such as OLS, LASSO or
#' PCA are available.
#'
#' @return Returns a list with the the predicted data and the used model
#' results.
#'
#' @importFrom dplyr across
#' @importFrom dplyr first
#' @importFrom dplyr filter
#' @importFrom dplyr group_by
#' @importFrom dplyr last
#' @importFrom dplyr mutate
#' @importFrom dplyr rename
#' @importFrom dplyr select
#' @importFrom dplyr summarise
#' @importFrom lubridate floor_date
#' @importFrom magrittr %>%
#' @importFrom glmnet cv.glmnet
#' @importFrom glmnet glmnet
#' @importFrom stats predict
#' @importFrom tibble as_tibble
#' @importFrom tibble tibble
#' @importFrom tidyselect everything
#' @export
forecast_m <- function(r_list, data, fd = T) {

  # get the number of lags from the suffix
  number_of_lags <- sub(".*_", "", colnames(r_list[[1]])[length(colnames(r_list[[1]]))])

  # Compute lags
  r_raw <- lapply(r_list, function(x) {
    y <- as_tibble(x)

    if (number_of_lags >= 1) y <- mutate(y, lag_1 = lag(lag_0, 1))
    if (number_of_lags >= 2) y <- mutate(y, lag_2 = lag(lag_0, 2))
    if (number_of_lags >= 3) y <- mutate(y, lag_3 = lag(lag_0, 3))
    if (number_of_lags == 4) y <- mutate(y, lag_4 = lag(lag_0, 4))
    y <- ungroup(y)

    return(y)
  })

  # Compute first differences if fd=T
  if (fd) {
    r_raw <- lapply(r_raw, function(x) {
      mutate(
        x,
        across(
          .cols = -c(1, 2), function(y) c(0, diff(y, 1))
        ),
        .keep = "all"
      )
    })
  }

  # Join with datastream data.
  # Remove months, where Google did some changes to the Google Trends data.
  # Remove NAs.
  r_raw <- lapply(r_raw, function(x) {
    left_join(x, data[1:nrow(x), ], by = "time") %>%
      select(time, id, data = value, everything()) %>%
      filter(time != as.Date("2011-01-01")) %>% # omit structural breaks
      filter(time != as.Date("2016-01-01")) %>%
      mutate(across(everything(), function(y) replace(y, y == -Inf, NA_real_))) %>%
      mutate(across(everything(), function(y) replace(y, y == Inf, NA_real_)))
  })


  # use PCA
  # r_factors <- lapply(r_raw, function(x){
  #   pc <- as_tibble(prcomp(x[-c(1,2)])$x)
  #   bind_cols(x[c(1,2)], pc[,1:min(20, length(r_raw[[1]])-2)]) %>% #number of PCs
  #     drop_na()
  # })
  # set r <- r_factors to use PCA-Model

  r <- r_raw

  # Impute NAs with 0
  r <- lapply(r, function(x) {
    x[is.na(x)] <- 0
    return(x)
  })


  # Function to estimate the model via RIDGE regression
  build_model <- function(series) {
    y <- as.matrix(series[1])
    x <- as.matrix(series[-c(1)])

    # Cross validation
    # cv <- cv.glmnet(x, y, alpha = 0)

    # Alternative arguments for glmnet():
    # OLS: lambda= 0
    # LASSO: alpha = 1
    model <- glmnet(x = x, y = y, alpha = 0, lambda = 0) # cv$lambda.min)

    return(model)
  }

  # Trends Data to forecast with previous estimated model
  covariats <- lapply(r, function(x) data.matrix(x[-(1:3)]))

  # estimate model, deselect everything besides variables (columns 1, 2)
  models <- lapply(data.matrix(r), function(x) build_model(x[-(1:2)]))

  # forecast
  pred_values <- mapply(predict, object = models, newx = covariats)

  # select last value in each vintage as forecast
  # for relevant quarter
  last_values <- sapply(pred_values, last)

  forec <- tibble(
    time = seq.Date(max(first(r)$time), max(last(r)$time), by = "month"),
    value = last_values
  ) %>%
    mutate(time = floor_date(time, "month")) %>%
    rename(index = value)

  last_model <- last(models)
  return(list(
    # returns forcasted values and
    forec = forec,
    # model estimated with contemporary data
    last_model = last_model
  ))
}