R/counterfactual_model.R

Defines functions .make_wind_vectors .transform_input .make_fourier_features run_fnn run_lightgbm run_rf run_dynamic_regression run_counterfactual

Documented in run_counterfactual run_dynamic_regression run_fnn run_lightgbm run_rf

#' Full counterfactual simulation run
#'
#' Chains detrending, training of a selected model, prediction and retrending together
#' for ease of use. See documentation of individual functions for details.
#' @param split_data List of two named dataframes called train and apply
#' @param params A list of parameters that define the following:
#' \describe{
#'   \item{meteo_variables}{A character vector specifying the names of the
#'   meteorological variables used as inputs.}
#'   \item{model}{A list of hyperparameters for training the chosen model. Name of this list
#'   and its parameters depend on the chosen models. See [ubair::run_dynamic_regression()],
#'   [ubair::run_lightgbm()], [ubair::run_rf()] and [ubair::run_fnn()] functions for details}
#'   }
#' @param detrending_function String which defines type of trend to remove.
#' Options are "linear","quadratic", "exponential", "spline", "none". See [ubair::detrend()]
#' and [ubair::retrend_predictions()] for details.
#' @param model_type String to decide which model to use. Current options random
#' forest "rf", gradient boosted decision trees "lightgbm", "dynamic_regression" and feedforward neural network "fnn"
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param log_transform If TRUE, uses log transformation during detrending and
#' retrending. For details see [ubair::detrend()] documentation
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return Data frame of predictions, model and importance
#' @examples
#' data(mock_env_data)
#' split_data <- list(
#'   train = mock_env_data[1:80, ],
#'   apply = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' res <- run_counterfactual(split_data, params, detrending_function = "linear")
#' prediction <- res$retrended_predictions
#' random_forest_model <- res$model
#' @export
run_counterfactual <- function(split_data,
                               params,
                               detrending_function = "none",
                               model_type = "rf",
                               alpha = 0.9,
                               log_transform = FALSE,
                               calc_shaps = FALSE) {
  variables <- c("day_julian", "weekday", "hour", params$meteo_variables)
  detrended_list <- detrend(
    split_data,
    mode = detrending_function,
    log_transform = log_transform
  )
  trend <- detrended_list$model
  detrended_train <- detrended_list$train
  detrended_apply <- detrended_list$apply

  if (model_type == "rf") {
    detrended_train <- detrended_train %>% select(value, !!variables)
    res <- run_rf(
      train = detrended_train,
      test = detrended_apply,
      model_params = params$random_forest,
      alpha = alpha,
      calc_shaps = calc_shaps
    )
  } else if (model_type == "lightgbm") {
    detrended_train <- detrended_train %>%
      select(value, dplyr::any_of(variables))
    res <- run_lightgbm(
      train = detrended_train,
      test = detrended_apply,
      model_params = params$lightgbm,
      alpha = alpha,
      calc_shaps = calc_shaps
    )
  } else if (model_type == "dynamic_regression") {
    res <- run_dynamic_regression(
      train = detrended_train,
      test = detrended_apply,
      params = params,
      alpha = alpha,
      calc_shaps = calc_shaps
    )
  } else if (model_type == "fnn") {
    res <- suppressMessages(run_fnn(
      train = detrended_train,
      test = detrended_apply,
      params = params,
      calc_shaps = calc_shaps
    ))
  } else {
    stop("Wrong model_type. Select one of 'rf', 'lightgbm', 'dynamic_regression', 'fnn'.")
  }
  retrended_predictions <- retrend_predictions(res$dt_predictions,
    trend,
    log_transform = log_transform
  )
  list(prediction = retrended_predictions, model = res$model, importance = res$importance)
}

#' Run the dynamic regression model
#'
#' This function trains a dynamic regression model with fourier transformed temporal features
#' and meteorological variables as external regressors on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. This is referred to as a dynamic regression model in
#' [Forecasting: Principles and Practise, Chapter 10 - Dynamic regression models](https://otexts.com/fpp3/dynamic.html)
#'
#' Note: Runs the dynamic regression model for individualised use with own data pipeline.
#' Otherwise use [ubair::run_counterfactual()] to call this function.
#' @param train Dataframe of train data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param test Dataframe of test data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param params list of hyperparameters to use in dynamic_regression call. Only uses ntrain to specify
#' the number of data points to use for training. Default is 8760 which results in
#' 1 year of hourly data
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return Data frame of predictions and model
#' @importFrom forecast BoxCox.lambda auto.arima forecast
#' @export
run_dynamic_regression <- function(train,
                                   test,
                                   params,
                                   alpha,
                                   calc_shaps) {
  train <- train %>% dplyr::filter(date < test$date[1])
  # 24 * 365 = 8760 (1 year of training data)
  ntrain <- ifelse(is.null(params$dynamic_regression$ntrain), 8760, params$dynamic_regression$ntrain)
  train <- utils::tail(train, ntrain)
  message(
    paste("Using data for dynamic regression training from ", min(train$date), "to ", max(train$date)),
    ". Too long training series can lead to worse performance. Adjust this via the dynamic_regression$ntrain hyperparameter."
  )
  train_transformed <- .transform_input(train, params)
  test_transformed <- .transform_input(test, params)
  scale_result <- scale_data(
    train_data = train_transformed,
    apply_data = test_transformed
  )
  xreg <- scale_result$train %>%
    dplyr::select(-value) %>%
    as.matrix()
  xreg_pred <- scale_result$apply %>%
    dplyr::select(-value) %>%
    as.matrix()
  y <- scale_result$train$value %>% stats::ts()
  model <- forecast::auto.arima(y,
    d = 0, xreg = xreg,
    seasonal = FALSE,
    trace = FALSE,
    allowdrift = TRUE,
    allowmean = TRUE,
    lambda = NULL,
    biasadj = TRUE
  )
  pred <- forecast::forecast(
    object = model,
    xreg = xreg_pred,
    level = alpha,
    lambda = NULL,
    biasadj = TRUE
  )
  test$prediction <- pred$mean %>% as.numeric()
  test$prediction_lower <- pred$lower[, 1] %>% as.numeric()
  test$prediction_upper <- pred$upper[, 1] %>% as.numeric()
  dt_predictions <- rescale_predictions(
    scale_result = scale_result,
    dt_predictions = test
  )
  if (calc_shaps) {
    rlang::check_installed(c("fastshap", "shapviz"),
      reason = "calculate shap values for dynamic regression"
    )
    shap_vals <- fastshap::explain(
      model,
      X = xreg,
      nsim = 100,
      newdata = xreg_pred,
      pred_wrapper = function(object, newdata) {
        fc_prediction <- forecast::forecast(
          object = object,
          xreg = newdata,
          lambda = NULL,
          biasadj = TRUE
        )
        return(fc_prediction$mean %>% as.numeric())
      },
      shap_only = FALSE
    )
    shp <- shapviz::shapviz(shap_vals)
  } else {
    shp <- NULL
  }
  list(dt_predictions = dt_predictions, model = model, importance = shp)
}

#' Run random forest model with ranger
#'
#' This function trains a random forest model (ranger) on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. The model uses meteorological variables and temporal features.
#'
#' Note: Runs the random forest model for individualised use with own data pipeline.
#' Otherwise use [ubair::run_counterfactual()]  to call this function.
#' @param train Dataframe of train data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param test Dataframe of test data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param model_params list of hyperparameters to use in ranger call. See \code{\link[ranger:ranger]{ranger:ranger()}} for options.
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return List with data frame of predictions and model
#' @import ranger
#' @export
run_rf <- function(train, test, model_params, alpha, calc_shaps) {
  function_parameters <- list(
    y = train$value,
    x = train %>% select(-value),
    importance = "none",
    splitrule = "variance",
    keep.inbag = TRUE,
    quantreg = TRUE
  )
  function_parameters <- append(function_parameters, model_params)
  model <- do.call(ranger::ranger, args = function_parameters)
  quantiles <- data.table::as.data.table(predict(model,
    test,
    type = "quantiles",
    quantiles = c((1 - alpha) / 2, (1 + alpha) / 2)
  )$predictions)
  if (calc_shaps) {
    rlang::check_installed(c("treeshap", "shapviz"),
      reason = "calculate shap values for random forest"
    )
    shap_subset <- test[sample(nrow(test), min(nrow(test), 1000)), ] %>% select(-value)
    unified <- treeshap::ranger.unify(model, data.matrix(train %>% select(-value)))
    treeshap_vals <- treeshap::treeshap(unified, data.matrix(shap_subset), verbose = FALSE)
    shp <- shapviz::shapviz(treeshap_vals, X_pred = data.matrix(shap_subset), X = shap_subset)
  } else {
    shp <- NULL
  }
  test[, "prediction" := predict(model, test)$predictions]
  test[, c("prediction_lower", "prediction_upper") := quantiles]
  list(dt_predictions = test, model = model, importance = shp)
}

#' Run gradient boosting model with lightgbm
#'
#' This function trains a gradient boosting model (lightgbm) on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. The model uses meteorological variables and temporal features.
#'
#' Note: Runs the gradient boosting model for individualised use with own data pipeline.
#' Otherwise use [ubair::run_counterfactual()]  to call this function.
#' @param train Dataframe of train data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param test Dataframe of test data as returned by the [ubair::split_data_counterfactual()]
#' function.
#' @param model_params list of hyperparameters to use in lgb.train call.
#' See \code{\link[lightgbm:lgb.train]{lightgbm:lgb.train()}} params argument for details.
#' @param alpha Confidence level of the prediction interval between 0 and 1.
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return List with data frame of predictions and model
#' @import lightgbm
#' @examples
#' \donttest{
#' data(mock_env_data)
#' split_data <- list(
#'   train = mock_env_data[1:80, ],
#'   apply = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' variables <- c("day_julian", "weekday", "hour", params$meteo_variables)
#' res <- run_lightgbm(
#'   train = mock_env_data[1:80, ],
#'   test = mock_env_data[81:100, ], params$lightgbm, alpha = 0.9,
#'   calc_shaps = FALSE
#' )
#' prediction <- res$dt_predictions
#' model <- res$model
#' }
#' @export
run_lightgbm <- function(train, test, model_params, alpha, calc_shaps) {
  dtrain <- lgb.Dataset(
    data = data.matrix(train %>% select(-value)),
    label = train$value
  )
  model_mean <- lgb.train(
    params = model_params[names(model_params) != "nrounds"],
    data = dtrain,
    nrounds = model_params$nrounds
  )
  params_lower <- append(
    model_params[names(model_params) != "nrounds"],
    list(
      "objective" = "quantile",
      "alpha" = (1 - alpha) / 2,
      "verbosity" = 0
    )
  )
  model_lower <- lgb.train(
    params = params_lower,
    data = dtrain,
    nrounds = model_params$nrounds
  )
  params_upper <- append(
    model_params[names(model_params) != "nrounds"],
    list(
      "objective" = "quantile",
      "alpha" = (1 + alpha) / 2,
      "verbosity" = 0
    )
  )
  model_upper <- lgb.train(
    params = params_upper,
    data = dtrain,
    nrounds = model_params$nrounds
  )
  dapply <- test %>% select(!!colnames(train))
  dapply <- data.matrix(dapply %>% select(-value))
  dt_predictions <- test
  dt_predictions$prediction <- predict(model_mean, dapply)
  dt_predictions$prediction_lower <- predict(model_lower, dapply)
  dt_predictions$prediction_upper <- predict(model_upper, dapply)
  if (calc_shaps) {
    rlang::check_installed(c("shapviz"),
      reason = "calculate shap values for lightgbm"
    )
    shp <- shapviz::shapviz(model_mean, X_pred = dapply, X = dapply)
  } else {
    shp <- NULL
  }
  list(dt_predictions = dt_predictions, model = model_mean, importance = shp)
}

#' Train a Feedforward Neural Network (FNN) in a Counterfactual Scenario.
#'
#' Trains a feedforward neural network (FNN) model on the
#' specified training dataset and makes predictions on the test dataset in a
#' counterfactual scenario. The model uses meteorological variables and
#' sin/cosine-transformed features. Scales the data before training and rescales
#' predictions, as the model does not converge with unscaled data.
#'
#' This function provides flexibility for users with their own data pipelines
#' or workflows. For a simplified pipeline, consider using
#' \code{\link[ubair:run_counterfactual]{run_counterfactual()}}.
#'
#' Experiment with hyperparameters such as \code{learning_rate},
#' \code{batchsize}, \code{hidden_layers}, and \code{num_epochs} to improve
#' performance.
#'
#' Warning: Using many or large hidden layers in combination with a high number
#' of epochs can lead to long training times.
#'
#' @param train A data frame or tibble containing the training dataset,
#' including the target variable (`value`)
#' and meteorological variables specified in `params$meteo_variables`.
#' @param test A data frame or tibble containing the test dataset on which
#' predictions will be made,
#' using the same meteorological variables as in the training dataset.
#' @param params A list of parameters that define the following:
#' \describe{
#'   \item{meteo_variables}{A character vector specifying the names of the
#'   meteorological variables used as inputs.}
#'   \item{fnn}{A list of hyperparameters for training the feedforward neural
#'   network, including:
#'     \itemize{
#'       \item \code{activation_fun}: The activation function for the hidden
#'       layers (e.g., "sigmoid", "tanh").
#'       \item \code{momentum}: The momentum factor for training.
#'       \item \code{learningrate_scale}: Factor for adjusting learning rate.
#'       \item \code{output_fun}: The activation function for the output layer
#'       \item \code{batchsize}: The size of the batches during training.
#'       \item \code{hidden_dropout}: Dropout rate for the hidden layers to
#'       prevent overfitting.
#'       \item \code{visible_dropout}: Dropout rate for the input layer.
#'       \item \code{hidden_layers}: A vector specifying the number of neurons
#'       in each hidden layer.
#'       \item \code{num_epochs}: Number of epochs (iterations) for training.
#'       \item \code{learning_rate}: Initial learning rate.
#'     }
#'   }
#' }
#' @param calc_shaps Boolean value. If TRUE, calculate SHAP values for the
#' method used and format them so they can be visualised with
#' \code{\link[shapviz:sv_importance]{shapviz:sv_importance()}} and
#' \code{\link[shapviz:sv_dependence]{shapviz:sv_dependence()}}.
#' The SHAP values are generated for a subset (or all, depending on the size of the dataset) of the
#' test data.
#' @return A list with three elements:
#' \describe{
#'   \item{\code{dt_predictions}}{A data frame containing the test data along
#' with the predicted values:
#'     \describe{
#'       \item{\code{prediction}}{The predicted values from the FNN model.}
#'       \item{\code{prediction_lower}}{The same predicted values, as no
#'       quantile model is available yet for FNN.}
#'       \item{\code{prediction_upper}}{The same predicted values, as no
#'       quantile model is available yet for FNN.}
#'     }
#'   }
#'   \item{\code{model}}{The trained FNN model object from the
#'   \code{deepnet::nn.train()} function.}
#'   \item{\code{importance}}{SHAP importance values (if
#'   \code{calc_shaps = TRUE}). Otherwise, `NULL`.}
#' }
#' @examples
#' data(mock_env_data)
#' params <- load_params()
#' res <- run_fnn(
#'   train = mock_env_data[1:80, ],
#'   test = mock_env_data[81:100, ], params,
#'   calc_shaps = FALSE
#' )
#' @export
run_fnn <- function(train, test, params, calc_shaps) {
  rlang::check_installed(c("deepnet"), reason = "to run a fnn")

  train_transformed <- .transform_input(train, params)
  test_transformed <- .transform_input(test, params)

  scale_result <- scale_data(
    train_data = train_transformed,
    apply_data = test_transformed
  )

  train_matrix <- scale_result$train %>%
    dplyr::select(-value) %>%
    as.matrix()
  test_matrix <- scale_result$apply %>%
    dplyr::select(-value) %>%
    as.matrix()
  target_vector <- scale_result$train$value

  fnn_function_parameter <- append(
    list(x = train_matrix, y = target_vector),
    params$fnn
  )
  model <- do.call(deepnet::nn.train, args = fnn_function_parameter)

  prediction <- deepnet::nn.predict(model, test_matrix)
  # no quantil model yet for feedforward neural network
  test <- test %>%
    mutate(
      prediction = prediction,
      prediction_lower = prediction,
      prediction_upper = prediction
    )

  test <- rescale_predictions(
    scale_result = scale_result,
    dt_predictions = test
  )
  if (calc_shaps) {
    rlang::check_installed(c("fastshap", "shapviz"),
      reason = "calculate shap values for fnn"
    )
    shap_vals <- fastshap::explain(
      model,
      X = train_matrix,
      nsim = 50,
      newdata = test_matrix,
      pred_wrapper = function(object, newdata) {
        deepnet::nn.predict(
          nn = object,
          x = newdata
        ) %>% as.numeric()
      },
      shap_only = FALSE
    )
    shp <- shapviz::shapviz(shap_vals)
  } else {
    shp <- NULL
  }
  list(dt_predictions = test, model = model, importance = shp)
}

#' Make fourier features out of hour, weekday and day of the year
#'
#' @return Numeric matrix of sin and cos transformed temporal features of dimension
#' n x 6.
#' @noRd
.make_fourier_features <- function(df) {
  df$weekday <- df$weekday %>% as.numeric()
  fourier_list <- list(
    sin((24 * (df$day_julian) + df$hour) / (24 * 365) * 2 * pi),
    cos((24 * (df$day_julian) + df$hour) / (24 * 365) * 2 * pi),
    sin((24 * (df$weekday - 1) + df$hour) / (24 * 7) * 2 * pi),
    cos((24 * (df$weekday - 1) + df$hour) / (24 * 7) * 2 * pi),
    sin(df$hour / 24 * 2 * pi),
    cos(df$hour / 24 * 2 * pi)
  )
  res_matrix <- matrix(unlist(fourier_list), ncol = 6, byrow = FALSE)
  colnames(res_matrix) <- c(
    "sin_year", "cos_year", "sin_week", "cos_week",
    "sin_hour", "cos_hour"
  )
  res_matrix
}

#' Transform Data to cyclic features for fnn/dynamic regression
#'
#' @return Numeric matrix combining the selected meteorological variables
#' (transformed wind vectors as U and V components) and the Fourier features.
#' @noRd
.transform_input <- function(input_data, params) {
  input_meteo <- input_data %>% select(!!params$meteo_variables)
  if ("WIG" %in% params$meteo_variables && "WIR" %in% params$meteo_variables) {
    input_meteo <- .make_wind_vectors(input_meteo) %>% select(-WIG, -WIR)
  }
  fourier_features <- .make_fourier_features(input_data)
  transformed <- cbind(input_meteo, fourier_features, input_data[, "value"])
  transformed
}

#' Creates wind vectors from direction and speed
#'
#' Takes a dataframe with columns WIG (wind speed) and
#' WIR (wind direction in degrees 0 to 360) and creates wind vectors with U and
#' V component
#' @return Data frame of the same format as the input with additional columns
#' "WIND_U" and "WIND_V".
#' @noRd
.make_wind_vectors <- function(dt_prepared) {
  dt_prepared <- dt_prepared %>% mutate(
    "WIND_U" = sin(pi * WIR / 180) * WIG,
    "WIND_V" = cos(pi * WIR / 180) * WIG
  )
  dt_prepared
}

Try the ubair package in your browser

Any scripts or data that you put into this service are public.

ubair documentation built on April 12, 2025, 2:12 a.m.