R/data_preprocessing.R
In ubair: Effects of External Conditions on Air Quality

Documented in detrend retrend_predictions scale_data

#' Removes trend from data
#'
#' Takes a list of train and application data as prepared by
#' [ubair::split_data_counterfactual()]
#' and removes a polynomial, exponential or cubic spline spline trend function.
#' Trend is obtained only from train data. Use as part of preprocessing before
#' training a model based on decision trees, i.e. random forest and lightgbm.
#' For the other methods it may be helpful but they are generally able to
#' deal with trends themselves. Therefore we recommend to try out different
#' versions and guide decisisions using the model evaluation metrics from
#' [ubair::calc_performance_metrics()].
#'
#' Apply [ubair::retrend_predictions()] to predictions to return to the
#' original data units.
#'
#' @param split_data List of two named dataframes called train and apply
#' @param mode String which defines type of trend is present. Options are
#' "linear", "quadratic", "exponential", "spline", "none".
#' "none" returns original data
#' @param num_splines Defines the number of cubic splines if `mode="spline"`.
#' Choose num_splines=1 for cubic polynomial trend. If `mode!="spline"`, this
#' parameter is ignored
#' @param log_transform If `TRUE`, use a log-transformation before detrending
#' to ensure positivity of all predictions in the rest of the pipeline.
#' A exp transformation is necessary during retrending to return to the solution
#' space. Use only in combination with `log_transform` parameter in
#' [ubair::retrend_predictions()]
#' @return List of 3 elements. 2 dataframes: detrended train, apply and the
#' trend function
#' @examples
#' data(mock_env_data)
#' split_data <- list(
#'   train = mock_env_data[1:80, ],
#'   apply = mock_env_data[81:100, ]
#' )
#' detrended_list <- detrend(split_data, mode = "linear")
#' detrended_train <- detrended_list$train
#' detrended_apply <- detrended_list$apply
#' trend <- detrended_list$model
#' @export
#' @importFrom stats lm predict spline
#' @import splines
detrend <- function(split_data,
                    mode = "linear",
                    num_splines = 5,
                    log_transform = FALSE) {
  dt_train_new <- data.table::copy(split_data$train)
  dt_apply_new <- data.table::copy(split_data$apply)
  stopifnot("log_transform needs to be boolean, i.e. either TRUE or FALSE" = class(log_transform) == "logical")
  if (log_transform) {
    dt_train_new$value <- log(dt_train_new$value)
    dt_apply_new$value <- log(dt_apply_new$value)
  }
  if (mode == "linear") {
    trend <- lm(value ~ date_unix, data = dt_train_new)
  } else if (mode == "quadratic") {
    trend <- lm(value ~ date_unix + I(date_unix^2), data = dt_train_new)
  } else if (mode == "exponential") {
    trend <- lm(value ~ log(date_unix), data = dt_train_new)
  } else if (mode == "spline") {
    stopifnot("Set num_splines larger or equal to 1" = num_splines >= 1)
    knots <- seq(
      from = min(dt_train_new$date_unix),
      to = max(dt_train_new$date_unix),
      length.out = num_splines + 1
    )
    knots <- knots[2:(length(knots) - 1)]
    if (length(knots) <= 2) { # If 2 or less knots, just fit cubic polynomial
      trend <- lm(value ~ date_unix + I(date_unix^2) + I(date_unix^3),
        data = dt_train_new
      )
    } else { # Else use splines
      trend <- lm(value ~ bs(date_unix, knots = knots), data = dt_train_new)
    }
  } else if (mode == "none") {
    trend <- lm(value ~ 1 - 1, data = dt_train_new)
  } else {
    stop("mode needs to be any of the following strings: 'linear',
         'quadratic', 'exponential', 'spline', 'none'")
  }
  trend_train_values <- predict(trend, newdata = dt_train_new)
  dt_train_new$value <- dt_train_new$value - trend_train_values
  trend_apply_values <- predict(trend, newdata = dt_apply_new)
  dt_apply_new$value <- dt_apply_new$value - trend_apply_values
  return(list(train = dt_train_new, apply = dt_apply_new, model = trend))
}

#' Restors the trend in the prediction
#'
#' Takes a dataframe of predictions as returned by any of
#' the 'run_model' functions and restores a trend which was previously
#' removed via [ubair::detrend()]. This is necessary for the predictions
#' and the true values to have the same units. The function is basically
#' the inverse function to [ubair::detrend()] and should only be used in
#' combination with it.
#'
#' @param dt_predictions Dataframe of predictions with columns `value`,
#' `prediction`, `prediction_lower`, `prediction_upper`
#' @param trend lm object generated by [ubair::detrend()]
#' @param log_transform Returns values to solution space, if they have been
#' log transformed during detrending. Use only in combination with `log_transform`
#' parameter in detrend function.
#' @return Retrended dataframe with same structure as `dt_predictions`
#' which is returned by any of the run_model() functions.
#' @examples
#' \donttest{
#' data(mock_env_data)
#' split_data <- list(
#'   train = mock_env_data[1:80, ],
#'   apply = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' detrended_list <- detrend(split_data,
#'   mode = "linear"
#' )
#' trend <- detrended_list$model
#' detrended_train <- detrended_list$train
#' detrended_apply <- detrended_list$apply
#' result <- run_lightgbm(
#'   train = detrended_train,
#'   test = detrended_apply,
#'   model_params = params$lightgbm,
#'   alpha = 0.9,
#'   calc_shaps = FALSE
#' )
#' retrended_predictions <- retrend_predictions(result$dt_predictions, trend)
#' }
#' @export
retrend_predictions <- function(dt_predictions, trend, log_transform = FALSE) {
  stopifnot("log_transform needs to be boolean, i.e. TRUE or FALSE" = class(log_transform) == "logical")
  stopifnot("trend object needs to be a linear model of class 'lm'" = class(trend) == "lm")
  stopifnot(
    "Not all of 'value', 'prediction', 'prediction_lower', 'prediction_upper' are columns in dt_predictions" =
      all(c("value", "prediction", "prediction_lower", "prediction_upper")
      %in% colnames(dt_predictions))
  )
  trend_value <- predict(trend, newdata = dt_predictions)
  if (log_transform) {
    dt_predictions[,
      c("value", "prediction", "prediction_lower", "prediction_upper") := lapply(.SD, \(x) exp(x + trend_value)),
      .SDcols = c("value", "prediction", "prediction_lower", "prediction_upper")
    ]
  } else {
    dt_predictions[,
      c("value", "prediction", "prediction_lower", "prediction_upper") := lapply(.SD, \(x) x + trend_value),
      .SDcols = c("value", "prediction", "prediction_lower", "prediction_upper")
    ]
  }

  dt_predictions
}


#' Standardize Training and Application Data
#'
#' This function standardizes numeric columns of the `train_data` and applies
#' the same scaling (mean and standard deviation) to the corresponding columns
#' in `apply_data`. It returns the standardized data along with the scaling
#' parameters (means and standard deviations). This is particularly important
#' for neural network approaches as they tend to be numerically unstable and
#' deteriorate otherwise.
#'
#' @param train_data A data frame containing the training dataset to be
#' standardized. It must contain numeric columns.
#' @param apply_data A data frame  containing the dataset to which the scaling
#' from `train_data` will be applied.
#'
#' @return A list containing the following elements:
#' \item{train}{The standardized training data.}
#' \item{apply}{The `apply_data` scaled using the means and standard deviations
#' from the `train_data`.}
#' \item{means}{The means of the numeric columns in `train_data`.}
#' \item{sds}{The standard deviations of the numeric columns in `train_data`.}
#' @export
#' @importFrom dplyr mutate across where
#' @examples
#' data(mock_env_data)
#' detrended_list <- list(
#'   train = mock_env_data[1:80, ],
#'   apply = mock_env_data[81:100, ]
#' )
#' scale_result <- scale_data(
#'   train_data = detrended_list$train,
#'   apply_data = detrended_list$apply
#' )
#' scaled_train <- scale_result$train
#' scaled_apply <- scale_result$apply
scale_data <- function(train_data,
                       apply_data) {
  means <- attr(
    scale(train_data %>% select(where(is.numeric))),
    "scaled:center"
  )
  sds <- attr(
    scale(train_data %>% select(where(is.numeric))),
    "scaled:scale"
  )

  train_data <- train_data %>%
    dplyr::mutate_if(is.numeric, ~ as.numeric(scale(.)))

  apply_data <- apply_data %>%
    mutate(across(
      names(sds),
      ~ as.numeric(scale(.,
        center = means[dplyr::cur_column()],
        scale = sds[dplyr::cur_column()]
      ))
    ))

  list(
    train = train_data,
    apply = apply_data,
    means = means,
    sds = sds
  )
}


#' Rescale predictions to original scale.
#'
#' This function rescales the predicted values (`prediction`, `prediction_lower`,
#' `prediction_upper`). The scaling is reversed using the means and
#' standard deviations that were saved from the training data. It is the inverse
#' function to [ubair::scale_data()] and should be used only in combination.
#'
#' @param scale_result A list object returned by [ubair::scale_data()],
#' containing the means and standard deviations used for scaling.
#' @param dt_predictions A data frame containing the predictions,
#' including columns `prediction`, `prediction_lower`, `prediction_upper`.
#'
#' @return A data frame with the predictions and numeric columns rescaled back
#' to their original scale.
#'
#' @export
#' @examples
#' \donttest{
#' data(mock_env_data)
#' scale_res <- scale_data(
#'   train_data = mock_env_data[1:80, ],
#'   apply_data = mock_env_data[81:100, ]
#' )
#' params <- load_params()
#' res <- run_lightgbm(
#'   train = scale_res$train, test = scale_res$apply,
#'   params$lightgbm, alpha = 0.9, calc_shaps = FALSE
#' )
#' dt_predictions <- res$dt_predictions
#' rescaled_predictions <- rescale_predictions(scale_res, dt_predictions)
#' }
rescale_predictions <- function(scale_result, dt_predictions) {
  means <- scale_result$means
  sds <- scale_result$sds
  rescaled_predictions <- dt_predictions %>%
    mutate(
      prediction = prediction * sds["value"] + means["value"],
      prediction_lower = prediction_lower * sds["value"] + means["value"],
      prediction_upper = prediction_upper * sds["value"] + means["value"]
    )
  return(rescaled_predictions)
}