R/modelling.R

Defines functions .cast_to_wide .extract_components split_data_counterfactual .wday_monday .add_date_variables prepare_data_for_modelling

Documented in prepare_data_for_modelling split_data_counterfactual

#' Prepare Data for Training a model
#'
#' Prepares environmental data by filtering for relevant components,
#' converting the data to a wide format, and adding temporal features. Should be
#' called before
#' \code{\link[ubair:split_data_counterfactual]{split_data_counterfactual()}}
#'
#' @param env_data A data table in long format.
#' Must include the following columns:
#' \describe{
#'   \item{Station}{Station identifier for the data.}
#'   \item{Komponente}{The environmental component being measured
#'        (e.g., temperature, NO2).}
#'   \item{Wert}{The measured value of the component.}
#'   \item{date}{Timestamp as `POSIXct` object in `YYYY-MM-DD HH:MM:SS` format.}
#'   \item{Komponente_txt}{A textual description of the component.}
#' }
#' @param params A list of modelling parameters loaded from `params.yaml`.
#' Must include:
#' \describe{
#'   \item{meteo_variables}{A vector of meteorological variable names.}
#'   \item{target}{The name of the target variable.}
#' }
#' @return A `data.table` in wide format, with columns:
#' `date`, one column per component, and temporal features
#' like `date_unix`, `day_julian`, `weekday`, and `hour`.
#' @examples
#' env_data <- data.table::data.table(
#'   Station = c("StationA", "StationA", "StationA"),
#'   Komponente = c("NO2", "TMP", "NO2"),
#'   Wert = c(50, 20, 40),
#'   date = as.POSIXct(c("2023-01-01 10:00:00", "2023-01-01 11:00:00", "2023-01-02 12:00:00"))
#' )
#' params <- list(meteo_variables = c("TMP"), target = "NO2")
#' prepared_data <- prepare_data_for_modelling(env_data, params)
#' print(prepared_data)
#'
#' @export
prepare_data_for_modelling <- function(env_data, params) {
  components <- c(params$meteo_variables, params$target)
  dt_filtered <- .extract_components(env_data, components)
  dt_wide <- .cast_to_wide(dt_filtered)

  if (!params$target %in% names(dt_wide)) {
    warning(sprintf("Target '%s' is not present in the data. Make sure it exists
                    and you have set the correct target name", params$target))
    stop("Exiting function due to missing target data.")
  }
  dt_prepared <- dt_wide %>%
    dplyr::rename(value = params$target) %>%
    .add_date_variables(replace = TRUE) %>%
    dplyr::filter(!is.na(value))
  dt_prepared
}

#' Turn date feature into temporal features date_unix, day_julian, weekday and
#' hour
#'
#' @param df Data.table with column date formatted as date-time object
#' @param replace Boolean which determines whether to replace existing temporal variables
#' @return A data.table with all relevant temporal features for modelling
#' @noRd
.add_date_variables <- function(df, replace) {
  names <- names(df)
  if (replace) {
    df$date_unix <- as.numeric(df$date)
    df$day_julian <- lubridate::yday(df$date)
    df$weekday <- .wday_monday(df$date, as.factor = TRUE)
    df$hour <- lubridate::hour(df$date)
  } else {
    if (!"date_unix" %in% names) {
      df$date_unix <- as.numeric(df$date)
    }
    if (!"day_julian" %in% names) {
      df$day_julian <- lubridate::yday(df$date)
    }
    if (!"weekday" %in% names) {
      df$weekday <- .wday_monday(df$date, as.factor = TRUE)
    }
    if (!"hour" %in% names) {
      df$hour <- lubridate::hour(df$date)
    }
  }
  return(df)
}

#' Reformat lubridate weekdays into weekdays with monday as day 1
#'
#' @param x Vector of date-time objects
#' @param as.factor Boolean that determines whether to return a factor or a numeric vector
#' @noRd
.wday_monday <- function(x, as.factor = FALSE) {
  x <- lubridate::wday(x)
  x <- x - 1
  x <- ifelse(x == 0, 7, x)
  if (as.factor) {
    x <- factor(x, levels = 1:7, ordered = TRUE)
  }
  return(x)
}


#' Split Data into Training and Application Datasets
#'
#' Splits prepared data into training and application datasets based on
#' specified date ranges for a business-as-usual scenario. Data before
#' `application_start` and after `application_end` is used as training data,
#' while data within the date range is used for application.
#'
#' @param dt_prepared The prepared data table.
#' @param application_start The start date(date object) for the application
#' period of the business-as-usual simulation. This coincides with the start of
#' the reference window.
#' Can be created by e.g. lubridate::ymd("20191201")
#' @param application_end The end date(date object)  for the application period
#' of the business-as-usual simulation. This coincides with the end of
#' the effect window.
#' Can be created by e.g. lubridate::ymd("20191201")
#' @return A list with two elements:
#' \describe{
#'   \item{train}{Data outside the application period.}
#'   \item{apply}{Data within the application period.}
#' }
#' @examples
#' dt_prepared <- data.table::data.table(
#'   date = as.Date(c("2023-01-01", "2023-01-05", "2023-01-10")),
#'   value = c(50, 60, 70)
#' )
#' result <- split_data_counterfactual(
#'   dt_prepared,
#'   application_start = as.Date("2023-01-03"),
#'   application_end = as.Date("2023-01-08")
#' )
#' print(result$train)
#' print(result$apply)
#' @export
split_data_counterfactual <- function(dt_prepared,
                                      application_start,
                                      application_end) {
  stopifnot(
    inherits(application_start, "Date"),
    inherits(application_end, "Date")
  )
  stopifnot(application_start <= application_end)
  dt_train <- dt_prepared[date < application_start | date > application_end]
  dt_apply <- dt_prepared[date >= application_start & date <= application_end]
  list(train = dt_train, apply = dt_apply)
}


#' Extract Components for Modelling
#' Stop with error message if any selected meteo variable/component is not
#' contained in the data.
#'
#' @param env_data Daily aggregated data table.
#' @param components Vector of component names to extract.
#' @return A data.table filtered by the specified components.
#' @noRd
.extract_components <- function(env_data, components) {
  if (!all(components %in% env_data$Komponente)) {
    missing_components <- components[!components %in% env_data$Komponente]
    stop(paste(
      "Data does not contain all selected variables:", missing_components,
      "\n Check data and meteo_variables/params.yaml."
    ))
  }
  env_data[Komponente %in% components, list(Komponente, Wert, date)]
}

#' @param dt_filtered Filtered data.table.
#' @return A wide-format data.table.
#' @noRd
#' @examples
#' dt_filtered <- data.table::data.table(
#'   date = as.POSIXct(c("2023-01-01", "2023-01-01", "2023-01-02")),
#'   Komponente = c("NO2", "TMP", "NO2"),
#'   Wert = c(50, 20, 40)
#' )
#' wide_data <- .cast_to_wide(dt_filtered)
#' print(wide_data)
.cast_to_wide <- function(dt_filtered) {
  data.table::dcast(dt_filtered,
    formula = date ~ Komponente,
    value.var = "Wert"
  )
}

Try the ubair package in your browser

Any scripts or data that you put into this service are public.

ubair documentation built on April 12, 2025, 2:12 a.m.