R/data_loading.R
In ubair: Effects of External Conditions on Air Quality

Documented in copy_default_params load_params load_uba_data_from_dir

#' Load Parameters from YAML File
#'
#' Reads a YAML file containing model parameters, including station settings,
#' variables, and configurations for various models. If no file path is
#' provided, the function defaults to loading `params.yaml` from the package's
#' `extdata` directory.
#'
#' @param filepath Character. Path to the YAML file. If `NULL`, the function
#' will attempt to load the default `params.yaml` provided in the package.
#' @return A list containing the parameters loaded from the YAML file.
#' @details
#' The YAML file should define parameters in a structured format, such as:
#'
#' ```yaml
#' target: 'NO2'
#'
#' lightgbm:
#'   nrounds: 200
#'   eta: 0.03
#'   num_leaves: 32
#'
#' dynamic_regression:
#'   ntrain: 8760
#'
#' random_forest:
#'   num.trees: 300
#'   max.depth: 10
#'
#' meteo_variables:
#'   - GLO
#'   - TMP
#' ```
#' @examples
#' params <- load_params()
#' @export
#' @importFrom yaml read_yaml
load_params <- function(filepath = NULL) {
  if (is.null(filepath)) {
    filepath <- system.file("extdata", "params.yaml", package = "ubair")
  }
  if (file.exists(filepath)) {
    params <- yaml::read_yaml(filepath)
    return(params)
  } else {
    stop("YAML file not found at the specified path.")
  }
}

#' Copy Default Parameters File
#'
#' Copies the default `params.yaml` file, included with the package, to a
#' specified destination directory. This is useful for initializing parameter
#' files for custom edits.
#'
#' @param dest_dir Character. The path to the directory where the `params.yaml`
#' file will be copied.
#' @return Nothing is returned. A message is displayed upon successful copying.
#' @details
#' The `params.yaml` file contains default model parameters for various
#' configurations such as LightGBM, dynamic regression, and others. See the
#' \code{\link[ubair:load_params]{load_params()}}` documentation for an example of the file's structure.
#'
#' @examples
#' copy_default_params(tempdir())
#' @export
copy_default_params <- function(dest_dir) {
  if (!dir.exists(dest_dir)) {
    stop("Destination directory does not exist.")
  }
  file.copy(system.file("extdata", "params.yaml", package = "ubair"),
    file.path(dest_dir, "params.yaml"),
    overwrite = TRUE
  )
  message("Default params.yaml copied to ", normalizePath(dest_dir))
}


#' Load UBA Data from Directory
#'
#' This function loads data from CSV files in the specified directory. It supports two formats:
#'
#' 1. "inv": Files must contain the following columns:
#'    - `Station`, `Komponente`, `Datum`, `Uhrzeit`, `Wert`.
#' 2. "24Spalten": Files must contain:
#'    - `Station`, `Komponente`, `Datum`, and columns `Wert01`, ..., `Wert24`.
#'
#' File names should include "inv" or "24Spalten" to indicate their format. The function scans
#' recursively for `.csv` files in subdirectories and combines the data into a single `data.table`
#' in long format.
#' Files that are not in the exected format will be ignored.
#'
#' @param data_dir Character. Path to the directory containing `.csv` files.
#' @return A `data.table` containing the loaded data in long format. Returns an error if no valid
#' files are found or the resulting dataset is empty.
#' @export
#' @importFrom lubridate ymd_hm ymd_h
#' @importFrom tidyr gather
#' @importFrom dplyr mutate select  %>%
load_uba_data_from_dir <- function(data_dir) {
  if (!dir.exists(data_dir)) {
    stop(paste("Directory does not exist:", data_dir))
  }
  all_files <- list.files(data_dir,
    pattern = "\\.csv$",
    recursive = TRUE,
    full.names = TRUE
  )

  list_data_parts <- lapply(unique(dirname(all_files)), function(dir) {
    .load_data(dir)
  })

  combined_data <- data.table::rbindlist(list_data_parts, fill = TRUE)
  if (nrow(combined_data) == 0) {
    stop(paste(
      "The resulting data is empty after loading all files from",
      data_dir
    ))
  }

  return(combined_data)
}

#' Load data for a Specific Directory
#'
#' @param data_dir Character. Path to the directory containing `.csv` files.
#' @return A `data.table` with the loaded data for the directory. Returns an
#' empty `data.table` if no files match the expected format.
#' @noRd
.load_data <- function(data_dir) {
  data_files <- list.files(data_dir, pattern = "\\.csv$")
  list_data <- lapply(data_files, function(file) {
    .load_data_file(data_dir, file)
  })
  data.table::rbindlist(list_data, fill = TRUE)
}

#' Load data from a specific file
#'
#' Loads data from a file in "inv" or "24Spalten" format. Unsupported formats
#' return an empty `data.table`.
#'
#' @param data_dir Character. Base directory containing the file.
#' @param file Character. Name of the file to load.
#' @return `data.table` containing loaded data or empty data.table for
#' unsupported formats.
#' @noRd
.load_data_file <- function(data_dir, file) {
  file_path <- file.path(data_dir, file)
  if (grepl("inv", file)) {
    .load_inv_file(file_path, file)
  } else if (grepl("24Spalten", file)) {
    .load_spalten_file(file_path, file)
  } else {
    data.table::data.table()
  }
}

#' Load data from an 'inv' file
#'
#' @param file_path Full path to the 'inv' file.
#' @param file The filename being loaded.
#' @return A data.table with the loaded 'inv' data.
#' @noRd
.load_inv_file <- function(file_path, file) {
  data.table::fread(file_path,
    quote = "'",
    na.strings = c(
      "-999", "-888", "-777", "-666", "555", "-555", "-333",
      "-111", "555.00000000"
    )
  ) %>%
    mutate(
      date = ymd_hm(paste(Datum, Uhrzeit)),
      Komponente_txt = Komponente,
      Komponente = substring(sub("\\_.*", "", file), 7)
    )
}

#' Load data from a '24Spalten' file
#'
#' @param file_path Full path to the 'Spalten' file.
#' @param file The filename being loaded.
#' @return `data.table` containing the processed data from the '24Spalten' file.
#' @noRd
.load_spalten_file <- function(file_path, file) {
  data.table::fread(file_path,
    quote = "'",
    na.strings = c(
      "-999", "-888", "-777", "-666", "555", "-555", "-333",
      "-111", "555.00000000"
    )
  ) %>%
    tidyr::gather("time", "Wert", Wert01:Wert24) %>%
    mutate(
      Uhrzeit = substring(time, 5),
      date = ymd_h(paste(Datum, Uhrzeit)),
      Komponente_txt = Komponente,
      Komponente = substring(sub("\\_.*", "", file), 8)
    ) %>%
    dplyr::select(-c(Nachweisgrenze, Lieferung))
}