R/read_swissdata.R
In tstools: A Time Series Toolbox for Official Statistics

Documented in read_swissdata

#' Read data generated by the Swissdata project
#'
#' Read data from swissdata compliant .csv files and
#' turn them into a list of time series.
#'
#' @param path character full path to dataset.
#' @param key_columns character vector specifying all columns that should be
#' part of the key. Defaults to the dim.order specified by swissdata.
#' @param filter function A function that is applied to the raw data.data table after it is read. Useful for
#' filtering out undesired data.
#' @param aggregates list A list of dimensions over which to aggregate data. The names of this list determing
#' which function is used to calculate the aggregate (e.g. sum, mean etc.). Defaults to sum.
#' @param keep_last_freq_only in case there is a frequency change in a time series,
#' should only the part of the series be returned that has the same frequency as
#' the last observation. This is useful when data start out crappy and then stabilize
# after a while. Defaults to FALSE. Hence only the last part of the series is returned.
#' @details
#' The order of dimensions in key_columns determines their order in the key
#' The resulting ts_key will be of the form <swissdata-set-name>.<instance of key_columns[1]>...
#' @examples
#' ds_location <- system.file("example_data/ch.seco.css.csv", package = "tstools")
#' tslist <- read_swissdata(ds_location, "idx_type")
#' tsplot(tslist[1])
#' @importFrom data.table fread
#' @export
read_swissdata <- function(path, key_columns = NULL, filter = NULL,
                           aggregates = NULL,
                           keep_last_freq_only = FALSE) {
  ..raw_names <- NULL
  . <- NULL
  dataset <- gsub("\\.csv", "", basename(path))

  if (is.null(key_columns)) {
    set_id <- gsub(".csv$", "", basename(path))

    meta <- .read_swissdata_meta_unknown_format(gsub(".csv", "", path))

    if (length(meta) == 0) {
      # Alternatively: Take them as they come in the csv?
      stop("Neither JSON nor YAML metadata found and key_columns not specified. Cannot proceed!")
    }

    key_columns <- meta$dim.order
  }

  # Read all columns as character to preserve things like
  # 00, 012 etc. in dims
  raw <- fread(path, colClasses = "character")
  raw[, value := as.numeric(value)]

  # TODO!!! Document change in aggregates param
  if (!is.null(aggregates)) {
    raw_names <- names(raw)
    dims <- setdiff(raw_names, c("date", "value"))
    totals <- lapply(seq_along(aggregates), function(i) {
      agg <- aggregates[[i]]
      aggregate_fcn <- agg$fcn
      aggregate_fcn <- ifelse(is.null(aggregate_fcn), "sum", aggregate_fcn)
      aggdim <- agg$dimensions
      raw[, .(value = do.call(aggregate_fcn, list(value))),
        by = c(setdiff(dims, aggdim), "date")
      ][, get("aggdim") := "total"][, ..raw_names]
    })
    totals <- rbindlist(totals)
    raw <- rbindlist(list(raw, totals))
  }

  raw[, series := do.call(
    paste,
    c(dataset, .SD, sep = ".")
  ),
  .SDcols = key_columns
  ]
  if (!is.null(filter)) {
    raw <- filter(raw)
  }

  long_to_ts(raw[, list(series, date, value)],
    keep_last_freq_only = keep_last_freq_only
  )
}