#' Read IAMC-style .csv or .xlsx files, or object from rds file.
#'
#' Reads IAMC-style .csv or .xlsx files or object from rds file into a quitte data frame.
#'
#' @md
#' @param file Path of IAMC-style .csv, .xlsx, or rds file or vector of paths to read.
#' @param sep Column separator, defaults to ";" in read_mif_header().
#' @param quote Quote characters, empty by default.
#' @param na.strings Entries to interpret as NA; defaults to
#' `c("UNDF", "NA", "N/A", "n_a")`
#' @param convert.periods If `TRUE`, periods are converted to
#' [`POSIXct`][base::DateTimeClasses]. If `FALSE` (the default), periods
#' are numerical.
#' @param check.duplicates If `TRUE` a duplicates check will be performed on the
#' data. For time- and memory-critical applications this can be switched
#' off.
#' @param factors Return columns as factors (`TRUE`, the default) or not.
#' @param drop.na Should `NA` values be dropped from the `quitte`?
#' @param comment A character which at line start signifies the optional comment
#' header with metadata at the head of `file`. The comment header, if
#' present, is returned as a `comment_header` attribute. If multiple files
#' are read, the `comment_header` attribute is a list of comment headers with
#' file paths as names.
#' @param filter.function A function used to filter data during read. See
#' Details.
#' @param chunk_size Number of lines to read at a time. Defaults to 200000.
#' (REMIND .mif files have between 55000 and 105000 lines for H12 and EU21
#' regional settings, respectively.)
#'
#' @details
#' In order to process large data sets, like IIASA data base snapshots,
#' `read.quitte()` reads provided files in chunks of `chunk_size` lines
#' (not for Excel files), and applies `filter.function()` to the chunks. This
#' allows for filtering data piece-by-piece, without exceeding available memory.
#' `filter.function` is a function taking one argument, a quitte data frame of
#' the read chunk, and is expected to return a data frame. Usually it should
#' simply contain all the filters usually applied after all the data is read in.
#' Suppose there is a file `big_IIASA_snapshot.csv`, from which only data for
#' the REMIND and MESSAGE models between the years 2020 to 2050 is of interest.
#' Normally, this data would be processed as
#' ```
#' read.quitte(file = 'big_IIASA_snapshot.csv') %>%
#' filter(grepl('^(REMIND|MESSAGE)', .data$model),
#' between(.data$period, 2020, 2060))
#' ```
#' If however `big_IIASA_snapshot.csv` is too large to be read in completely,
#' it can be read using
#' ```
#' read.quitte(file = 'big_IIASA_snapshot.csv',
#' filter.function = function(x) {
#' x %>%
#' filter(grepl('^(REMIND|MESSAGE)', .data$model),
#' between(.data$period, 2020, 2060))
#' })
#' ```
#'
#' @return A quitte data frame.
#'
#' @author Michaja Pehl
#'
#' @examples
#' \dontrun{
#' read.quitte(c("some/data/file.mif", "some/other/data/file.mif"))
#' read.quitte("some/data/file.csv", sep = ",", quote = '"')
#' }
#'
#' @importFrom dplyr as_tibble bind_rows distinct first last mutate tibble
#' relocate select
#' @importFrom forcats as_factor
#' @importFrom magrittr %>%
#' @importFrom rlang .data is_empty
#' @importFrom readr DataFrameCallback problems read_delim_chunked read_lines
#' @importFrom readxl read_excel excel_sheets
#' @importFrom tidyr pivot_longer
#' @importFrom tidyselect all_of
#' @importFrom utils read.table
#'
#' @export
read.quitte <- function(file,
sep = NULL,
quote = "",
na.strings = c("UNDF", "NA", "N/A", "n_a"),
convert.periods = FALSE,
check.duplicates = TRUE,
factors = TRUE,
drop.na = FALSE,
comment = '#',
filter.function = identity,
chunk_size = 200000L) {
if (!length(file))
stop('\'file\' is empty.')
if (!( is.function(filter.function)
&& 1 == length(formals(filter.function))))
stop('`filter.function` must be a function taking only one argument.')
.read.quitte <- function(f, sep, quote, na.strings, convert.periods,
drop.na, comment, filter.function, chunk_size) {
default.columns <- c("model", "scenario", "region", "variable", "unit")
if (grepl("\\.xlsx?$", f)) {
data <- read_excel(path = f, sheet = if ('data' %in% excel_sheets(f)) 'data' else 1, guess_max = 21474836)
if (! any(grepl("^[0-9]{4}$", colnames(data)))) {
warning("File ", f, " contains no data, returning empty data.frame.")
return(as.quitte(NULL))
}
data <- pivot_longer(data, matches("^[0-9]{4}$"), names_to = 'period', values_drop_na = drop.na)
missing.default.columns <- default.columns[! default.columns %in% tolower(colnames(data))]
if (length(missing.default.columns) > 0) {
warning(f, " misses default columns: ", paste(missing.default.columns, collapse = ", "))
}
return(filter.function(as.quitte(data)))
}
if (grepl("\\.rds$", f)) {
return(filter.function(as.quitte(readRDS(f))))
}
# Check the header for correct names, periods all in one block and no
# additional columns after the periods
# read header ----
# read the header, comment_header, and useless.last.column from f and
# assign them to the environment
foo <- read_mif_header(f, sep, comment)
header <- foo$header
comment_header <- foo$comment_header
useless.last.column <- foo$useless.last.column
if (is.null(sep)) sep <- foo$sep
# FIXME: relax to handle other than 4-digit periods
period.columns <- grep("^[A-Za-z]?[0-9]{4}$", header)
if (!all(header[1:5] %in% default.columns))
stop("missing default columns in header of file ", f, ": ",
paste(setdiff(default.columns, header[1:5]), collapse = ", "))
if (length(period.columns) == 0) {
warning("No column name found that could be understood as a 4-digit year in file ", f,
". Returning empty data.frame.")
return(as.quitte(NULL))
}
if (last(period.columns) != length(header))
stop("unallowed extra columns in header of file ", f)
if (!all(seq_range(range(period.columns)) == period.columns))
stop("period columns not all in one block in header of file ", f)
periods <- header[period.columns]
colClasses <- paste(c(rep('c', period.columns[1] - 1),
rep('n', length(period.columns)),
ifelse(useless.last.column, '-', '')),
collapse = '')
# read data ----
# the callback function accepts a chunk of data, `x`, pivots the periods
# to long format (dropping NAs if required), converts the periods to
# integer or POSIXct values as required, and applies the
# `filter.function`.
chunk_callback <- DataFrameCallback$new(
(function(FilterF, convert.periods, drop.na) {
function(x, pos) {
if ('problems' %in% names(attributes(x))) {
p <- problems(x)
}
else {
p <- NULL
}
x <- x %>%
relocate(all_of(default.columns)) %>%
# convert to long format
pivot_longer(all_of(periods), names_to = 'period',
values_drop_na = drop.na) %>%
# convert periods
mutate(period = gsub('^[A-Za-z]?', '', .data$period),
period = if (convert.periods) {
ISOyear(.data$period)
} else {
as.integer(as.character(.data$period))
}) %>%
# apply filter
FilterF()
if (!is.null(p))
attr(x, 'problems') <- p
return(x)
}
})(filter.function, convert.periods, drop.na)
)
data <- suppressWarnings(
read_delim_chunked(
file = f, callback = chunk_callback, delim = sep,
chunk_size = chunk_size, quote = quote, col_names = header,
col_types = colClasses, na = na.strings, comment = comment,
trim_ws = TRUE, skip = length(comment_header) + 1)
)
# catch any parsing problems
data_problems <- if (nrow(problems(data))) {
problems(data) %>%
mutate(file = f)
}
# re-attach parsing problems
attr(data, 'problems') <- data_problems
attr(data, 'comment_header') <- comment_header
return(data)
}
# read quitte for all supplied files ----
quitte <- tibble()
quitte_problems <- tibble()
comment_header <- list()
for (f in file) {
data <- .read.quitte(f, sep, quote, na.strings, convert.periods,
drop.na, comment, filter.function, chunk_size)
quitte <- bind_rows(quitte, data)
quitte_problems <- bind_rows(quitte_problems, attr(data, 'problems'))
comment_header <- c(comment_header,
setNames(list(attr(data, 'comment_header')), f))
}
if (factors) {
quitte <- quitte %>%
# preserve order of scenarios for order of .mif files
mutate(scenario = as_factor(.data$scenario)) %>%
factor.data.frame()
}
# check for duplicate entries, ignoring values
if (check.duplicates) {
reportDuplicates(quitte, action = "warn")
}
# apply quitte "class" attribute
class(quitte) <- c('quitte', class(quitte))
# pass parsing problems along and issue warning
attr(quitte, 'problems') <- if (nrow(quitte_problems)) {
quitte_problems
}
if (nrow(quitte_problems)) {
if (interactive()) {
warning('One or more parsing issues, call `readr::problems()` for ',
'details.',
call. = FALSE)
} else {
warning.length <- getOption('warning.length')
options(warning.length = 8170)
warning('One or more parsing issues:\n',
quitte_problems %>%
as.data.frame() %>%
format() %>%
capture.output() %>%
paste(collapse = '\n'))
options(warning.length = warning.length)
}
}
# pass comment_header along as attribute, if any
if (!is_empty(unlist(comment_header))) {
if (1 == length(file)) {
comment_header <- unlist(comment_header, use.names = FALSE)
}
attr(quitte, 'comment_header') <- comment_header
}
return(quitte)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.