gcam-korea-release: Build the GCAM-Korea Input Datasets

Documented in extract_header_info find_header load_csv_files

# utils.R


#' find_header
#'
#' Read a file line-by-line to find how far its header extends, and return it.
#'
#' @param fqfn Fully qualified filename, character
#' @return The header as a character vector.
#' @note Headers are defined as consecutive lines beginning with "#" at the top of a file.
#' @author Alexey Shiklomanov
find_header <- function(fqfn) {
  con <- file(fqfn, "r")
  on.exit(close(con))

  is_comment <- TRUE
  header <- character()

  while(is_comment) {
    line <- readLines(con, n = 1)
    is_comment <- grepl("^#", line)
    if (is_comment) {
      header <- c(header, line)
    }
  }
  header
}

#' load_csv_files
#'
#' Load one or more internal, i.e. included with the package, csv (or csv.gz) data files.
#'
#' @param filenames Character vector of filenames to load
#' @param optionals Logical vector, specifying whether corresponding file is optional
#' @param quiet Logical - suppress messages?
#' @param ... Any other parameter to pass to \code{readr::read_csv}
#' @details The data frames read in are marked as inputs, not ones that have
#' been computed, via \code{\link{add_comments}}. Optional files that are not found
#' as returned as NA in the list.
#' @return A list of data frames (tibbles).
#' @importFrom magrittr "%>%"
#' @importFrom assertthat assert_that
load_csv_files <- function(filenames, optionals, quiet = FALSE, ...) {
  assert_that(is.character(filenames))
  assert_that(is.logical(optionals))
  assert_that(is.logical(quiet))
  assert_that(length(filenames) == length(optionals))

  # Remove duplicates
  optionals <- optionals[!duplicated(filenames)]
  filenames <- filenames[!duplicated(filenames)]

  filedata <- list()
  for(fnum in seq_along(filenames)) {
    f <- filenames[fnum]

    if(!quiet) cat("Loading", f, "...\n")
    fqfn <- find_csv_file(f, optionals[fnum], quiet = quiet)

    if(is.null(fqfn)) {
      assert_that(optionals[fnum]) # if we get back a NULL, file has to be optional
      filedata[[f]] <- missing_data()
      if(!quiet) message("Note: optional input ", f, "not found")
      next
    }

    # Read the file header and extract the column type info from it
    assert_that(file.exists(fqfn))
    header <- find_header(fqfn)
    col_types <- extract_header_info(header, label = "Column types:", fqfn, required = TRUE)

    readr::read_csv(fqfn, comment = COMMENT_CHAR, col_types = col_types, ...) %>%
      parse_csv_header(fqfn, header) %>%
      add_comments(paste("Read from", gsub("^.*extdata", "extdata", fqfn))) %>%
      add_flags(FLAG_INPUT_DATA) ->
      filedata[[f]]

    # Title might have been filled in, or not
    if(is.null(get_title(filedata[[f]]))) {
      filedata[[f]] <- add_title(filedata[[f]], f)
    }

  }
  filedata
}


#' extract_header_info
#'
#' Extract information from CSV headers.
#'
#' @param header_lines Character vector holding raw header lines
#' @param label Label to search for - character
#' @param filename Filename (for error reporting purposes)
#' @param required Is this label required? (logical)
#' @param multiline Can this label hold multi-line information? (logical)
#' @details CSV files can have headers, commented lines of the form "# Title: xxxx",
#' "# Source: xxxx", etc. Extract this information if present. Note that empty headers
#' are not allowed. This function is called by \code{\link{parse_csv_header}}.
#' @return Extracted label information, as a character vector
extract_header_info <- function(header_lines, label, filename, required = FALSE, multiline = FALSE) {

  . <- NULL                             # silence notes on package check.

  assert_that(is.character(header_lines))
  assert_that(is.character(label))
  assert_that(is.character(filename))
  assert_that(is.logical(required))

  label_regex <- paste0("^", COMMENT_CHAR, "\\s*", trimws(label))
  label_line <- grep(label_regex, header_lines)
  if(length(label_line) > 1) {
    stop("Header label ", label, " appears on >1 line in ", basename(filename))
  } else if(length(label_line) == 1) {
    if(multiline) {
      # Multiline comments may end with the last comment line before data...
      comment_end1 <- max(grep(paste0("^", COMMENT_CHAR), header_lines))
      # ...or at the next label (xxx:)...
      all_label_lines <- grep(paste0("^", COMMENT_CHAR, "\\s*[a-zA-Z]*:"), header_lines)
      if(any(all_label_lines > label_line)) {
        comment_end2 <- min(all_label_lines[all_label_lines > label_line])
      } else {
        comment_end2 <- NA  # no next label
      }
      # ... whichever comes first
      comment_end <- min(comment_end1, comment_end2 - 1, na.rm = TRUE)
    } else {
      comment_end <- label_line
    }
    # Pull out information and return
    header_lines[label_line:comment_end] %>%
      gsub(label_regex, "", .) %>%
      gsub(paste0("^", COMMENT_CHAR), "", .) %>%
      trimws ->
      info

    if(any(grepl(",,+$", info))) {
      warning("Multiple commas at end of header line in ", filename)
    }
    if(nchar(paste(info, collapse = "")) == 0) {
      stop("Empty metadata label '", label, "' found in ", basename(filename))
    }
    return(info)
  } else {
    if(required) {
      stop("Required metadata label '", label, "' not found in ", basename(filename))
    }
    NULL   # label not present
  }
}


#' parse_csv_header
#'
#' Parse a CSV file's header, if present.
#'
#' @param obj The object to attach attributes to
#' @param filename Fully-qualified filename
#' @param header A vector of strings comprising the file header
#' @param enforce_requirements Enforce mandatory fields?
#' @details Headers are given at the top of files and consist of labels ("Title:", "Units:", etc)
#' prefixed by comment characters (#). The parser looks for these, and calls \code{\link{add_title}} and
#' similar functions to return an empty data frame with appropriate attribute set.
#' @return An empty \code{\link{tibble}} with appropriate attributes filled in.
#' @export
parse_csv_header <- function(obj, filename, header, enforce_requirements = TRUE) {
  assert_that(tibble::is_tibble(obj))
  assert_that(is.character(filename))
  assert_that(is.character(header))
  assert_that(is.logical(enforce_requirements))

  # Excel tries to be 'helpful' and, when working with CSV files, quotes lines with
  # commas in them...which you CAN'T SEE when re-opening in Excel. Trap this problem.
  if(any(grepl(paste0('^"', COMMENT_CHAR), header))) {
    stop('A quoted comment (# prefixed by a double quote, probably due to Excel) detected in ', basename(filename))
  }

  # The 'File:' field has to match the actual filename
  filecheck <- extract_header_info(header, "File:", filename, required = enforce_requirements)
  # Remove trailing commas - stupid Excel
  filecheck <- gsub(",*$", "", filecheck)
  if(enforce_requirements & !identical(filecheck, basename(filename))) {
    stop("'File:' given in header (", filecheck, ") doesn't match filename in ", filename)
  }

  obj %>%
    add_title(extract_header_info(header, "Title:", filename, required = enforce_requirements)) %>%
    add_units(extract_header_info(header, "Units?:", filename, required = enforce_requirements)) %>%
    add_comments(extract_header_info(header, "(Comments|Description):", filename, multiline = TRUE)) %>%
    add_reference(extract_header_info(header, "(References?|Sources?):", filename, multiline = TRUE))
}


#' find_csv_file
#'
#' Find an internal, i.e. included with the package, data file.
#' @param filename Filename (extension optional) to find
#' @param optional Logical: file optional to find?
#' @param quiet Logical - suppress messages?
#' @return Full name of file, or NULL if file not found but is optional.
#' @details Throws an error if file not found (and file is not optional).
#' @importFrom assertthat assert_that
find_csv_file <- function(filename, optional, quiet = FALSE) {
  assert_that(is.character(filename))
  assert_that(assert_that(length(filename) == 1))
  assert_that(is.logical(optional))
  assert_that(is.logical(quiet))

  extensions <- c("", ".csv")
  for(ex in extensions) {
    fqfn <- system.file("extdata", paste0(filename, ex), package = "gcamkordata")
    if(fqfn != "") {
      if(!quiet) cat("Found", fqfn, "\n")
      return(fqfn)  # found it
    }
  }
  if(optional) {
    return(NULL)
  } else {
    stop("Couldn't find required data ", filename)
  }
}


#' Write data produced by chunks to csv files.
#'
#' Write the data produced by the chunks to their output files.  This is mostly
#' a wrapper around \code{write_csv} that figures out file names, processes
#' table flags, writes metadata comments, and so forth.
#'
#' One thing to be aware of is that there is a wart in \code{readr} v 1.1 and
#' later that causes floating point data to be written as integers if they
#' happen to have integer values.  This can cause problems if there are so many
#' apparently-integer values before the first obviously-float value that
#' \code{read_csv} concludes that the column should have integer type.  If this
#' seems to be happening to your table, add the PROTECT_FLOAT flag to it, and
#' any floating point data in your table will be protected before it is
#' written.  Use this option sparingly, as the data written that way tends to be
#' a lot bigger, owing to the large number of digits we have to write.
#'
#' @param chunkdata Named list of tibbles (data frames) to write
#' @param write_inputs Write data that were read as inputs, not computed? Logical
#' @param create_dirs Create directory if necessary, and delete contents? Logical
#' @param write_outputs Write all chunk outputs to disk?
#' @param write_xml Write XML Batch chunk outputs to disk?
#' @param outputs_dir Directory to save data into
#' @param xml_dir Directory to save XML results into
#' @importFrom assertthat assert_that
save_chunkdata <- function(chunkdata, write_inputs = FALSE, create_dirs = FALSE,
                           write_outputs = TRUE, write_xml = write_outputs,
                           outputs_dir = OUTPUTS_DIR, xml_dir = XML_DIR) {
  assert_that(is_data_list(chunkdata))
  assert_that(is.logical(write_inputs))
  assert_that(is.logical(create_dirs))
  assert_that(is.character(outputs_dir))

  # Create directory if necessary, and remove any previous outputs
  if(create_dirs) {
    dir.create(outputs_dir, showWarnings = FALSE, recursive = TRUE)
    unlink(file.path(outputs_dir, "*.csv"))
    dir.create(xml_dir, showWarnings = FALSE, recursive = TRUE)
    unlink(file.path(xml_dir, "*.xml"))

    koreaXmlPath <- paste(xml_dir, "korea/", sep='')
    dir.create(koreaXmlPath, showWarnings = FALSE, recursive = TRUE)
    unlink(file.path(koreaXmlPath, "*.xml"))
  }

  for(cn in names(chunkdata)) {
    cd <- get_data(chunkdata, cn)
    if(is.null(cd)) next   # optional file that wasn't found

    if(FLAG_XML %in% get_flags(cd)) {
      if(write_xml) {
        # TODO: worry about absolute paths?
        cd$xml_file <- file.path(xml_dir, cd$xml_file)
        run_xml_conversion(cd)
      }
    } else if(write_outputs) {
      fqfn <- file.path(outputs_dir, paste0(cn, ".csv"))
      suppressWarnings(file.remove(fqfn))

      cmnts <- get_comments(cd)
      flags <- get_flags(cd)

      # If these data have been tagged as input data, don't write
      if(FLAG_NO_OUTPUT %in% flags ||
         FLAG_INPUT_DATA %in% flags && !write_inputs) {
        next
      }

      # If data is in a different from for original data system, indicate
      # that by writing to first line of file
      if(!is.null(flags)) {
        cat(paste(COMMENT_CHAR, paste(flags, collapse = " ")), file = fqfn, sep = "\n")
      }

      if(!is.null(cmnts)) {
        cat(paste(COMMENT_CHAR, cmnts), file = fqfn, sep = "\n", append = TRUE)
      }

      readr::write_csv(cd, fqfn, append = TRUE, col_names = TRUE)
    }
  }
}


#' find_chunks
#'
#' Get a list of chunks in this package.
#' These are functions with a name of "module_{modulename}_{chunkname}".
#' @param pattern Regular expression pattern to search for
#' @param include_disabled Return names of disabled chunks?
#' @return A data frame with fields 'name', 'module', and 'chunk'.
#' @details If a chunk name ends with \code{_DISABLED}, by default its name
#' will not be returned.
#' @importFrom magrittr "%>%"
#' @export
find_chunks <- function(pattern = "^module_[a-zA-Z\\.]*_.*$", include_disabled = FALSE) {

  . <- name <- disabled <- x <- NULL    # silence notes on package check.

  assertthat::assert_that(is.character(pattern))

  ls(name = parent.env(environment()), pattern = pattern) %>%
    tibble::tibble(name = ., disabled = grepl("_DISABLED$", name)) %>%
    filter(include_disabled | !disabled) %>%
    tidyr::separate(name, into = c("x", "module", "chunk"), remove = FALSE, sep = "_", extra = "merge") %>%
    dplyr::select(-x)
}


#' chunk_inputs
#'
#' @param chunks A character vector of chunks names
#' @return A tibble with columns 'name' (chunk name), 'input' (name of data),
#' 'file_file' (whether object is read from a file), and 'optional' (whether
#' the object is optional or not).
#' @export
chunk_inputs <- function(chunks = find_chunks()$name) {
  assertthat::assert_that(is.character(chunks))

  # Get list of data required by each chunk
  chunk_names <- character()
  inputs <- character()
  from_files <- logical()
  optionals <- logical()
  for(ch in chunks) {
    cl <- call(ch, driver.DECLARE_INPUTS)
    reqdata <- eval(cl)

    # Chunks mark their file inputs specially, using vector names
    if(is.null(names(reqdata))) {
      file_inputs <- rep(FALSE, times = length(reqdata))
      optional_file_inputs <- rep(FALSE, times = length(reqdata))
    } else {
      file_inputs <- names(reqdata) %in% c("FILE", "OPTIONAL_FILE")
      optional_file_inputs <- names(reqdata) == "OPTIONAL_FILE"
    }
    if(!is.null(reqdata)) {
      chunk_names <- c(chunk_names, rep(ch, times = length(reqdata)))
      inputs <- c(inputs, as.vector(unlist(reqdata)))
      from_files <- c(from_files, file_inputs)
      optionals <- c(optionals, optional_file_inputs)
    }
  }
  tibble(name = chunk_names, input = inputs, from_file = from_files, optional = optionals)
}


#' inputs_of
#'
#' Convenience function for getting the inputs of one or more chunks
#'
#' @param chunks Names of chunks, character
#' @return Character vector of inputs.
#' @export
inputs_of <- function(chunks) {
  if(is.null(chunks) || chunks == "") return(NULL)
  chunk_inputs(chunks)$input
}

#' chunk_outputs
#'
#' List all chunk outputs.
#'
#' @param chunks A character vector of chunks names
#' @return A tibble with columns 'name' (chunk name), 'output' (name of data),
#' and 'to_xml' (whether or not this is an XML structure).
#' @export
chunk_outputs <- function(chunks = find_chunks()$name) {
  assertthat::assert_that(is.character(chunks))

  chunk_names <- character()
  outputs <- character()
  to_xmls <- logical()
  for(ch in chunks) {
    cl <- call(ch, driver.DECLARE_OUTPUTS)
    reqdata <- eval(cl)

    # Chunks mark any XML file outputs using vector names
    if(is.null(names(reqdata))) {
      fileoutputs <- rep(FALSE, times = length(reqdata))
    } else {
      fileoutputs <- names(reqdata) == "XML"
    }
    if(!is.null(reqdata)) {
      chunk_names <- c(chunk_names, rep(ch, times = length(reqdata)))
      outputs <- c(outputs, as.vector(unlist(reqdata)))
      to_xmls <- c(to_xmls, fileoutputs)
    }
  }
  tibble(name = chunk_names, output = outputs, to_xml = to_xmls)
}

#' outputs_of
#'
#' Convenience function for getting the outputs of one or more chunks
#'
#' @param chunks Names of chunks, character
#' @return Character vector of inputs.
#' @export
outputs_of <- function(chunks) {
  if(is.null(chunks) || chunks == "") return(NULL)
  chunk_outputs(chunks)$output
}

#' screen_forbidden
#'
#' Screen a function for use of functions forbidden by data system style guide.
#'
#' Certain functions are forbidden by the dsr style guide from being used in
#' code chunks.  This function tests a function for calls to forbidden functions
#' and flags the offending lines.
#'
#' @param fn The function to be tested. This is the actual function object, not
#' the name of the function.
#' @return Nx2 Character matrix of flagged lines and the test that tripped them
#' (empty vector, if none)
#' @author RL 19 Apr 2017
#' @importFrom utils capture.output
screen_forbidden <- function(fn) {
  forbidden <- c("(?<!error_no_)match(?!es)", "ifelse",
                 "melt", "cast",
                 "rbind", "cbind", "merge",
                 "read\\.csv", "write\\.csv",
                 "summarise_each", "mutate_each")

  code <- capture.output(fn)
  code <- gsub("#.*$", "", code)      # remove comments
  code <- gsub('"[^"]*"', "", code)   # remove double quoted material
  code <- gsub("'[^']*'", "", code)   # remove single quoted material

  # For some reason the R package check process seems to concatenate certain lines;
  # in particular a mutate() followed by a replace_na() ends up on a single line, which
  # can cause false positives below if it's then followed by another mutate(). This
  # does not occur during 'normal' testthat testing.
  # Anyway, ensure all %>% operations are on separate lines
  code <- unlist(vapply(code, strsplit, split = "%>%", fixed = TRUE, FUN.VALUE = list(1)))

  # Special multiline case: consecutive mutate calls
  rslt <- character()
  mutates <- grep("^\\s*mutate\\(", code)
  diff1s <- base::diff(mutates) == 1
  if(any(diff1s)) {
    rslt <- cbind("consecutive mutate calls", code[mutates[which(diff1s)]])
  }

  # General screen-forbidden search, single lines only
  for(f in unique(forbidden)) {
    bad <- grep(f, code, perl = TRUE)
    if(length(bad) > 0) {
      rslt <- rbind(rslt,
                    cbind(f, code[bad]))
    }
  }
  rslt
}