R/dirdf_parse.R

#' Path Metadata Parsing
#'
#' Creates a data frame using information from the paths and file names. It
#' accepts either a template or a regular expression and column names. Similar
#' to [dirdf()], but this takes a vector of pathnames and tries to
#' match them directly, rather than calling [base::dir()] on them and
#' matching those results. This is helpful if you want to filter or transform
#' the set of paths before matching, e.g. to remove any irrelevant filenames
#' like \file{.gitignore}, \file{.DS_Store}, \file{desktop.ini}.
#'
#' @seealso [dirdf()]
#'
#' @param pathnames character vector of pathname(s), e.g. the result of calling
#'   [base::dir()].
#'
#' @param template [template][templates] character string, e.g.
#'   `"Country/Province/City/StationID_Date.ext"`.
#' 
#' @param regexp regular expression used to parse the file names.
#'   Only one of the arguments `regexp` and `template` must be specified, i.e.
#'   only one of them can be non-`NULL`.
#' 
#' @param colnames character vector containing the names of the columns in the
#'   data frame. Not required if using `template` or if `regexp` uses
#'   named capturing groups (see examples), but may still be used to override
#'   column names.
#' 
#' @param missing value to use for unmatched optional template elements or
#'   regexp capturing groups.
#' 
#' @param ignore.case,perl If `regexp` is used, these are passed to
#'   [base::regexpr()]. Note that unlike `regexpr()`, the default value
#'   for `perl` is `TRUE` (to make it more convenient to use named
#'   capture groups, which are only supported in Perl mode).
#'
#' @example incl/dirdf_parse.R
#'
#' @export
dirdf_parse <- function(pathnames, template = NULL, regexp = NULL, colnames = NULL, missing = NA_character_, ignore.case = FALSE, perl = TRUE) {
  stop_if_not(xor(!is.null(template), !is.null(regexp)))
  stop_if_not(length(missing) == 1L)

  if (!is.null(template)) {
    regexp <- templateToRegex(template)
    ignore.case <- FALSE
    perl <- TRUE
  }

  ## Parse
  m <- regexpr(regexp, pathnames, ignore.case = ignore.case, perl = perl)
  nonMatching <- pathnames[!is.na(match(m, -1))]
  if (length(nonMatching) > 0) {
    stop("Unexpected path(s) found:", paste0("\n", nonMatching))
  }

  df <- regexprMatchToDF(pathnames, m, colnames = colnames, missing = missing)

  ## Drop unwanted fields
  names <- colnames(df)
  drop <- grep("^_DROP_BEGIN_(.*)_DROP_END_$", names)
  if (length(drop) > 0) df <- df[-drop]
  
  ## Coerce to data.frame
  df <- cbind(df, pathname = pathnames, stringsAsFactors = FALSE)
  class(df) <- c("dirdf", class(df))

  df
}
ropenscilabs/dirdf documentation built on May 27, 2019, 8:32 p.m.