R/list.data.R

Defines functions .prepare.data.ignore.regex .parse.extensions .list.data list.data

Documented in .list.data list.data .parse.extensions .prepare.data.ignore.regex

#' Listing the data for the current project
#'
#' This function produces a data.frame of all data files in the project, with
#' meta data on if and how the file will be loaded by \code{load.project}.
#'
#' @param ... Named arguments to override configuration from
#'   \code{config/global.dcf} and \code{lib/global.R}.
#'
#' @return A data.frame listing the available data, with relevant meta data
#'
#' @details The returned data.frame contains the following variables, with one
#'   observation per file in \code{data/}:
#'
#' \tabular{ll}{
#'    \code{filename} \tab Character variable containing the filename relative
#'    to \code{data/} directory. \cr
#'    \code{varname} \tab Character variable containing the name of the variable
#'    into which the file will be imported. * \cr
#'    \code{is_ignored} \tab Logical variable that indicates whether the file.
#'    is ignored through the \code{data_ignore} option in the configuration \cr
#'    \code{is_directory} \tab Logical variable that indicates whether the file
#'    is a directory. \cr
#'    \code{is_cached} \tab Logical variable that indicates whether the file is
#'    already available in the \code{cache/} directory. \cr
#'    \code{cached_only} \tab Logical variable that indicates whether the
#'    variable is only available in the \code{cache/} directory. This occurs
#'    when calling the cache function with a code fragment in a munge script.
#'    \cr
#'    \code{reader} \tab Character variable containing the name of the reader
#'    function that will be used to load the data. Contains a
#'    \code{character(0)} if no suitable reader was found.
#' }
#'
#' * Note that some readers return more than one variable, usually with the
#'   listed variable name as prefix. This is true for for example the
#'   \code{xls.reader} and \code{xlsx.reader}.
#'
#' @export
#'
#' @seealso \code{\link{load.project}}, \code{\link{show.project}},
#'    \code{\link{project.config}}
#'
#' @examples
#' library('ProjectTemplate')
#'
#' \dontrun{list.data()}
list.data <- function(...) {
  override.config <- .parse.override.config(list(...))
  config <- .load.config(override.config)
  .list.data(config)
}


#' Build the list of data available for loading into memory
#'
#' This function produces a data.frame of all data files in the project, with
#' meta data on if and how the file will be loaded by \code{load.project}.
#'
#' @param config List containing the configuration to use.
#'
#' @inherit list.data description details return
#'
#' @keywords internal
#'
#' @rdname internal.list.data
.list.data <- function(config) {
  # Get list of variables in data/, always recursive to exclude cached
  # variables from nested files
  all.files <- list.files(path = 'data', recursive = TRUE, include.dirs = TRUE)
  # Get list of variables according to configured recursive_loading, used as
  # filtering variable later
  data.files <- list.files(path = 'data', recursive = config$recursive_loading,
                           include.dirs = !config$recursive_loading)

  # Get variable name and reader from filenames
  files.parsed <- .parse.extensions(all.files, config)
  varnames <- files.parsed$varnames
  readers <- files.parsed$readers

  is_ignored <- grepl(.prepare.data.ignore.regex(config$data_ignore),
                      all.files)
  is_directory <- file.info(file.path('data', all.files))$isdir

  is_cached <- .is.cached(varnames)
  cache_only <- rep(FALSE, length(varnames))

  # Build the final data.frame
  df <- data.frame(filename = all.files,
                   varname = varnames,
                   is_ignored = is_ignored,
                   is_directory = is_directory,
                   is_cached = is_cached,
                   cache_only = cache_only,
                   stringsAsFactors = FALSE)
  df$reader <- readers
  # Keep only lines with files that match the configured recursive_loading
  # setting
  df <- df[df$filename %in% data.files,]
  df <- df[order(df$reader == "file.reader", decreasing = TRUE),]
  ## df <- df[!duplicated(df$varname, incomparables = ""),]
  # Get list of variables in cache/
  cached.vars <- .cached.variables()
  # Exclude variables already found in data/
  cached.vars <- setdiff(cached.vars, varnames)
  

  filenames <- rep('', length(cached.vars))
  is_ignored <- grepl(.prepare.data.ignore.regex(config$data_ignore),
                      cached.vars)
  is_directory <- rep(FALSE, length(cached.vars))
  cache_only <- rep(TRUE, length(cached.vars))
  readers <- rep('', length(cached.vars))

  # .cached.variables returns all variables without checking validity, need to
  # call .is.cached to perform this check
  is_cached <- .is.cached(cached.vars)

  df2 <- data.frame(filename = filenames,
                    varname = cached.vars,
                    is_ignored = is_ignored,
                    is_directory = is_directory,
                    is_cached = is_cached,
                    cache_only = cache_only,
                    reader = readers,
                    row.names = NULL,
                    stringsAsFactors = FALSE)

  rbind(df, df2)
}


#' Match readers to the extensions of the data files
#'
#' @param data.files a vector of paths to data files
#'
#' @return A list of \code{readers} and \code{varnames}
#'
#' @keywords internal
#'
#' @rdname internal.parse.extensions
.parse.extensions <- function(data.files, config) {
  readers <- character(length(data.files))
  varnames <- character(length(data.files))

  for (extension in ls(extensions.dispatch.table)) {
    extension.match <- grepl(extension, data.files,
                             ignore.case = TRUE, perl = TRUE)
    readers[extension.match] <- list(extensions.dispatch.table[[extension]])
    varnames[extension.match] <- sub(extension, '', data.files[extension.match],
                                     ignore.case = TRUE, perl = TRUE)
    varnames[extension.match] <- clean.variable.name(varnames[extension.match], config)
  }

  list(readers = readers, varnames = varnames)
}


#' Prepare a regular expression for matching files to be ignored
#'
#' Constructs a single regular expression for matching file names in data that
#' should not be imported. It can detect literal names, globs with wildcards and
#' regular expressions.
#'
#' @param ignore_files A comma separated character vector that lists all
#'   patterns to be matched for ignoring
#'
#' @return A chained regular expression that matches all patterns in the
#'   \code{ignore_files} variable.
#'
#' @keywords internal
#'
#' @rdname internal.prepare.data.ignore.regex
.prepare.data.ignore.regex <- function(ignore_files) {
  ignore_files <- strsplit(ignore_files, '\\s*,\\s*')[[1]]
  regexes <- ignore_files[grepl('^/.*/$', ignore_files)]
  literals <- setdiff(ignore_files, regexes)

  # Create regex for special characters in regex to be escaped
  #  (welcome to backslash hell)
  # Note that * is a regex special character but often used in literals as
  #  wildcard
  regex.special <- c('.', '\\', '|', '(', ')', '[', '{', '^', '$', '+', '?')
  regex.special <- paste0('([',
                          paste0('\\', regex.special, collapse = '|'),
                          '])')
  # Escape special characters in literal strings
  literals <- gsub(regex.special, '\\\\\\1', literals)
  # Escape wildcard * in literal strings
  literals <- gsub('\\*', '\\.\\*', literals)
  # Convert trailing slash to wildcard
  literals <- gsub('/$', '/\\.\\*', literals)
  literals <- paste0('^', literals, '$')

  # Remove starting and trailing slashes from regexes
  regexes <- gsub('(^/)|(/$)', '', regexes)

  # Combine and return prepared regexes
  paste0(c(literals, regexes), collapse = '|')
}

Try the ProjectTemplate package in your browser

Any scripts or data that you put into this service are public.

ProjectTemplate documentation built on July 31, 2021, 5:07 p.m.