R/frk.R

Defines functions frk_summarise frk_summarise_ frk_read_delim frk_stack

Documented in frk_read_delim frk_stack frk_summarise frk_summarise_

#' Guess all structural characteristics of one or more flat files
#' 
#' @description
#' Runs all functions from the `guess` family to gather all structural
#' characteristics of given file(s).
#' 
#' @param path a character vector of full path names; the default corresponds to the working directory, getwd(). Tilde expansion (see path.expand) is performed. Missing values will be ignored.
#' @param pattern an optional regular expression. Only file names which match the regular expression will be returned.
#' @param recursive logical. Should the listing recurse into directories?
#' @param guess_max Maximum number of records to use for guess
#' @param skip atomic numeric specifying the number of lines to skip in each file. If NULL, it will be guessed.
#' @param verbose Whether to output guess as message
#' @param progress Show progress bar?
#' @return A tibble whose lines stores the guesses for each file
#' 
#' @seealso [guess_delim()], [guess_col_names()], and the whole `guess` family
#' 
#' @examples
#' \dontrun{
#' # Create sample files
#' file <- tempfile()
#' write.table(
#'   dplyr::storms, file, sep = "|", dec = ",",
#'   col.names = TRUE, row.names = FALSE)
#' file2 <- tempfile()
#' write_csv(dplyr::storms, file2)
#' 
#' # Run all guesses
#' summ <- frk_summarise(file)
#' str(summ)
#' 
#' # Run all guesses on multiple files
#' summ <- frk_summarise(c(file, file2))
#' str(summ)
#' }
#' 
#' @export
frk_summarise <- function(path, 
                          pattern = NULL, 
                          recursive = FALSE, 
                          guess_max = 10, 
                          skip = NULL,
                          verbose = FALSE, 
                          progress = TRUE) {
  
  # If path is a directory, list all the files.
  files <- list.files(path = path, 
                      pattern = pattern, 
                      all.files = FALSE, 
                      full.names = TRUE,
                      recursive = recursive,
                      ignore.case = FALSE,
                      include.dirs = FALSE,
                      no.. = FALSE)
  
  # If path is a single file then it will return a length zero object.
  if(length(files) == 0) files <- path
  
  
  # Handle verbose
  if (length(files) > 1 & verbose) {
    warning("Only use verbose = TRUE when summarizing one file")
    verbose <- FALSE
  }
  
  message("Summarising files")
  
  # Get summary for all files
  if(progress) {
    pb <- progress::progress_bar$new(total = length(files),
                                     format = "[:bar] :current of :total")
    summary <- purrr::map_df(files, ~{
      pb$tick()
      frk_summarise_(.x, guess_max, verbose, skip = skip)
    })
  } else {
    summary <- purrr::map_df(files, ~frk_summarise_(.x, guess_max, verbose))
  }
  
  return(summary)
}

#' Guess all structural characteristics of a flat file
#' 
#' @param file Path to file
#' @param guess_max Maximum number of records to use for guess
#' @param verbose Whether to output guess as message
#' @return A named list with the guesses for the specified file
frk_summarise_ <- function(file, guess_max = 10, skip = NULL, verbose = FALSE) {
  
  # file extension
  file_ext <- base::tolower(tools::file_ext(file))
  
  if(file_ext %in% c("txt", "tsv", "csv", "dat", "")) {
    # Guess encoding
    guessed_encoding <- guess_encoding(file, guess_max, verbose)
    
    # Guess lines to skip
    guessed_skip <- if(is.null(skip)) guess_skip(file, guess_max, verbose) else skip
    
    # Guess delim
    guessed_delim <- guess_delim(file, guess_max, verbose, encoding = guessed_encoding, skip = guessed_skip)$char[1]
    
    # Guess has header
    guessed_has_header <- guess_has_header(file, guess_max, verbose, skip = guessed_skip, encoding = guessed_encoding)
    
    # Guess quote
    guessed_quote <- guess_quote(file, guess_max, verbose, skip = guessed_skip)
    
    # Guess col types
    guessed_col_types <- guess_col_types(file, guess_max, verbose, delim = guessed_delim, skip = guessed_skip, encoding = guessed_encoding)
    
    # Gues col names
    guessed_col_names <- guess_col_names(file, guess_max, verbose, delim = guessed_delim, header = guessed_has_header, quote = guessed_quote, encoding = guessed_encoding, skip = guessed_skip)
    
    # Suggested col names
    suggested_col_names <- if(guessed_has_header) tide_names(guessed_col_names) else FALSE
    
    # Guess decimal and grouping marks
    guessed_decimal_mark <- guess_decimal_mark(file, guess_max, verbose, delim = guessed_delim, quote = guessed_quote, skip = guessed_skip)
    guessed_grouping_mark <- guess_grouping_mark(file, guess_max, verbose)
  } else {
    # Guess encoding
    guessed_encoding <- as.character(NA)
    
    # Guess lines to skip
    guessed_skip <- as.numeric(NA)
    guessed_skip <- as.numeric(NA)
    
    # Guess delim
    guessed_delim <- as.character(NA)
    
    # Guess has header
    guessed_has_header <- as.logical(NA)
    
    # Guess quote
    guessed_quote <- as.character(NA)
    
    # Guess col types
    guessed_col_types <- as.list(NA)
    
    # Guess col names
    guessed_col_names <- as.list(NA)
    
    # Suggested col names
    suggested_col_names <- as.list(NA)
    
    # Guess decimal and grouping marks
    guessed_decimal_mark <- as.character(NA)
    guessed_grouping_mark <- as.character(NA)
  }
  
  
  if(verbose) message(sprintf("%s\n", file))
  
  return(list(
    file = file,
    file_ext = file_ext,
    delim = guessed_delim,
    encoding = guessed_encoding,
    has_header = guessed_has_header,
    col_types = list(guessed_col_types),
    col_names = list(guessed_col_names),
    suggested_col_names = list(suggested_col_names),
    quote = guessed_quote,
    skip = guessed_skip,
    decimal_mark = guessed_decimal_mark,
    grouping_mark = guessed_grouping_mark,
    escape_backslash = FALSE,
    escape_double = FALSE,
    na = list(c("", "NA")),
    quoted_na = TRUE,
    comment = "",
    trim_ws = TRUE,
    n_max = Inf
  ))
}


#' Read flat file knowing nothing about them
#' 
#' @description
#' This is a wrapper for [readr::read_delim()] that guesses the main arguments,
#' namely `delim`, `quote`, `col_names`, `encoding`, `decimal_mark`, `grouping_mark`, and `skip`.
#' 
#' @param file Path to file
#' @param delim (Guessed) Delimiter of file
#' @param quote (Guessed) Character used to quote strings
#' @param escape_backslash Whether backslashed are used to escape special characters
#' @param escape_double Whether quotes are escaped by doubling them
#' @param col_names (Guessed) Whether file has column names or vector with column names
#' @param col_types Vector with types to be forced on columns or NULL for guess
#' @param locale (Guessed) Some region-dependent information
#' @param na String or character vector to use for missing values
#' @param quoted_na Whether missing values can be quoted
#' @param comment String that indicates comments
#' @param trim_ws Whether to trim whitespace from each field
#' @param skip (Guessed) Number of lines to skip before reading data
#' @param n_max Maximum number of records to read
#' @param guess_max Maximum number of records to use for guess
#' 
#' @seealso [readr::read_delim()], [readr::locale()]
#' 
#' @examples
#' \dontrun{
#' # Create sample file
#' file <- tempfile()
#' write.table(
#'   dplyr::storms, file, sep = "|", dec = ",",
#'   col.names = TRUE, row.names = FALSE)
#' 
#' # Read file
#' tbl <- frk_read_delim(file)
#' glimpse(tbl)
#' }
#' 
#' @export
frk_read_delim <- function(file,
                           delim = guess_delim(file, guess_max)$char[1],
                           quote = guess_quote(file, guess_max),
                           escape_backslash = FALSE,
                           escape_double = TRUE,
                           col_names = guess_has_header(file, guess_max),
                           col_types = NULL,
                           locale = readr::locale(
                             encoding = guess_encoding(file, guess_max)$encoding[1],
                             decimal_mark = guess_decimal_mark(file, guess_max),
                             grouping_mark = guess_grouping_mark(file, guess_max)),
                           na = c("", "NA"),
                           quoted_na = TRUE,
                           comment = "",
                           trim_ws = TRUE,
                           skip = guess_skip(file, guess_max),
                           n_max = Inf,
                           guess_max = min(10, n_max)) {
  
  # Use read delim with new arguments
  out <- readr::read_delim(
    file, delim, quote, escape_backslash, escape_double,
    col_names, col_types, locale, na, quoted_na, comment,
    trim_ws, skip, n_max, guess_max)
  
  return(out)
}


#' Stack a set of files from a instruction
#' 
#' @description
#' This is a wrapper for [readr::read_delim()] that guesses the main arguments,
#' namely `delim`, `quote`, `col_names`, `encoding`, `decimal_mark`, `grouping_mark`, and `skip`.
#' 
#' @param file A tibble returned by \code{frk_summarise()}.
#' 
#' @seealso [frk_summarise()], [frk_read_delim()]
#' 
#' @examples
#' \dontrun{
#' dir <- "~/folder_full_of_flat_files"
#' stacked_files <- dir %>% frk_summarise %>% frk_stack
#' }
#' 
#' @export
frk_stack <- function(instruction) {
  purrr::map(instruction, ~frk_read_delim)
}
Athospd/forkliftr documentation built on May 14, 2019, 9:42 a.m.