R/data_reading.R

Defines functions read_directory detect_input split_by_colnum

# Data Reading ------------------------------------------------------------

#' Read all files from a specific directory
#' @param path Name of the subdirectory to read. Default is getwd().
#' @param pattern String to match in filenames. If NULL, all files in the directory are read.
#' @param bind Whether or not to rbind all files into a single table.
#' @param to_envir Whether or not to load the files directly into .GlobalEnv instead of a list.
#' @param ... Additional specifications for data.table::fread()
#' @return A list of data.tables read from \code{path}.
read_directory <- function(path = getwd(), pattern = NULL, bind = FALSE, to_envir = FALSE, ...){

  # identifies all files in path
  filenames <- list.files(path, full.names = T)[grepl(pattern = '\\.[[:alpha:]]{1,3}$', x = list.files(path)) & !grepl(pattern = '\\.R', x = list.files(path)) &!grepl(pattern = '.zip|.xlsx|.rar', x = list.files(path))]

  if(!is.null(pattern)){
    filenames <- filenames[grepl(pattern, filenames)]
  }

  if(bind){
    #read all files directly into a same data.table, no rbind needed
    # reference: https://twitter.com/Hung_TT_Nguyen/status/1230519551076339712
    data.table(filenames = filenames)[, fread(file = as.character(.BY),
                                              encoding = "UTF-8",
                                              na.strings = "NA",
                                              integer64 = 'character',
                                              ...),
                                      by = filenames]
    return(dat)
  }

  cat('\nReading the following files:\n', paste(filenames, collapse = '\n'), sep  = '\n', '')

  # creates names for all tables from their filenames
  tablenames <- stringr::str_remove(string = filenames,  pattern = '.[^.]+$') %>% stringr::str_remove(paste0(path,'/'))

  # reads all files into a table list
  data_list <- map(filenames, data.table::fread, encoding = "UTF-8", na.strings = "NA", integer64 = 'character', nThread = 30, ...) %>%
    purrr::set_names(tablenames)

  if(to_envir){
    list2env(data_list, envir = .GlobalEnv)
  }
  return(data_list)
}

#' Helper function to detect \code{input} parameter from dq::data_quality_report()
#' @param input A data.frame, list of data frames, character ending in a file extension (eg .csv, .txt, etc.) or a character specifying a directopry to read
#' @param dir.pattern If \code{input} is a directory, an optional matching pattern for filenames in said directory.
#detects if the input is a list, a data frame a directory or a filename and returns either a df or a list
detect_input <- function(input, dir.pattern = NULL,...){
  # determine if the analyisis will be done on a single file, a data.frame, a list of data.frames or a directory
  if(is.character(input)){
    # single file
    if(grepl(pattern = '\\.[[:alpha:]]', input)){
      cat('\nLoading file from directory... \n')
      dat <- data.table::fread(input,  encoding = "UTF-8", na.strings = "NA", integer64 = 'character', nThread = 30, ...)
      # entire directory
    }else{
      cat('\nLoading all files from directory... \n')
      dat <- read_directory(input, pattern = dir.pattern, ...)
    }
  }

  # single data.frame
  if(is.data.frame(input)){
    dat <- input
  }
  #
  # list of data.frames
  if(all(purrr::map_lgl(input, ~is.data.frame(.x)))){
    dat <- input
  }
  return(dat)
}

#' Read plain text files with an inconsistent amount of separators in different rows
#' @param txt Either a filename or a character vector containing the lines of a plain text file.
#' @param sep Character used as separator in the read file.
#' @param table_name Prefix to add on the names of the tables returned
#' @return A list of data.tables, where each data.table has a consitent amount of separators
split_by_colnum <- function(txt, sep = ',', table_name = NULL, ...){
  #if txt is a filename
  if(length(txt) == 1 & grepl('\\.[[:alpha:]]{1,3}$ ', txt)){
    txt <- readr::read_lines(txt)
  }
  Encoding(txt) <-  'UTF-8'
  tab_count <- stringr::str_count(string = txt, pattern = sep)
  tbls <- split(txt, tab_count)
  tbls <- purrr::map(tbls, function(x) data.table::fread(text = x, nThread = 30, ...))
  names(tbls) <- paste(table_name, (as.integer(names(tbls))+1), 'columns', sep = '_')
  return(tbls)
}
pheymanss/dq documentation built on March 12, 2020, 1:29 a.m.