# Data Reading ------------------------------------------------------------
#' Read all files from a specific directory
#' @param path Name of the subdirectory to read. Default is getwd().
#' @param pattern String to match in filenames. If NULL, all files in the directory are read.
#' @param bind Whether or not to rbind all files into a single table.
#' @param to_envir Whether or not to load the files directly into .GlobalEnv instead of a list.
#' @param ... Additional specifications for data.table::fread()
#' @return A list of data.tables read from \code{path}.
read_directory <- function(path = getwd(), pattern = NULL, bind = FALSE, to_envir = FALSE, ...){
# identifies all files in path
filenames <- list.files(path, full.names = T)[grepl(pattern = '\\.[[:alpha:]]{1,3}$', x = list.files(path)) & !grepl(pattern = '\\.R', x = list.files(path)) &!grepl(pattern = '.zip|.xlsx|.rar', x = list.files(path))]
if(!is.null(pattern)){
filenames <- filenames[grepl(pattern, filenames)]
}
if(bind){
#read all files directly into a same data.table, no rbind needed
# reference: https://twitter.com/Hung_TT_Nguyen/status/1230519551076339712
data.table(filenames = filenames)[, fread(file = as.character(.BY),
encoding = "UTF-8",
na.strings = "NA",
integer64 = 'character',
...),
by = filenames]
return(dat)
}
cat('\nReading the following files:\n', paste(filenames, collapse = '\n'), sep = '\n', '')
# creates names for all tables from their filenames
tablenames <- stringr::str_remove(string = filenames, pattern = '.[^.]+$') %>% stringr::str_remove(paste0(path,'/'))
# reads all files into a table list
data_list <- map(filenames, data.table::fread, encoding = "UTF-8", na.strings = "NA", integer64 = 'character', nThread = 30, ...) %>%
purrr::set_names(tablenames)
if(to_envir){
list2env(data_list, envir = .GlobalEnv)
}
return(data_list)
}
#' Helper function to detect \code{input} parameter from dq::data_quality_report()
#' @param input A data.frame, list of data frames, character ending in a file extension (eg .csv, .txt, etc.) or a character specifying a directopry to read
#' @param dir.pattern If \code{input} is a directory, an optional matching pattern for filenames in said directory.
#detects if the input is a list, a data frame a directory or a filename and returns either a df or a list
detect_input <- function(input, dir.pattern = NULL,...){
# determine if the analyisis will be done on a single file, a data.frame, a list of data.frames or a directory
if(is.character(input)){
# single file
if(grepl(pattern = '\\.[[:alpha:]]', input)){
cat('\nLoading file from directory... \n')
dat <- data.table::fread(input, encoding = "UTF-8", na.strings = "NA", integer64 = 'character', nThread = 30, ...)
# entire directory
}else{
cat('\nLoading all files from directory... \n')
dat <- read_directory(input, pattern = dir.pattern, ...)
}
}
# single data.frame
if(is.data.frame(input)){
dat <- input
}
#
# list of data.frames
if(all(purrr::map_lgl(input, ~is.data.frame(.x)))){
dat <- input
}
return(dat)
}
#' Read plain text files with an inconsistent amount of separators in different rows
#' @param txt Either a filename or a character vector containing the lines of a plain text file.
#' @param sep Character used as separator in the read file.
#' @param table_name Prefix to add on the names of the tables returned
#' @return A list of data.tables, where each data.table has a consitent amount of separators
split_by_colnum <- function(txt, sep = ',', table_name = NULL, ...){
#if txt is a filename
if(length(txt) == 1 & grepl('\\.[[:alpha:]]{1,3}$ ', txt)){
txt <- readr::read_lines(txt)
}
Encoding(txt) <- 'UTF-8'
tab_count <- stringr::str_count(string = txt, pattern = sep)
tbls <- split(txt, tab_count)
tbls <- purrr::map(tbls, function(x) data.table::fread(text = x, nThread = 30, ...))
names(tbls) <- paste(table_name, (as.integer(names(tbls))+1), 'columns', sep = '_')
return(tbls)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.