#' Ingest a directory of uniform files
#'
#' This function reads in all files from a directory using the chosen import
#' function. Use the 'pattern' argument to specify a set of files, or a
#' single file type. If collapse = TRUE \code{\link[dplyr]{bind}} is used
#' to match column names and bind the imported data into a single object.
#' \strong{All ingest functions use the source file name as an identifying
#' column to track provenance and relate data and metadata read from files.
#' Please check that you have unique file names.}
#'
#' If \code{check.duplicates = "remove"} then only a single set of records will
#' be retained when files have identical contents. This does not provide rowwise
#' checking for duplicates. A separate data.frame is created specifying the removed
#' input_source, the number of records removed, and the reason for removal.
#'
#'
#' @param directory A character vector with the name of the directory that
#' contains your data files. Defaults to the working directory.
#' @param ingest.function The function to use to read in the files, defaults to
#' \code{\link[utils]{read.table}} but can take any ingestr or standard import function.
#' @param pattern A character vector providing the pattern to match filenames as
#' in \code{\link[base]{list.files}}. Defaults to all files "*".
#' @param collapse A logical argument, when true a single object is returned,
#' when false an object is returned for each file. Defaults to \code{TRUE}.
#' @param recursive A logical argument, when true files are read recursively,
#' defaults to \code{TRUE}. See \code{\link[base]{list.files}} for more
#' information..
#' @param check.duplicates A character argument specifying the action that
#' should be taken if files with duplicate contents are detected. One of
#' "warn", "remove", or NULL to disable checking. Defaults to "warn".
#' @param ... Additional arguments to pass to the input method
#'
#' @return When \code{collapse = T} a single object matching the output class
#' of \code{fun} is returned. When \code{collapse = F} a single object is
#' returned matching the output class of \code{fun} in the parent environment
#' of the function. The names of the input sources are used as object names.
#'
#' @export
ingest_directory <- function(directory = getwd(),
ingest.function = utils::read.csv,
pattern = "*",
collapse = TRUE,
recursive = FALSE,
check.duplicates = "warn",
...){
# Check parameter inputs
all_character(check.duplicates, directory, pattern)
all_logical(collapse, recursive)
function.exists <-
is.logical(try(is.function(ingest.function), silent = T))
if(!function.exists){
stop("The function specified in ingest.function is not found.
Check your spelling or ensure that the necessary library is loaded.")
}
# Wrap the function in try() to keep a bad file from ending the function.
import_function <-
function(x, ...){try(ingest.function(x, ...))}
# Get file list from directory
file_list <- list.files(directory,
pattern = pattern,
recursive = recursive,
full.names = TRUE)
if(any(duplicated(file_list))){
duplicated_file_names <-
file_list[duplicated(file_list)]
stop("The following files have duplicate file names: ",
duplicated_file_names,
". Filenames are used by ingestr to track data provenance and must be unique.")
}
names(file_list) <- file_list
file_count <- length(file_list)
# Read in files
suppressMessages(imported_list <-
lapply(file_list,
import_function,
...))
imported_list <-
imported_list[which(vapply(imported_list,
function(x){all(!grepl("try-error",
class(x)))},
logical(1)))]
successful_file_count <-
length(imported_list)
# Collect header information generated by ingest_* functions
function_as_string <-
lapply(as.list(match.call()), as.character)[["ingest.function"]]
is_ingestr <-
any(grepl(function_as_string, getNamespaceExports("ingestr")))
if(is_ingestr){
suppressMessages(headers <-
lapply(names(imported_list),
ingest_header))
headers_df <-
do.call("rbind", args = headers)
export_header(headers_df, directory)
}
if(successful_file_count != file_count){
message(file_count - successful_file_count,
" files were not successfully ingested.")
}
if(!is.null(check.duplicates) && any(duplicated(imported_list))){
duplicated_content_files <-
names(imported_list)[duplicated(imported_list)]
if(check.duplicates == "warn"){
stop("The following files look like they have duplicate contents. Use
check.duplicates = NULL to suppress this warning or check.duplicated
= 'remove' to automatically remove duplicate files: ",
duplicated_content_files)}
if(check.duplicates == "remove"){
message("Duplicate file contents were removed from ",
duplicated_content_files,
". See removed_data_",
basename(directory))
removed_data <-
dplyr::bind_rows(lapply(seq_len(length(imported_list)),
function(x){
df <-
data.frame(data_removed = paste("All", nrow(x), "records from file."),
reason_for_removal = "duplicate file contents",
stringsAsFactors = FALSE)
if(!is_ingestr){
df$input_source <- names(imported_list)[x]
}
df}))
assign(removed_data,
paste0("removed_data_", basename(directory)),
envir = parent.frame(n = 1))
}
}
if(!collapse){
lapply(seq_along(imported_list),
function(x){
assign(names(imported_list)[x], imported_list[[x]], pos = 1)
})
}
if(collapse & successful_file_count > 1){
column_names <-
purrr::map(imported_list,
~names(.x))
if(!all(purrr::map2_lgl(column_names,
column_names[c(2:successful_file_count, 1)],
~identical(.x, .y)))){
stop("Imported files do not have the same column names.
If you would still like to combine these datasets then run with collapse =
FALSE and manually create a single data set from the returned objects.")
}
imported_attributes <-
purrr::map(imported_list,
~purrr::map(.x, ~attributes(.x)))
template_attributes <-
imported_attributes[[1]]
if(!all(purrr::map2_lgl(imported_attributes,
imported_attributes[c(2:successful_file_count, 1)],
~identical(.x, .y)))){
stop("Imported files do not have the same column attributes.
If you would still like to combine these datasets then run with collapse =
FALSE and manually create a single data set from the returned objects.")
}
if(is_ingestr){
importedData <-
dplyr::bind_rows(imported_list)
} else {
importedData <-
dplyr::bind_rows(imported_list, .id = "input_source")
}
lapply(names(importedData)[-1],
function(x){
attributes(importedData[[x]]) <<- template_attributes[[x]]
})
if(!identical(lapply(importedData[, -1],
attributes),
template_attributes)){
message("Unable to restore lost column attributes when the datasets were combined.")
}
return(importedData)
} else {
imported_list[[1]]
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.