R/ingest_directory.R

Defines functions ingest_directory

Documented in ingest_directory

#' Ingest a directory of uniform files
#'
#' This function reads in all files from a directory using the chosen import
#' function.  Use the 'pattern' argument to specify a set of files, or a
#' single file type. If collapse = TRUE \code{\link[dplyr]{bind}} is used
#' to match column names and bind the imported data into a single object.
#' \strong{All ingest functions use the source file name as an identifying
#' column to track provenance and relate data and metadata read from files.
#' Please check that you have unique file names.}
#'
#' If \code{check.duplicates = "remove"} then only a single set of records will
#' be retained when files have identical contents. This does not provide rowwise
#' checking for duplicates. A separate data.frame is created specifying the removed
#' input_source, the number of records removed, and the reason for removal.
#'
#'
#' @param directory A character vector with the name of the directory that
#'   contains your data files. Defaults to the working directory.
#' @param ingest.function The function to use to read in the files, defaults to
#'   \code{\link[utils]{read.table}} but can take any ingestr or standard import function.
#' @param pattern A character vector providing the pattern to match filenames as
#'   in \code{\link[base]{list.files}}. Defaults to all files "*".
#' @param collapse A logical argument, when true a single object is returned,
#'   when false an object is returned for each file. Defaults to \code{TRUE}.
#' @param recursive A logical argument, when true files are read recursively,
#'   defaults to \code{TRUE}. See \code{\link[base]{list.files}} for more
#'   information..
#' @param check.duplicates A character argument specifying the action that
#'   should be taken if files with duplicate contents are detected. One of
#'   "warn", "remove", or NULL to disable checking. Defaults to "warn".
#' @param ... Additional arguments to pass to the input method
#'
#' @return  When \code{collapse = T} a single object matching the output class
#'   of \code{fun} is returned. When \code{collapse = F} a single object is
#'   returned matching the output class of \code{fun} in the parent environment
#'   of the function. The names of the input sources are used as object names.
#'
#' @export

ingest_directory <- function(directory = getwd(),
                             ingest.function = utils::read.csv,
                             pattern = "*",
                             collapse = TRUE,
                             recursive = FALSE,
                             check.duplicates = "warn",
                             ...){

  # Check parameter inputs

  all_character(check.duplicates, directory, pattern)

  all_logical(collapse, recursive)

  function.exists <-
    is.logical(try(is.function(ingest.function), silent = T))

  if(!function.exists){
    stop("The function specified in ingest.function is not found.
         Check your spelling or ensure that the necessary library is loaded.")
  }

  # Wrap the function in try() to keep a bad file from ending the function.
  import_function <-
    function(x, ...){try(ingest.function(x, ...))}

  # Get file list from directory
  file_list <- list.files(directory,
                          pattern = pattern,
                          recursive = recursive,
                          full.names = TRUE)

  if(any(duplicated(file_list))){
    duplicated_file_names <-
      file_list[duplicated(file_list)]
    stop("The following files have duplicate file names: ",
         duplicated_file_names,
         ". Filenames are used by ingestr to track data provenance and must be unique.")
  }

  names(file_list) <- file_list

  file_count <- length(file_list)
  
  # Read in files
  suppressMessages(imported_list <-
                     lapply(file_list,
                            import_function,
                            ...))

  imported_list <- 
    imported_list[which(vapply(imported_list,
                               function(x){all(!grepl("try-error",
                                                      class(x)))},
                               logical(1)))]

  successful_file_count <-
    length(imported_list)

  # Collect header information generated by ingest_* functions
  function_as_string <- 
    lapply(as.list(match.call()), as.character)[["ingest.function"]]
  
  is_ingestr <- 
    any(grepl(function_as_string, getNamespaceExports("ingestr")))
  
  if(is_ingestr){
    suppressMessages(headers <- 
                       lapply(names(imported_list),
                              ingest_header))
    
    headers_df <- 
      do.call("rbind", args = headers)
    
    export_header(headers_df, directory)
  }
  
  if(successful_file_count != file_count){
    message(file_count - successful_file_count,
            " files were not successfully ingested.")
  }

  if(!is.null(check.duplicates) && any(duplicated(imported_list))){
    duplicated_content_files <-
      names(imported_list)[duplicated(imported_list)]

    if(check.duplicates == "warn"){
      stop("The following files look like they have duplicate contents. Use
           check.duplicates = NULL to suppress this warning or check.duplicated
           = 'remove' to automatically remove duplicate files: ",
           duplicated_content_files)}
    if(check.duplicates == "remove"){
      message("Duplicate file contents were removed from ",
              duplicated_content_files,
              ". See removed_data_",
              basename(directory))
      removed_data <-
        dplyr::bind_rows(lapply(seq_len(length(imported_list)),
                                function(x){
                                  
                                  df <- 
                                    data.frame(data_removed = paste("All", nrow(x), "records from file."),
                                               reason_for_removal = "duplicate file contents",
                                               stringsAsFactors = FALSE)
                                  
                                  if(!is_ingestr){
                                    df$input_source <- names(imported_list)[x]
                                  }
                                  
                                  df}))
      
      assign(removed_data,
             paste0("removed_data_", basename(directory)),
             envir = parent.frame(n = 1))
    }
  }
  
  if(!collapse){
    lapply(seq_along(imported_list),
           function(x){
             assign(names(imported_list)[x], imported_list[[x]], pos = 1)
           })
  }

  if(collapse & successful_file_count > 1){

    column_names <-
      purrr::map(imported_list,
                 ~names(.x))

    if(!all(purrr::map2_lgl(column_names,
                            column_names[c(2:successful_file_count, 1)],
                            ~identical(.x, .y)))){
      stop("Imported files do not have the same column names.
           If you would still like to combine these datasets then run with collapse =
           FALSE and manually create a single data set from the returned objects.")
    }

    imported_attributes <-
      purrr::map(imported_list,
                 ~purrr::map(.x, ~attributes(.x)))

    template_attributes <-
      imported_attributes[[1]]

    if(!all(purrr::map2_lgl(imported_attributes,
                            imported_attributes[c(2:successful_file_count, 1)],
                            ~identical(.x, .y)))){
      stop("Imported files do not have the same column attributes.
           If you would still like to combine these datasets then run with collapse =
           FALSE and manually create a single data set from the returned objects.")
    }

    if(is_ingestr){
      importedData <-
        dplyr::bind_rows(imported_list) 
    } else {
      importedData <-
        dplyr::bind_rows(imported_list, .id = "input_source")
    }
    
    lapply(names(importedData)[-1],
           function(x){
             attributes(importedData[[x]]) <<- template_attributes[[x]]
           })

    if(!identical(lapply(importedData[, -1],
                         attributes),
                  template_attributes)){
      message("Unable to restore lost column attributes when the datasets were combined.")
    }

    return(importedData)
  } else {
    imported_list[[1]]
  }
}
jpshanno/ingestr documentation built on Sept. 24, 2020, 11:40 a.m.