R/data_reading.R

Defines functions split_by_colnum detect_input read_directory

# Data Reading ------------------------------------------------------------

# read several files from the same directory
read_directory <- function(path = getwd(), pattern = NULL, bind = FALSE, to_envir = FALSE, ...){

  # identifies all files in path
  filenames <- list.files(path, full.names = T)[grepl(pattern = '\\.[[:alpha:]]{1,3}$', x = list.files(path)) & !grepl(pattern = '\\.R', x = list.files(path)) &!grepl(pattern = '.zip|.xlsx|.rar', x = list.files(path))]

  if(!is.null(pattern)){
    filenames <- filenames[grepl(pattern, filenames)]
  }

  cat('\nReading the following files:\n', paste(filenames, collapse = '\n'), sep  = '\n', '')

  # creates names for all tables from their filenames
  tablenames <- stringr::str_remove(string = filenames,  pattern = '.[^.]+$') %>% stringr::str_remove(paste0(path,'/'))

  # reads all files into a table list
  data_list <- map(filenames, data.table::fread, encoding = "UTF-8", na.strings = "NA", integer64 = 'character', nThread = 30, ...) %>%
    purrr::set_names(tablenames)

  if(bind){
    dat <- rbindlist(data_list)
    return(dat)
  }

  if(to_envir){
    list2env(data_list, envir = .GlobalEnv)
  }
  return(data_list)
}

#detects if the input is a list, a data frame a directory or a filename and returns either a df or a list
detect_input <- function(input, dir.pattern = NULL,...){
  # determine if the analyisis will be done on a single file, a data.frame, a list of data.frames or a directory
  if(is.character(input)){
    # single file
    if(grepl(pattern = '\\.[[:alpha:]]', input)){
      cat('\nLoading file from directory... \n')
      dat <- data.table::fread(input,  encoding = "UTF-8", na.strings = "NA", integer64 = 'character', nThread = 30, ...)
      # entire directory
    }else{
      cat('\nLoading all files from directory... \n')
      dat <- read_directory(input, pattern = dir.pattern, ...)
    }
  }

  # single data.frame
  if(is.data.frame(input)){
    dat <- input
  }
  #
  # list of data.frames
  if(all(purrr::map_lgl(input, ~is.data.frame(.x)))){
    dat <- input
  }
  return(dat)
}

# function that reads several tables into a list, split by the number of columns they each has
split_by_colnum <- function(txt, sep = ',', table_name = NULL, fread_fill = FALSE){
  #if txt is a filename
  if(length(txt) == 1 & grepl('\\.[[:alpha:]]{1,3}$ ', txt)){
    txt <- readr::read_lines(txt)
  }
  Encoding(txt) <-  'UTF-8'
  tab_count <- stringr::str_count(string = txt, pattern = sep)
  tbls <- split(txt, tab_count)
  tbls <- purrr::map(tbls, function(x) data.table::fread(text = x, nThread = 30, fill = fread_fill))
  names(tbls) <- paste(table_name, (as.integer(names(tbls))+1), 'columnas', sep = '_')
  return(tbls)
}
pheymanss/dq documentation built on Jan. 17, 2020, 1:09 p.m.