R/read_description.R

Defines functions read_sas read_dictionary read_description

Documented in read_description

#' @importFrom magrittr "%>%"
#' @importFrom dplyr "mutate"
NULL

#' Reads dictionary and input folder from IBGE datasets
#'
#' @param directory Path to directory where IBGE files are located
#'
#' @return A tibble
#'
#' @export
read_description <- function(directory) {
  dic <- read_dictionary(directory)
  input <- read_sas(directory)

  dplyr::full_join(dic, input, by = "variable") %>%
    mutate(factor = grepl(read_format, pattern = "^\\$")) %>% # Factors in SAS start with $
    mutate(double = grepl(read_format, pattern = "^\\d+\\.\\d+")) %>% # Doubles have dot separated numeric values
    mutate(format_size = gsub(read_format, pattern = "^\\$|\\.$", replacement = "")) %>% # Removes $ from beginning and . from end to get size
    dplyr::select(c("position", "variable", "label", "value", "value_label", "factor", "double", "format_size"))
}

read_dictionary <- function(directory) {
  xls <- dir(directory, pattern = "\\.xls$", full.names = TRUE)
  if (length(xls) == 0)
    stop("Dictionary file not found in specified directory (maybe it is in a subdirectory?)")

  col_names <- c("position", "size", "variable", "variable_category", "label", "value", "value_label")
  readxl::read_excel(
    xls,
    range = readxl::cell_limits(c(4, NA), c(NA, 7)), # Critical assumption: 4 useless rows and all information in first 7 columns
    col_names = col_names
  ) %>%
    tidyr::fill(position, size, variable, label) %>%
    dplyr::filter(grepl(position, pattern = "^\\d+$")) %>% # Removes more useless rows (section headers for example)
    dplyr::mutate(variable = toupper(variable)) # Need consistent variable names for subsequent join
}

read_sas <- function(directory) {
  sas_path <- dir(directory, pattern = "\\.txt$", full.names = TRUE)
  if (length(sas_path) == 0)
    stop("Input file not found in specified directory (maybe it is in a subdirectory?)")

  sas <- readr::read_lines(sas_path, locale = readr::locale(encoding = "latin1")) %>%
    gsub(pattern = "\\/\\*.*$", replacement = "") %>% # Removes comments
    trimws() %>% # Removes whitespace from edges
    paste0("\n") # Adds newline to avoid problems with strings ending in null byte

  body_begin <- grep(sas, pattern = "^INPUT", ignore.case = TRUE) + 1
  body_end <- dplyr::last(grep(sas, pattern = "^@"))

  readr::read_table2(
    sas[body_begin:body_end],
    col_names = c("at_position", "variable", "read_format"),
    col_types = readr::cols_only("c", "c", "c")
  ) %>%
    dplyr::mutate(variable = toupper(variable)) # Need consistent variable names for subsequent join
}
datazoompuc/IBGEreadR documentation built on March 3, 2021, 12:32 a.m.