#' @importFrom magrittr "%>%"
#' @importFrom dplyr "mutate"
NULL
#' Reads dictionary and input folder from IBGE datasets
#'
#' @param directory Path to directory where IBGE files are located
#'
#' @return A tibble
#'
#' @export
read_description <- function(directory) {
dic <- read_dictionary(directory)
input <- read_sas(directory)
dplyr::full_join(dic, input, by = "variable") %>%
mutate(factor = grepl(read_format, pattern = "^\\$")) %>% # Factors in SAS start with $
mutate(double = grepl(read_format, pattern = "^\\d+\\.\\d+")) %>% # Doubles have dot separated numeric values
mutate(format_size = gsub(read_format, pattern = "^\\$|\\.$", replacement = "")) %>% # Removes $ from beginning and . from end to get size
dplyr::select(c("position", "variable", "label", "value", "value_label", "factor", "double", "format_size"))
}
read_dictionary <- function(directory) {
xls <- dir(directory, pattern = "\\.xls$", full.names = TRUE)
if (length(xls) == 0)
stop("Dictionary file not found in specified directory (maybe it is in a subdirectory?)")
col_names <- c("position", "size", "variable", "variable_category", "label", "value", "value_label")
readxl::read_excel(
xls,
range = readxl::cell_limits(c(4, NA), c(NA, 7)), # Critical assumption: 4 useless rows and all information in first 7 columns
col_names = col_names
) %>%
tidyr::fill(position, size, variable, label) %>%
dplyr::filter(grepl(position, pattern = "^\\d+$")) %>% # Removes more useless rows (section headers for example)
dplyr::mutate(variable = toupper(variable)) # Need consistent variable names for subsequent join
}
read_sas <- function(directory) {
sas_path <- dir(directory, pattern = "\\.txt$", full.names = TRUE)
if (length(sas_path) == 0)
stop("Input file not found in specified directory (maybe it is in a subdirectory?)")
sas <- readr::read_lines(sas_path, locale = readr::locale(encoding = "latin1")) %>%
gsub(pattern = "\\/\\*.*$", replacement = "") %>% # Removes comments
trimws() %>% # Removes whitespace from edges
paste0("\n") # Adds newline to avoid problems with strings ending in null byte
body_begin <- grep(sas, pattern = "^INPUT", ignore.case = TRUE) + 1
body_end <- dplyr::last(grep(sas, pattern = "^@"))
readr::read_table2(
sas[body_begin:body_end],
col_names = c("at_position", "variable", "read_format"),
col_types = readr::cols_only("c", "c", "c")
) %>%
dplyr::mutate(variable = toupper(variable)) # Need consistent variable names for subsequent join
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.