#' Writes .dct Stata file from dictionary and input folder
#'
#' @param directory Path to directory where IBGE files are located
#' @param output Full name of output file, should end with ".dct"
#' @param force_labels Data frame to manually set labels for variables
#'
#' @return Nothing
#'
#' @export
stata.dct <- function(directory, output, force_labels = NULL) {
df <-
read_description(directory) %>%
dplyr::select(-c("value", "value_label")) %>% # Currently we do not use value labels in Stata packages
dplyr::distinct(variable, .keep_all = TRUE) # Removing above columns leads to lots of repeated rows
# exclude_irrelevant_terms_in_label() %>% # Sounds good, doesn't work
if (!is.null(force_labels)) {
df <- df %>%
dplyr::left_join(force_labels, by = "variable")
df$label <- df$label.y
empty_values <- is.na(df$label.y)
df$label[empty_values] <- df$label.x[empty_values]
df <- dplyr::select(df, -c("label.x", "label.y"))
}
body <- df %>%
plyr::mdply(to_string) %>%
dplyr::pull(V1) # V1 is the column created with the application of `to_string`
fileConn <- file(output, encoding = "utf8")
writeLines(c("dictionary {", body, "}"), fileConn)
close(fileConn)
}
to_string <- function(position, variable, label, factor, double, format_size) {
paste0(
" ",
"_column(", position, ")", " ",
if (factor) paste0("str", format_size)
else if (double) "double"
else if (as.numeric(format_size) <= 2) "byte"
else if (as.numeric(format_size) <= 9) "int"
else "long", " ",
variable, " ",
"%", format_size, if(factor) "s" else "f", " ",
"\"", clean(label), "\""
)
}
clean <- function(string) {
string %>%
gsub(pattern = "[[:space:]]", replacement = " ") %>% # Avoid specially newlines inside labels
gsub(pattern = "_+", replacement = "_") %>% # Reduce wasted space on label with multiple _
gsub(pattern = "[\"“”\']", replacement = "-") # Quotation characters can lead to error in writing the dictionary
}
#' Excludes terms with low tf-idf score.
#' Searches for the smallest score cutoff that keeps descriptions at 80 character.
exclude_irrelevant_terms_in_label <- function(df) {
tfidf <- df %>%
tidytext::unnest_tokens(term, label) %>%
dplyr::count(variable, term, sort = TRUE) %>%
tidytext::bind_tf_idf(term, variable, n)
for (i in 1:nrow(df)) {
label <- df[[i, "label"]]
label_terms <- strsplit(label, " ")[[1]]
clean_terms <- tolower(label_terms)
var <- df[[i, "variable"]]
terms <- tfidf[order(-tfidf$tf_idf), ] %>%
dplyr::filter(variable == var) %>%
dplyr::pull(term)
new_label <- label
lower <- 1
upper <- 15
while(upper >= lower) {
middle <- (upper + lower) %/% 2
if(upper == lower + 1) middle <- upper
new_label <- paste0(label_terms[clean_terms %in% terms[1:middle]], collapse = " ")
if(upper == lower) break
if (nchar(new_label) > 80) upper <- middle - 1
else lower <- middle
}
df[i, ]$label <- new_label
}
df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.