R/stata.R

Defines functions exclude_irrelevant_terms_in_label clean to_string stata.dct

Documented in stata.dct

#' Writes .dct Stata file from dictionary and input folder
#'
#' @param directory Path to directory where IBGE files are located
#' @param output Full name of output file, should end with ".dct"
#' @param force_labels Data frame to manually set labels for variables
#'
#' @return Nothing
#'
#' @export
stata.dct <- function(directory, output, force_labels = NULL) {
  df <-
    read_description(directory) %>%
    dplyr::select(-c("value", "value_label")) %>% # Currently we do not use value labels in Stata packages
    dplyr::distinct(variable, .keep_all = TRUE) # Removing above columns leads to lots of repeated rows
    # exclude_irrelevant_terms_in_label() %>% # Sounds good, doesn't work

  if (!is.null(force_labels)) {
    df <- df %>%
      dplyr::left_join(force_labels, by = "variable")

    df$label <- df$label.y
    empty_values <- is.na(df$label.y)
    df$label[empty_values] <- df$label.x[empty_values]

    df <- dplyr::select(df, -c("label.x", "label.y"))
  }

  body  <- df %>%
    plyr::mdply(to_string) %>%
    dplyr::pull(V1) # V1 is the column created with the application of `to_string`

  fileConn <- file(output, encoding = "utf8")
  writeLines(c("dictionary {", body, "}"), fileConn)
  close(fileConn)
}

to_string <- function(position, variable, label, factor, double, format_size) {
  paste0(
    "  ",
    "_column(", position, ")", " ",
    if (factor) paste0("str", format_size)
    else if (double) "double"
    else if (as.numeric(format_size) <= 2) "byte"
    else if (as.numeric(format_size) <= 9) "int"
    else "long", " ",
    variable, " ",
    "%", format_size, if(factor) "s" else "f", " ",
    "\"", clean(label), "\""
  )
}

clean <- function(string) {
  string %>%
    gsub(pattern = "[[:space:]]", replacement = " ") %>% # Avoid specially newlines inside labels
    gsub(pattern = "_+", replacement = "_") %>% # Reduce wasted space on label with multiple _
    gsub(pattern = "[\"“”\']", replacement = "-") # Quotation characters can lead to error in writing the dictionary
}

#' Excludes terms with low tf-idf score.
#' Searches for the smallest score cutoff that keeps descriptions at 80 character.
exclude_irrelevant_terms_in_label <- function(df) {
  tfidf <- df %>%
    tidytext::unnest_tokens(term, label) %>%
    dplyr::count(variable, term, sort = TRUE) %>%
    tidytext::bind_tf_idf(term, variable, n)

  for (i in 1:nrow(df)) {
    label <- df[[i, "label"]]

    label_terms <- strsplit(label, " ")[[1]]
    clean_terms <- tolower(label_terms)
    var <- df[[i, "variable"]]

    terms <-  tfidf[order(-tfidf$tf_idf), ] %>%
      dplyr::filter(variable == var) %>%
      dplyr::pull(term)

    new_label <- label
    lower <- 1
    upper <- 15
    while(upper >= lower) {
      middle <- (upper + lower) %/% 2

      if(upper == lower + 1) middle <- upper

      new_label <- paste0(label_terms[clean_terms %in% terms[1:middle]], collapse = " ")

      if(upper == lower) break

      if (nchar(new_label) > 80) upper <- middle - 1
      else lower <- middle
    }
    df[i, ]$label <- new_label
  }
  df
}
datazoompuc/IBGEreadR documentation built on March 3, 2021, 12:32 a.m.