R/sas_formatter.R

Defines functions sas_formatter

#' Format Categorical Variables using using SAS VALUE statements in a file
#'
#' @description This function reads a SAS program, extracts the value
#' statements, and returns a function that takes a numeric variable and maps it
#' to descriptive categories based on the "format.sas" attribute that is created
#' as SAS dataset is read into R with the Haven package.
#'
#' @param sas_file a SAS program with VALUE statements
#'
#' @return a function that takes a variable in a SAS dataset read in by the
#' Haven package and maps the numeric values to the categories setup in the
#' SAS program specified in the sas_file argument
#' @import purrr
#' @import tidytext
#' @export
sas_formatter <- function(sas_file){


  # Read in the formatting SAS code

  sas_formats <- stringi::stri_enc_toutf8(read_file(sas_file))
  Encoding(sas_formats) <- "UTF-8"
  sas_formats <- iconv(sas_formats, "UTF-8", "UTF-8",sub='')

  sas_formats_df <- data_frame(txt = sas_formats) %>%
    # Parse so that each statement (ends in a ;) is a sep. line
    unnest_tokens(stmt, txt, token = "regex", pattern = ";") %>%
    # Only want VALUE statements
    filter(grepl("value", stmt)) %>%
    # Get rid of whitespace at beginning
    mutate(trimed_stmt = stringr::str_trim(stmt)) %>%
    # Create columns for VALUE, the name of the format, and the format definition
    tidyr::separate(trimed_stmt, c("value", "format_name", "formats"), extra = "merge")

  sas_formats_lst <-
    sas_formats_df %>%
    # Create a list of vectors, each vector is the cleaned up format definition
    pmap(.f = function(formats, ...) stringr::str_trim(strsplit(formats, "\\n")[[1]])) %>%
    # foramt names are all in upper case.
    set_names(toupper(sas_formats_df$format_name)) %>%
    map_if(is.character, function(x) gsub("'", "", x)) %>%
    # make vectors data_frames
    map(~ data_frame(value_def = .x)) %>%
    # filter out non-numeric values (these are missing - haven converts them to NA)
    map(~ filter(.x, grepl("^[0-9]", value_def))) %>%
    # sep out into key and value for makemap function
    map(~ separate(.x, value_def, c("keys", "values"),
                   sep = "=", extra = "merge", convert = TRUE)) %>%
    map(~ makeMap(keys = as.integer(.x$keys), values = .x$values))

  # This is the closure that will be returned
  map_factor <- function(column_var) {

    # Each variable has a format.sas attribute (if it was read in by Haven) Get
    # it to figure out which categories to apply
    format.sas <- toupper(attr(column_var, "format.sas"))

    # If there is isn't one, just return the original column without changes
    # Should this throw a waring?
    if (is.null(format.sas))
      column_var

    # If the variable DOES have a format.sas attr, but there isn't a
    # corresponding format that was read from the sas_file program, just return
    # the column without changes. This really should throw a warning
    else if (is.null(sas_formats_lst[[format.sas]]))
      column_var

    else {
      outvar <- as.factor(sas_formats_lst[[format.sas]](column_var))
      attr(outvar, "label") <- attr(column_var, "label")
      outvar
    }
  }

  map_factor
}
FredHutch/sasHelpers documentation built on May 3, 2019, 3:32 p.m.