#' Format Categorical Variables using using SAS VALUE statements in a file
#'
#' @description This function reads a SAS program, extracts the value
#' statements, and returns a function that takes a numeric variable and maps it
#' to descriptive categories based on the "format.sas" attribute that is created
#' as SAS dataset is read into R with the Haven package.
#'
#' @param sas_file a SAS program with VALUE statements
#'
#' @return a function that takes a variable in a SAS dataset read in by the
#' Haven package and maps the numeric values to the categories setup in the
#' SAS program specified in the sas_file argument
#' @import purrr
#' @import tidytext
#' @export
sas_formatter <- function(sas_file){
# Read in the formatting SAS code
sas_formats <- stringi::stri_enc_toutf8(read_file(sas_file))
Encoding(sas_formats) <- "UTF-8"
sas_formats <- iconv(sas_formats, "UTF-8", "UTF-8",sub='')
sas_formats_df <- data_frame(txt = sas_formats) %>%
# Parse so that each statement (ends in a ;) is a sep. line
unnest_tokens(stmt, txt, token = "regex", pattern = ";") %>%
# Only want VALUE statements
filter(grepl("value", stmt)) %>%
# Get rid of whitespace at beginning
mutate(trimed_stmt = stringr::str_trim(stmt)) %>%
# Create columns for VALUE, the name of the format, and the format definition
tidyr::separate(trimed_stmt, c("value", "format_name", "formats"), extra = "merge")
sas_formats_lst <-
sas_formats_df %>%
# Create a list of vectors, each vector is the cleaned up format definition
pmap(.f = function(formats, ...) stringr::str_trim(strsplit(formats, "\\n")[[1]])) %>%
# foramt names are all in upper case.
set_names(toupper(sas_formats_df$format_name)) %>%
map_if(is.character, function(x) gsub("'", "", x)) %>%
# make vectors data_frames
map(~ data_frame(value_def = .x)) %>%
# filter out non-numeric values (these are missing - haven converts them to NA)
map(~ filter(.x, grepl("^[0-9]", value_def))) %>%
# sep out into key and value for makemap function
map(~ separate(.x, value_def, c("keys", "values"),
sep = "=", extra = "merge", convert = TRUE)) %>%
map(~ makeMap(keys = as.integer(.x$keys), values = .x$values))
# This is the closure that will be returned
map_factor <- function(column_var) {
# Each variable has a format.sas attribute (if it was read in by Haven) Get
# it to figure out which categories to apply
format.sas <- toupper(attr(column_var, "format.sas"))
# If there is isn't one, just return the original column without changes
# Should this throw a waring?
if (is.null(format.sas))
column_var
# If the variable DOES have a format.sas attr, but there isn't a
# corresponding format that was read from the sas_file program, just return
# the column without changes. This really should throw a warning
else if (is.null(sas_formats_lst[[format.sas]]))
column_var
else {
outvar <- as.factor(sas_formats_lst[[format.sas]](column_var))
attr(outvar, "label") <- attr(column_var, "label")
outvar
}
}
map_factor
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.