R/generate_sas_readfile.R

Defines functions generate_sas_readfile

Documented in generate_sas_readfile

#' Generates .csv and .sas files from a data.table
#'
#' @param data A data.table object
#' @param filename_stub A character string which will be the filename of the generated files, eg, `filename_stub.csv` and `filename_stub.sas`.
#' @param shortener A function which shortens variable names if longer than 30 characters
#' @details The function generates 3 files:
#' * filename_stub.csv
#' * filename_stub.sas
#' * filename_stub-variable-listing.csv
#' 
#' The variable listing provides details on variable names, type, length, and the original variable name (unabridged_name) if 
#' * the variable name was changed to comply with variable name character count restrictions, or
#' * non alpha-numeric characters were removed from the variable name.  
#' @md
#' @keywords SAS
#' @export
#' @examples
#' iris_dt <- as.data.table(iris)
#' generate_sas_readfile(iris_dt, "iris")
#' 
#' # Long variable names
#' # Default shortener simply cuts the variable name at 30 characters
#' iris_dt$very_long_variable_name_that_exceeds_character_count_restrictions <- 1
#' generate_sas_readfile(iris_dt, "iris2")
#' 
#' # Custom shortener
#' shorten <- function(x){
#'   y <- gsub("a|e|i|o|u","",x)
#'   substr(y,1,30)
#' }
#' generate_sas_readfile(iris_dt, "iris3", shortener = shorten)

generate_sas_readfile <- function(data, filename_stub, shortener = function(x){substr(x,1,30)}){

  if (!requireNamespace("data.table", quietly = TRUE)) {
    stop("The data.table package is needed for this function to work.",
         call. = FALSE)
  }
  if (!requireNamespace("dplyr", quietly = TRUE)) {
    stop("The dplyr package is needed for this function to work.",
         call. = FALSE)
  }
  if (!requireNamespace("dtplyr", quietly = TRUE)) {
    stop("The dtplyr package is needed for this function to work.",
         call. = FALSE)
  }  
  
  if(!"data.table" %in% class(data)) data <- as.data.table(data)
  
  data <- copy(data)
  vc <- data[,vapply(.SD,class,NA_character_)]
  vn <- copy(names(data))
  for(j in 1:ncol(data)){
    vj <- vn[j]
    switch(
      vc[j]
      , factor = data[,(vj) := as.character(get(vj))]
      , logical = data[,(vj) := 1*get(vj)]
    )
    if(nchar(vj)>30){
      new <- shortener(vj)
      setnames(data, vj, new)
      message("SAS name limit: " %|% vj %|% " renamed to " %|% new)
    }
    if(length(grep("\\W", vj))){
      new <- gsub("\\W","_", vj)
      setnames(data, vj, new)
      message("Punctuation in variable name: " %|% vj %|% " renamed to " %|% new)
    }
  }
  
  tpe <- function(x){ifelse(sum(!is.na(x))==0, "empty",typeof(x))}
  nch <- function(x){ifelse(is.character(x), max(nchar(x)), NA)}
  dt <- data.table(
    varname = names(data)
    , type = unlist(data[,lapply(.SD, tpe)])
    , length = unlist(data[,lapply(.SD,nch)])
    , unabridged_name = vn
  )
  dt[varname == unabridged_name, unabridged_name := ""]
  
  dt[,infile_command := varname %|% ifelse(type=="character", " :$" %|% length %|% ".", "") %|% "\n"]
  
  infile <- "DATA DATA;\nINFILE \"" %|% filename_stub %|% ".csv\" 	DSD DLM=','  LRECL=20000  missover firstobs = 2 termstr=lf;\nINPUT\n"
  
  cat(infile, dt[,infile_command], ";\nRUN;", file = filename_stub %|% ".sas")
  
  write.csv(
    dt[,.(varname, type, length, unabridged_name)],
    file = filename_stub %|% "-variable-list.csv",
    row.names = FALSE,
    na = ""
  )
  
  write.csv(
    data
    , file = filename_stub %|% ".csv"
    , row.names = FALSE
  )
  
  if(dt[,sum(duplicated(varname))>0]){
    warning("The following variable names are duplicated in the dataset")
    dt[varname %in% dt[,.N,varname][N > 1, varname]][,.(varname,unabridged_name)] %>% 
      (function(x){paste(capture.output(print(x)), collapse = "\n")}) %>% 
      warning
  }
}
thomasgstewart/tgsify documentation built on June 18, 2020, 11:10 a.m.