R/importSAS.R

Defines functions contentSAS importSAS

Documented in contentSAS importSAS

##' Selective import of sas7bdat files into R data.table format
##'
##' This function first runs SAS proc contents in the background and uses the result
##' to write temporary SAS code that writes the data in csv format to a temporary file.
##' The data are then read with data.table::freads into R. The function tries to help
##' with the formatting of numeric and date variables and with the understanding of missing values.
##' However, in some cases this help is counterproductive and can therefore be turned off,
##' to some extent at least, with arguments na.strings, force.numeric, skip.date.conversion.
##' Also, the user selects which columns and rows to import. See examples.
##' @title importSAS
##' @aliases importSAS contentSAS
##' @usage importSAS(filename,wd=NULL,keep=NULL,drop = NULL,
##'                  where = NULL,obs = NULL,filter = NULL,
##'                  filter.by = NULL, filter.negative = FALSE,
##'                  set.hook=NULL,step.hook=NULL,pre.hook=NULL,
##'                  post.hook=NULL,savefile = NULL,overwrite = TRUE,
##'                  show.sas.code=FALSE,save.tmp = FALSE,content=FALSE,
##'                  na.strings="dot",date.vars=NULL,datetime.vars=NULL,
##'                  character.vars="pnr", numeric.vars = NULL,sas.program,
##'                  sas.switches, sas.runner, use.colClasses=TRUE,
##'                  skip.date.conversion=FALSE, force.numeric=TRUE,
##'                  sas.data.extension="sas7bdat", verbose=FALSE,...)
##'        contentSAS(filename,wd=NULL)
##' @param filename The filename (with full path) of the SAS dataset to import.
##' So, \code{"x:/data/rawdata/701111/lmdb.sas7bdat"} and also
##' \code{"v:/data/workdata/701111/MeMe/project1/data/mydata.sas7bdat"} work but \code{"./data/mydata.sas7bdat"}
##' does not even not if the working directory is set to \code{"v:/data/workdata/701111/MeMe/project1/"}.
##' @param wd The directory used to store temporarily created files (SAS script, log file, csv file). You need to
##'           have permission to write to this directory. The default value is the current working directory, see \code{getwd()},
##'           (which you may not have access to write to!). On Gentofte's Danmark Statistics servers it may help to set
##'           the working directory to the fast X drive.
##' @param keep A vector of variable names, i.e., the variables (columns) to read and keep from the dataset. Default is to read and keep all variables.
##' @param drop Specifies the variables (columns) to leave out from the dataset. Default is to leave out no variables.
##' @param where Specifies which conditions the observations (rows) from the dataset should fulfil. Default is no conditions. Use SAS syntax (see examples).
##' @param obs Number of observations to read from the dataset. Setting this to \code{Inf} has the same effect as not setting it, i.e, read all observations.
##' @param filter Alternative or in addition to the where statement it is
##'               possible to filter the rows of \code{filename} based on a data.table.
##'               E.g., filter can be a data.table with one column consisting of *unique*
##'               PNRs to specify that only the matching rows should be imported from filename.
##' @param filter.by Vector of variable names to filter by. By default all variables present in the filter data are used.
##' @param filter.negative Vector of length two that defines how to merge the \code{filter} data
##'                    with the SAS dataset \code{filename}:
##' \itemize{
##' \item \code{c(1,1)} means observation are only included if they are present both the filter data and the SAS data.
##' \item \code{c(-1,1)} means observation are only included if they are not present filter data but present in the SAS data.
##' }
##' @param set.hook Quoted SAS statments (within use single quotes) to be placed in addition to set options (where, keep, drop, obs) when setting the data set \code{filename}. See examples.
##' @param step.hook Quoted SAS statments (within use single quotes) to be placed after setting the data set \code{filename}. See examples.
##' @param pre.hook Quoted SAS code (within use single quotes) to be set in the beginning of the SAS program. For example, it maybe useful to specify options such as \code{'options obs=100;'} in combination with a where statement.
##' @param post.hook Quoted SAS code (within use single quotes) to be set at the end of the SAS program. For example, it maybe useful to specify 'proc print data=df;' in case of trouble.
##' @param savefile If specified, the generated csv file will be saved with the given name. The name should end with ".csv". The file will be saved in the working directory or in the directory given under \code{wd} if this is specified.
##' @param overwrite Logical. Determines whether or not to overwrite files already existing with the same name as files generated by this function. This is the temporary SAS file, log file and csv file, and possibly a permanent file with the name given under "savefile". If the value is FALSE and some files already exist, the function will abort and print the name of the problematic files. The default value is TRUE.
##' @param show.sas.code Logical. If \code{TRUE} show sas code in R console before running it.
##' @param save.tmp Logical. Option to save all temporary files. Even though this is set to FALSE, the csv file will be saved if there is given a filename in "savefile". Default value is FALSE.
##' @param content Logical. If true, the function will only read and return the content of the import file. Together with save.tmp=TRUE, this can be used to generate the SAS file without running it.
##' @param na.strings A vector of strings to interpret values of character variables as NA. Each element should be a regular expression
##'                   that can be understood by \code{grepl}. For example, the default value \code{"^\\.$"} matches fields that contain a single dot and nothing else than a dot.
##' @param date.vars Vector of variables to be converted to date variables. For these variables a SAS format statement \code{yymmdd10.} is
##'                  used to force the correct order of year months and days and the conversion is done with \code{lubridate::ymd}.
##'                  Conversion can be skipped with argument \code{skip.date.conversion}.
##' @param datetime.vars Vector of variables to be converted to datetime variables. For these variables a SAS format statement \code{datetime.} is
##'                  used and the conversion is done with \code{lubridate::dmy_hms}.
##'                  Conversions in both SAS and R programs can be skipped with argument \code{skip.date.conversion}.
##' @param character.vars names of variables that should be converted to character. Case does not matter. Default is \code{"pnr"}.
##' @param numeric.vars character.vars names of variables that should be converted to numeric. case does not matter.
##' @param sas.program sas program. On linux where \code{.Platform$OS.type=="unix"} this defaults to \code{"sas"} on any other system to "C:/Program Files/SASHome/SASFoundation/9.4/sas.exe"
##' @param sas.switches On linux this defaults to {""} on any other system to \code{"-batch -nosplash -noenhancededitor -sysin"}
##' @param sas.runner How sas is invoked. On linux this defaults to \code{"system"} on any other system to \code{"shell"}.
##' @param use.colClasses Logical. If TRUE learn about the variable types from SAS's proc contents and
##'        pass to \code{fread} as argument \code{colClasses}.
##' @param skip.date.conversion if TRUE do not try to convert any dates or datetime variables. If \code{"SAS"} only skip the format statements but format
##'        via lubridate (see \code{date.vars} and \code{datetime.vars}). If \code{"R"} skip lubridate conversion but keep the format statements.
##' @param force.numeric if TRUE force numeric format on numeric variables specified by argument \code{numeric.vars}.
##' @param sas.data.extension String to be checked against the file extenstion of filename. Default is \code{"sas7bdat"}.
##' @param verbose Logical. When \code{TRUE} warnings and errors are shown, otherwise not shown.
##'        This can be useful to turn on when there are problems with the result.
##' @param ... Arguments passed to \code{fread} for reading the created .csv file. OBS: try to avoid specifying \code{colClasses} and instead use the arguments of importSAS: \code{character.vars}, \code{date.vars}, \code{datetime.vars} and \code{numeric.vars}.
##' @return The output is a data.table with the columns requested in keep (or all columns) and the rows requested in where (or all rows) up to obs many rows.
##' @author Anders Munch \email{a.munch@sund.ku.dk} and Thomas A Gerds \email{tag@biostat.ku.dk}
##' @references This function is based on pioneering work by Jesper Lindhardsen.
##' @details As \code{R} is case-sensitive while \code{SAS} is not, to avoid confusion all variable names are converted to lower case.
##' @examples
##' # We first set a working directory in which we have read and write permission
##' # These functions will produce temporary files which, if save.tmp is not set to TRUE, will
##' # be removed afterwards.
##'
##' \dontrun{
##' setwd("v:/Data/Workdata/704791/AndersMunch/readSAS/R")
##'
##' # Before importing a SAS data file it is useful to look into the
##' # format of the variables:
##' contentSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl")
##' contentSAS(filename="x:/Data/Rawdata_Hurtig/704791/pop.sas7bdat")
##'
##' # Also, it is often a good idea to initially only read a limited amount of data
##' df101 <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",obs=101,save.tmp=TRUE)
##' # and to examine the result
##' str(df101)
##' df101
##'
##' # Format, dates, numeric, character
##' df101 <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",obs=101,
##'                    save.tmp=TRUE,date.vars="inddto",
##'                    numeric.vars="pnr",character.vars="packsize")
##'
##' # we can also use the pre.hook to limit the number of observations via sas options:
##' importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",
##'           pre.hook="options obs=17;",where="diag='DN899'",keep=c("PNR","diag"),show.sas.code=1L)
##'
##' # NOTE: In combination with a where statement SAS will find the first 101 observations that
##' #       satisfy the where statement. When the where statement finds nothing then SAS
##' #       will run through the whole file without finding anything
##'
##' # To import the whole file just remove the limitations:
##' # df <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",show.sas.code=TRUE)
##' # But: this is not useful when files are huge and you should to look
##' # at the examples below which show how to read selected columns and rows only
##'
##' # In huge data it is efficient to select only the required columns
##' # and to read only the rows of the data set that meet a criterion.
##' # The following example shows how to select columns with the keep option
##' # and how to specify the "where statement" of the SAS data step:
##' df0 <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",
##'                  obs=101,
##'                  keep=c("diag","diagtype"),
##'                  where="diagtype = 'A' and diag = '45490'")
##'
##' # Examine the imported data.table
##' str(df0)
##' df0
##' # using where contains is powerful
##' importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",
##'                  obs=10,
##'                  save.tmp=TRUE,
##'                  where="diag contains 'DI2'",
##'                  keep=c("PNR","diag","inddto","uddto"))
##'
##'
##' # Another typical task is to import only the rows which correspond to
##' # a set of pnr numbers or a set of diagnoses or both.
##' # To achieve this the function importSAS is merging files during the import.
##' # The feature is called 'filter' and illustrated in the following examples:
##' # Example 1: import only rows of diag_indl which correspond to one of the
##' #            first 23 pnr's in pop
##' pop <- importSAS(filename="x:/Data/Rawdata_Hurtig/704791/pop.sas7bdat",
##'                  obs=23,keep="PNR")
##' df1 <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",
##'                  obs=8501,filter=pop)
##' str(df1)
##' df1
##'
##' # Example 2: import only rows of diag_indl which correspond the
##' #            first pnr's in pop for which also diag is equal to DK409
##' pop2 <- importSAS(filename="x:/Data/Rawdata_Hurtig/704791/pop.sas7bdat",
##'                  obs=4223,keep="PNR")
##' pop2 <- pop2[,diag:="DZ508"]
##' df2 <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",
##'                  obs=8501,keep=c("pnr","diag","inddto","uddto","pattype"),
##'                  where="pattype ne 3",filter=pop2)
##' str(df2)
##' df2
##'
##' ## Sometimes the sas data file cannot be read due to unknown formats
##' ## here is how to solve this:
##' df2a <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",
##'                  obs=101,
##'                  pre.hook="options nofmterr;")
##'
##'
##' # The hooks set.hook and step.hook can be used as follows:
##' df3 <- importSAS(filename="X:/Data/Rawdata_Hurtig/704791/diag_indl",
##'                  obs=101,
##'                  keep=c("PNR","diag","diagtype"),
##'                  set.hook="firstobs=5",
##'                  step.hook="by PNR; firstPNR=first.PNR;",
##'                  where="diag contains 'I'")
##' str(df3)
##' df3
##'
##' # if you are more fluent in SAS than in R it may help to
##' # know how to communicate e.g., a keep statement, instead of
##' # using the keep argument:
##'
##' df4 <-importSAS(filename="X:/Data/Rawdata_Hurtig/999999/Dream201801",
##'                 set.hook="keep=pnr branch;",
##'                 obs=1000)
##'
##' # Because the "overwrite" argument is FALSE, running the above code again will abort the import
##' # to not overwrite the temporary files.
##' # Setting "overwrite=TRUE" will allow the function to overwrite the files.
##'
##' }
##' @export
importSAS <- function(filename, wd = NULL, keep = NULL, drop = NULL, where = NULL,
                       obs = NULL, filter = NULL, filter.by = NULL, filter.negative = FALSE,
                       set.hook = NULL, step.hook = NULL, pre.hook = NULL,
                       post.hook = NULL, savefile = NULL, overwrite = TRUE, show.sas.code = FALSE,
                       save.tmp = FALSE, content = FALSE, na.strings = c("dot"), date.vars = NULL,datetime.vars=NULL,
                       character.vars = "pnr", numeric.vars = NULL, sas.program,
                       sas.switches, sas.runner, use.colClasses=TRUE,skip.date.conversion = FALSE,force.numeric=TRUE,
                       sas.data.extension="sas7bdat",verbose = FALSE,
                       ...)
{
    if (!file.exists(filename)){
        stop(paste0("A file with name ",filename," does not exist."))
    }else{
        if (tolower(tools::file_ext(filename))!=sas.data.extension)
            stop("The filename exists, but file extension does not match sas.data.extension: ",sas.data.extension)
    }
    ## DD <- dirname(filename)
    ## FF <- basename(filename)
    ## fullname <- list.files(path=DD,pattern=paste0("^",filename,"$"),full.names=TRUE)
    .SD = NULL
    keep <- tolower(keep)
    drop <- tolower(drop)
    date.vars <- tolower(date.vars)
    datetime.vars <- tolower(datetime.vars)
    olddir <- getwd()
    if (length(wd) == 0){
        wd <- getwd()
    }else{
        setwd(wd)
    }
                                        # cleaning up old temporary directories
    olddirectories <- list.files(wd,pattern="heaven_tempSASfiles[a-z0-9]+")
    for (old in olddirectories){
        message("Cleaning up temporary directories from previous calls.")
        unlink(old,recursive=TRUE,force=TRUE)
    }
    tmpname <- basename(tempfile(pattern="heaven_tempSASfiles"))
    tmpdir = paste0(wd,"/",tmpname)
    if (verbose) message("Writing temporary SAS files (log, lst, input data, output data) to directory\n",tmpdir)
    if (file.exists(tmpdir)) {
        stop(paste("file.exists:",tmpdir))
    }else{
        try.val <- try(dir.create(tmpdir))
        if (class(try.val)[[1]]=="try-error")
            stop("Cannot create temporary directory.\nYou probably do not have permission to write to the directory: \n ",
                 tmpdir, "\nTry to specify another directory with the argument \"wd\" or change the working directory.",
                 sep = "")
    }
    if (length(savefile) > 0) {
        outfile <- paste(wd, "/", savefile, sep = "")
        if (file.exists(outfile)) {
            if (interactive()){
                maybestop <- utils::askYesNo(paste0("Overwrite existing file: ",outfile,"? "))
                if (is.na(maybestop)||maybestop==FALSE){
                    stop("File exists:",outfile)
                }
            }else{
                stop("File exists:",outfile)
            }
        }
        message("The output of SAS will be saved to file:\n",outfile)
    }
    else {
        outfile <- paste(tmpdir, "/", "sasimport_internal_tmpout.csv",
                         sep = "")
    }
    setwd(tmpdir)
    on.exit({
        setwd(olddir)
        if (!save.tmp) {
            unlink(tmpdir,recursive=TRUE,force=TRUE)
        }else{
            cat("\nTemporary directory:\n ",tmpdir,"\ncan now be inspected -- and should be removed manually afterwards!\n")
        }
    })
    if (.Platform$OS.type == "unix") {
        if (missing(sas.program)) {
            sas.program <- "sas"
        }
        if (missing(sas.switches)) {
            sas.switches <- ""
        }
        if (missing(sas.runner)) {
            sas.runner <- "system"
        }
    }
    else {
        if (missing(sas.program)) {
            sas.program <- "C:/Program Files/SASHome/SASFoundation/9.4/sas.exe"
        }
        if (missing(sas.switches)) {
            sas.switches <- "-batch -nosplash -noenhancededitor -sysin "
        }
        if (missing(sas.runner)) {
            sas.runner <- "shell"
        }
    }
    existing.files <- NULL
    tmp.SASfile <- paste(tmpdir, "/", "sasimport_internal_tmpfile.sas",
                         sep = "")
    tmp.log <- paste(tmpdir, "/", "sasimport_internal_tmpfile.log",
                     sep = "")
    tmp.filterfile <- paste(tmpdir, "/", "sasimport_internal_tmpfilterfile.csv",
                            sep = "")
    tmp.SASproccont <- paste(tmpdir, "/", "sasimport_internal_tmpproccont.sas",
                             sep = "")
    tmp.proccontout <- paste(tmpdir, "/", "sasimport_internal_tmpproccontout.csv",
                             sep = "")
    tmp.proccontlog <- paste(tmpdir, "/", "sasimport_internal_tmpproccont.log",
                             sep = "")
    try.val <- try(file.create(tmp.SASfile))

    files <- c(tmp.SASfile, tmp.log, tmp.filterfile, outfile,
               tmp.SASproccont, tmp.proccontout, tmp.proccontlog)
    for (file in files) {
        if (file.exists(file))
            existing.files <- c(existing.files, basename(file))
    }
    if (length(existing.files) > 0 & overwrite == FALSE) {
        stop(paste("Aborted to not overwrite the file(s):", paste(" ",
                                                                  existing.files, collapse = "\n"), "in the directory:",
                   paste(" ", tmpdir), "Set the argument \"overwrite\" equal to \"TRUE\" to allow overwriting.",
                   sep = "\n"))
    }
    for (file in files) {
        if (file.exists(file))
            file.remove(file)
    }
    cond <- ""

    ## ----------------------------- start proc contents -------------------------
    file.create(tmp.SASproccont)
    cat("ods listing close;\nODS OUTPUT variables=dcontent; \n proc contents data='",
        filename, "';\nrun;\nproc sort data=dcontent;\nby num;\nrun; \ndata _NULL_; \nset dcontent; \n file '",
        tmp.proccontout, "' dsd; \nif _n_ eq 1 then link names; \nput (_all_)(~); return; \nnames:\nlength _name_ $32; \ndo while(1); \ncall vnext(_name_); \nif upcase(_name_) eq '_NAME_' then leave; \nput _name_ ~ @; \nend; \nput; \nreturn; \nrun;\n",
        sep = "", file = tmp.SASproccont, append = TRUE)
    if (.Platform$OS.type == "unix")
        fprog <- paste0(sas.program, " ", sas.switches, " ",
                        tmp.SASproccont)
    else fprog <- paste0("\"\"", sas.program, "\" ", sas.switches,
                         "\"", tmp.SASproccont, "\"\"")
    runcontents <- try(do.call(sas.runner, list(fprog)), silent = FALSE)
    if (class(runcontents)[1] == "try-error") {
        warning(paste("Running sas on", fprog, "yielded the error shown above."))
    }
    if (file.exists(tmp.proccontout)) {
        suppressWarnings(dt.content <- data.table::fread(file = tmp.proccontout,
                                                         header = TRUE))
        dt.content.vars <- names(dt.content)
        dt.content <- dt.content[,match(tolower(c("Variable","Type","Format","Informat")),tolower(dt.content.vars),nomatch=0),with=FALSE]
    }
    else {
        stop(paste("Running sas on", fprog, "did not produce the expected output file."))
    }
    ## ----------------------------- end proc contents -------------------------
    ## if pnr is not in var.names remove it
    if (!("pnr" %in% tolower(dt.content$Variable))) {
        character.vars <- character.vars[character.vars!="pnr"]
    }
    if (length(keep) > 0) {

        keep <- unique(c(keep,date.vars,datetime.vars,numeric.vars,character.vars))
        if ("pnr" %in% tolower(dt.content$Variable)) {
            if (!("pnr" %in% keep)) keep <- c("pnr",keep)
        }
        cond <- paste(cond, "keep=", paste(keep, collapse = " "),
                      " ", sep = "")
    }else{
        # keep all variables
        keep <- tolower(dt.content$Variable)
    }
    if (length(drop) > 0) {
        cond <- paste(cond, "drop=", paste(drop, collapse = " "),
                      " ", sep = "")
    }
    if (length(where) > 0) {
        cond <- paste(cond, "where=(", where, ") ", sep = "")
    }
    if (length(obs) > 0 && !is.infinite(obs)) {
        cond <- paste(cond, "obs=", format(obs, scientific = FALSE),
                      " ", sep = "")
    }
    
    if (length(set.hook) > 0 & is.character(set.hook))
        cond <- paste("(", cond, set.hook, ")", sep = " ")
    else cond <- paste("(", cond, ")", sep = " ")
    
    keep.check <- drop.check <- TRUE
    if (length(keep) > 0) {
        keep.check <- tolower(keep) %in% tolower(dt.content$Variable)
    }
    if (length(drop) > 0) {
        drop.check <- tolower(drop) %in% tolower(dt.content$Variable)
    }
    if (length(filter)>0){
        if (length(filter.by) == 0) {
            filter.names <- names(filter)
        }
        else {
            filter.names <- filter.by
        }
        if (any((filter.vars.pos <- (match(tolower(filter.names),tolower(dt.content$Variable),nomatch=0)))==0)){
            stop(paste0("\nThe following variables in the filter data set are not found in the data that should be imported:\n",
                        paste0(filter.names[filter.vars.pos==0],collapse=", "),
                        "\nThe following names are in the data:\n",
                        paste0(dt.content$Variable,collapse=", ")))
        }else{
            ## convert filter variables to same case as in imported data
            orig.filter.names <- data.table::copy(names(filter))
            setnames(filter,filter.names,dt.content$Variable[filter.vars.pos])
        }
    }
    # autodetect variable types and formats
    # ----------------------------------------------------------------
    # restrict var.format and var.type to interesting variables

    dt.content <- dt.content[tolower(Variable)%in%keep]
    # assess formats according to SAS proc contents
    dt.content[,target.type:="character"]
    dt.content[grepl("num", Type, ignore.case = TRUE),target.type:="numeric"]
    if ("Format"%in%dt.content.vars){
        dt.content[grepl("date|dato|DDMM|MMDD", Format, ignore.case = TRUE),target.type:="date"]
        dt.content[grepl("datetime", Format, ignore.case = TRUE) ,target.type:="datetime"]
        # take care of time variables
        dt.content[grepl("^time", Format, ignore.case = TRUE) ,target.type:="character"]
    }
    if ("Informat"%in%dt.content.vars){
        dt.content[grepl("date|dato|DDMM|MMDD", Informat, ignore.case = TRUE),target.type:="date"]
        dt.content[grepl("datetime", Informat, ignore.case = TRUE) ,target.type:="datetime"]
        # take care of time variables 
        dt.content[grepl("^time", Informat, ignore.case = TRUE) ,target.type:="character"]
    }
    # user may force different types
    if (!is.null(date.vars)){
        for (name in tolower(date.vars)) {
            if (0==(hit <- match(name,tolower(dt.content$Variable),nomatch=0))){
                warning(paste(name, "not found in dataset."))
            } else {
                name <- dt.content$Variable[hit]
                dt.content[name == Variable,target.type:="date"]
            }
        }
    }
    if (!is.null(datetime.vars)){
        for (name in tolower(datetime.vars)) {
            if (0==(hit <- match(name,tolower(dt.content$Variable),nomatch=0))){
                warning(paste(name, "not found in dataset."))
            } else {
                name <- dt.content$Variable[hit]
                dt.content[name == Variable,target.type:="datetime"]
            }
        }
    }
    if (!is.null(numeric.vars)){
        for (name in tolower(numeric.vars)) {
            if (0==(hit <- match(name,tolower(dt.content$Variable),nomatch=0))){
                warning(paste(name, "not found in dataset."))
            } else {
                name <- dt.content$Variable[hit]
                dt.content[name == Variable,target.type:="numeric"]
            }
        }
    }
    if (!is.null(character.vars)){
        for (name in tolower(character.vars)) {
            if (0==(hit <- match(name,tolower(dt.content$Variable),nomatch=0))){
                warning(paste(name, "not found in dataset."))
            } else {
                name <- dt.content$Variable[hit]
                dt.content[name == Variable,target.type:="character"]
            }
        }
    }
    dt.content[tolower(Variable)%in%numeric.vars,target.type:="numeric"]
    dt.content[tolower(Variable)%in%character.vars,target.type:="character"]
    #
    datetime.vars <- dt.content[target.type=="datetime"]$Variable
    date.vars <- dt.content[target.type=="date"]$Variable
    numeric.vars <- dt.content[target.type=="numeric"]$Variable
    character.vars <- dt.content[target.type=="character"]$Variable
    #
    # could consider a format statement for variables where user specifies
    # numeric.vars or character.vars. for now only formatting date
    # and datetime variables
    #
    skip.date.conversion <- tolower(as.character(skip.date.conversion[[1]]))
    format.statement <- if (length(date.vars)==0||skip.date.conversion %in% c("sas","TRUE"))
                            ""
                        else paste("format ", paste(date.vars, collapse = " "), " yymmdd10.;")
    if (length(datetime.vars)>0 && !(skip.date.conversion %in% c("TRUE","sas")))
        format.statement <- paste(format.statement,paste("\nformat ", paste(datetime.vars, collapse = " "), " datetime.;"))
    if (length(pre.hook) > 0 & is.character(pre.hook)) {
        cat(pre.hook, file = tmp.SASfile, append = TRUE)
    }
    if (length(filter) > 0) {
        data.table::fwrite(filter, quote = TRUE, file = tmp.filterfile)
        cat("proc import datafile='", tmp.filterfile, "' \n",
            "out = csv_import \ndbms =csv; \nrun; \n", sep = "",
            file = tmp.SASfile, append = TRUE)
    }
    cat("data df; \nset '",filename,"'",cond,";\n",format.statement,sep = "",file = tmp.SASfile,append = TRUE)
    if (length(step.hook) > 0 & is.character(step.hook)) {
        cat(step.hook, sep = "", file = tmp.SASfile, append = TRUE)
    }
    cat("\nrun;\n", sep = "", file = tmp.SASfile, append = TRUE)
    if (length(filter) > 0) {
        cat("proc sort data=csv_import; \nby ", paste(filter.names,
                                                      collapse = " "), "; \nrun; \nproc sort data=df; \nby ",
            paste(filter.names, collapse = " "), "; \nrun;\n ",
            sep = "", file = tmp.SASfile, append = TRUE)
        if (filter.negative){
            merge.cond.statement <- paste("if ( NOT a ) AND ( b ) ;\n")
        }else{
            merge.cond.statement <- paste("if ( a ) AND ( b ) ;\n")
        }
        cat("data df; \nmerge csv_import(IN=a) df(IN=b); \nby ",
            paste(filter.names, collapse = " "), ";\n", merge.cond.statement,
            "run;\n", sep = "", file = tmp.SASfile, append = TRUE)
    }
    if (length(post.hook) > 0 & is.character(post.hook)) {
        cat(post.hook, file = tmp.SASfile, append = TRUE)
    }
    if (show.sas.code == TRUE) {
        cat("\nRunning the following sas code in the background.\nYou can cancel SAS at any time.\n")
        cat("\n-------------------------SAS-code-------------------------\n")
        cat(readChar(tmp.SASfile, file.info(tmp.SASfile)$size))
        cat("\n----------------------------------------------------------\n")
    }
    else {
        if (verbose)
            cat("\nRunning sas code in the background. You can cancel SAS at any time.\n")
    }
    tmp.lines <- paste("data _NULL_; \nset df; \n file '", outfile,
                       "' dsd; \nif _n_ eq 1 then link names; \nput (_all_)(~); return; \nnames:\nlength _name_ $32; \ndo while(1); \ncall vnext(_name_); \nif upcase(_name_) eq '_NAME_' then leave; \nput _name_ ~ @; \nend; \nput; \nreturn; \nrun;\n")
    cat(tmp.lines, file = tmp.SASfile, append = TRUE)
    if (!(prod(keep.check) * prod(drop.check))) {
        error.mes <- "Some of the variables specified in the keep or drop statement are not found in the import file.\n"
        if (length(keep) > 0)
            error.mes <- paste(error.mes, "The KEEP argument(s): \n  ",
                               paste(tolower(keep[!keep.check]), collapse = "\n"),
                               "\nare not found in the import file.\n", sep = "")
        if (length(drop) > 0)
            error.mes <- paste(error.mes, "The DROP argument(s): \n  ",
                               paste(tolower(drop[!drop.check]), collapse = "\n"),
                               "\nare not found in the import file.\n", sep = "")
        error.mes <- paste(error.mes, "\nThe content of the import file is:\n",
                           sep = "")
    }
    if (content == TRUE) {
        df <- dt.content
        if (!(prod(keep.check) * prod(drop.check))) {
            cat(paste("Warning:\n", error.mes, sep = ""))
            print(df)
            cat("\nThis will give an error when content=FALSE.\n")
        }
    }
    else {
        if (!(prod(keep.check) * prod(drop.check))) {
            cat(paste("Error:\n", error.mes, sep = ""))
            print(dt.content)
            stop("Aborted.")
        }
        else {
            if (.Platform$OS.type == "unix") {
                fprog <- paste0(sas.program, " ", sas.switches,
                                " ", tmp.SASfile)
            }
            else {
                fprog <- paste0("\"\"", sas.program, "\" ", sas.switches,
                                "\"", tmp.SASfile, "\"\"")
            }
            runSAS <- try(do.call(sas.runner, list(fprog)), silent = FALSE)
            if (class(runSAS)[1] == "try-error") {
                warning(paste("Running sas on", fprog, "yielded the error shown above."))
            }
            if (!file.exists(outfile)) {
                stop(paste0("SAS did not produce output file. Maybe you have misspecified a SAS statement?",
                            ifelse(save.tmp == FALSE, "\nRun with save.tmp=TRUE and then",
                                   "\nPlease"), " check the log file:\n", tmp.log))
            }
            info <- file.info(outfile)
            ia <- c(list(file = outfile, header = TRUE),
                    list(...))
            # types of variables (case of variable names according to dt.contents)
            if (length(ia$colClasses) == 0){
                ia$colClasses <- list(character = NULL, numeric = NULL)
            }
            if (use.colClasses){
                ia$colClasses[["character"]] <- c(character.vars,date.vars,datetime.vars)
                ia$colClasses[["numeric"]] <- numeric.vars
            }else{
                ia$colClasses <- NULL
            }
            # reset filternames
            if (length(filter) > 0){
                setnames(filter,orig.filter.names)
            }
            if (info$size == 0) {
                warning("The dataset produced by SAS appears to be empty.")
                # empty data set
                return(df)
            }
            else {
                # read 1 line of data and adapt colClasses etc in case that user has used keep or drop in set.hook
                tryH1 <- try(h1 <- do.call(data.table::fread,list(file=ia$file,nrow=1,header=TRUE)))
                h1names <- names(h1)
                ia$colClasses <- lapply(ia$colClasses,function(x){intersect(x,h1names)})
                numeric.vars <- intersect(numeric.vars,h1names)
                character.vars <- intersect(character.vars,h1names)
                date.vars <- intersect(date.vars,h1names)
                datetime.vars <- intersect(datetime.vars,h1names)
                # now read the data
                if (verbose)
                    tryread <- try(df <- do.call(data.table::fread,ia))
                else
                    suppressWarnings(tryread <- try(df <- do.call(data.table::fread,ia)))
                if ("try-error" %in% class(tryread)) {
                    warning("Could not read the constructed dataset into R. \nSomething probably went wrong during SAS program execution. ")
                    df <- NULL
                }
            }
            # change case of variable names
            names(df) <- tolower(names(df))
            # force numeric format
            if (force.numeric[[1]] == TRUE && length(numeric.vars) > 0) {
                numeric.vars <- tolower(numeric.vars)
                if (verbose)
                    try(df[, `:=`((numeric.vars), lapply(.SD, as.numeric)),
                           .SDcols = numeric.vars], silent = FALSE)
                else
                    suppressWarnings(try(df[, `:=`((numeric.vars), lapply(.SD, as.numeric)),
                                            .SDcols = numeric.vars], silent = FALSE))
            }
            # deal with na.strings in character variables
            na.strings[na.strings == "dot"] <- "^\\.$"
            if (length(na.strings)>0 && length(character.vars) > 0) {
                character.vars <- tolower(character.vars)
                if (verbose)
                    try(df[, `:=`((character.vars), lapply(.SD, function(x){x[grepl(paste0(na.strings,collapse="|"),x)] <- NA;x})),
                           .SDcols = character.vars], silent = FALSE)
                else
                    suppressWarnings(
                        try(df[, `:=`((character.vars), lapply(.SD, function(x){x[grepl(paste0(na.strings,collapse="|"),x)] <- NA;x})),
                               .SDcols = character.vars], silent = FALSE)
                    )
            }
            # date format
            if (!(skip.date.conversion %in% c("r","true")) && length(date.vars)>0) {
                date.vars <- tolower(date.vars)
                # respect if user wants character or numeric format instead of date format
                if (any(c(character.vars,numeric.vars)%in%date.vars))
                    date.vars <- setdiff(date.vars,c(character.vars,numeric.vars))
                if (verbose)
                    try(df[, `:=`((date.vars), lapply(.SD, lubridate::ymd)),
                           .SDcols = date.vars], silent = FALSE)
                else
                    suppressWarnings(try(df[, `:=`((date.vars), lapply(.SD, lubridate::ymd)),
                                            .SDcols = date.vars], silent = FALSE))
            }
            # datetime format
            if (!(skip.date.conversion %in% c("r","TRUE")) && length(datetime.vars)>0) {
                datetime.vars <- tolower(datetime.vars)
                # respect if user wants character or numeric format instead of date format
                if (any(c(character.vars,numeric.vars)%in%date.vars))
                    datetime.vars <- setdiff(datetime.vars,c(character.vars,numeric.vars))
                if (verbose)
                    try(df[, `:=`((datetime.vars), lapply(.SD, lubridate::dmy_hms)),
                           .SDcols = datetime.vars], silent = FALSE)
                else{
                    suppressWarnings(try(df[, `:=`((datetime.vars), lapply(.SD, lubridate::dmy_hms)),
                                            .SDcols = datetime.vars], silent = FALSE))
                }
            }
        }
    }
    return(try(df[], silent = TRUE))
}

##' @export
contentSAS <- function(filename,wd=NULL){
    importSAS(filename=filename,wd=wd,content = TRUE)
}
tagteam/heaven documentation built on April 26, 2024, 6:22 a.m.