R/sas.get.s

Defines functions sasdsLabels readSAScsv lookupSASContents read.xportDataload sasxport.get as.data.frame.special.miss code.levels sas.codes print.special.miss format.special.miss is.special.miss ddmmmyy print.timePOSIXt importConvertDateTime sas.get

Documented in as.data.frame.special.miss code.levels code.levels ddmmmyy format.special.miss format.special.miss importConvertDateTime is.special.miss is.special.miss lookupSASContents print.special.miss print.special.miss print.timePOSIXt readSAScsv read.xportDataload sas.codes sas.codes sasdsLabels sas.get sas.get sasxport.get

## $Id$
sas.get <- 
  function(libraryName,
           member,
           variables = character(0), 
           ifs = character(0), 
           format.library = libraryName,
           id, 
           dates. = c("sas","yymmdd","yearfrac","yearfrac2"), 
           keep.log = TRUE,
           log.file = "_temp_.log", 
           macro = sas.get.macro,
           data.frame.out = existsFunction("data.frame"), 
           clean.up = FALSE,
           quiet = FALSE,
           temp = tempfile("SaS"), 
           formats=TRUE,
           recode=formats, 
           special.miss=FALSE,
           sasprog="sas",
           as.is=.5,
           check.unique.id=TRUE,
           force.single=FALSE,
           pos,
           uncompress=FALSE,
           defaultencoding="latin1")
{
  if(force.single) stop('force.single does not work under R')
  dates. <- match.arg(dates.)

  fexists <- function(name) {
    w <- file.exists(name)
    attr(w, 'which') <- name[w]
    w
  }

  file.is.dir <- function(name) {
    isdir <- file.info(name)$isdir
    isdir && !is.na(isdir)
  }

  file.is.readable <- function(name) file.access(name,4)==0

  fileShow <- function(x) file.show(x)

  if(recode) formats <- TRUE

  if(missing(formats) || formats) {
    ## *****  Next line begins mod from Mike Kattan edits 11 Sep 97
    ## Redone FEH 22Oct00
    no.format <- all(!fexists(file.path(format.library,
                                        c('formats.sc2','formats.sct','formats.sct01','formats.sas7bcat'))))
    if(no.format) {
      if((!missing(formats) && formats) || (!missing(recode) && recode))
        warning(paste(paste(format.library, 
                            "/formats.sc? or formats.sas7bcat",sep = ""), 
                      " not found. Formatting ignored. \n"))
      formats <- recode <- FALSE
    }
    ## ***** End Mike Kattan edits 11 Sep 97
  }
  
  ## 5 Changes here from Claudie Berger <[email protected]> 19feb00
  ## Allows work on sas v7.
  sasin   <- paste(temp, ".3.sas", sep = "")
  sasout1 <- paste(temp, ".1.sas", sep = "")
  sasout2 <- paste(temp, ".2.sas", sep = "")
  sasout3 <- paste(temp, ".4.sas", sep = "")
  sasout4 <- paste(temp, ".5.sas", sep = "")
  nvariables <- length(variables)
  if(nvariables>0) {
    if(any(jdup <- duplicated(variables)))
      stop(paste("duplicate variables requested: ", variables[jdup]))
  }
  
  varstring <- paste(variables, collapse = "\n ")
  ifs <- paste("'",paste(ifs, collapse = ";\n "),"'",sep="")
  if(length(sasin) != 1)
    stop("Illegal temporary file name")
  
  temp.files <- c(sasin, sasout1, sasout2, sasout3, sasout4)
  if(!keep.log)
    temp.files <- c(temp.files, log.file)
  
  if(clean.up)
    on.exit(unlink(temp.files))
  ##on.exit(sys(paste("rm -f", paste(temp.files, collapse = " "))))
  ##  4oct03
  
  if(missing(member))
    stop("SAS member name is required")
  
  if(missing(libraryName))
    stop("SAS library name is required")

  ## Encoding added by Reinhold Koch 24Jan14 <[email protected]>
  cat("%LET DEFAULTE=", defaultencoding, ";\n", sep="", file=sasin)
  cat(macro, sep="\n", file=sasin, append=TRUE)

  sasds.suffix <- c('sd2','sd7','ssd01','ssd02','ssd03','ssd04','sas7bdat') 
  ## 22Oct00

  if(libraryName == "") libraryName <- "."
  if(!file.is.dir(libraryName))
    stop(paste(sep = "", "library, \"", libraryName, 
               "\", is not a directory"))

  unix.file <- file.path(libraryName, paste(member, sasds.suffix, sep="."))

  if(uncompress) {
    if(any(fe <- fexists(paste(unix.file,".gz",sep=""))))
      system(paste("gunzip ", attr(fe,'which'),'.gz',sep=''))
    else if(any(fe <- fexists(paste(unix.file,".Z",sep=""))))
      system(paste("uncompress ",attr(fe,'which'),'.Z',sep=''))
  }

  if(!any(fe <- fexists(unix.file))) {
    stop(paste(sep = "", "Unix file, \"",
               paste(unix.file,collapse=' '), 
               "\", does not exist"))
  } else {
    file.name <- attr(fe,'which')
    if(!file.is.readable(file.name)) {
      stop(paste(sep = "", 
                 "You do not have read permission for Unix file, \"",
                 file.name, "\""))   # 22Oct00
    }
  }
    
  cat("libname temp '", libraryName, "';\n", file = sasin, append = TRUE,
      sep = "")
  
  ## format.library should contain formats.sct containing user defined
  ## formats used by this dataset.  It must be present.
  cat("libname library '", format.library, "';\n", file = sasin,
      append = TRUE, sep = "")
  cat("%sas_get(temp.", member, ",\n",
      "  ", sasout1, ",\n",
      "  ", sasout2, ",\n",
      "  ", sasout3, ",\n",
      "  ", sasout4, ",\n",
      "  dates=", dates., ",\n",
      "  vars=",  varstring, ",\n",
      "  ifs=",   ifs, ",\n",
      "  formats=", as.integer(formats), "\n,",
      "  specmiss=", as.integer(special.miss), ");\n",
      file = sasin, append = TRUE, sep = "")

  
  status <- system(paste(shQuote(sasprog), shQuote(sasin), "-log",
                         shQuote(log.file)), intern=FALSE)
  ## 24nov03 added output=F
  if(status != 0) {
    if(!quiet && fexists(log.file)) fileShow(log.file)  ## 4oct03
    stop(paste("SAS job failed with status", status))
  }
  ##
  ## Read in the variable information
  ##
  if(!(fexists(sasout1) && fexists(sasout2))) {
    if(!quiet)
      fileShow(log.file)  ## 4oct03
    
    stop("SAS output files not found")
  }
  
  vars <-
    scan(sasout1, list(name = "", type = 0, length = 0,
                       format = "", label = "", n = 0),
         multi.line = FALSE, sep = "\022",
         flush=TRUE, comment.char='', quote='')
  ## Thanks Don MacQueen for scan fix for R
  
  nvar <- length(vars$name)
  if(nvar == 0) {
    if(!quiet)
      fileShow(log.file)  ## 4oct03
    
    stop("First SAS output is empty")
  }
  
  nrow <- vars$n[1]	#n is the same for each variable
  
  ## Read the data in
  ##  We try to be clever about the variable type.  If SAS is character
  ##  use char of course.  If is numeric and length >4, use double.  If
  ##  numeric and length <4, use single.  We could also use the format to
  ##  choose further, if it consists of a number followed by a "."
  ##  can we safely assume integer.
  ##
  type <- ifelse(vars$type == 2, "character(nrow)", 
                 ifelse(force.single,  ##28Mar01
                        "single(nrow)", "double(nrow)"))
  ##BILL: I corrected the macro so the following isn't needed:
  ## get rid of trailing blank on names
  ##	vars$name <- unix("sed 's/ $//'", vars$name)
  inlist <- paste("\"", vars$name, "\"=", type,
                  sep = "", collapse = ", ")
  
  inlist <- parse(text = paste("list(", inlist, ")"))
  ## Inlist would now be the size of the final data structure, if I had
  ## evaluated it.

  ## Read the data
  ds <-
    scan(sasout2, eval(inlist), sep = "\022", multi.line = FALSE,
         flush=TRUE, comment.char='', quote='')
  
  if(length(ds) < nvariables) {
    m <- variables[is.na(match(variables, names(ds)))]
    if(length(m) > 0) {
      warning(paste(length(m), 
                    "requested variables did not exist:", 
                    paste("\"", m, "\"", sep = "", collapse = " "), 
                    "\n\t(use sas.contents())"))
    }
  }
  
  format <- vars$format
  format[format=='$'] <- ' '    # 1Mar00
  label <- vars$label
  name <- vars$name
  esasout3 <- formats && fexists(sasout3)   #added formats && 1/20/93
  if(recode && !esasout3) recode <- FALSE
  FORMATS <- NULL

  if(formats && esasout3) {
    FORMATS <- dget(sasout3)
    if(length(FORMATS)==0) {
      FORMATS <- NULL;
      recode <- FALSE
    }	
  }
  
  smiss <- NULL
  if(special.miss && fexists(sasout4))
    smiss <-
      scan(sasout4, list(name="", code="", obs=integer(1)),
           multi.line=FALSE, flush=TRUE, sep="\022",
           comment.char='', quote='')
  
  sasdateform <- c("date","mmddyy","yymmdd","ddmmyy","yyq","monyy",
                   "julian","qtr","weekdate","weekdatx","weekday","month")
  dateform <- 	
    list(as.name("ddmmmyy"),"m/d/y","y/m/d","d/m/y",as.name("ddmmmyy"),
         "mon year",as.name("ddmmmyy"),"mon",as.name("ddmmmyy"),
         as.name("ddmmmyy"), as.name("ddmmmyy"),"m")
  
  sastimeform <- c("hhmm","hour","mmss","time")
  timeform <- c("h:m","h","m:s","h:m:s")
  sasdatetimeform <- c("datetime","tod")
  datetimeform <- list(list(as.name("ddmmmyy"),"h:m:s"), c("m/d/y"," "))
  z <- "%02d%b%Y"
  dateform4 <-
    c(z,"%02m/%02d/%Y","%Y/%02m/%02d","%02d/%02m/%Y", z,"%02m %Y",
      z,"%02m", z, z, z,"%02m")
  
  timeform4 <- c("%02H:%02M","%02H","%02M:%02S","%02H:%02M:%02S")
  datetimeform4 <- c("%02d%b%Y %02h:%02m:%02s","%02m/%02d/%Y")
  
  ## Don MacQueen
  days.to.adj <- as.numeric(difftime(ISOdate(1970,1,1,0,0,0) , 
                                     ISOdate(1960,1,1,0,0,0), 'days'))
  secs.to.adj <- days.to.adj*24*60*60
  
  for(i in 1:nvar) {
    atr <- list()
    dsi <- ds[[i]]
    fname <- format[i]
    rec <- FALSE
    if(fname!=" ") {
      ff <- fname
      if(dates.=="sas" & (m <- match(fname,sasdateform,0)) >0) {
        ##look for partial dates
        dd <- dsi-floor(dsi)
        ddn <- !is.na(dd)
        if(any(ddn) && any(dd[ddn]!=0)) {
          ll <- 1:length(dd)
          atr$partial.date <- 
            list(month=ll[dd==.5],day=ll[dd==.25],both=ll[dd==.75])
          atr$imputed <- ll[dd!=0]
          dsi <- floor(dsi)
        }
        dsi <- importConvertDateTime(dsi, 'date', 'sas',
                                     form=dateform[m])
        
        if(length(atr$imputed)) 
          attr(dsi,'class') <- c("impute",attr(dsi,'class'))
        
        ff <- NULL
      } else {
        if((m <- match(fname,sastimeform,0)) >0) {
          dsi <- importConvertDateTime(dsi, 'time', 'sas', 
                                       form=timeform[m])
          ff <- NULL			
        } else if((m <- match(fname,sasdatetimeform,0))>0) {
          dsi <- importConvertDateTime(dsi, 'datetime', 'sas',
                                       form=datetimeform[m])
          
          ff <- NULL					
        }
      }

      atr$format <- ff
      if(recode & length(g <- FORMATS[[fname]])) {
        labs <- g$labels
        if(!is.logical(recode)) {
          labs <- if(recode==1) paste(g$values,":",labs,sep="")
                  else paste(labs,"(",g$values,")",sep="")
        }

	dsi <- factor(dsi, g$values, labs)
        atr$sas.codes <- g$values
        rec <- TRUE
      }   
    }

    if(data.frame.out && !rec && vars$type[i]==2 &&
       ((is.logical(as.is) && !as.is) || 
        (is.numeric(as.is) && length(unique(dsi)) < as.is*length(dsi))))
      dsi <- factor(dsi, exclude="") #exclude added 5Mar93
    
    ## For data frames, char. var usually factors
    if(label[i]!=" ")
      label(dsi) <- label[i]  #atr$label <- label[i]
    
    if(length(smiss$name)) {
      j <- smiss$name==name[i]
      if(any(j)) {
        atr$special.miss <- 
          list(codes=smiss$code[j],obs=smiss$obs[j])
        attr(dsi,'class') <- c("special.miss",attr(dsi,'class'))
      }
    }

    if(!is.null(atr))
      attributes(dsi) <- c(attributes(dsi),atr)

    if(missing(pos))
      ds[[i]] <- dsi
    else
      assign(name[i], dsi, pos=pos)				
  }

  if(!missing(pos))
    return(structure(pos, class="pos"))

  atr <- list()
  
  if(missing(id)) {
    if(data.frame.out)
      atr$row.names <- as.character(1:nrow)
  } else {
    idname <- id 
    jj <- match(idname, names(ds), 0)
    if(any(jj==0))
      stop(paste("id variable(s) not in dataset:",
                 paste(idname[jj==0],collapse=" ")))
    
    if(length(idname)==1) {
      id <- ds[[idname]] #Need since not use data.frame
    } else {		 
      id <- as.character(ds[[idname[1]]])
      for(jj in 2:length(idname))
        id <- paste(id, as.character(ds[[idname[jj]]]))
    }

    if(check.unique.id) {
      dup <- duplicated(id)
      if(any(dup))
        warning(paste("duplicate IDs:",
                      paste(id[dup], collapse=" ")))
    }

    if(data.frame.out)
      atr$row.names <- as.character(id)
    else atr$id <- id	
  }

  if(!is.null(FORMATS))
    atr$formats <- FORMATS

  if(data.frame.out)
    atr$class <- "data.frame"

  attributes(ds) <- c(attributes(ds),atr)
  ds
}

importConvertDateTime <- 
  function(x, type=c('date','time','datetime'),
           input=c('sas','spss','dataload'), form) {
  type <- match.arg(type)
  input <- match.arg(input)

  if(input != 'sas' && type != 'date')
    stop('only date variables are support for spss, dataload')
		
  adjdays <- c(sas=3653, spss=141428, dataload=135080)[input]
  ## 1970-1-1 minus 1960-1-1, 1582-10-14, or 1600-3-1
  origin <- c(sas='1960-01-01', spss='1582-10-14', dataload='1600-03-01')[input]
  if(input=='spss') x <- x/86400
  
  switch(type,
         date = structure(x - adjdays, class='Date'),
         time = {
           ## Don MacQueen 3Apr02
           z <- structure(x, class=c('POSIXt','POSIXct'))
           f <- format(z, tz='GMT')
           z <- as.POSIXct(format(z, tz='GMT'), tz='')
           structure(z, class=c('timePOSIXt','POSIXt','POSIXct'))},
         datetime = as.POSIXct(x, origin=origin, tz='GMT'))
#           chron((x - adjdays*86400)/86400,
#                 out.format=c(dates='day mon year', times='h:m:s'))})
}


## Don MacQueen 3Apr02
## slightly modified copy of format.POSIXct() from R base
format.timePOSIXt <- function (x, format = "%H:%M:%S", tz = "",
                               usetz = FALSE, ...) {
  if (!inherits(x, c("timePOSIXt","POSIXct"))) stop("wrong class")
  class(x) <- class(x)[-1]
  structure(format.POSIXlt(as.POSIXlt(x, tz), format, usetz, ...),
            names = names(x))
}

print.timePOSIXt <- function(x, ...) print(format(x, ...))


##if(!.R.) {
## Output format routine needed by chron for usual SAS date format
ddmmmyy <- function(x)
{
  y <- chron::month.day.year(trunc(unclass(x)), attr(x,"origin"))
  yr <- y$year
  m <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct",
         "Nov","Dec")[y$month]
  ifelse(yr<1900 | yr>=2000, paste(y$day,m,yr,sep=""),
         paste(y$day,m,yr-1900,sep=""))
}


## Functions to handle special.miss class
is.special.miss <- function(x, code)
{
  sm <- attr(x, "special.miss")
  if(!length(sm))
    return(rep(FALSE, length(x)))
  
  if(missing(code)) {
    z <- rep(FALSE, length(x))
    z[sm$obs] <- TRUE
  } else {
    z <- rep(FALSE, length(x))
    z[sm$obs[sm$codes==code]] <- TRUE
  }
  
  z
}


"[.special.miss" <- function(x, ..., drop=FALSE)
{
  ats <- attributes(x)
  ats$dimnames <- NULL
  ats$dim <- NULL
  ats$names <- NULL
  attr(x,'class') <- NULL
  y <- x[..., drop = drop]
  if(length(y) == 0)
    return(y)
  
  k <- seq(along=x)
  names(k) <- names(x)
  k <- k[...]
  attributes(y) <- c(attributes(y), ats)
  smiss <- attr(y, "special.miss")
  codes <- rep("ZZ",length(x))
  codes[smiss$obs] <- smiss$codes
  codes <- codes[...]
  which <- codes!="ZZ"
  if(sum(which)) attr(y,"special.miss") <- 
    list(obs=seq(along=k)[codes!="ZZ"],codes=codes[codes!="ZZ"])
  else {
    attr(y,"special.miss") <- NULL
    attr(y,'class') <- attr(y,'class')[attr(y,'class') != "special.miss"]
    if(length(attr(y,'class'))==0)
      attr(y,'class') <- NULL
  }
  
  y
}


format.special.miss <- function(x, ...)
{
  w <-
    if(is.factor(x))
      as.character(x)
    else {
      cl <- attr(x,'class');
      cl <- cl[cl!="special.miss"]
      if(length(cl)) {
        attr(x,'class') <- cl;
        format(x, ...)
      } else format.default(x, ...)
    }
  
  sm <- attr(x, "special.miss")
  names(w) <- names(x)
  if(!length(sm))
    return(w)
  
  w[sm$obs] <- sm$codes
  attr(w,"label") <- attr(w,"special.miss") <- attr(w,"class") <- NULL
  w
}


print.special.miss <- function(x, ...)
{
  sm <- attr(x, "special.miss")
  if(!length(sm)) {
    print.default(x)
    return(invisible())
  }
  
  w <- format.special.miss(x)
  print.default(w, quote=FALSE)
  invisible()
}


sas.codes <- function(object) attr(object, "sas.codes")


code.levels <- function(object) {
  if(length(cod <- attr(object,"sas.codes"))) 
    levels(object) <- paste(cod,":",levels(object),sep="")
  
  object
}


as.data.frame.special.miss <- function(x, row.names = NULL, optional = FALSE, ...)
{
  nrows <- length(x)
  if(is.null(row.names)) {
    ## the next line is not needed for the 1993 version of data.class and is
    ## included for compatibility with 1992 version
    if(length(row.names <- names(x)) == nrows &&
       !anyDuplicated(row.names)) {
    }
    else if(optional)
      row.names <- character(nrows)
    else row.names <- as.character(1:nrows)
  }

  value <- list(x)
  if(!optional)
    names(value) <- deparse(substitute(x))[[1]]

  structure(value, row.names=row.names, class='data.frame')
}


## val{nval}=compress(value)||"" was =value  23mar04
sas.get.macro <-
  c("/* Macro sas_get (modified by F. Harrell 30Jan90, Bill Dunlap Dec90, FH Mar92,",
    "\t\t\tFH Apr95 (extend LENGTH smiss))", 
    "    Sets up for conversion of SAS dataset to S dataset.", 
    "    Arguments:", "\tdataset - name of SAS dataset", 
    "\ttemp1\t- Name of temporary dataset to contain data dictionar (unquoted)",
    "\t\t  default=/tmp/file.1", 
    "\ttemp2\t- Name of temporary dataset to contain ASCII version of SAS", 
    "\t\t  dataset (unquoted)", "\t\t  default=/tmp/file.2", 
    "\ttemp3   - Name of temporary dataset to contain ASCII file with S", 
    "\t\t  program to store format values and labels", 
    "\ttemp4   - Name of temporary dataset to contain ASCII file with", 
    "\t\t  locations of special missing values", 
    "\tdates\t- SAS to store date variables in SAS format ( # days from 1/1/60)",
    "\t\t  (default)", 
    "\t\t- YEARFRAC to store as days from 1/1/1900, divided by 365.25", 
    "\t\t- YEARFRAC2 to store as year + fraction of current year", 
    "\t\t- YYMMDD to store as numeric YYMMDD", 
    "\tvars    - list of variable in dataset that you want returned to S",
    "                  (unquoted, separate variable names with spaces)  If empty,",
    "                  then return all variables.", 
    "        ifs     - sequence of SAS subsetting if statements, (unquoted,",
    "                  separated by semicolons).", 
    "\tformats - 0 (default) - do not create file on temp3 containing S", 
    "\t\t  statements to store format values and labels, 1 do create", 
    "\tspecmiss- 0 (default).  Set to 1 to write a data file on temp4 with",
    "\t\t  the fields: variable name, special missing value code,", 
    "\t\t  observation number", 
    "\tdefencod - default encoding of dataset if it does not specify",
    "                                                                              */",
    "%macro sas_get(dataset,  temp1, temp2, temp3, temp4, dates=SAS, vars=, ifs=, ",
    "\tformats=0, specmiss=0, defencod=&DEFAULTE);", 
    "OPTIONS NOFMTERR;",
    "%LET DSID=%SYSFUNC(open(&dataset,i));",
    "%LET ENCODE=%SCAN(%SYSFUNC(ATTRC(&DSID,ENCODING)),1);",
    "%IF &ENCODE=Default %THEN %LET dataset=&dataset(encoding=&defencod);",
    "%IF %QUOTE(&temp1)=  %THEN %LET temp1=/tmp/file.1;", 
    "%IF %QUOTE(&temp2)=  %THEN %LET temp2=/tmp/file.2;", 
    "%IF %QUOTE(&temp3)=  %THEN %LET temp3=/tmp/file.3;", 
    "%IF %QUOTE(&temp4)=  %THEN %LET temp4=/tmp/file.4;",
    ## Next line had %QUOTE(&ifs),1,\"'\"  31oct02
    "%LET dates=%UPCASE(&dates);", "%LET ifs=%SCAN(%QUOTE(&ifs),1,'');", 
    "%LET _s_=_sav_;", 
    "/* BILL: Can these 2 subsets be combined into one pass of the data? -Frank*/",
    "/* Subset by observation first */", "%IF %QUOTE(&ifs)^= %THEN %DO;", 
    " data _osub_ ;", "  set &dataset ;", "  &ifs ;", 
    " %LET dataset=_osub_ ;", " %END;", "/* Then subset by variable */", 
    "%IF &vars^= %THEN %DO;", " data _vsub_ ;", "  set &dataset ;", 
    "  keep &vars ;", " %LET dataset=_vsub_ ;", " %END;", 
    "proc contents data=&dataset out=&_s_(KEEP=name type length label format nobs ",
    " varnum) noprint; ", "%IF &formats=1 %THEN %DO;", 
    "   PROC FORMAT LIBRARY=LIBRARY CNTLOUT=f(KEEP=fmtname type start end label);",
    "   DATA f; SET f; RETAIN n 0; n+1; IF type=\"C\" THEN fmtname=\"$\"||fmtname;",
    "   PROC SORT DATA=f OUT=f(DROP=n); BY fmtname n; ", 
    "  *Sort by n instead of start for numerics so 13 sorts after 2;", 
    "  *Dont consider formats containing ANY range of values;", 
    "  *Dont consider formats that dont have at least one non-missing (if", 
    "   numeric) starting value.  This gets rid of formats that are used", 
    "   only to label special missing values;", 
    "   DATA f2; SET f; BY fmtname; RETAIN anyrange 0 anynmiss 0;", 
    "      IF FIRST.fmtname THEN DO;anyrange=0;anynmiss=0;END;", 
    "      IF start^=end THEN anyrange=1;", 
    "      IF TYPE=\"C\" THEN anynmiss=1; ", 
    "      ELSE IF (start+0)>. THEN anynmiss=1;", 
    "      IF LAST.fmtname & anynmiss & ^anyrange THEN OUTPUT; KEEP fmtname;",
    "   DATA f; MERGE f f2(IN=in2); BY fmtname; IF in2;", 
    "      IF TYPE=\"N\" THEN DO; IF (start+0)>.;  *S cannot handle special missings;",
    "         END;", "      RENAME fmtname=format start=value; DROP end;", 
    "   PROC SORT DATA=&_s_(KEEP=format) OUT=sform; BY format;", 
    "   DATA sform; SET sform; BY format; IF LAST.format;", 
    "   DATA f; MERGE sform(IN=in1) f(IN=in2); BY format; ", 
    "      IF in1 & in2;", 
    "   *This keeps formats ever used by any variable;", 
    "   DATA _NULL_; SET f END=_eof_; BY format;", 
    "      ARRAY val{*} $ 16 val1-val500; ARRAY lab{*} $ 40 lab1-lab500; ", 
    "      RETAIN done 0 nform 0 nval 0 val1-val500 \" \" lab1-lab500 \" \" bk -1; ",
    "      FILE \"&temp3\" LRECL=4096;", "      IF FIRST.format THEN DO;", 
    "         IF ^done THEN PUT 'list(' @@;  done=1;", 
    "         nform=nform+1; nval=0;", 
    "         format=TRANSLATE(format,\".abcdefghijklmnopqrstuvwxyz\",", 
    "                                 \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 
    "          IF nform=1 THEN PUT '\"' format +bk '\"=list(' @@;", 
    "         ELSE PUT ', \"' format +bk '\"=list(' @@;", "         END;", 
    "      nval=nval+1; ", 
    "      IF nval>500 THEN DO; ERROR \">500 format values not allowed\";ABORT ABEND;",
    "         END;", '      val{nval}=compress(value)||""; lab{nval}=label; ', 
    "      IF LAST.format THEN DO;", "         PUT \"values=c(\" @@; ", 
    "         DO i=1 TO nval; IF i>1 THEN PUT \",\" @@;", 
    "            IF type=\"N\" THEN PUT val{i} +bk @@;", 
    "            ELSE PUT '\"' val{i} +bk '\"'  @@;", "            END;", 
    "         PUT \"),labels=c(\" @@;", 
    "         DO i=1 TO nval; IF i>1 THEN PUT \",\" @@;", 
    "            PUT '\"' lab{i} +bk '\"' @@;", "            END;", 
    "         PUT \"))\";", "         END;", 
    "      IF _eof_ THEN PUT \")\";", "   %END;", 
    "PROC SORT DATA=&_s_;BY varnum;", "data _null_;", " set &_s_ end=eof;", 
    " FILE \"&temp1\";  RETAIN _bk_ -1;", " if _n_ = 1 then do;", 
    "%IF &specmiss=0 %THEN %LET ofile=_NULL_; ", 
    "%ELSE %LET ofile=smiss(KEEP=vname val obs);", 
    "  put \"data &ofile; set &dataset end=eof;\";", 
    "  put '  file \"&temp2\" RECFM=D LRECL=4096;';", 
    "  put \"  retain __delim 18 _bk_ -1 obs 0; LENGTH _xx_ $ 20 obs 5;obs+1; \";",
    "%IF &specmiss=1 %THEN %DO;", 
    "  put \"LENGTH vname $ 8 val $ 1;\"; %END;", "  end;", 
    " IF type=2 THEN DO;", "  PUT 'FORMAT ' name ';' @;", 
    "  PUT 'IF ' name '=\" \" THEN PUT __delim IB1. @;';", 
    "/* $char added F.H. 24Mar92, dropped  +_bk_ before __delim */", 
    "/* $CHAR. removed FEH 2Aug92, added null FORMAT above, added back +_bk_ */",
    "  PUT 'ELSE PUT ' name '+_bk_ __delim IB1. @;';", "  END;", 
    " ELSE DO; ", "  PUT 'IF ' name '<=.Z THEN _xx_=\"NA\";' @;", 
    "  PUT 'ELSE _xx_=LEFT(PUT(' @;", "  format=UPCASE(format);", 
    "  IF format=\"DATE\"|format=\"MMDDYY\"|format=\"YYMMDD\"|",
    "format=\"DDMMYY\"|format=\"YYQ\"|format=\"MONYY\"|format=\"JULIAN\" THEN DO;",
    "   %IF &dates=SAS %THEN", "    PUT name \",BEST18.)\";", 
    "   %ELSE %IF &dates=YYMMDD %THEN", "    PUT name \",YYMMDD6.)\";", 
    "   %ELSE %IF &dates=YEARFRAC %THEN", 
    "    PUT \"(\" name \"-MDY(1,1,1900))/365.25,7.3)\";", 
    "   %ELSE %IF &dates=YEARFRAC2 %THEN %DO;", 
    "    PUT \"YEAR(\" name \")-1900+(\" name \"-MDY(1,1,YEAR(\" name \")))/\" @;",
    "    PUT \"(MDY(12,31,YEAR(\" name \"))-MDY(1,1,YEAR(\" name \"))+1),7.3)\";",
    "    %END;", "   ;", "   END;\t", 
    "  ELSE DO;PUT name \",BEST18.)\" @;END;", 
    "  PUT ');  PUT _xx_ +_bk_ __delim IB1. @;';  *Added +_bk_ 2Aug92;", 
    "%IF &specmiss=1 %THEN %DO;", 
    "  put 'IF .A<=' name '<=.Z THEN DO;",
    "   vname=\"' name +_bk_ '\"; val=put(' name ',1.); OUTPUT; END;';",
    "  %END;", "  END;", "if eof then PUT 'PUT; RUN;';", "run;", 
    "%include \"&temp1\";", "data _null_; set &_s_;", 
    " retain __delim 18 _bk_ -1; ", " file \"&temp1\" LRECL=4096;", 
    " name=TRANSLATE(name,\".abcdefghijklmnopqrstuvwxyz\",", 
    "\t\t     \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 
    " format=TRANSLATE(format,\".abcdefghijklmnopqrstuvwxyz\",", 
    "                         \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 
    " put name +_bk_ __delim IB1. type +_bk_ __delim IB1. length +_bk_ __delim IB1.",
    "  format +_bk_ __delim IB1. label +_bk_ __delim IB1. nobs +_bk_ __delim IB1.;",
    "run;", "%IF &specmiss=1 %THEN %DO;", 
    " PROC SORT DATA=smiss OUT=smiss;BY vname val obs;", 
    " DATA _NULL_; SET smiss;FILE \"&temp4\" RECFM=D LRECL=30;", 
    " RETAIN _bk_ -1 __delim 18;", 
    " vname=TRANSLATE(vname,\".abcdefghijklmnopqrstuvwxyz\",", 
    "\t\t       \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");", 
    " PUT vname +_bk_ __delim IB1. val +_bk_ __delim IB1. obs +_bk_ __delim IB1.;",
    " RUN;", " %END;", "%mend sas_get;")


sasxport.get <- function(file, lowernames=TRUE,force.single=TRUE,
                         method=c('read.xport','dataload','csv'),
                         formats=NULL, allow=NULL, out=NULL,
                         keep=NULL, drop=NULL, as.is=0.5, FUN=NULL) {
  method <- match.arg(method)
  if(length(out) && method!='csv')
    stop('out only applies to method="csv"')
  
  rootsoftware <- if(method=='dataload')'dataload'
  else 'sas'

  sasdateform <-
    toupper(c("date","mmddyy","yymmdd","ddmmyy","yyq","monyy",
              "julian","qtr","weekdate","weekdatx","weekday","month"))
  sastimeform     <- toupper(c("hhmm","hour","mmss","time"))
  sasdatetimeform <- toupper(c("datetime","tod"))

  ## Note: unlike read.spss, read.dta, SAS xport reading functions do not
  ## support URLs.  And thanks to Kurt Hornik for https
  if(grepl("^https?://", tolower(file))) {
    tf <- tempfile()
    download.file(file, tf, mode='wb', quiet=TRUE)
    file <- tf
  }

  dsinfo <-
    if(method == 'csv') lookupSASContents(file)
    else lookup.xport(file)

  whichds <-
    if(length(keep))
      keep
    else
      setdiff(names(dsinfo), c(drop,'_CONTENTS_','_contents_'))
  
  ds <- switch(method,
               read.xport= read.xport(file),
               dataload  = read.xportDataload(file, whichds),
               csv       = if(!length(out))
               readSAScsv(file, dsinfo, whichds))

  if(method=='read.xport' && (length(keep) | length(drop)))
    ds <- ds[whichds]

  ## PROC FORMAT CNTLOUT= dataset present?
  fds <- NULL
  if(!length(formats)) {
    fds <- sapply(dsinfo, function(x)
                  all(c('FMTNAME','START','END','MIN','MAX','FUZZ')
                      %in% x$name))
    fds <- names(fds)[fds]
    if(length(fds) > 1) {
      warning('transport file contains more than one PROC FORMAT CNTLOUT= dataset; using only the first')
      fds <- fds[1]
    }
  }
  
  finfo <- NULL
  if(length(formats) || length(fds)) {
    finfo <-
      if(length(formats))
        formats
      else if(length(out))
        readSAScsv(file, dsinfo, fds)
      else ds[[fds]]

    ## Remove leading $ from char format names
    ##  fmtname <- sub('^\\$','',as.character(finfo$FMTNAME))
    fmtname <- as.character(finfo$FMTNAME)
    finfo <- split(finfo[c('START','END','LABEL')], fmtname)
    finfo <- lapply(finfo,
                    function(f)
                    {
                      rb <- function(a)
                        {  # remove leading + trailing blanks
                          a <- sub('[[:space:]]+$', '', as.character(a))
                          sub('^[[:space:]]+', '', a)
                        }

                      st <- rb(f$START)
                      en <- rb(f$END)
                      lab <- rb(f$LABEL)
                      ##j <- is.na(st) | is.na(en)
                      ##  st %in% c('','.','NA') | en %in% c('','.','NA')
                      j <- is.na(st) | is.na(en) | st == '' | en == ''
                      if(any(j)) {
                        warning('NA in code in FORMAT definition; removed')
                        st <- st[!j]; en <- en[!j]; lab <- lab[!j]
                      }

                      if(!all(st==en))
                        return(NULL)

                      list(value = all.is.numeric(st, 'vector'),
                           label = lab)
                    })
  }

  ## Number of non-format datasets
  nods <- length(whichds)
  nds  <- nods - (length(formats) == 0 && length(finfo) > 0)
  which.regular <- setdiff(whichds, fds)
  dsn <- tolower(which.regular)
  
  if((nds > 1) && !length(out)) {
    res <- vector('list', nds)
    names(res) <- gsub('_','.',dsn)
  }

  if(length(FUN)) {
    funout <- vector('list', length(dsn))
    names(funout) <- gsub('_','.',dsn)
  }
  possiblyConvertChar <- if(method=='read.xport')
    (is.logical(as.is) && as.is)  ||
      (is.numeric(as.is) && as.is < 1) else
  (is.logical(as.is) && !as.is) ||
    (is.numeric(as.is) && as.is > 0)
  ## reverse logic because read.xport always converts characters to factors
  j <- 0
  for(k in which.regular) {
    j   <- j + 1
    cat('Processing SAS dataset', k, '\t ')
    w   <-
      if(length(out))
        readSAScsv(file, dsinfo, k)
      else if(nods==1)
        ds
      else ds[[k]]

    cat('.')
    if(!length(w)) {
      cat('Empty dataset', k, 'ignored\n')
      next
    }

    chcase <- if(lowernames) tolower else function(x) x
    nam      <- chcase(makeNames(names(w), allow=allow))
    names(w) <- nam
    dinfo    <- dsinfo[[k]]
    fmt      <- sub('^\\$','',dinfo$format)
    lab      <- dinfo$label
    ndinfo   <- chcase(makeNames(dinfo$name, allow=allow))
    names(lab) <- names(fmt) <- ndinfo
    for(i in 1:length(w)) {
      changed <- FALSE
      x  <- w[[i]]
      fi <- fmt[nam[i]]; names(fi) <- NULL
      if(fi != '' && length(finfo) && (fi %in% names(finfo))) {
        f <- finfo[[fi]]
        if(length(f)) {  ## may be NULL because had a range in format
          x <- factor(x, f$value, f$label)
          attr(x, 'format') <- fi
          changed <- TRUE
        }
      }
      if(is.numeric(x)) {
        if(fi %in% sasdateform) {
          x <- importConvertDateTime(x, 'date', rootsoftware)
          changed <- TRUE
        } else if(fi %in% sastimeform) {
          x <- importConvertDateTime(x, 'time', rootsoftware)
          changed <- TRUE
        } else if(fi %in% sasdatetimeform) {
          x <- importConvertDateTime(x, 'datetime', rootsoftware)
          changed <- TRUE
        } else if(force.single) {
          if(all(is.na(x))) {
            storage.mode(x) <- 'integer'
            changed <- TRUE
          } else if(max(abs(x),na.rm=TRUE) <= (2^31-1) &&
                    all(floor(x) == x, na.rm=TRUE)) {
            storage.mode(x) <- 'integer'
            changed <- TRUE
          }
        }
      } else if(method=='read.xport' && possiblyConvertChar && is.factor(x)) {
        if((is.logical(as.is) && as.is) ||
           (is.numeric(as.is) && length(unique(x)) >= as.is*length(x))) {
          x <- as.character(x)
          changed <- TRUE
        }
      } else if(possiblyConvertChar && is.character(x)) {
        if((is.logical(as.is) && !as.is) || 
           (is.numeric(as.is) && length(unique(x)) < as.is*length(x))) {
          x <- factor(x, exclude='')
          changed <- TRUE
        }
      }

      lz <- lab[nam[i]]
      if(lz != '') {
        names(lz) <- NULL
        label(x)  <- lz
        changed   <- TRUE
      }
      
      if(changed)
        w[[i]] <- x
    }

    cat('.\n')
    if(length(out)) {
      nam <- gsub('_','.',dsn[j])
      assign(nam, w)
      ## ugly, but a way to get actual data frame name into first
      ## argument of save( )
      eval(parse(text=paste('save(',nam,', file="',
                   paste(out, '/', nam,'.rda',sep=''),
                   '", compress=TRUE)',sep='')))
      if(length(FUN) && length(w))
        funout[[nam]] <- FUN(w)

      remove(nam)
    } else if(nds > 1)
      res[[j]] <- w
  }

  if(length(out)) {
    names(dsinfo) <- gsub('_','.',tolower(names(dsinfo)))
    if(length(FUN))
      attr(dsinfo, 'FUN') <- funout

    invisible(dsinfo)
  } else if(nds > 1)
    res
  else w
}

## Use dataload program to create a structure like read.xport does
read.xportDataload <- function(file, dsnames) {
  outf <- substring(tempfile(tmpdir=''),2)
  file.copy(file, paste(tempdir(),outf,sep='/'))
  curwd <- getwd()
  on.exit(setwd(curwd))
  setwd(tempdir())
  n <- length(dsnames)
  w <- vector('list', n); names(w) <- dsnames
  for(a in dsnames) {
    status <- system(paste('dataload', outf, 'zzzz.rda', a),
                     intern=FALSE)
    if(status==0) {
      load('zzzz.rda')
      names(zzzz) <- makeNames(names(zzzz))
      w[[a]] <- zzzz
    }
  }

  w
}

utils::globalVariables(c("NOBS", "memname", "memlabel"))
## Read _contents_.csv and store it like lookup.xport output
lookupSASContents <- function(sasdir) {
  w <- read.csv(paste(sasdir,'_contents_.csv',sep='/'), as.is=TRUE)
  z <- tapply(w$NOBS, w$MEMNAME, function(x)x[1])
  if(any(z == 0)) {
    cat('\nDatasets with 0 observations ignored:\n')
    print(names(z)[z == 0], quote=FALSE)
    w <- subset(w, NOBS > 0)
  }

  w$TYPE <- ifelse(w$TYPE==1, 'numeric', 'character')
  names(w) <- tolower(names(w))
  unclass(split(subset(w,select=-c(memname,memlabel)), w$memname))
}

## Read all SAS csv export files and store in a list
readSAScsv <- function(sasdir, dsinfo, dsnames=names(dsinfo)) {
  sasnobs <- sapply(dsinfo, function(x)x$nobs[1])
  multi <- length(dsnames) > 1
  if(multi) {
    w <- vector('list', length(dsnames))
    names(w) <- dsnames
  }

  for(a in dsnames) {
    z <- read.csv(paste(sasdir,'/',a,'.csv', sep=''),
                  as.is=TRUE, blank.lines.skip=FALSE,
                  comment.char="")

    importedLength <- length(z[[1]])
    if(importedLength != sasnobs[a])
      cat('\nError: NOBS reported by SAS (',sasnobs[a],') for dataset ',
          a,' is not the same as imported length (', importedLength,
          ')\n', sep='')

    if(multi)
      w[[a]] <- z
  }

  if(multi)
    w
  else z
}

sasdsLabels <- function(file)
{
  w <- scan(file, sep='\n', what='', quiet=TRUE)
  i <- grep('Data Set Name:', w)
  if(!length(i))
    return(NULL)
  
  n <- tolower(sub('.*\\.([A-Z0-9\\_]*)[[:space:]]+.*','\\1',w[i]))
  w <- gsub('\t','',w)
  labs <- ifelse(nchar(w[i-1])==0,w[i-2],w[i-1])
  names(labs) <- n
  labs
}
harrelfe/Hmisc documentation built on Oct. 3, 2019, 9:34 p.m.