R/read.dta.R

Defines functions write.dta read.dta

Documented in read.dta write.dta

### This file is part of the 'foreign' package for R.

#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/

read.dta <- function(file, convert.dates = TRUE,
                     convert.factors = TRUE, missing.type = FALSE,
                     convert.underscore = FALSE, warn.missing.labels = TRUE)
{
    if(length(grep("^(http|ftp|https)://", file))) {
        tmp <- tempfile()
        download.file(file, tmp, quiet = TRUE, mode = "wb")
        file <- tmp
        on.exit(unlink(file))
    }
    rval <- .External(do_readStata, file)

    if(convert.underscore)
        names(rval) <- gsub("_", ".", names(rval))

    types <- attr(rval, "types")
    stata.na <- data.frame(type = 251L:255L,
                           min = c(101, 32741, 2147483621, 2^127, 2^1023),
                           inc = c(1,1,1,2^115,2^1011)
                           )


    if(!missing.type) {
        if (abs(attr(rval, "version")) >= 8L) {
            for(v in which(types > 250L)) {
                this.type <- types[v] - 250L
                rval[[v]][rval[[v]] >= stata.na$min[this.type]] <- NA
            }
        }
    } else {
        if (abs(attr(rval, "version")) >= 8L) {
            missings <- vector("list", length(rval))
            names(missings) <- names(rval)
            for(v in which(types > 250L)) {
                this.type <- types[v] - 250L
                nas <- is.na(rval[[v]]) |  rval[[v]] >= stata.na$min[this.type]
                natype <- (rval[[v]][nas] - stata.na$min[this.type])/stata.na$inc[this.type]
                natype[is.na(natype)] <- 0L
                missings[[v]] <- rep(NA, NROW(rval))
                missings[[v]][nas] <- natype
                rval[[v]][nas] <- NA
            }
            attr(rval,"missing") <- missings
        } else
            warning("'missing.type' only applicable to version >= 8 files")
    }

    convert_dt_c <- function(x)
        as.POSIXct((x+0.1)/1000, origin = "1960-01-01") # avoid rounding down

    convert_dt_C <- function(x) {
        ls <- .leap.seconds + seq_along(.leap.seconds) + 315619200
        z <- (x+0.1)/1000 # avoid rounding down
        z <- z - rowSums(outer(z, ls, ">="))
        as.POSIXct(z, origin = "1960-01-01")
    }

    if (convert.dates) {
        ff <- attr(rval, "formats")
        ## dates <- grep("%-*d", ff)
        ## Stata 12 introduced 'business dates'
        ## 'Formats beginning with %t or %-t are Stata's date and time formats.'
        ## but it seems some are earlier.
        ## The dta_115 description suggests this is too inclusive:
        ## 'Stata has an old *%d* format notation and some datasets
        ##  still have them. Format *%d*... is equivalent to modern
        ##  format *%td*... and *%-d*... is equivalent to *%-td*...'

        dates <- if (attr(rval, "version") >= 8L) grep('^%(-|)(d|td)', ff)
        else grep("%-*d", ff)
        ## avoid as.Date in case strptime is messed up
        base <- structure(-3653L, class = "Date") # Stata dates are integer vars
        for(v in dates) rval[[v]] <- structure(base + rval[[v]], class = "Date")

        for(v in grep("%tc", ff)) rval[[v]] <- convert_dt_c(rval[[v]])
        for(v in grep("%tC", ff)) rval[[v]] <- convert_dt_C(rval[[v]])
    }
    if (convert.factors %in% c(TRUE, NA)) {
        if (attr(rval, "version") == 5L)
            warning("cannot read factor labels from Stata 5 files")
        else {
            ll <- attr(rval, "val.labels")
            tt <- attr(rval, "label.table")
            factors <- which(ll != "")
            for(v in factors) {
                labels <- tt[[ll[v]]]
                if (warn.missing.labels && is.null(labels)) {
                    warning(gettextf("value labels (%s) for %s are missing",
                                     sQuote(ll[v]), sQuote(names(rval)[v])),
                            domain = NA)
                    next
                }
                if(!is.na(convert.factors)) {
                    ## some levels don't have labels, so skip
                    if (!all(rval[[v]] %in% c(NA, NaN, tt[[ll[v]]])))
                        next
                }
                rval[[v]] <- factor(rval[[v]], levels=tt[[ll[v]]],
                                    labels=names(tt[[ll[v]]]))
            }
        }
    }

    att <- attributes(rval)
    ##rval <- as.data.frame(rval, stringsAsFactors=FALSE)
    class(rval) <- "data.frame"
    newatt <- attributes(rval)
    newatt <- c(newatt, att[!(names(att) %in% names(newatt))])
    attributes(rval) <- newatt
    rval
}

write.dta <-
    function(dataframe, file, version = 7L,
             convert.dates = TRUE, tz = "GMT",
             convert.factors = c("labels","string","numeric","codes"))
{

    if(!is.data.frame(dataframe))
        stop("The object \"dataframe\" must have class data.frame")
    if (version < 6L) stop("Version must be 6-12")
    if (version == 9L) version <- 8L
    if (version == 11L) version <- 10L
    if (version == 12L) version <- 10L
    if (version > 12L) {
        warning("Version must be 6-12: using 7")
        version <- 7L
    }


    ## assume this is in chars: probably only works for ASCII
    ## But Stata formats are ASCII only
    namelength <- if (version == 6L) 8L else 31L
    oldn <- names(dataframe)
    nn <- abbreviate(oldn, namelength)
    if (any(nchar(nn) > namelength))
        stop("cannot uniquely abbreviate variable names")
    if (any(nchar(oldn) > namelength))
        warning("abbreviating variable names")
    names(dataframe) <- nn
    attr(dataframe,"orig.names") <- oldn

    if (convert.dates) {
        dates <- which(vapply(dataframe,
                              function(x) inherits(x, "Date"), NA))
        for(v in dates)
            dataframe[[v]] <- as.vector(julian(dataframe[[v]],
                                               as.Date("1960-1-1", tz="GMT")))
        dates <- which(vapply(dataframe,
                              function(x) inherits(x, "POSIXt"), NA))
        for(v in dates)
            dataframe[[v]] <- as.vector(round(julian(dataframe[[v]],
                                                     ISOdate(1960,1,1, tz=tz))))
        ## It would be possible to write these as %tc format,
        ## milliseconds since 01jan1960 00:00:00.000
        ## dataframe[[v]] <- 1000*as.vector(as.POSIXct(dataframe[[v]], tz=tz) + 315619200)
    }
    convert.factors <- match.arg(convert.factors)
    factors <- which(vapply(dataframe, is.factor, NA))
    if(convert.factors == "string") {
        for(v in factors)
            dataframe[[v]] <- I(as.character(dataframe[[v]]))
    } else if (convert.factors == "numeric") {
        for(v in factors)
            dataframe[[v]] <- as.numeric(as.character(dataframe[[v]]))
    } else if (convert.factors == "codes") {
        for (v in factors)
            dataframe[[v]] <- as.numeric(dataframe[[v]])
    }

    shortlevels <- function(f) {
        ll <- levels(f)
        if (is.null(ll)) return(NULL)
        ## avoid warning if non-ASCII strings are used (unwisely)
        if (all(nchar(ll, "bytes") <= 80L)) ll else abbreviate(ll, 80L)
    }
    leveltable <- lapply(dataframe, shortlevels)

    if (any(vapply(dataframe, function(x) {
        d <- dim(x)
        !is.null(d) && d[1L] < length(x)
        }, NA)))
        stop("cannot handle multicolumn columns")
    invisible(.External(do_writeStata, file, dataframe, version, leveltable))
}

Try the foreign package in your browser

Any scripts or data that you put into this service are public.

foreign documentation built on June 26, 2024, 9:06 a.m.