R/tools.R

Defines functions maxchar saveToExport set.lang varlabel set.label get.label.tables get.label get.origin.codes get.label.name get.lang get.filepath dir.exists13 save.encoding read.encoding

Documented in get.label get.label.name get.label.tables get.lang get.origin.codes maxchar saveToExport set.label set.lang varlabel

#
# Copyright (C) 2014-2021 Jan Marvin Garbuszus and Sebastian Jeworutzki
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.

# Wrapper Around iconv Calls for Code Readability
#
# @param x element to be converted
# @param encoding encoding to be used.
# @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
# @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
read.encoding <- function(x, fromEncoding, encoding) {
  iconv(x,
        from=fromEncoding,
        to=encoding ,
        sub="byte")
}

save.encoding <- function(x, encoding) {
  sapply(x, function(s)
           ifelse(Encoding(s) == "unknown",
                    iconv(s,
                          to=encoding,
                          sub="byte"),
                    iconv(s,  from=Encoding(s),
                          to=encoding,
                          sub="byte")
           )
        )
}

# Function to check if directory exists
# @param x file path
dir.exists13 <-function(x) {
  path <- dirname(x)
  return(file.exists(path))
}

# Construct File Path
#
# @param path path to dta file
# @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
# @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
get.filepath <- function(path="") {
  if (substring(path, 1, 1) == "~") {
    filepath <- path.expand(path)
  } else {
    filepath <- path
  }
  if (!file.exists(filepath)) {
    return("File does not exist.")
  }

  return(filepath)
}

#' Show Default Label Language
#'
#' Displays informations about the defined label languages.
#'
#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
#' @param print \emph{logical.} If \code{TRUE}, print available languages and
#'  default language.
#' @return Returns a list with two components:
#' \describe{
#' \item{languages:}{Vector of label languages used in the dataset}
#' \item{default:}{Name of the actual default label language, otherwise NA}
#' }
#' @details Stata allows to define multiple label sets in different languages.
#'  This functions reports the available languages and the selected default
#'  language.
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
#' @export
get.lang <- function(dat, print=T) {
  ex <- attr(dat, "expansion.fields")

  lang <- list()
  if (length(grep("_lang_list", ex)) > 0) {
    lang$languages <- strsplit(ex[[grep("_lang_list", ex)]][3], " ")[[1]]
  } else {
    lang$languages <- NA
  }
  lang$default <- ifelse(length(grep("_lang_c", ex)) > 0,
                         ex[[grep("_lang_c", ex)]][3],
                         NA)

  if (print) {
    cat("Available languages:\n ")
    cat(paste0(lang$languages, "\n"))
    cat("\nDefault language:\n")
    cat(paste0(" ",lang$default, "\n"))
    return(invisible(lang))
  }

  return(lang)
}

#' Get Names of Stata Label Set
#'
#' Retrieves the Stata label set in the dataset for all or an vector of variable
#' names.
#'
#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
#' @param var.name \emph{character vector.} Variable names. If \code{NULL}, get
#'  names of all label sets.
#' @param lang \emph{character.} Label language. Default language defined by
#'  \code{\link{get.lang}} is used if NA
#' @return Returns an named vector of variable labels
#' @details Stata stores factor labels in variable independent labels sets. This
#'  function retrieves the name of the label set for a variable.
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
#' @export
get.label.name <- function(dat, var.name=NULL, lang=NA) {
  vnames  <- names(dat)
  if (is.na(lang) | lang == get.lang(dat, F)$default) {
    labelsets <- attr(dat, "val.labels")
    names(labelsets) <- vnames
  } else if (is.character(lang)) {
    ex <- attr(dat, "expansion.fields")

    has_no_label_lang <- identical(
      integer(0),
      unlist(lapply(ex, grep, pattern ="_lang_l_"))
    )

    if (has_no_label_lang) {
      return("")
    }

    varname <- sapply(ex[grep(paste0("_lang_l_", lang), ex)],
                      function(x) x[1])
    labelsets.tmp <- sapply(ex[grep(paste0("_lang_l_", lang), ex)],
                            function(x) x[3])
    names(labelsets.tmp) <- varname

    labelsets <- rep("", length(vnames))
    names(labelsets) <- vnames
    labelsets[varname] <- labelsets.tmp[varname]
  }

  if (is.null(var.name)) {
    return(labelsets)
  } else {
    return(labelsets[var.name])
  }
}

#' Get Origin Code Numbers for Factors
#'
#' Recreates the code numbers of a factor as stored in the Stata dataset.
#'
#' @param x \emph{factor.} Factor to obtain code for
#' @param label.table \emph{table.} Table with factor levels obtained by
#'  \code{\link{get.label}}.
#' @return Returns an integer with original codes
#' @details While converting numeric variables into factors, the original code
#'  numbers are lost. This function reconstructs the codes from the attribute
#'  \code{label.table}.
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
#' @examples
#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
#' labname <- get.label.name(dat,"type")
#' labtab <- get.label(dat, labname)
#'
#' # comparsion
#' get.origin.codes(dat$type, labtab)
#' as.integer(dat$type)
#' @export
get.origin.codes <- function(x, label.table) {
  if (is.factor(x)) {
    fac <- as.character(x)
    return(as.integer(label.table[fac]))
  } else {
    message("x is no factor.")
  }
}

#' Get Stata Label Table for a Label Set
#'
#' Retrieve the value labels for a specific Stata label set.
#'
#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
#' @param label.name \emph{character.} Name of the Stata label set
#' @return Returns a named vector of code numbers
#' @details This function returns the table of factor levels which represent
#'  a Stata label set. The name of a label set for a variable can be obtained
#'  by \code{\link{get.label.name}}.
#' @examples
#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
#' labname <- get.label.name(dat,"type")
#' get.label(dat, labname)
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
#' @export
get.label <- function(dat, label.name) {
  return(attr(dat, "label.table")[label.name][[1]])
}

#' Get all Stata Label Sets for a Data.frame
#'
#' Retrieve the value labels for all variables.
#'
#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
#' @return Returns a named list of label tables
#' @details This function returns the factor levels which represent
#'  a Stata label set for all variables.
#' @examples
#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
#' get.label.tables(dat)
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
#' @importFrom stats setNames
#' @export
get.label.tables <- function(dat) {
  varnames <- setNames(names(dat), names(dat))
  lapply(varnames, function(varname) get.label(dat, get.label.name(dat, varname)))
}

#' Assign Stata Labels to a Variable
#'
#' Assign value labels from a Stata label set to a variable. If duplicated
#'  labels are found, unique labels will be generated according the following
#'  scheme: "label_(integer code)". Levels without labels will become <NA>.
#'
#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
#' @param var.name \emph{character.} Name of the variable in the data.frame
#' @param lang \emph{character.} Label language. Default language defined by
#'  \code{\link{get.lang}} is used if NA
#' @return Returns a labeled factor
#' @examples
#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"),
#'                   convert.factors=FALSE)
#'
#' # compare vectors
#' set.label(dat, "type")
#' dat$type
#'
#' # German label
#' set.label(dat, "type", "de")
#' @export
set.label <- function(dat, var.name, lang=NA) {
  if (is.factor(dat[,var.name])) {
    tmp <- get.origin.codes(dat[,var.name],
                            get.label(dat, get.label.name(dat, var.name)))
  } else {
    tmp <- dat[,var.name]
  }

  labtable <- get.label(dat, get.label.name(dat, var.name, lang))

  #check for duplicated labels
  labcount <- table(names(labtable))
  if (any(labcount > 1)) {


    warning(paste0("\n  ",var.name, ":\n  Duplicated factor levels detected -",
                   "generating unique labels.\n"))
    labdups <- names(labtable) %in% names(labcount[labcount > 1])
    # generate unique labels from assigned label and code number
    names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(",
                                       labtable[labdups], ")")
  }

  return(factor(tmp, levels=labtable,
                labels=names(labtable))
  )
}

#' Get and assign Stata Variable Labels
#'
#' Retrieve or set variable labels for a dataset.
#'
#' @name varlabel
#' @rdname varlabel
#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
#' @param var.name \emph{character vector.} Variable names. If NULL, get label
#'  for all variables.
#' @param lang \emph{character.} Label language. Default language defined by
#'  \code{\link{get.lang}} is used if NA
#' @param value \emph{character vector.} Character vector of size ncol(data) with variable names.
#' @return Returns an named vector of variable labels
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
#' @aliases varlabel
#' @aliases 'varlabel<-'
#' @examples
#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"),
#'                   convert.factors=FALSE)
#'
#' # display variable labels 
#' varlabel(dat)
#' 
#' # display german variable labels
#' varlabel(dat, lang="de")
#' 
#' # display german variable label for brand
#' varlabel(dat, var.name = "brand", lang="de")
#' 
#' # define new variable labels
#' varlabel(dat) <- letters[1:ncol(dat)]
#'
#' # display new variable labels
#' varlabel(dat)
NULL

#' @rdname varlabel
#' @export
varlabel <- function(dat, var.name=NULL, lang=NA) {
  vnames <- names(dat)
  if (is.na(lang) | lang == get.lang(dat, F)$default) {
    varlabel <- attr(dat, "var.labels")
    names(varlabel) <- vnames
  } else if (is.character(lang)) {
    ex <- attr(dat, "expansion.fields")
    varname <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[1])
    varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3])
    names(varlabel) <- varname
  }
  if (is.null(var.name)) {
    # order by data.frame columns and return
    return(varlabel[vnames])
  } else {
    return(varlabel[var.name])
  }
}

#' @rdname varlabel
#' @export
'varlabel<-' <- function(dat, value) {
  nlabs <- ncol(dat)
  if (length(value)==nlabs) {
    attr(dat, "var.labels") <- value
  } else {
      warning(paste("Vector of new labels must have", nlabs, "entries."))
    }
  dat
}


#' Assign Stata Language Labels
#'
#' Changes default label language for a dataset. 
#' Variables with generated labels (option generate.labels=TRUE) are kept unchanged.
#'
#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
#' @param lang \emph{character.} Label language. Default language defined by
#'  \code{\link{get.lang}} is used if NA
#' @param generate.factors \emph{logical.} If \code{TRUE}, missing factor levels
#'  are generated.
#' @return Returns a data.frame with value labels in language "lang".
#' @examples
#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
#' get.lang(dat)
#' varlabel(dat)
#'
#' # set German label
#' datDE <- set.lang(dat, "de")
#' get.lang(datDE)
#' varlabel(datDE)
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
#' @importFrom stats na.omit
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @export
set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
  if (is.na(lang) | lang == get.lang(dat, F)$default) {
    return(dat)
  } else if (is.character(lang)) {
    vnames <- names(dat)
    types <- attr(dat, "types")
    label <- attr(dat, "label.table")
    val.labels <- get.label.name(dat, NULL, lang)
    oldval.labels <- get.label.name(dat)
    oldval.labels <- oldval.labels[!is.na(oldval.labels)]
    oldval.labtab <- lapply(oldval.labels, function(x) get.label(dat, x))

    oldlang <- get.lang(dat, F)$default

    cat("Replacing value labels. This might take some time...\n")
    pb <- txtProgressBar(min=1,max=length(val.labels)+1)



    for (i in which(val.labels != "")) {

        labname <- val.labels[i]
        vartype <- types[i]
        labtable <- label[[labname]]
        varname <- names(val.labels)[i]

        # get old codes
        if (is.factor(dat[, varname])) {
          oldlabname <- oldval.labels[names(oldval.labels) == varname]
          oldlabtab <- oldval.labtab[[names(oldlabname)]]
          codes <- get.origin.codes(dat[,varname], oldlabtab)
          varunique <- na.omit(unique(codes))
        } else {
          varunique <- na.omit(unique(dat[,varname]))
        }

        if (labname %in% names(label) & is.factor(dat[,varname])) {
                     
          # assign label if label set is complete
          if (all(varunique %in% labtable)) {

            dat[,varname] <- factor(codes, levels=labtable,
                                    labels=names(labtable))
          }
          # else generate labels from codes
        } else if (generate.factors) {
          names(varunique) <- as.character(varunique)
          gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))

          dat[,varname] <- factor(codes, levels=gen.lab,
                                  labels=names(gen.lab))
        } else {
          warning(paste(vnames[i], "Missing factor labels - no labels assigned.
                        Set option generate.factors=T to generate labels."))
        }

        setTxtProgressBar(pb, i)
    }
    close(pb)

    # Save old default labels to expansion.fields. This is necessary to save
    # original labels for further use.
    vnames <- names(oldval.labels)
    names(oldval.labels) <- NULL
    tmp <- list()
    for (i in seq_along(val.labels)) {
      tmp[[i]] <- c(vnames[i],paste0("_lang_l_",oldlang), oldval.labels[i])
    }
    attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp)

    # variable label
    old.varlabel <- attr(dat, "var.labels")
    tmp <- list()
    for (i in seq_along(old.varlabel)) {
      tmp[[i]] <- c(vnames[i],paste0("_lang_v_", oldlang), old.varlabel[i])
    }
    attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp)

    ex <- attr(dat, "expansion.fields")
    varname <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[1])
    varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3])
    names(varlabel) <- varname
    varlabel.out <- as.character(varlabel[vnames])
    varlabel.out[is.na(varlabel.out)] <- ""
    attr(dat, "var.labels") <- varlabel.out

    # set new default lang and store string as default attributes
    names(val.labels) <- NULL
    attr(dat, "val.labels") <- val.labels
    attr(dat, "expansion.fields")[[
      grep("_lang_c", attr(dat, "expansion.fields"))
      ]][3] <- lang

    return(dat)
  }
  }

#' Check if numeric vector can be expressed as integer vector
#'
#' Compression can reduce numeric vectors as integers if the vector does only
#' contain integer type data.
#'
#' @param x vector of data frame
saveToExport <- function(x) {
  ifelse(any(is.infinite(x)), FALSE, 
         ifelse(any(!is.na(x) & (x > .Machine$integer.max | x < -.Machine$integer.max)), FALSE, 
                isTRUE(all.equal(x, as.integer(x)))))
}


#' Check max char length of data.frame vectors
#'
#' Stata requires us to provide the maximum size of a charactervector as every
#' row is stored in a bit region of this size.
#'
#' Ex: If the max chars size is four, _ is no character in this vector:
#' 1. row: four
#' 3. row: one_
#' 4. row: ____
#'
#' If a character vector contains only missings or is empty, we will assign it a
#' value of one, since Stata otherwise cannot handle what we write.
#'
#' @param x vector of data frame
maxchar <- function(x) {
  z <- max(nchar(x, type="byte"), na.rm = TRUE)

  # Stata does not allow storing a string of size 0
  if (is.infinite(z) | (z == 0))
    z <- 1

  z
}
sjewo/readstata13 documentation built on Aug. 25, 2023, 7:45 a.m.