R/GUIfunctions.R

Defines functions writeSafeFile subsetMicrodata importProblem readMicrodata tryCatchFn varToNumeric varToFactor removeDirectID generateStrata mergeHouseholdData selectHouseholdData extractLabels changeVarLabel addVarLabels

Documented in generateStrata importProblem mergeHouseholdData readMicrodata removeDirectID selectHouseholdData subsetMicrodata varToFactor varToNumeric writeSafeFile

# Adds attributes to dataframe with variabe label information from list created by extractLabels
#
# For each variable in the dataset, a variable label is added in the label attribute,
# which can be exported by read_dta from haven
#
# @keywords internal
# @param datManip a data.frame with manipulated data (variable names should not be changed)
# @param lab list with label information generated by extractLabels
# @return the dataframe datManip with attrributes containing variable label information
# @author Thijs Benschop
addVarLabels <- function(datManip, lab){
  if (is.null(lab[[1]])) {
    warning("There are no variable labels!")
    return(datManip)
  } else {
    # Collect colnames of datManip and match labels from lab
    # Variable names with no matching names get an empty variable label (e.g. a newly created strata variable)
    newLabels <- cbind(colnames(datManip), unlist(lapply(colnames(datManip), function(x) {
      if (length(which(x == lab[[1]][,"var.name"])) == 0) {
        return("")
      } else {
        lab[[1]][which(x == lab[[1]][,"var.name"]),"var.label"]
      }
    })))

    # Add label attributes to all variables from newLabels
    datManip2 <- lapply(colnames(datManip), function(x) {
      attr(datManip[[x]], "label") <- as.character(newLabels[which(x == newLabels[,1]),2])
      datManip[[x]]
    })

    # Convert to data.frame
    datManip2 <- as.data.frame(datManip2)
    colnames(datManip2) <- colnames(datManip)
    return(datManip2)
  }
}

# Changes a variable label in the list generated by extractLabels
#
# The variable label can be changed for variables that were already in the dataframe used for extractLabels
#
# @keywords internal
# @param lab list with label information generated by extractLabels
# @param varname the variable name
# @param newlabel the new label for variable varname
# @return a list with both variable labels (as matrix) and value labels (as list) (see extractLabels)
# @author Thijs Benschop
changeVarLabel <- function(lab, varname, newlabel){
  # Check whether varname is in lab
  if (!varname %in% lab[[1]][,"var.name"]) {
    stop("The variable is not in the label information/original dataset. Labels can only be changed for variables in the original dataset.")
  }
  # Replace label with newlabel for variable varname
  lab[[1]][which(lab[[1]][,"var.name"] == varname), "var.label"] <- newlabel
  return(lab)
}

# Extracts label information from dataframe read by read_dta from haven package.
#
# Both variable labels and value labels are extracted.
# @keywords internal
# @param dat a data.frame loaded by read_dta from have package
# @return a list with both variable labels (as data.frame) and value labels (as list)
# @author Thijs Benschop
extractLabels <- function(dat){
  # Check whether there are variable labels available
  if (!all(sapply(sapply(dat, function(x) { attr(x, "label") }), is.null))) {
    # Save all variable labels
    varLab <- as.data.frame(cbind(colnames(dat), lapply(dat, function(x){attr(x, "label")})))
    colnames(varLab) <- c("var.name", "var.label")
    rownames(varLab) <- NULL
    # Set to NA values in var.label that have more than one element (value labels)
    varLab[which(sapply(dat, function(x) { length(attr(x, "label")) }) > 1), 2] <- NA
    # Set to NULL values in var.label to NA
    varLab[which(sapply(sapply(dat, function(x) { attr(x, "label") }), is.null)), 2] <- NA
    # Set to NA values in var.label that have more than one element (value labels)
    varLab[which(sapply(dat, function(x) { length(attr(x, "label")) }) > 1), 2] <- NA

    # Convert all variable labels to UTF-8
    nonUTFvarlabel <- NULL
    varLab[, 2] <- unlist(varLab[,2], use.names = FALSE)
    if(any(!validUTF8(varLab[,2]))){
      whichNotUTF8 <- which(!is.na(varLab[,2]) & !validUTF8(varLab[,2]))
      nonUTFvarlabel <- varLab[whichNotUTF8, c(1,2)] # Save list of all labels that aren't encoded in UTF-8
      varLab[whichNotUTF8, 2] <- enc2utf8(varLab[whichNotUTF8, 2])
      varLab[whichNotUTF8, 2] <- iconv(varLab[whichNotUTF8, 2], "UTF-8", "UTF-8", sub='')
      nonUTFvarlabel <- cbind(nonUTFvarlabel, varLab[whichNotUTF8, 2])
    }
  } else {
    varLab <- NULL
    nonUTFvarlabel <- NULL
  }

  # Check whether there are value labels available
  if (!all(sapply(sapply(dat, function(x) { attr(x, "labels") }), is.null))) {
    # Save all value labels for variables of class labelled
    valLab <- lapply(dat, function(x){attr(x, "labels")})
  } else {
    valLab <- NULL
  }
  return(list(varLab, valLab, nonUTFvarlabel))
}

#' Creates a household level file from a dataset with a household structure.
#'
#' It removes individual level variables and selects one record per household based on a household ID. The function can also be used for other hierachical structures.
#'
#' @note It is of great importance that users select a variable with containing information on household-ids and weights in \code{hhVars}.
#'
#' @param dat a data.frame with the full dataset
#' @param hhId name of the variable with the household (cluster) ID
#' @param hhVars character vector with names of all household level variables
#' @return a data.frame with only household level variables and one record per household
#' @author Thijs Benschop and Bernhard Meindl
#' @export
#' @examples
#' ## ori-hid: household-ids; household_weights: sampling weights for households
#' x_hh <- selectHouseholdData(dat=testdata, hhId="ori_hid",
#'   hhVars=c("urbrur", "roof",  "walls", "water", "electcon", "household_weights"))
selectHouseholdData <- function(dat, hhId, hhVars) {
  # Check whether specified variables are available in the data
  if (!all(hhVars %in% colnames(dat))) {
    stop("Some selected household variables aren't available in the data.\nRespecify hhVars\n")
  }
  if (!hhId %in% colnames(dat)) {
    stop("The selected household ID isn't available in the data.\nRespecify hhId\n")
  }
  # Remove any records with missing household ID (these cannot be matched later on in the process)
  res <- dat[stats::complete.cases(dat[,hhId]), ]

  # Keep only one observation per household
  res <- res[which(!duplicated(res[,hhId])),]

  # Sort hhVars on the order of the variables in dat
  hhVars <- colnames(dat)[which(colnames(dat) %in% hhVars)]

  # Drop all variables that are not at the household level
  res <- res[,c(hhId, hhVars), drop=FALSE]
  invisible(res)
}

#' Replaces the raw household-level data with the anonymized household-level data in the full dataset
#' for anonymization of data with a household structure (or other hierarchical structure).
#' Requires a matching household ID in both files.
#'
#' @param dat a data.frame with the full dataset
#' @param hhId name of the household (cluster) ID (identical in both datasets)
#' @param dathh a dataframe with the treated household level data (generated for example with \link{selectHouseholdData})
#' @return a data.frame with the treated household level variables and the raw individual level variables
#' @author Thijs Benschop and Bernhard Meindl
#' @export
#' @examples
#' ## Load data
#' x <- testdata
#' \donttest{
#' ## donttest is necessary because of 
#' ## Examples with CPU time > 2.5 times elapsed time
#' ## caused by using C++ code and/or data.table
#' ## Create household level dataset
#' x_hh <- selectHouseholdData(dat=x, hhId="ori_hid",
#'   hhVars=c("urbrur", "roof",  "walls", "water", "electcon", "household_weights"))
#' ## Anonymize household level dataset and extract data
#' sdc_hh <- createSdcObj(x_hh, keyVars=c('urbrur','roof'), w='household_weights')
#' sdc_hh <- kAnon(sdc_hh, k = 3)
#' x_hh_anon <- extractManipData(sdc_hh)
#'  
#' ## Merge anonymized household level data back into the full dataset
#' x_anonhh <- mergeHouseholdData(x, "ori_hid", x_hh_anon)
#'  
#' ## Anonymize full dataset and extract data
#' sdc_full <- createSdcObj(x_anonhh, keyVars=c('sex', 'age', 'urbrur', 'roof'), w='sampling_weight')
#' sdc_full <- kAnon(sdc_full, k = 3)
#' x_full_anon <- extractManipData(sdc_full)
#' }
mergeHouseholdData <- function(dat, hhId, dathh) {
  # Check whether household ID is available in both datasets
  if (!(hhId %in% colnames(dathh)))
    stop("The selected household ID isn't available in the anonymized household data.\nRespecify hhId")
  if (!(hhId %in% colnames(dat)))
    stop("The selected household ID isn't available in the full dataset.\nRespecify hhId")

  # Drop all variables from the untreated dataset that are in the household level dataset except the household ID and the weight variable
  # The weights from the household level file aren't exported because they either are contained in the complete dataset or generated and can
  # therefore be generated by the user.
  toBeDropped <- colnames(dathh)
  toBeDropped <- toBeDropped[-which(toBeDropped %in% hhId)]

  res <- dat[,-which(colnames(dat) %in% toBeDropped), drop=FALSE]

  # Merge the individual level and household level files by the household ID
  # Individuals without matching household ID in dathh are kept and have all missings for the household level variables
  res <- merge(res, dathh, by = hhId, all.x = TRUE)

  # Move hhId to first column
  res <- res[,c(hhId, colnames(res)[-which(colnames(res) %in% hhId)])]
  invisible(res)
}

#' Generate one strata variable from multiple factors
#'
#' For strata defined by multiple variables (e.g. sex,age,country) one combined
#' variable is generated.
#'
#' @param df a data.frame
#' @param stratavars character vector with variable name
#' @param name name of the newly generated variable
#' @return The original data set with one new column.
#' @author Alexander Kowarik
#' @export
#' @examples
#'
#' x <- testdata
#' x <- generateStrata(x,c("sex","urbrur"),"strataIDvar")
#' head(x)
#'
generateStrata <- function(df, stratavars, name) {
  strata <- rep("", nrow(df))
  for (i in seq_along(stratavars)) {
    strata <- paste(strata, df[, stratavars[i]], sep="")
    if (length(stratavars) > i) {
      strata <- paste(strata, "-", sep="")
    }
  }
  df <- cbind(df, strata)
  colnames(df)[length(colnames(df))] <- name
  return(df)
}

#' Remove certain variables from the data set inside a sdc object.
#'
#' Delete variables without changing anything else in the sdcObject (writing
#' NAs).
#'
#'
#' @name removeDirectID
#' @docType methods
#' @param obj object of class \code{\link{sdcMicroObj-class}}
#' @param var name of the variable(s) to be remove
#' @return the modified \code{\link{sdcMicroObj-class}}
#' @author Alexander Kowarik
#' @keywords methods
#' @export
#' @examples
#' ## for objects of class sdcMicro:
#' data(testdata2)
#' sdc <- createSdcObj(testdata, keyVars=c('urbrur','roof'),
#'  numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- removeDirectID(sdc, var="age")
removeDirectID <- function(obj, var) {
  removeDirectIDX(obj=obj, var=var)
}

setGeneric("removeDirectIDX", function(obj, var) {
  standardGeneric("removeDirectIDX")
})

setMethod(f="removeDirectIDX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
  kV <- colnames(obj@origData)[get.sdcMicroObj(obj, "keyVars")]
  nV <- colnames(obj@origData)[get.sdcMicroObj(obj, "numVars")]
  wV <- colnames(obj@origData)[get.sdcMicroObj(obj, "weightVar")]
  sV <- colnames(obj@origData)[get.sdcMicroObj(obj, "strataVar")]
  hV <- colnames(obj@origData)[get.sdcMicroObj(obj, "hhId")]

  if (any(var %in% kV))
    stop("A direct identifier should not be seleceted as key variable.\n Therefore it can not be removed.")
  if (any(var %in% nV))
    stop("A direct identifier should not be seleceted as numerical key variable.\n Therefore it can not be removed.")
  if (any(var %in% wV))
    stop("A direct identifier should not be seleceted as weight variable.\n Therefore it can not be removed.")
  if (any(var %in% sV))
    stop("A direct identifier should not be seleceted as strata variable.\n Therefore it can not be removed.")
  if (any(var %in% hV))
    stop("A direct identifier should not be seleceted as cluster ID.\n Therefore it can not be removed.")

  o <- obj@origData
  if (any(!var %in% colnames(o)))
    stop("direct identifier variable not found on data set")
  o <- o[, !colnames(o) %in% var, drop=FALSE]
  obj <- nextSdcObj(obj)
  obj@deletedVars <- c(obj@deletedVars, var)
  obj@origData <- o
  obj
})

#' Change the a keyVariable of an object of class \code{\link{sdcMicroObj-class}} from Numeric to
#' Factor or from Factor to Numeric
#'
#' Change the scale of a variable
#'
#' @name varToFactor
#' @docType methods
#' @param obj object of class \code{\link{sdcMicroObj-class}}
#' @param var name of the keyVariable to change
#' @return the modified \code{\link{sdcMicroObj-class}}
#' @keywords methods
#' @export
#' @examples
#' ## for objects of class sdcMicro:
#' data(testdata2)
#' sdc <- createSdcObj(testdata2,
#'   keyVars=c('urbrur','roof','walls','water','electcon','relat','sex'),
#'   numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- varToFactor(sdc, var="urbrur")
#'
varToFactor <- function(obj, var) {
  varToFactorX(obj=obj, var=var)
}
setGeneric("varToFactorX", function(obj, var) {
  standardGeneric("varToFactorX")
})

setMethod(f="varToFactorX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
  obj <- nextSdcObj(obj)
  x <- get.sdcMicroObj(obj, type="manipKeyVars")
  x2 <- varToFactor(x, var=var)
  obj <- set.sdcMicroObj(obj, type="manipKeyVars", input=list(as.data.frame(x2)))
  obj
})

setMethod(f="varToFactorX", signature=c("data.frame"),
definition=function(obj, var) {
  #if ( length(var)!=1) {
  #  stop("More than 1 variable specified in 'var'!\n")
  #}
  if (!all(var %in% colnames(obj))) {
    stop("at least one variable specified in 'var' is not available in 'obj'!\n")
  }
  for (vv in var) {
    obj[[vv]] <- as.factor(obj[[vv]])
  }
  obj
})


#' @export
#' @rdname varToFactor
varToNumeric <- function(obj, var) {
  varToNumericX(obj=obj, var=var)
}

setGeneric("varToNumericX", function(obj, var) {
  standardGeneric("varToNumericX")
})

setMethod(f="varToNumericX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
  obj <- nextSdcObj(obj)
  x <- get.sdcMicroObj(obj, type="manipKeyVars")
  suppressWarnings(tmpvar <- as.numeric(as.character(x[, var])))
  x2 <- varToNumeric(x, var=var)
  obj <- set.sdcMicroObj(obj, type="manipKeyVars", input=list(as.data.frame(x2)))
  obj
})

setMethod(f="varToNumericX", signature=c("data.frame"),
definition=function(obj, var) {
  if (!all(var %in% colnames(obj))) {
    stop("at least one variable specified in 'var' is not available in 'obj'!\n")
  }
  for (vv in var) {
    if (inherits(obj[[vv]], "factor")) {
      obj[[vv]] <- as.numeric(levels(obj[[vv]]))[obj[[vv]]]
    } else{
      obj[[vv]] <- as.numeric(obj[[vv]])
    }
  }
  obj
})

# wrapper for tryCatch()
tryCatchFn <- function(expr) {
  result <- tryCatch({expr},
   error=function(e) {
     return(e)
   })
  return(result)
}

#' readMicrodata
#'
#' reads data from various formats into R. Used in \code{\link{sdcApp}}.
#'
#' @param path a file path
#' @param type which format does the file have. currently allowed values are
#' \itemize{
#' \item \code{sas}
#' \item \code{spss}
#' \item \code{stata}
#' \item \code{R}
#' \item \code{rdf}
#' \item \code{csv}
#' }
#' @param convertCharToFac (logical) if TRUE, all character vectors are automatically
#' converted to factors
#' @param drop_all_missings (logical) if TRUE, all variables that contain NA-values only
#' will be dropped
#' @param ... additional parameters. Currently used only if \code{type='csv'} to pass
#' arguments to \code{read.table()}.
#'
#' @note if \code{type} is either \code{'sas'}, \code{'spss'} or \code{'stata'}, values read in as \code{NaN}
#' will be converted to \code{NA}.
#' @return a data.frame or an object of class 'simple.error'. If a stata file was read in, the resulting \code{data.frame}
#' has an additional attribute \code{lab} in which variable and value labels are stored.
#' @author Bernhard Meindl
#' @export
readMicrodata <- function(path, type, convertCharToFac=TRUE, drop_all_missings=TRUE, ...) {
  nonUTFvarname <- NULL
  if (type=="sas") {
    res <- tryCatchFn(haven::read_sas(data_file=path))
    # Convert column names to utf8
    nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
    colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
    }
  if (type=="spss") {
    res <- tryCatchFn(haven::read_spss(file=path))
    # Convert column names to utf8
    nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
    colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
    }
  if (type=="stata") {
    res <- tryCatchFn(haven::read_dta(file=path))
    # Convert column names to utf8
    nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
    colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
    lab <- extractLabels(res)
  }
  if (type=="R") {
    res <- tryCatchFn(get(load(file=path)))
  }
  if (type=="rdf") {
    res <- tryCatchFn(get(paste(path)))
  }
  if (type=="csv") {
    opts <- list(...)
    header <- ifelse(opts$header==TRUE, TRUE, FALSE)
    sep <- opts$sep
    quote <- "\""
    comment.char <- ""
    res <- tryCatchFn(utils::read.table(path, sep=sep, header=header, quote=quote, comment.char=comment.char))
  }
  if (inherits(res, "simpleError")) {
    return(res)
  }
  if (!inherits(res, "data.frame")) {
    res$message <- paste0(res$message,"\ndata read into the system was not of class 'data.frame'!")
    return(res)
  }
  # convert result to clas 'data.frame' if it is a 'tbl_df'...
  if (inherits(res, "tbl_df")) {
    class(res) <- "data.frame"
  }

  # convert NaN to NA if data was read in with haven
  if (type %in% c("sas","spss","stata")) {
    res[is.na(res)] <- NA
  }

  # check if any variable has class 'labelled' or 'haven_labelled' (from haven 2.0.0) and convert it to factors.
  # this might happen if we read data with read_xxx() from haven
  cl_lab <- which(sapply(res, inherits, "labelled") | sapply(res, inherits, "haven_labelled"))
  if (length(cl_lab) > 0) {
    if (length(cl_lab)==1) {
      res[[cl_lab]] <- haven::as_factor(res[[cl_lab]], levels="default")
    } else {
      res[,cl_lab] <- lapply(res[,cl_lab] , function(x) {
        haven::as_factor(x, levels="default")
      })
    }
  }

  if (convertCharToFac) {
    # convert character-variables to factors
    cl_char <- which(sapply(res, class)=="character")
    if (length(cl_char) >0) {
      if (length(cl_char) == 1) {
        res[[cl_char]] <- as.factor(res[[cl_char]])
      } else {
        res[,cl_char] <- lapply(res[,cl_char], as.factor)
      }
    }
  }
  if (drop_all_missings) {
    # drop all variables that are NA-only
    keep <- which(sapply(res, function(x) sum(is.na(x))!=length(x)))
    dropped <- colnames(res)[-keep]
    res <- res[,keep,drop=FALSE]
    # save names of dropped variables
    if(length(dropped) > 0){
      attr(res, "dropped") <- dropped
    }
  }

  # Convert levels in factor and character variables to utf8
  nonUTFvallabels <- data.frame(varName = character(), initLabel = character(0), convLabel = character(0), stringsAsFactors = FALSE)
  for (i in 1:dim(res)[2]) {
    # Character strings
    if (inherits(res[, i], "character")) {
      if (any(!validUTF8(res[,i]))) {
        nonUTFvallabels <- rbind(nonUTFvallabels, cbind(rep(colnames(res)[i], length(unique(res[which(!validUTF8(res[,i])),i]))),
          unique(res[which(!validUTF8(res[,i])),i]),
          iconv(enc2utf8(unique(res[which(!validUTF8(res[,i])),i])), "UTF-8", "UTF-8", sub='')))
        res[which(!validUTF8(res[,i])),i] <- enc2utf8(res[which(!validUTF8(res[,i])),i])
        # Remove any non UTF8 characters
        res[which(!validUTF8(res[,i])),i] <- iconv(res[which(!validUTF8(res[,i])),i], "UTF-8", "UTF-8", sub='')
      }
    }
    # Factor variables
    if (inherits(res[, i], "factor")) {
      if (any(!validUTF8(levels(res[,i])))) {
        nonUTFvallabels <- rbind(nonUTFvallabels, cbind(rep(colnames(res)[i], length(levels(res[,i])[which(!validUTF8(levels(res[,i])))])),
          levels(res[,i])[which(!validUTF8(levels(res[,i])))],
          iconv(enc2utf8(levels(res[,i])[which(!validUTF8(levels(res[,i])))]), "UTF-8", "UTF-8", sub='')))
        # Convert to UTF8, encoding unknown
        levels(res[,i])[which(!validUTF8(levels(res[,i])))] <- enc2utf8(levels(res[,i])[which(!validUTF8(levels(res[,i])))])
        # Remove any non UTF8 characters
        levels(res[,i])[which(!validUTF8(levels(res[,i])))] <- iconv(levels(res[,i])[which(!validUTF8(levels(res[,i])))], "UTF-8", "UTF-8", sub='')
      }
    }
  }
  if (!is.null(nonUTFvarname)) {
    if (dim(nonUTFvarname)[1] == 0) {
      nonUTFvarname <- NULL
    }
  }  # Set to NULL if no changed labels
  if (!is.null(nonUTFvallabels)) {
    if (dim(nonUTFvallabels)[1] == 0) {
      nonUTFvallabels <- NULL
    }
  } # Set to NULL if no changed labels

  if (type=="stata") {
    attr(res, "lab") <- lab
  }

  # Collect variable names, variable labels and value labels that were encoded to UTF8
  if (type=="stata") {
    attr(res, "nonUTF") <- list(nonUTFvarname, nonUTFvallabels, lab[[3]])
  } else{
    attr(res, "nonUTF") <- list(nonUTFvarname, nonUTFvallabels)
  }
  res
}

#' importProblem
#'
#' reads an sdcProblem with code that has been exported within \code{\link{sdcApp}}.
#'
#' @param path a file path
#' @return an object of class \code{sdcMicro_GUI_export} or an object of class 'simple.error'
#' @author Bernhard Meindl
#' @export
importProblem <- function(path) {
  res <- tryCatchFn(get(load(file=path)))
  if (inherits(res, "simpleError")) {
    return(res)
  }
  if (!inherits(res, "sdcMicro_GUI_export")) {
    res$message <- paste0(
      res$message,
      "\ndata read into the system was not of class 'sdcMicro_GUI_export'!"
    )
    return(res)
  }
  res
}

#' subsetMicrodata
#'
#' allows to restrict original data to only a subset. This may be useful to test some anonymization
#' methods. This function will only be used in the graphical user interface \code{\link{sdcApp}}.
#'
#' @param obj an object of class \code{\link{data.frame}} containing micro data
#' @param type algorithm used to sample from original microdata. Currently supported choices are
#' \describe{
#' \item{\code{n_perc}}{ the restricted microdata will be a \code{n-percent} sample of the original microdata.}
#' \item{\code{first_n}}{ only the first \code{n} observations will be used.}
#' \item{\code{every_n}}{ the restricted microdata set consists of every \code{n-th} record.}
#' \item{\code{size_n}}{ a total of \code{n} observations will be randomly drawn.}
#' }
#' @param n numeric vector of length 1 specifying the specific parameter with respect to argument \code{type}.
#' @return an object of class \code{\link{sdcMicroObj-class}} with modified slot \code{@origData}.
#' @author Bernhard Meindl
#' @rdname subsetMicrodata
subsetMicrodata <- function(obj, type, n) {
  if (!type %in% c("n_perc","first_n","every_n","size_n")) {
    stop("invalid value in argument 'type'\n")
  }
  if (n < 1) {
    stop("argument 'n' must be >=1\n")
  }

  dat <- obj
  nrObs <- nrow(dat)
  if (type=="n_perc") {
    ssize <- ceiling((nrObs/100)*n)
    dat <- dat[sample(1:nrObs, ssize),,drop=FALSE]
  }
  if (type=="first_n") {
    dat <- dat[1:n,,drop=F]
  }
  if (type=="every_n") {
    ssize <- (1:nrObs)%%n==1
    dat <- dat[ssize,,drop=F]
  }
  if (type=="size_n") {
    dat <- dat[sample(1:nrObs, n),,drop=F]
  }
  dim(dat)
  return(dat)
}


#' writeSafeFile
#'
#' writes an anonymized dataset to a file. This function should be used in the
#' graphical user interface [sdcApp()] only.
#'
#' @param obj a `data.frame` containing micro data
#' @param randomizeRecords (logical) specifies, if the output records should
#' be randomized. The following options are possible:
#' - `"no"`: default, no randomization takes place
#' - `"simple"`: records are randomly swapped
#' - `"byHH"`: if slot `"hhId"` is not `NULL`, the clusters defined by this
#' variable are randomized across the dataset. If slot `"hhId"` is `NULL`, the
#' records or the dataset are randomly changed.
#' - `"withinHH"`: if slot `"hhId"` is not `NULL`, the clusters defined by
#' this variable are randomized across the dataset and additionally, the order
#' of records within the clusters are also randomly changed. If slot `"hhId"`
#' is `NULL`, the records or the dataset are randomly changed.
#'
#' @param format (character) specifies the output file format. Accepted
#' values are:
#' - `"rdata"`: output will be saved in the R binary file-format
#' - `"sav"`: output will be saved as SPSS-file
#' - `"dta"`: ouput will be saved as STATA-file
#' - `"csv"`: output will be saved as comma seperated (text)-file
#' - `"sas"`: output will be saved as SAS-file (sas7bdat)
#' @param fileOut (character) file to which output should be written
#' @param ... optional arguments used for [utils::write.table()] if
#' argument `"format"` equals `"csv"`
#' @return invisible `NULL` if the file was successfully written
#' @author Bernhard Meindl
#' @rdname writeSafeFile
#' @md
#' @export
writeSafeFile <- function(obj, format, randomizeRecords, fileOut, ...) {
  if (!inherits(obj, "sdcMicroObj")) {
    stop("invalid input in argument 'obj'\n")
  }
  dat <- extractManipData(obj, randomizeRecords=randomizeRecords)

  if (format=="rdata") {
    save(dat, file=fileOut)
  }
  if (format=="sav") {
    haven::write_sav(data=dat, path=fileOut)
  }
  if (format=="sas") {
    haven::write_sas(data=dat, path=fileOut)
  }
  if (format=="dta") {
    # add label information
    inp <- list(...)
    new_labs <- inp$lab
    if (!is.null(new_labs)) {
      # restrict to existing variables in anonymized dataset
      ll1 <- new_labs[[1]]
      ii <- which(ll1$var.name %in% colnames(dat))
      ll1 <- ll1[ii,]
      new_labs[[1]] <- ll1
      ll2 <- new_labs[[2]]
      if (!is.null(ll2)) {
        ii <- which(names(ll2) %in% colnames(dat))
        ll2 <- ll2[ii]
        new_labs[[2]] <- ll2
      }
      dat <- addVarLabels(dat, lab=new_labs)
    }
    haven::write_dta(data=dat, path=fileOut, version=inp$version)
  }
  if (format=="csv") {
    utils::write.table(dat, file=fileOut, ...)
  }
  return(invisible(NULL))
}
sdcTools/sdcMicro documentation built on March 15, 2024, 12:32 p.m.