sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

Documented in generateStrata importProblem mergeHouseholdData readMicrodata removeDirectID selectHouseholdData subsetMicrodata varToFactor varToNumeric writeSafeFile

# Adds attributes to dataframe with variabe label information from list created by extractLabels
#
# For each variable in the dataset, a variable label is added in the label attribute,
# which can be exported by read_dta from haven
#
# @keywords internal
# @param datManip a data.frame with manipulated data (variable names should not be changed)
# @param lab list with label information generated by extractLabels
# @return the dataframe datManip with attrributes containing variable label information
# @author Thijs Benschop
addVarLabels <- function(datManip, lab){
  if (is.null(lab[[1]])) {
    warning("There are no variable labels!")
    return(datManip)
  } else {
    # Collect colnames of datManip and match labels from lab
    # Variable names with no matching names get an empty variable label (e.g. a newly created strata variable)
    newLabels <- cbind(colnames(datManip), unlist(lapply(colnames(datManip), function(x) {
      if (length(which(x == lab[[1]][,"var.name"])) == 0) {
        return("")
      } else {
        lab[[1]][which(x == lab[[1]][,"var.name"]),"var.label"]
      }
    })))

    # Add label attributes to all variables from newLabels
    datManip2 <- lapply(colnames(datManip), function(x) {
      attr(datManip[[x]], "label") <- as.character(newLabels[which(x == newLabels[,1]),2])
      datManip[[x]]
    })

    # Convert to data.frame
    datManip2 <- as.data.frame(datManip2)
    colnames(datManip2) <- colnames(datManip)
    return(datManip2)
  }
}

# Changes a variable label in the list generated by extractLabels
#
# The variable label can be changed for variables that were already in the dataframe used for extractLabels
#
# @keywords internal
# @param lab list with label information generated by extractLabels
# @param varname the variable name
# @param newlabel the new label for variable varname
# @return a list with both variable labels (as matrix) and value labels (as list) (see extractLabels)
# @author Thijs Benschop
changeVarLabel <- function(lab, varname, newlabel){
  # Check whether varname is in lab
  if (!varname %in% lab[[1]][,"var.name"]) {
    stop("The variable is not in the label information/original dataset. Labels can only be changed for variables in the original dataset.")
  }
  # Replace label with newlabel for variable varname
  lab[[1]][which(lab[[1]][,"var.name"] == varname), "var.label"] <- newlabel
  return(lab)
}

# Extracts label information from dataframe read by read_dta from haven package.
#
# Both variable labels and value labels are extracted.
# @keywords internal
# @param dat a data.frame loaded by read_dta from have package
# @return a list with both variable labels (as data.frame) and value labels (as list)
# @author Thijs Benschop
extractLabels <- function(dat){
  # Check whether there are variable labels available
  if (!all(sapply(sapply(dat, function(x) { attr(x, "label") }), is.null))) {
    # Save all variable labels
    varLab <- as.data.frame(cbind(colnames(dat), lapply(dat, function(x){attr(x, "label")})))
    colnames(varLab) <- c("var.name", "var.label")
    rownames(varLab) <- NULL
    # Set to NA values in var.label that have more than one element (value labels)
    varLab[which(sapply(dat, function(x) { length(attr(x, "label")) }) > 1), 2] <- NA
    # Set to NULL values in var.label to NA
    varLab[which(sapply(sapply(dat, function(x) { attr(x, "label") }), is.null)), 2] <- NA
    # Set to NA values in var.label that have more than one element (value labels)
    varLab[which(sapply(dat, function(x) { length(attr(x, "label")) }) > 1), 2] <- NA

    # Convert all variable labels to UTF-8
    nonUTFvarlabel <- NULL
    varLab[, 2] <- unlist(varLab[,2], use.names = FALSE)
    if(any(!validUTF8(varLab[,2]))){
      whichNotUTF8 <- which(!is.na(varLab[,2]) & !validUTF8(varLab[,2]))
      nonUTFvarlabel <- varLab[whichNotUTF8, c(1,2)] # Save list of all labels that aren't encoded in UTF-8
      varLab[whichNotUTF8, 2] <- enc2utf8(varLab[whichNotUTF8, 2])
      varLab[whichNotUTF8, 2] <- iconv(varLab[whichNotUTF8, 2], "UTF-8", "UTF-8", sub='')
      nonUTFvarlabel <- cbind(nonUTFvarlabel, varLab[whichNotUTF8, 2])
    }
  } else {
    varLab <- NULL
    nonUTFvarlabel <- NULL
  }

  # Check whether there are value labels available
  if (!all(sapply(sapply(dat, function(x) { attr(x, "labels") }), is.null))) {
    # Save all value labels for variables of class labelled
    valLab <- lapply(dat, function(x){attr(x, "labels")})
  } else {
    valLab <- NULL
  }
  return(list(varLab, valLab, nonUTFvarlabel))
}

#' Creates a household level file from a dataset with a household structure.
#'
#' It removes individual level variables and selects one record per household based on a household ID. The function can also be used for other hierachical structures.
#'
#' @note It is of great importance that users select a variable with containing information on household-ids and weights in \code{hhVars}.
#'
#' @param dat a data.frame with the full dataset
#' @param hhId name of the variable with the household (cluster) ID
#' @param hhVars character vector with names of all household level variables
#' @return a data.frame with only household level variables and one record per household
#' @author Thijs Benschop and Bernhard Meindl
#' @export
#' @examples
#' ## ori-hid: household-ids; household_weights: sampling weights for households
#' x_hh <- selectHouseholdData(dat=testdata, hhId="ori_hid",
#'   hhVars=c("urbrur", "roof",  "walls", "water", "electcon", "household_weights"))
selectHouseholdData <- function(dat, hhId, hhVars) {
  # Check whether specified variables are available in the data
  if (!all(hhVars %in% colnames(dat))) {
    stop("Some selected household variables aren't available in the data.\nRespecify hhVars\n")
  }
  if (!hhId %in% colnames(dat)) {
    stop("The selected household ID isn't available in the data.\nRespecify hhId\n")
  }
  # Remove any records with missing household ID (these cannot be matched later on in the process)
  res <- dat[stats::complete.cases(dat[,hhId]), ]

  # Keep only one observation per household
  res <- res[which(!duplicated(res[,hhId])),]

  # Sort hhVars on the order of the variables in dat
  hhVars <- colnames(dat)[which(colnames(dat) %in% hhVars)]

  # Drop all variables that are not at the household level
  res <- res[,c(hhId, hhVars), drop=FALSE]
  invisible(res)
}

#' Replaces the raw household-level data with the anonymized household-level data in the full dataset
#' for anonymization of data with a household structure (or other hierarchical structure).
#' Requires a matching household ID in both files.
#'
#' @param dat a data.frame with the full dataset
#' @param hhId name of the household (cluster) ID (identical in both datasets)
#' @param dathh a dataframe with the treated household level data (generated for example with \link{selectHouseholdData})
#' @return a data.frame with the treated household level variables and the raw individual level variables
#' @author Thijs Benschop and Bernhard Meindl
#' @export
#' @examples
#' ## Load data
#' x <- testdata
#' \donttest{
#' ## donttest is necessary because of 
#' ## Examples with CPU time > 2.5 times elapsed time
#' ## caused by using C++ code and/or data.table
#' ## Create household level dataset
#' x_hh <- selectHouseholdData(dat=x, hhId="ori_hid",
#'   hhVars=c("urbrur", "roof",  "walls", "water", "electcon", "household_weights"))
#' ## Anonymize household level dataset and extract data
#' sdc_hh <- createSdcObj(x_hh, keyVars=c('urbrur','roof'), w='household_weights')
#' sdc_hh <- kAnon(sdc_hh, k = 3)
#' x_hh_anon <- extractManipData(sdc_hh)
#'  
#' ## Merge anonymized household level data back into the full dataset
#' x_anonhh <- mergeHouseholdData(x, "ori_hid", x_hh_anon)
#'  
#' ## Anonymize full dataset and extract data
#' sdc_full <- createSdcObj(x_anonhh, keyVars=c('sex', 'age', 'urbrur', 'roof'), w='sampling_weight')
#' sdc_full <- kAnon(sdc_full, k = 3)
#' x_full_anon <- extractManipData(sdc_full)
#' }
mergeHouseholdData <- function(dat, hhId, dathh) {
  # Check whether household ID is available in both datasets
  if (!(hhId %in% colnames(dathh)))
    stop("The selected household ID isn't available in the anonymized household data.\nRespecify hhId")
  if (!(hhId %in% colnames(dat)))
    stop("The selected household ID isn't available in the full dataset.\nRespecify hhId")

  # Drop all variables from the untreated dataset that are in the household level dataset except the household ID and the weight variable
  # The weights from the household level file aren't exported because they either are contained in the complete dataset or generated and can
  # therefore be generated by the user.
  toBeDropped <- colnames(dathh)
  toBeDropped <- toBeDropped[-which(toBeDropped %in% hhId)]

  res <- dat[,-which(colnames(dat) %in% toBeDropped), drop=FALSE]

  # Merge the individual level and household level files by the household ID
  # Individuals without matching household ID in dathh are kept and have all missings for the household level variables
  res <- merge(res, dathh, by = hhId, all.x = TRUE)

  # Move hhId to first column
  res <- res[,c(hhId, colnames(res)[-which(colnames(res) %in% hhId)])]
  invisible(res)
}

#' Generate one strata variable from multiple factors
#'
#' For strata defined by multiple variables (e.g. sex,age,country) one combined
#' variable is generated.
#'
#' @param df a data.frame
#' @param stratavars character vector with variable name
#' @param name name of the newly generated variable
#' @return The original data set with one new column.
#' @author Alexander Kowarik
#' @export
#' @examples
#'
#' x <- testdata
#' x <- generateStrata(x,c("sex","urbrur"),"strataIDvar")
#' head(x)
#'
generateStrata <- function(df, stratavars, name) {
  strata <- rep("", nrow(df))
  for (i in seq_along(stratavars)) {
    strata <- paste(strata, df[, stratavars[i]], sep="")
    if (length(stratavars) > i) {
      strata <- paste(strata, "-", sep="")
    }
  }
  df <- cbind(df, strata)
  colnames(df)[length(colnames(df))] <- name
  return(df)
}

#' Remove certain variables from the data set inside a sdc object.
#'
#' Delete variables without changing anything else in the sdcObject (writing
#' NAs).
#'
#'
#' @name removeDirectID
#' @docType methods
#' @param obj object of class \code{\link{sdcMicroObj-class}}
#' @param var name of the variable(s) to be remove
#' @return the modified \code{\link{sdcMicroObj-class}}
#' @author Alexander Kowarik
#' @keywords methods
#' @export
#' @examples
#' ## for objects of class sdcMicro:
#' data(testdata2)
#' sdc <- createSdcObj(testdata, keyVars=c('urbrur','roof'),
#'  numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- removeDirectID(sdc, var="age")
removeDirectID <- function(obj, var) {
  removeDirectIDX(obj=obj, var=var)
}

setGeneric("removeDirectIDX", function(obj, var) {
  standardGeneric("removeDirectIDX")
})

setMethod(f="removeDirectIDX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
  kV <- colnames(obj@origData)[get.sdcMicroObj(obj, "keyVars")]
  nV <- colnames(obj@origData)[get.sdcMicroObj(obj, "numVars")]
  wV <- colnames(obj@origData)[get.sdcMicroObj(obj, "weightVar")]
  sV <- colnames(obj@origData)[get.sdcMicroObj(obj, "strataVar")]
  hV <- colnames(obj@origData)[get.sdcMicroObj(obj, "hhId")]

  if (any(var %in% kV))
    stop("A direct identifier should not be seleceted as key variable.\n Therefore it can not be removed.")
  if (any(var %in% nV))
    stop("A direct identifier should not be seleceted as numerical key variable.\n Therefore it can not be removed.")
  if (any(var %in% wV))
    stop("A direct identifier should not be seleceted as weight variable.\n Therefore it can not be removed.")
  if (any(var %in% sV))
    stop("A direct identifier should not be seleceted as strata variable.\n Therefore it can not be removed.")
  if (any(var %in% hV))
    stop("A direct identifier should not be seleceted as cluster ID.\n Therefore it can not be removed.")

  o <- obj@origData
  if (any(!var %in% colnames(o)))
    stop("direct identifier variable not found on data set")
  o <- o[, !colnames(o) %in% var, drop=FALSE]
  obj <- nextSdcObj(obj)
  obj@deletedVars <- c(obj@deletedVars, var)
  obj@origData <- o
  obj
})

#' Change the a keyVariable of an object of class \code{\link{sdcMicroObj-class}} from Numeric to
#' Factor or from Factor to Numeric
#'
#' Change the scale of a variable
#'
#' @name varToFactor
#' @docType methods
#' @param obj object of class \code{\link{sdcMicroObj-class}}
#' @param var name of the keyVariable to change
#' @return the modified \code{\link{sdcMicroObj-class}}
#' @keywords methods
#' @export
#' @examples
#' ## for objects of class sdcMicro:
#' data(testdata2)
#' sdc <- createSdcObj(testdata2,
#'   keyVars=c('urbrur','roof','walls','water','electcon','relat','sex'),
#'   numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- varToFactor(sdc, var="urbrur")
#'
varToFactor <- function(obj, var) {
  varToFactorX(obj=obj, var=var)
}
setGeneric("varToFactorX", function(obj, var) {
  standardGeneric("varToFactorX")
})

setMethod(f="varToFactorX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
  obj <- nextSdcObj(obj)
  x <- get.sdcMicroObj(obj, type="manipKeyVars")
  x2 <- varToFactor(x, var=var)
  obj <- set.sdcMicroObj(obj, type="manipKeyVars", input=list(as.data.frame(x2)))
  obj
})

setMethod(f="varToFactorX", signature=c("data.frame"),
definition=function(obj, var) {
  #if ( length(var)!=1) {
  #  stop("More than 1 variable specified in 'var'!\n")
  #}
  if (!all(var %in% colnames(obj))) {
    stop("at least one variable specified in 'var' is not available in 'obj'!\n")
  }
  for (vv in var) {
    obj[[vv]] <- as.factor(obj[[vv]])
  }
  obj
})


#' @export
#' @rdname varToFactor
varToNumeric <- function(obj, var) {
  varToNumericX(obj=obj, var=var)
}

setGeneric("varToNumericX", function(obj, var) {
  standardGeneric("varToNumericX")
})

setMethod(f="varToNumericX", signature=c("sdcMicroObj"),
definition=function(obj, var) {
  obj <- nextSdcObj(obj)
  x <- get.sdcMicroObj(obj, type="manipKeyVars")
  suppressWarnings(tmpvar <- as.numeric(as.character(x[, var])))
  x2 <- varToNumeric(x, var=var)
  obj <- set.sdcMicroObj(obj, type="manipKeyVars", input=list(as.data.frame(x2)))
  obj
})

setMethod(f="varToNumericX", signature=c("data.frame"),
definition=function(obj, var) {
  if (!all(var %in% colnames(obj))) {
    stop("at least one variable specified in 'var' is not available in 'obj'!\n")
  }
  for (vv in var) {
    if (inherits(obj[[vv]], "factor")) {
      obj[[vv]] <- as.numeric(levels(obj[[vv]]))[obj[[vv]]]
    } else{
      obj[[vv]] <- as.numeric(obj[[vv]])
    }
  }
  obj
})

# wrapper for tryCatch()
tryCatchFn <- function(expr) {
  result <- tryCatch({expr},
   error=function(e) {
     return(e)
   })
  return(result)
}

#' readMicrodata
#'
#' reads data from various formats into R. Used in \code{\link{sdcApp}}.
#'
#' @param path a file path
#' @param type which format does the file have. currently allowed values are
#' \itemize{
#' \item \code{sas}
#' \item \code{spss}
#' \item \code{stata}
#' \item \code{R}
#' \item \code{rdf}
#' \item \code{csv}
#' }
#' @param convertCharToFac (logical) if TRUE, all character vectors are automatically
#' converted to factors
#' @param drop_all_missings (logical) if TRUE, all variables that contain NA-values only
#' will be dropped
#' @param ... additional parameters. Currently used only if \code{type='csv'} to pass
#' arguments to \code{read.table()}.
#'
#' @note if \code{type} is either \code{'sas'}, \code{'spss'} or \code{'stata'}, values read in as \code{NaN}
#' will be converted to \code{NA}.
#' @return a data.frame or an object of class 'simple.error'. If a stata file was read in, the resulting \code{data.frame}
#' has an additional attribute \code{lab} in which variable and value labels are stored.
#' @author Bernhard Meindl
#' @export
readMicrodata <- function(path, type, convertCharToFac=TRUE, drop_all_missings=TRUE, ...) {
  nonUTFvarname <- NULL
  if (type=="sas") {
    res <- tryCatchFn(haven::read_sas(data_file=path))
    # Convert column names to utf8
    nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
    colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
    }
  if (type=="spss") {
    res <- tryCatchFn(haven::read_spss(file=path))
    # Convert column names to utf8
    nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
    colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
    }
  if (type=="stata") {
    res <- tryCatchFn(haven::read_dta(file=path))
    # Convert column names to utf8
    nonUTFvarname <- cbind(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))], iconv(enc2utf8(colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))]), "UTF-8", "UTF-8", sub='')) # Save list of all variable names that aren't encoded in UTF-8
    colnames(res)[which(!validUTF8(colnames(res)) & !is.na(colnames(res)))] <- nonUTFvarname[,2]
    lab <- extractLabels(res)
  }
  if (type=="R") {
    res <- tryCatchFn(get(load(file=path)))
  }
  if (type=="rdf") {
    res <- tryCatchFn(get(paste(path)))
  }
  if (type=="csv") {
    opts <- list(...)
    header <- ifelse(opts$header==TRUE, TRUE, FALSE)
    sep <- opts$sep
    quote <- "\""
    comment.char <- ""
    res <- tryCatchFn(utils::read.table(path, sep=sep, header=header, quote=quote, comment.char=comment.char))
  }
  if (inherits(res, "simpleError")) {
    return(res)
  }
  if (!inherits(res, "data.frame")) {
    res$message <- paste0(res$message,"\ndata read into the system was not of class 'data.frame'!")
    return(res)
  }
  # convert result to clas 'data.frame' if it is a 'tbl_df'...
  if (inherits(res, "tbl_df")) {
    class(res) <- "data.frame"
  }

  # convert NaN to NA if data was read in with haven
  if (type %in% c("sas","spss","stata")) {
    res[is.na(res)] <- NA
  }

  # check if any variable has class 'labelled' or 'haven_labelled' (from haven 2.0.0) and convert it to factors.
  # this might happen if we read data with read_xxx() from haven
  cl_lab <- which(sapply(res, inherits, "labelled") | sapply(res, inherits, "haven_labelled"))
  if (length(cl_lab) > 0) {
    if (length(cl_lab)==1) {
      res[[cl_lab]] <- haven::as_factor(res[[cl_lab]], levels="default")
    } else {
      res[,cl_lab] <- lapply(res[,cl_lab] , function(x) {
        haven::as_factor(x, levels="default")
      })
    }
  }

  if (convertCharToFac) {
    # convert character-variables to factors
    cl_char <- which(sapply(res, class)=="character")
    if (length(cl_char) >0) {
      if (length(cl_char) == 1) {
        res[[cl_char]] <- as.factor(res[[cl_char]])
      } else {
        res[,cl_char] <- lapply(res[,cl_char], as.factor)
      }
    }
  }
  if (drop_all_missings) {
    # drop all variables that are NA-only
    keep <- which(sapply(res, function(x) sum(is.na(x))!=length(x)))
    dropped <- colnames(res)[-keep]
    res <- res[,keep,drop=FALSE]
    # save names of dropped variables
    if(length(dropped) > 0){
      attr(res, "dropped") <- dropped
    }
  }

  # Convert levels in factor and character variables to utf8
  nonUTFvallabels <- data.frame(varName = character(), initLabel = character(0), convLabel = character(0), stringsAsFactors = FALSE)
  for (i in 1:dim(res)[2]) {
    # Character strings
    if (inherits(res[, i], "character")) {
      if (any(!validUTF8(res[,i]))) {
        nonUTFvallabels <- rbind(nonUTFvallabels, cbind(rep(colnames(res)[i], length(unique(res[which(!validUTF8(res[,i])),i]))),
          unique(res[which(!validUTF8(res[,i])),i]),
          iconv(enc2utf8(unique(res[which(!validUTF8(res[,i])),i])), "UTF-8", "UTF-8", sub='')))
        res[which(!validUTF8(res[,i])),i] <- enc2utf8(res[which(!validUTF8(res[,i])),i])
        # Remove any non UTF8 characters
        res[which(!validUTF8(res[,i])),i] <- iconv(res[which(!validUTF8(res[,i])),i], "UTF-8", "UTF-8", sub='')
      }
    }
    # Factor variables
    if (inherits(res[, i], "factor")) {
      if (any(!validUTF8(levels(res[,i])))) {
        nonUTFvallabels <- rbind(nonUTFvallabels, cbind(rep(colnames(res)[i], length(levels(res[,i])[which(!validUTF8(levels(res[,i])))])),
          levels(res[,i])[which(!validUTF8(levels(res[,i])))],
          iconv(enc2utf8(levels(res[,i])[which(!validUTF8(levels(res[,i])))]), "UTF-8", "UTF-8", sub='')))
        # Convert to UTF8, encoding unknown
        levels(res[,i])[which(!validUTF8(levels(res[,i])))] <- enc2utf8(levels(res[,i])[which(!validUTF8(levels(res[,i])))])
        # Remove any non UTF8 characters
        levels(res[,i])[which(!validUTF8(levels(res[,i])))] <- iconv(levels(res[,i])[which(!validUTF8(levels(res[,i])))], "UTF-8", "UTF-8", sub='')
      }
    }
  }
  if (!is.null(nonUTFvarname)) {
    if (dim(nonUTFvarname)[1] == 0) {
      nonUTFvarname <- NULL
    }
  }  # Set to NULL if no changed labels
  if (!is.null(nonUTFvallabels)) {
    if (dim(nonUTFvallabels)[1] == 0) {
      nonUTFvallabels <- NULL
    }
  } # Set to NULL if no changed labels

  if (type=="stata") {
    attr(res, "lab") <- lab
  }

  # Collect variable names, variable labels and value labels that were encoded to UTF8
  if (type=="stata") {
    attr(res, "nonUTF") <- list(nonUTFvarname, nonUTFvallabels, lab[[3]])
  } else{
    attr(res, "nonUTF") <- list(nonUTFvarname, nonUTFvallabels)
  }
  res
}

#' importProblem
#'
#' reads an sdcProblem with code that has been exported within \code{\link{sdcApp}}.
#'
#' @param path a file path
#' @return an object of class \code{sdcMicro_GUI_export} or an object of class 'simple.error'
#' @author Bernhard Meindl
#' @export
importProblem <- function(path) {
  res <- tryCatchFn(get(load(file=path)))
  if (inherits(res, "simpleError")) {
    return(res)
  }
  if (!inherits(res, "sdcMicro_GUI_export")) {
    res$message <- paste0(
      res$message,
      "\ndata read into the system was not of class 'sdcMicro_GUI_export'!"
    )
    return(res)
  }
  res
}

#' subsetMicrodata
#'
#' allows to restrict original data to only a subset. This may be useful to test some anonymization
#' methods. This function will only be used in the graphical user interface \code{\link{sdcApp}}.
#'
#' @param obj an object of class \code{\link{data.frame}} containing micro data
#' @param type algorithm used to sample from original microdata. Currently supported choices are
#' \describe{
#' \item{\code{n_perc}}{ the restricted microdata will be a \code{n-percent} sample of the original microdata.}
#' \item{\code{first_n}}{ only the first \code{n} observations will be used.}
#' \item{\code{every_n}}{ the restricted microdata set consists of every \code{n-th} record.}
#' \item{\code{size_n}}{ a total of \code{n} observations will be randomly drawn.}
#' }
#' @param n numeric vector of length 1 specifying the specific parameter with respect to argument \code{type}.
#' @return an object of class \code{\link{sdcMicroObj-class}} with modified slot \code{@origData}.
#' @author Bernhard Meindl
#' @rdname subsetMicrodata
subsetMicrodata <- function(obj, type, n) {
  if (!type %in% c("n_perc","first_n","every_n","size_n")) {
    stop("invalid value in argument 'type'\n")
  }
  if (n < 1) {
    stop("argument 'n' must be >=1\n")
  }

  dat <- obj
  nrObs <- nrow(dat)
  if (type=="n_perc") {
    ssize <- ceiling((nrObs/100)*n)
    dat <- dat[sample(1:nrObs, ssize),,drop=FALSE]
  }
  if (type=="first_n") {
    dat <- dat[1:n,,drop=F]
  }
  if (type=="every_n") {
    ssize <- (1:nrObs)%%n==1
    dat <- dat[ssize,,drop=F]
  }
  if (type=="size_n") {
    dat <- dat[sample(1:nrObs, n),,drop=F]
  }
  dim(dat)
  return(dat)
}


#' writeSafeFile
#'
#' writes an anonymized dataset to a file. This function should be used in the
#' graphical user interface [sdcApp()] only.
#'
#' @param obj a `data.frame` containing micro data
#' @param randomizeRecords (logical) specifies, if the output records should
#' be randomized. The following options are possible:
#' - `"no"`: default, no randomization takes place
#' - `"simple"`: records are randomly swapped
#' - `"byHH"`: if slot `"hhId"` is not `NULL`, the clusters defined by this
#' variable are randomized across the dataset. If slot `"hhId"` is `NULL`, the
#' records or the dataset are randomly changed.
#' - `"withinHH"`: if slot `"hhId"` is not `NULL`, the clusters defined by
#' this variable are randomized across the dataset and additionally, the order
#' of records within the clusters are also randomly changed. If slot `"hhId"`
#' is `NULL`, the records or the dataset are randomly changed.
#'
#' @param format (character) specifies the output file format. Accepted
#' values are:
#' - `"rdata"`: output will be saved in the R binary file-format
#' - `"sav"`: output will be saved as SPSS-file
#' - `"dta"`: ouput will be saved as STATA-file
#' - `"csv"`: output will be saved as comma seperated (text)-file
#' - `"sas"`: output will be saved as SAS-file (sas7bdat)
#' @param fileOut (character) file to which output should be written
#' @param ... optional arguments used for [utils::write.table()] if
#' argument `"format"` equals `"csv"`
#' @return invisible `NULL` if the file was successfully written
#' @author Bernhard Meindl
#' @rdname writeSafeFile
#' @md
#' @export
writeSafeFile <- function(obj, format, randomizeRecords, fileOut, ...) {
  if (!inherits(obj, "sdcMicroObj")) {
    stop("invalid input in argument 'obj'\n")
  }
  dat <- extractManipData(obj, randomizeRecords=randomizeRecords)

  if (format=="rdata") {
    save(dat, file=fileOut)
  }
  if (format=="sav") {
    haven::write_sav(data=dat, path=fileOut)
  }
  if (format=="sas") {
    haven::write_sas(data=dat, path=fileOut)
  }
  if (format=="dta") {
    # add label information
    inp <- list(...)
    new_labs <- inp$lab
    if (!is.null(new_labs)) {
      # restrict to existing variables in anonymized dataset
      ll1 <- new_labs[[1]]
      ii <- which(ll1$var.name %in% colnames(dat))
      ll1 <- ll1[ii,]
      new_labs[[1]] <- ll1
      ll2 <- new_labs[[2]]
      if (!is.null(ll2)) {
        ii <- which(names(ll2) %in% colnames(dat))
        ll2 <- ll2[ii]
        new_labs[[2]] <- ll2
      }
      dat <- addVarLabels(dat, lab=new_labs)
    }
    haven::write_dta(data=dat, path=fileOut, version=inp$version)
  }
  if (format=="csv") {
    utils::write.table(dat, file=fileOut, ...)
  }
  return(invisible(NULL))
}
sdcTools/sdcMicro documentation built on March 15, 2024, 12:32 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sdcTools/sdcMicro
Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

R/GUIfunctions.R
In sdcTools/sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

Defines functions writeSafeFile subsetMicrodata importProblem readMicrodata tryCatchFn varToNumeric varToFactor removeDirectID generateStrata mergeHouseholdData selectHouseholdData extractLabels changeVarLabel addVarLabels

Documented in generateStrata importProblem mergeHouseholdData readMicrodata removeDirectID selectHouseholdData subsetMicrodata varToFactor varToNumeric writeSafeFile

R Package Documentation

Browse R Packages

We want your feedback!

sdcTools/sdcMicro Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

R/GUIfunctions.R In sdcTools/sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

Defines functions writeSafeFile subsetMicrodata importProblem readMicrodata tryCatchFn varToNumeric varToFactor removeDirectID generateStrata mergeHouseholdData selectHouseholdData extractLabels changeVarLabel addVarLabels

Documented in generateStrata importProblem mergeHouseholdData readMicrodata removeDirectID selectHouseholdData subsetMicrodata varToFactor varToNumeric writeSafeFile

R Package Documentation

Browse R Packages

We want your feedback!

sdcTools/sdcMicro
Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

R/GUIfunctions.R
In sdcTools/sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation