# obj: an sdcMicroObj-object
# v: either a numeric vector specifying column-indices or a character vector specifying
# column-names
standardizeInput <- function(obj, v) {
if (!inherits(obj, "sdcMicroObj")) {
stop("obj must be an object of class 'sdcMicroObj'!", call. = FALSE)
}
if (is.null(v)) {
return(NULL)
}
if (is.numeric(v)) {
if (all(v %in% 1:ncol(get.sdcMicroObj(obj, type = "origData")))) {
return(v)
} else {
stop("please specify valid column-indices!", call. = FALSE)
}
} else if (is.character(v)) {
m <- match(v, colnames(get.sdcMicroObj(obj, type = "origData")))
if (!any(is.na(m))) {
return(m)
} else {
stop("please specify valid column-names!", call. = FALSE)
}
} else {
stop(
"please specify either a numeric vector specifying column-indices or a character vector containing valid variable names!",
call. = FALSE
)
}
}
#' @name sdcMicroObj-class
#' @param dat The microdata set. A numeric matrix or data frame containing the data.
#' @param keyVars Indices or names of categorical key variables. They must, of
#' course, match with the columns of \sQuote{dat}.
#' @param pramVars Indices or names of categorical variables considered to be pramed.
#' @param numVars Index or names of continuous key variables.
#' @param ghostVars if specified a list which each element being a list of exactly two elements.
#' The first element must be a character vector specifying exactly one variable name that was
#' also specified as a categorical key variable (\code{keyVars}), while the second element is
#' a character vector of valid variable names (that must not be listed as \code{keyVars}).
#' If \code{\link{localSuppression}} or \code{\link{kAnon}} was applied, the resulting
#' suppression pattern for each key-variable is transferred to the depending variables.
#' @param weightVar Indices or name determining the vector of sampling weights.
#' @param hhId Index or name of the cluster ID (if available).
#' @param strataVar Indices or names of stratification variables.
#' @param sensibleVar Indices or names of sensible variables (for l-diversity)
#' @param excludeVars which variables of \code{dat} should not be included in
#' result-object? Users may specify a vector of variable-names available in \code{dat}
#' that were not specified in either \code{keyVars}, \code{numVars}, \code{pramVars},
#' \code{ghostVars}, \code{hhId}, \code{strataVar} or \code{sensibleVar}.
#' @param options additional options (if specified, a list must be used as input)
#' @param seed (numeric) number specifiying the seed which will be set to allow for
#' reproducablity. The number will be rounded and saved as element \code{seed} in slot \code{options}.
#' @param randomizeRecords (logical) if \code{TRUE}, the order of observations in the input microdata set
#' will be randomized.
#' @param alpha numeric between 0 and 1 specifying the fraction on how much keys containing \code{NAs} should
#' contribute to the frequency calculation which is also crucial for risk-estimation.
#' @references Templ, M. and Meindl, B. and Kowarik, A.: \emph{Statistical Disclosure Control for
#' Micro-Data Using the R Package sdcMicro}, Journal of Statistical Software,
#' 67 (4), 1--36, 2015. \doi{10.18637/jss.v067.i04}
#' @export
#' @examples
#' ## we can also specify ghost (linked) variables
#' ## these variables are linked to some categorical key variables
#' ## and have the sampe suppression pattern as the variable that they
#' ## are linked to after \code{\link{localSuppression}} has been applied
#' data(testdata)
#' testdata$electcon2 <- testdata$electcon
#' testdata$electcon3 <- testdata$electcon
#' testdata$water2 <- testdata$water
#'
#' keyVars <- c("urbrur","roof","walls","water","electcon","relat","sex")
#' numVars <- c("expend","income","savings")
#' w <- "sampling_weight"
#'
#' ## we want to make sure that some variables not used as key-variables
#' ## have the same suppression pattern as variables that have been
#' ## selected as key variables. Thus, we are using 'ghost'-variables.
#' ghostVars <- list()
#'
#' ## we want variables 'electcon2' and 'electcon3' to be linked
#' ## to key-variable 'electcon'
#' ghostVars[[1]] <- list()
#' ghostVars[[1]][[1]] <- "electcon"
#' ghostVars[[1]][[2]] <- c("electcon2","electcon3")
#'
#' \donttest{
#' ## donttest because Examples with CPU time > 2.5 times elapsed time
#' ## we want variable 'water2' to be linked to key-variable 'water'
#' ghostVars[[2]] <- list()
#' ghostVars[[2]][[1]] <- "water"
#' ghostVars[[2]][[2]] <- "water2"
#'
#' ## create the sdcMicroObj
#' obj <- createSdcObj(testdata, keyVars=keyVars,
#' numVars=numVars, w=w, ghostVars=ghostVars)
#'
#' ## apply 3-anonymity to selected key variables
#' obj <- kAnon(obj, k=3); obj
#'
#' ## check, if the suppression patterns are identical
#' manipGhostVars <- get.sdcMicroObj(obj, "manipGhostVars")
#' manipKeyVars <- get.sdcMicroObj(obj, "manipKeyVars")
#' all(is.na(manipKeyVars$electcon) == is.na(manipGhostVars$electcon2))
#' all(is.na(manipKeyVars$electcon) == is.na(manipGhostVars$electcon3))
#' all(is.na(manipKeyVars$water) == is.na(manipGhostVars$water2))
#'
#' ## exclude some variables
#' obj <- createSdcObj(testdata, keyVars=c("urbrur","roof","walls"), numVars="savings",
#' weightVar=w, excludeVars=c("relat","electcon","hhcivil","ori_hid","expend"))
#' colnames(get.sdcMicroObj(obj, "origData"))
#' }
createSdcObj <- function(dat, keyVars, numVars=NULL, pramVars=NULL, ghostVars=NULL, weightVar=NULL,
hhId=NULL, strataVar=NULL, sensibleVar=NULL, excludeVars=NULL, options=NULL, seed=NULL,
randomizeRecords=FALSE, alpha=1) {
obj <- new("sdcMicroObj")
if(!is.null(options)){
# check if options is a named list
if(!(is.list(options) && !is.null(names(options)))){
stop("`options` must be either `NULL` or a named list")
}
}
if (!is.null(seed) && is.numeric(seed)) {
ss <- round(seed)
set.seed(ss)
options$seed <- ss
} else {
options$seed <- NA
}
# max_size for undo-functionality (refers to rows of data.frame input
# can be set via env-var `sdcMicro_maxsize_undo`
res <- tryCatch(
expr = as.numeric(Sys.getenv("sdcMicro_maxsize_undo")),
error = function(e) e,
warning = function(w) w
)
if (inherits(res, "error") || is.na(res) || res < 1) {
options$max_size <- 1e5
} else {
options$max_size <- res
}
if (!is.data.frame(dat)) {
dat <- as.data.frame(dat)
}
class(dat) <- "data.frame" # removing e.g data.table attributes
if (randomizeRecords==TRUE) {
dat <- dat[sample(1:nrow(dat)),]
rownames(dat) <- NULL
}
options$randomizeRecords <- randomizeRecords
obj <- set.sdcMicroObj(obj, type="origData", input=list(dat))
usedVars <- c(standardizeInput(obj, keyVars),
standardizeInput(obj, numVars), standardizeInput(obj, pramVars),
standardizeInput(obj, weightVar), standardizeInput(obj, hhId),
standardizeInput(obj, strataVar), standardizeInput(obj, sensibleVar))
if (!is.null(ghostVars)) {
for (i in seq_along(ghostVars)) {
usedVars <- c(usedVars, standardizeInput(obj, ghostVars[[i]][[2]]))
}
}
usedVars <- unique(usedVars)
# exclude variables if required
if (!is.null(excludeVars)) {
excludeVarsInd <- standardizeInput(obj, excludeVars)
if (any(excludeVarsInd %in% usedVars)) {
stop("You have specified variables in 'excludeVars' that cannot be removed!\n")
}
obj@origData <- obj@origData[,-c(excludeVarsInd),drop=FALSE]
obj@deletedVars <- excludeVars
dat <- get.sdcMicroObj(obj, type="origData")
}
# key-variables
keyVarInd <- standardizeInput(obj, keyVars)
TFcharacter <- lapply(dat[, keyVarInd, drop=FALSE], class) %in% "character"
if (any(TFcharacter)) {
for (kvi in which(TFcharacter)) {
dat[, keyVarInd[kvi]] <- as.factor(dat[, keyVarInd[kvi]])
}
}
obj <- set.sdcMicroObj(obj, type="keyVars", input=list(keyVarInd))
obj <- set.sdcMicroObj(obj, type="manipKeyVars", input=list(dat[, keyVarInd, drop=FALSE]))
if (!is.null(pramVars)) {
pramVarInd <- standardizeInput(obj, pramVars)
obj <- set.sdcMicroObj(obj, type="pramVars", input=list(pramVarInd))
# variable only consists of NA values?
all.na <- which(sapply(obj@origData[, pramVars], function(x) {
all(is.na(x))
}))
if (length(all.na) > 0) {
warnMsg <- "at least one pramVar only contains NA values! --> we do not use this variable!\n"
obj <- addWarning(obj, warnMsg=warnMsg, method="createSdcObj", variable=NA)
warning(warnMsg)
obj <- set.sdcMicroObj(obj, type="pramVars", list(get.sdcMicroObj(obj, type="pramVars")[-all.na]))
pramVarInd <- pramVarInd[-all.na]
}
pramData <- dat[, pramVarInd, drop=FALSE]
obj <- set.sdcMicroObj(obj, type="manipPramVars", input=list(dat[, pramVarInd,
drop=FALSE]))
}
# numeric-variables
if (!is.null(numVars)) {
numVarInd <- standardizeInput(obj, numVars)
obj <- set.sdcMicroObj(obj, type="numVars", input=list(numVarInd))
obj <- set.sdcMicroObj(obj, type="manipNumVars", input=list(dat[, numVarInd, drop=FALSE]))
}
# ghostVars
if ( !is.null(ghostVars) ) {
for ( i in seq_along(ghostVars) ) {
gV <- standardizeInput(obj, ghostVars[[i]][[1]])
sV <- standardizeInput(obj, ghostVars[[i]][[2]])
if ( length(gV) != 1 ) {
stop("only one (existing) key variable name can be specified as idenpendent variables in a ghostVars-element!\n")
}
if ( any(sV %in% keyVarInd) ) {
stop("one variables that are no categorical key variables can be specified as dependent variables in a ghostVars element.\n")
}
ghostVars[[i]][[1]] <- gV
ghostVars[[i]][[2]] <- sV
}
obj <- set.sdcMicroObj(obj, type="ghostVars", input=list(ghostVars))
ghostVarInd <- unlist(lapply(ghostVars, function(x) { x[[2]]}))
obj <- set.sdcMicroObj(obj, type ="manipGhostVars", input=list(dat[, ghostVarInd, drop=FALSE]))
}
# weight-variable
if (!is.null(weightVar)) {
weightVarInd <- standardizeInput(obj, weightVar)
obj <- set.sdcMicroObj(obj, type="weightVar", input=list(weightVarInd))
}
# hhId-variable
if (!is.null(hhId)) {
hhIdInd <- standardizeInput(obj, hhId)
obj <- set.sdcMicroObj(obj, type="hhId", input=list(hhIdInd))
}
# strata-variable
if (!is.null(strataVar)) {
strataVarInd <- standardizeInput(obj, strataVar)
obj <- set.sdcMicroObj(obj, type="strataVar", input=list(strataVarInd))
}
# sensible-variable
if (!is.null(sensibleVar)) {
sensibleVarInd <- standardizeInput(obj, sensibleVar)
obj <- set.sdcMicroObj(obj, type="sensibleVar", input=list(sensibleVarInd))
}
options$alpha <- alpha
obj <- set.sdcMicroObj(obj, type="options", input=list(options))
obj <- measure_risk(obj)
obj@originalRisk <- obj@risk
if (length(numVars) > 0) {
obj <- dRisk(obj)
obj <- dUtility(obj)
}
obj
}
computeNumberPrev <- function(obj) {
tmpo <- obj
for (i in 1:1000) {
tmpo <- tmpo@prev
if (is.null(tmpo)) {
return(i - 1)
}
}
}
deletePrevSave <- function(obj, m) {
nprev <- computeNumberPrev(obj)
if (m >= 1 && m <= nprev) {
cmd <- paste("obj@", paste(rep("prev", m), collapse="@"), "<-NULL", sep="")
eval(parse(text=cmd))
}
return(obj)
}
#' nextSdcObj
#'
#' internal function used to provide the undo-functionality.
#'
#' @param obj a \code{\link{sdcMicroObj-class}} object
#' @return a modified \code{\link{sdcMicroObj-class}} object
#' @export
#' @docType methods
nextSdcObj <- function(obj) {
nextSdcObjX(obj)
}
setGeneric("nextSdcObjX", function(obj) {
standardGeneric("nextSdcObjX")
})
setMethod(f="nextSdcObjX", signature=c("sdcMicroObj"), definition=function(obj) {
options <- get.sdcMicroObj(obj, type = "options")
if (("noUndo" %in% options)) {
return(obj)
}
if (nrow(obj@origData) > options$max_size) {
warnMsg <- paste("No previous states are saved because your data set has more than", options$max_size, "observations.\n")
obj <- addWarning(
obj = obj,
warnMsg = warnMsg,
method = "nextSdcObj",
variable = NA
)
warning(warnMsg)
return(obj)
}
if (length(grep("maxUndo", options)) > 0) {
maxUndo <- as.numeric(substr(
x = options[grep("maxUndo", options)],
start = 9,
stop = nchar(options[grep("maxUndo", options)], type = "width")
))
} else {
maxUndo <- 1
}
obj <- deletePrevSave(
obj = obj,
m = maxUndo
)
obj <- set.sdcMicroObj(
object = obj,
type = "prev",
input = list(obj)
)
return(obj)
})
#' Recompute Risk and Frequencies for a sdcMicroObj
#'
#' Recomputation of Risk should be done after manual changing the content of an
#' object of class [sdcMicroObj-class]
#'
#' By applying this function, the dislosure risk is re-estimated and the
#' corresponding slots of an object of class [sdcMicroObj-class] are updated.
#' This function mostly used internally to automatically update the risk after
#' an sdc method is applied.
#'
#' @name calcRisks
#' @param obj a [sdcMicroObj-class] object
#' @param ... no arguments at the moment
#' @seealso [sdcMicroObj-class]
#' @return a [sdcMicroObj-class] object with updated risk values
#' @export
#' @md
#' @examples
#' data(testdata2)
#' sdc <- createSdcObj(testdata2,
#' keyVars=c('urbrur','roof','walls','water','electcon','relat','sex'),
#' numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- calcRisks(sdc)
#'
calcRisks <- function(obj, ...) {
calcRisksX(obj=obj, ...)
}
setGeneric("calcRisksX", function(obj, ...) {
standardGeneric("calcRisksX")
})
setMethod(f="calcRisksX", signature=c("sdcMicroObj"), definition=function(obj, ...) {
risk <- get.sdcMicroObj(obj, type="risk")
modelSet <- (!is.null(risk$model))
suda2Set <- (!is.null(risk$suda2))
obj <- measure_risk(obj)
if (modelSet) {
kv <- colnames(get.sdcMicroObj(obj, type = "manipKeyVars"))
obj <- modRisk(
obj = obj,
method = "IPF",
formulaM = as.formula(paste(" ~ ", paste(kv, collapse="+"))))
}
if (suda2Set) {
obj <- suda2(obj)
}
if (length(get.sdcMicroObj(obj, type="manipNumVars")) > 0) {
obj <- dRisk(obj)
}
obj
})
#' Remove certain variables from the data set inside a sdc object.
#'
#' Extract the manipulated data from an object of class \code{\link{sdcMicroObj-class}}
#'
#' @name extractManipData
#' @param obj object of class \code{\link{sdcMicroObj-class}}
#' @param ignoreKeyVars If manipulated KeyVariables should be returned or the
#' unchanged original variables
#' @param ignorePramVars if manipulated PramVariables should be returned or the
#' unchanged original variables
#' @param ignoreNumVars if manipulated NumericVariables should be returned or
#' the unchanged original variables
#' @param ignoreGhostVars if manipulated Ghost (linked) Variables should be returned or
#' the unchanged original variables
#' @param ignoreStrataVar if manipulated StrataVariables should be returned or
#' the unchanged original variables
#' @param randomizeRecords (logical) specifies, if the output records should be randomized. The following
#' options are possible:
#' \describe{
#' \item{'no'}{default, no randomization takes place}
#' \item{'simple'}{records are just randomly swapped.}
#' \item{'byHH'}{if slot 'hhId' is not \code{NULL}, the clusters defined by this variable are randomized across the dataset. If
#' slot 'hhId' is \code{NULL}, the records or the dataset are randomly changed.}
#' \item{'withinHH'}{if slot 'hhId' is not \code{NULL}, the clusters defined by this variable are randomized across the dataset and
#' additionally, the order of records within the clusters are also randomly changed. If slot 'hhId' is \code{NULL}, the records or the dataset are
#' randomly changed.}}
#' @return a \code{data.frame} containing the anonymized data set
#' @author Alexander Kowarik, Bernhard Meindl
#' @export
#' @examples
#' ## for objects of class sdcMicro:
#' data(testdata2)
#' sdc <- createSdcObj(testdata,
#' keyVars=c('urbrur','roof'),
#' numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- removeDirectID(sdc, var="age")
#' dataM <- extractManipData(sdc)
extractManipData <- function(obj, ignoreKeyVars=FALSE, ignorePramVars=FALSE, ignoreNumVars=FALSE, ignoreGhostVars=FALSE, ignoreStrataVar=FALSE, randomizeRecords="no") {
extractManipDataX(obj, ignoreKeyVars=ignoreKeyVars, ignorePramVars=ignorePramVars, ignoreNumVars=ignoreNumVars,
ignoreGhostVars=ignoreGhostVars, ignoreStrataVar=ignoreStrataVar, randomizeRecords=randomizeRecords)
}
setGeneric("extractManipDataX", function(obj, ignoreKeyVars=FALSE, ignorePramVars=FALSE,
ignoreNumVars=FALSE, ignoreGhostVars=FALSE, ignoreStrataVar=FALSE, randomizeRecords="no") {
standardGeneric("extractManipDataX")
})
setMethod(f="extractManipDataX", signature=c("sdcMicroObj"), definition=function(obj,
ignoreKeyVars=FALSE, ignorePramVars=FALSE, ignoreNumVars=FALSE,
ignoreGhostVars=FALSE, ignoreStrataVar=FALSE, randomizeRecords="no") {
hhid <- clusterid <- newid <- N <- id <- NULL
if (!randomizeRecords %in% c("no","simple","byHH", "withinHH")) {
stop("invalid value in argument 'randomizeRecords'", call. = FALSE)
}
o <- get.sdcMicroObj(obj, type = "origData")
k <- get.sdcMicroObj(obj, type = "manipKeyVars")
p <- get.sdcMicroObj(obj, type = "manipPramVars")
n <- get.sdcMicroObj(obj, type = "manipNumVars")
g <- get.sdcMicroObj(obj, type = "manipGhostVars")
s <- get.sdcMicroObj(obj, type = "manipStrataVar")
origKeys <- o[, colnames(k), drop = FALSE]
if (!is.null(k) && !ignoreKeyVars) {
o[, colnames(k)] <- k
}
if (!is.null(p) && !ignorePramVars) {
o[, colnames(p)] <- p
}
if (!is.null(n) && !ignoreNumVars) {
o[, colnames(n)] <- n
}
if (!is.null(g) && !ignoreGhostVars) {
o[, colnames(g)] <- g
}
if (!is.null(s) && !ignoreStrataVar) {
o$sdcGUI_strataVar <- s
}
## quick and dirty: ensure that keyVars are factors:
if (!is.null(k) && !ignoreKeyVars) {
for (i in seq_len(ncol(k))) {
cc <- class(origKeys[[colnames(k)[i]]])
vname <- colnames(k)[i]
v_p <- o[[vname]]
if (cc != class(v_p)) {
if (cc == "integer") {
o[[vname]] <- as.integer(v_p)
}
if (cc == "character") {
o[[vname]] <- as.character(v_p)
}
if (cc == "numeric") {
o[[vname]] <- as.numeric(v_p)
}
if (cc == "logical") {
o[[vname]] <- as.logical(v_p)
}
}
}
}
if (randomizeRecords!="no") {
hhid <- get.sdcMicroObj(obj, "hhId")
if (is.null(hhid) | randomizeRecords=="simple") {
# just simple randomization
o <- o[sample(1:nrow(o)),]
} else {
tmp <- data.table(id=1:nrow(o), clusterid=o[[hhid]])
setkey(tmp, clusterid)
if (randomizeRecords=="withinHH") {
tmp <- tmp[,lapply(.SD, function(x) {
if(length(x)==1) { return(x)} else { return(sample(x))}
}), by=key(tmp), .SDcols="id"]
}
neworder <- tmp[,.N, by=key(tmp)]
neworder[,clusterid:=sample(clusterid)]
neworder[,newid:=.I]
neworder[,N:=NULL]
tmp <- tmp[neworder]
if (randomizeRecords=="byHH") {
setkey(tmp, newid, id)
}
if (randomizeRecords=="withinHH") {
setkey(tmp, newid)
}
o <- o[tmp$id,]
# randomize order of clusters
if (randomizeRecords=="byHH") {
setkey(tmp, newid, id)
o <- o[tmp$id,]
}
}
rownames(o) <- NULL
}
return(o)
})
addWarning <- function(obj, warnMsg, method, variable=NA) {
if (!inherits(obj, "sdcMicroObj")) {
stop("'obj' must be a of class 'sdcMicroObj'!\n")
}
if (!is.character(method)) {
stop("'method' must be a of class 'character'!\n")
}
if (!is.character(warnMsg)) {
stop("'warnMsg' must be a of class 'character'!\n")
}
addRes <- get.sdcMicroObj(obj, type="additionalResults")
df <- data.frame(warnMsg=warnMsg, method=method, variable=variable, stringsAsFactors=FALSE)
if (is.null(addRes)) {
addRes <- list()
addRes$sdcMicro_warnings <- df
} else {
if (is.null(addRes$sdcMicro_warnings)) {
addRes$sdcMicro_warnings <- df
} else {
addRes$sdcMicro_warnings <- rbind(addRes$sdcMicro_warnings, df)
}
}
obj <- set.sdcMicroObj(obj, type="additionalResults", list(addRes))
obj
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.