R/sdcMicro-package.R

#' @keywords internal
"_PACKAGE"
#' Statistical Disclosure Control (SDC) for the generation of protected
#' microdata for researchers and for public use.
#'
#' This package includes all methods of the popular software mu-Argus plus
#' several new methods. In comparison with mu-Argus the advantages of this
#' package are that the results are fully reproducible even with the included
#' GUI, that the package can be used in batch-mode from other software, that
#' the functions can be used in a very flexible way, that everybody could look
#' at the source code and that there are no time-consuming meta-data management
#' is necessary. However, the user should have a detailed knowledge about SDC
#' when applying the methods on data.
#'
#' The package is programmed using S4-classes and it comes with a well-defined
#' class structure.
#'
#' The implemented graphical user interface (GUI) for microdata protection
#' serves as an easy-to-handle tool for users who want to use the sdcMicro
#' package for statistical disclosure control but are not used to the native R
#' command line interface.  In addition to that, interactions between objects
#' which results from the anonymization process are provided within the GUI.
#' This allows an automated recalculation and displaying information of the
#' frequency counts, individual risk, information loss and data utility after
#' each anonymization step. In addition to that, the code for every
#' anonymization step carried out within the GUI is saved in a script which can
#' then be easily modified and reloaded.
#'
#' \tabular{ll}{ Package: \tab sdcMicro\cr Type: \tab Package\cr Version: \tab
#' 2.5.9\cr Date: \tab 2009-07-22\cr License: \tab GPL 2.0 \cr }
#'
#' @name sdcMicro-package
#' @aliases sdcMicro-package sdcMicro
#' @author Matthias Templ, Alexander Kowarik, Bernhard Meindl
#'
#' Maintainer: Matthias Templ <templ@@statistik.tuwien.ac.at>
#' @references
#' Templ, M. Statistical Disclosure Control for Microdata: Methods and Applications in R.
#' \emph{Springer International Publishing}, 287 pages, 2017. ISBN 978-3-319-50272-4.
#' \doi{10.1007/978-3-319-50272-4}
#'
#' Templ, M. and Kowarik, A. and Meindl, B.
#' Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro.
#' \emph{Journal of Statistical Software}, \strong{67} (4), 1--36, 2015. \doi{10.18637/jss.v067.i04}
#'
#' Templ, M. and Meindl, B. \emph{Practical Applications in
#' Statistical Disclosure Control Using R}, Privacy and Anonymity in
#' Information Management Systems, Bookchapter, Springer London, pp. 31-62,
#' 2010. \doi{10.1007/978-1-84996-238-4_3}
#'
#' Kowarik, A. and Templ, M. and Meindl, B. and Fonteneau, F. and Prantner, B.:
#' \emph{Testing of IHSN Cpp Code and Inclusion of New Methods into sdcMicro},
#' in: Lecture Notes in Computer Science, J. Domingo-Ferrer, I. Tinnirello
#' (editors.); Springer, Berlin, 2012, ISBN: 978-3-642-33626-3, pp. 63-77.
#' \doi{10.1007/978-3-642-33627-0_6}
#'
#' Templ, M.  \emph{Statistical Disclosure Control for Microdata Using the
#' R-Package sdcMicro}, Transactions on Data Privacy, vol. 1, number 2, pp.
#' 67-85, 2008. \url{http://www.tdp.cat/issues/abs.a004a08.php}
#'
#' Templ, M.  \emph{New Developments in Statistical Disclosure Control and
#' Imputation: Robust Statistics Applied to Official Statistics},
#' Suedwestdeutscher Verlag fuer Hochschulschriften, 2009, ISBN: 3838108280,
#' 264 pages.
#' @examples
#' \donttest{
#' ## example from Capobianchi, Polettini and Lucarelli:
#' data(francdat)
#' f <- freqCalc(francdat, keyVars=c(2, 4:6), w = 8)
#' f
#' f$fk
#' f$Fk
#'
#' ## dealing with missing values:
#' x <- francdat
#' x[3,5] <- NA
#' x[4,2] <- x[4,4] <- NA
#' x[5,6]  <- NA
#' x[6,2]  <- NA
#' f2 <- freqCalc(x, keyVars = c(2, 4:6), w = 8)
#' f2$fk
#' f2$Fk
#'
#' ## individual risk calculation:
#' indivf <- indivRisk(f)
#' indivf$rk
#'
#' ## Local Suppression
#' localS <- localSupp(f, keyVar = 2, threshold = 0.25)
#' f2 <- freqCalc(localS$freqCalc, keyVars=c(2, 4:6), w = 8)
#' indivf2 <- indivRisk(f2)
#' indivf2$rk
#'
#' ## select another keyVar and run localSupp() once again,
#' ## if you think the table is not fully protected
#' data(free1)
#' free1 <- as.data.frame(free1)
#' f <- freqCalc(x = free1, keyVars = 1:3, w = 30)
#' ind <- indivRisk(f)
#' ## and now you can use the interactive plot for individual risk objects:
#' ## plot(ind)
#'
#' ## example from Capobianchi, Polettini and Lucarelli:
#' data(francdat)
#' l1 <- localSuppression(
#'   obj = francdat,
#'   keyVars=c(2, 4:6),
#'   importance = c(1, 3, 2, 4)
#' )
#' l1
#' l1$x
#' l2 <- localSuppression(obj = francdat, keyVars=c(2, 4:6), k = 2)
#' l3 <- localSuppression(obj = francdat, keyVars=c(2, 4:6), k = 4)
#'
#' ## Global recoding:
#' data(free1)
#' free1 <- as.data.frame(free1)
#' free1[, "AGE"] <- globalRecode(
#'   obj = free1[, "AGE"],
#'   breaks = c(1,9,19,29,39,49,59,69,100),
#'   labels = 1:8
#' )
#'
#' ## Top coding:
#' topBotCoding(
#'   obj = free1[, "DEBTS"],
#'   value = 9000,
#'   replacement = 9100,
#'   kind = "top"
#' )
#'
#' ## Numerical Rank Swapping:
#' data(Tarragona)
#' Tarragona1 <- rankSwap(Tarragona, P = 10, K0 = NULL, R0 = NULL)
#'
#' ## Microaggregation:
#' m1 <- microaggregation(Tarragona, method = "onedims", aggr = 3)
#' m2 <- microaggregation(Tarragona, method = "pca", aggr = 3)
#'
#' ## using a subset because of computation time
#' valTable(Tarragona[1:50, ], method = c("simple", "onedims", "pca"))
#'
#' data(microData)
#' microData <- as.data.frame(microData)
#' m_micro <- microaggregation(microData, method = "mdav")
#' summary(m_micro)
#' plotMicro(m_micro, 1, which.plot = 1)  # not enough observations...
#' data(free1)
#' free1 <- as.data.frame(free1)
#' plotMicro(
#'   x = microaggregation(free1[,31:34], method = "onedims"),
#'   p = 1,
#'   which.plot = 1
#' )
#'
#' ## disclosure risk (interval) and data utility:
#' m1 <- microaggregation(Tarragona, method = "onedims", aggr = 3)
#' dRisk(obj = Tarragona, xm = m1$mx)
#' dRisk(obj = Tarragona, xm = m2$mx)
#' dUtility(obj = Tarragona, xm = m1$mx)
#' dUtility(obj = Tarragona, xm = m2$mx)
#'
#' ## Fast generation of synthetic data with approximately
#' ## the same covariance matrix as the original one.
#' data(mtcars)
#' cov(mtcars[, 4:6])
#' df_gen <- dataGen(obj = mtcars[, 4:6], n = 200)
#' cov(df_gen)
#' pairs(mtcars[, 4:6])
#' pairs(df_gen)
#'
#' ## Post-Randomization (PRAM)
#' x <- factor(sample(1:4, 250, replace = TRUE))
#' pr1 <- pram(x)
#' length(which(pr1$x_pram == x))
#' summary(pr1)
#' x2 <- factor(sample(1:4, 250, replace=TRUE))
#' length(which(pram(x2)$x_pram == x2))
#'
#' data(free1)
#' marstat <- as.factor(free1[,"MARSTAT"])
#' marstatPramed <- pram(marstat)
#' summary(marstatPramed)
#'
#' ## The same functionality can be also applied to `sdcMicroObj`-objects
#' data(testdata)
#'
#' ## undo-functionality is by default restricted to data sets
#' ## with <= `1e5` rows; to modify, env-var `sdcMicro_maxsize_undo`
#' ## can to be changed before creating a problem instance
#' Sys.setenv("sdcMicro_maxsize_undo" = 1e6)
#'
#' ## create an object
#' testdata$water <- factor(testdata$water)
#' sdc <- createSdcObj(
#'   dat = testdata,
#'   keyVars = c("urbrur", "roof", "walls", "electcon", "water", "relat", "sex"),
#'   numVars = c("expend", "income", "savings"),
#'   w = "sampling_weight"
#' )
#' head(sdc@@manipNumVars)
#'
#' ## Display risk-measures
#' sdc@@risk$global
#' sdc <- dRisk(sdc)
#' sdc@@risk$numeric
#'
#' ## Generation of synthetic data
#' synthdat <- dataGen(sdc)
#'
#' ## use addNoise with default parameters (not suggested)
#' sdc <- addNoise(sdc, variables = c("expend", "income"))
#' head(sdc@@manipNumVars)
#' sdc@@risk$numeric
#'
#' ## undolast step (remove adding noise)
#' sdc <- undolast(sdc)
#' head(sdc@@manipNumVars)
#' sdc@@risk$numeric
#'
#' ## apply addNoise() with custom parameters
#' sdc <- addNoise(sdc, noise = 0.2)
#' head(sdc@@manipNumVars)
#' sdc@@risk$numeric
#'
#' ## LocalSuppression
#' sdc <- undolast(sdc)
#' head(sdc@@risk$individual)
#' sdc@@risk$global
#' sdc <- localSuppression(sdc)
#' head(sdc@@risk$individual)
#' sdc@@risk$global
#'
#' ## microaggregation
#' sdc <- undolast(sdc)
#' head(get.sdcMicroObj(sdc, type = "manipNumVars"))
#' sdc <- microaggregation(sdc)
#' head(get.sdcMicroObj(sdc, type = "manipNumVars"))
#'
#' ## Post-Randomization
#' sdc <- undolast(sdc)
#' head(sdc@@risk$individual)
#' sdc@@risk$global
#' sdc <- pram(sdc, variables = "water")
#' head(sdc@@risk$individual)
#' sdc@@risk$global
#'
#' ## rankSwap
#' sdc <- undolast(sdc)
#' head(sdc@@risk$individual)
#' sdc@@risk$global
#' head(get.sdcMicroObj(sdc, type = "manipNumVars"))
#' sdc <- rankSwap(sdc)
#' head(get.sdcMicroObj(sdc, type = "manipNumVars"))
#' head(sdc@@risk$individual)
#' sdc@@risk$global
#'
#'
#' ## topBotCoding
#' head(get.sdcMicroObj(sdc, type = "manipNumVars"))
#' sdc@@risk$numeric
#' sdc <- topBotCoding(
#'   obj = sdc,
#'   value = 60000000,
#'   replacement = 62000000,
#'   column = "income"
#' )
#' head(get.sdcMicroObj(sdc, type = "manipNumVars"))
#' sdc@@risk$numeric
#'
#' ## LocalRecProg
#' data(testdata2)
#' keyVars <- c("urbrur", "roof", "walls", "water", "sex")
#' w <- "sampling_weight"
#' sdc <- createSdcObj(testdata2,
#'   keyVars = keyVars,
#'   weightVar = w
#' )
#' sdc@@risk$global
#' sdc <- LocalRecProg(sdc)
#' sdc@@risk$global
#'
#' ## Model-based risks using a formula
#' form <- as.formula(paste("~", paste(keyVars, collapse = "+")))
#' sdc <- modRisk(sdc, method = "default", formulaM = form)
#' get.sdcMicroObj(sdc, "risk")$model
#'
#' sdc <- modRisk(sdc, method = "CE", formulaM = form)
#' get.sdcMicroObj(sdc, "risk")$model
#'
#' sdc <- modRisk(sdc, method = "PML", formulaM = form)
#' get.sdcMicroObj(sdc, "risk")$model
#'
#' sdc <- modRisk(sdc, method = "weightedLLM", formulaM = form)
#' get.sdcMicroObj(sdc, "risk")$model
#'
#' sdc <- modRisk(sdc, method = "IPF", formulaM = form)
#' get.sdcMicroObj(sdc, "risk")$model
#' }
NULL
sdcTools/sdcMicro documentation built on March 15, 2024, 12:32 p.m.