Nothing
#' R6 Class defining statistical tables that can be perturbed
#'
#' Such objects are typically generated using [ck_setup()].
#' @md
#' @rdname cellkey_pkg
ck_class <- R6::R6Class("cellkey_obj", cloneable = FALSE,
public = list(
#' @description Create a new table instance
#' @param x an object coercible to a `data.frame`
#' @param rkey either a column name within `x` referring to a variable containing record keys
#' or a single integer(ish) number > `5` that referns to the number of digits for record keys that
#' will be generated internally.
#' @param dims a list containing slots for each variable that should be
#' tabulated. Each slot consists should be created/modified using [sdcHierarchies::hier_create()],
#' [sdcHierarchies::hier_add()] and other functionality from package `sdcHierarchies`.
#' @param w (character) a scalar character referring to a variable in `x` holding sampling
#' weights. If `w` is `NULL` (the default), all weights are assumed to be `1`
#' @param countvars (character) an optional vector containing names of binary (0/1 coded)
#' variables withing `x` that should be included in the problem instance.
#' These variables can later be perturbed.
#' @param numvars (character) an optional vector of numerical variables that can later be tabulated.
#' @return A new `cellkey_obj` object. Such objects (internally) contain the fully computed
#' statistical tables given input microdata (`x`), the hierarchical definitionals (`dims`) as
#' well as the remaining inputs. Intermediate results are stored internally and can only be
#' modified / accessed via the exported public methods described below.
initialize = function(x, rkey, dims, w = NULL, countvars = NULL, numvars = NULL) {
type <- is_perturbed <- NULL
if (!inherits(x, "data.frame")) {
stop("`x` is not coercible to a `data.frame`.", call. = FALSE)
}
x <- as.data.table(x)
# checking record keys
rkeyvar <- .tmpvarname("rkey")
if (is.character(rkey)) {
stopifnot(is_scalar_character(rkey))
stopifnot(rkey %in% names(x))
# check the record keys
rkeys <- x[[rkey]]
check_rkeys(rkeys = rkeys)
x[[rkeyvar]] <- rkeys
} else if (is_scalar_integerish(rkey)) {
if (rkey < 5) {
stop("number of digits for record keys must be >= 5!", call. = FALSE)
}
x[[rkeyvar]] <- ck_generate_rkeys(x, nr_digits = rkey)
} else {
e <- "Argument `rkey` must either be character or integer(ish) number > 0!"
stop(e, call. = FALSE)
}
# checking dimensions
if (!rlang::is_named(dims)) {
stop("Please provide names to argument `dims`", call. = FALSE)
}
if (!is.list(dims)) {
stop("Argument `dims` must be a list.", call. = FALSE)
}
dimvars <- names(dims)
if (!all(dimvars %in% names(x))) {
stop("Some variable names provided in `dims` don't exist in `x`.", call. = FALSE)
}
chk <- sapply(dims, function(x) {
!inherits(x, "sdc_hierarchy")
})
if (any(chk)) {
stop("Some elements provided in `dims` are not valid hierarchy objects.", call. = FALSE)
}
ck_log("compute dim_info")
h_info <- lapply(dims, hier_info)
h_info <- lapply(h_info, function(x) {
data.table::rbindlist(lapply(x, function(y) {
data.frame(
code = y$name,
level = y$level,
is_leaf = y$is_leaf,
parent = y$parent,
stringsAsFactors = FALSE)
}))
})
# checking weights
wvar <- .tmpvarname("weightvar")
if (!is.null(w)) {
if (!w %in% names(x)) {
stop("weight variable `w` does not exist in `x`.", call. = FALSE)
}
if (!is.numeric(x[[w]])) {
stop("weight variable `w` must be numeric.", call. = FALSE)
}
x[[wvar]] <- x[[w]]
is_weighted <- TRUE
} else {
x[[wvar]] <- 1
is_weighted <- FALSE
}
# adding a frequency variable
freqvar <- .tmpvarname("freq")
x[[freqvar]] <- 1
if (!is.null(countvars)) {
# check countvars
if (!is.character(countvars)) {
stop("Argument `countvars` needs to be character specifying variable names.", call. = FALSE)
}
if (!all(countvars %in% names(x))) {
stop("Some variable names provided in `countvars` don't exist in `x`.", call. = FALSE)
}
chk <- sapply(countvars, function(cv) {
!is.numeric(x[[cv]])
})
if (any(chk)) {
stop("Some elements provided in `countvars` are not numeric.", call. = FALSE)
}
# check if all count-vars are integerish 0/1
chk <- sapply(countvars, function(cv) {
all(unique(x[[cv]]) %in% c(0, 1))
})
if (!all(chk)) {
stop("Some elements provided in `countvars` are not 0/1 coded.", call. = FALSE)
}
# compute weighted countvars
countvars_w <- paste0("wc_", countvars)
suppressWarnings(
x[, (countvars_w) := lapply(.SD, function(v) v * get(wvar)), .SDcols = countvars]
)
# record_keys using an indicator
countvars_rec <- paste0("rkey_", countvars)
x[, (countvars_rec) := lapply(.SD, function(v) v * get(rkeyvar)), .SDcols = countvars]
} else {
countvars <- countvars_w <- countvars_rec <- c()
}
if (!is.null(numvars)) {
# check numvars
if (!is.character(numvars)) {
stop("Argument `numvars` needs to be character specifying variable names.", call. = FALSE)
}
if (!all(numvars %in% names(x))) {
stop("Some variable names provided in `numvars` don't exist in `x`.", call. = FALSE)
}
chk <- sapply(numvars, function(nv) {
!is.numeric(x[[nv]])
})
if (any(chk)) {
stop("Some elements provided in `numvars` are not numeric.", call. = FALSE)
}
setnames(x, numvars, tolower(numvars))
numvars <- tolower(numvars)
# compute weighted variables and index to compute record keys for only
# units actually contributing
numvars_w <- paste0("ws_", numvars)
nozero_reckeys_nums <- paste0("rkey_nozero_", numvars)
numvars_ind <- paste0("ind_nonzero", numvars)
for (i in seq_along(numvars)) {
v <- numvars[i]
x[[numvars_w[i]]] <- x[[wvar]] * x[[v]]
# compute index
index <- rep(1, nrow(x))
index[is.na(x[[v]])] <- 0
index[is.nan(x[[v]])] <- 0
index[x[[v]] == 0] <- 0
# record keys is 0 for non-contributing units
x[[nozero_reckeys_nums[i]]] <- x[[rkeyvar]] * index
}
} else {
max_contributions <- numvars_w <- nozero_reckeys_nums <- NULL
}
nv <- c(
rkeyvar, wvar, countvars, countvars_w, countvars_rec,
numvars, numvars_w, nozero_reckeys_nums
)
prob <- sdcTable::makeProblem(
data = x,
dimList = dims,
freqVarInd = match(freqvar, names(x)),
numVarInd = match(nv, names(x)),
weightInd = NULL,
sampWeightInd = NULL)
# with perturbations
tab <- sdcTable::sdcProb2df(
obj = prob,
addDups = TRUE,
addNumVars = TRUE,
dimCodes = "original")
ck_log("mark duplicated cells")
dims <- prob@dimInfo@dimInfo
tab$is_bogus <- FALSE
for (i in seq_len(length(dims))) {
dd <- dims[[i]]
tab$is_bogus[tab[[dd@vName]] %in% dd@dups] <- TRUE
}
# duplicated cells (bogus codes) have the same
# strID as their parent cell!
strids <- unique(tab[["strID"]])
# calculate contributing indices
# we do this in any case!
ck_log("compute contributing indices")
# the .contributing_indices_tmp() should be removed
# once sdcTable 0.32 is on cran as this version has the
# appropriate fixes;
.contributing_indices_tmp <- function(prob, ids = NULL) {
# returns all contributing codes for each dimensions
# of a sdcProblem-object; dimensions are converted
# sdcHierarchies-trees and hier_info() is then used.
.get_all_contributing_codes <- function(x) {
.sdchier_from_sdc <- function(d) {
df <- data.frame(
levels = slot(d, "levels"),
codes = slot(d, "codesOriginal"),
stringsAsFactors = FALSE
)
df$levels <- sapply(seq_len(nrow(df)), function(x) {
paste0(rep("@", df$levels[x]), collapse = "")
})
hier_import(df, from = "df")
}
stopifnot(inherits(x, "sdcProblem"))
dims_hier <- lapply(x@dimInfo@dimInfo, function(x) {
.sdchier_from_sdc(x)
})
dims_info <- lapply(dims_hier, function(x) {
hier_info(x)
})
all_contr_codes <- lapply(dims_info, function(x) {
if ("is_rootnode" %in% names(x)) {
res <- list()
res[[x$name]] <- list(
is_root = x$is_rootnode,
contr_codes = x$contributing_codes
)
} else {
res <- lapply(x, function(y) {
list(
is_root = y$is_rootnode,
contr_codes = y$contributing_codes
)
})
}
return(res)
})
all_contr_codes
}
. <- NULL
dt <- sdcProb2df(prob, addDups = FALSE, dimCodes = "original")
poss_ids <- dt$strID
if (is.null(ids)) {
ids <- poss_ids
} else {
if (!is.character(ids)) {
stop("Please provide a character vector in argument `ids`.", call. = FALSE)
}
if (!all(ids %in% poss_ids)) {
e <- c(
"Some values provided in `ids` are not valid. ",
"See column `strID` in `sdcProb2df()` for valid ids."
)
stop(paste(e, collapse = " "), call. = FALSE)
}
}
dimvars <- slot(prob, "dimInfo")@vNames
nr_dims <- length(dimvars)
dt <- dt[, c("strID", "freq", dimvars), with = FALSE]
data.table::setnames(dt, old = "strID", new = "id")
# we compute all unique codes once
unique_codes <- lapply(dt[, dimvars, with = FALSE], function(x) {
sort(unique(x))
})
# get contributing codes
contr_codes <- .get_all_contributing_codes(prob)
# positions in strID
str_info <- prob@dimInfo@strInfo
names(str_info) <- dimvars
# merge inner cell-info to data
dt_inner <- data.table(id = prob@dimInfo@strID, is_inner = TRUE)
dt_inner$idx <- seq_len(nrow(dt_inner))
dt_inner <- dt[dt_inner, on = "id"]
dt_inner$tmp <- apply(dt_inner[, dimvars, with = FALSE], 1, paste0, collapse = "")
setkeyv(dt_inner, "tmp")
# subsetting dt to those ids, we want to compute the contributing indices from
dt <- dt[.(ids), on = "id"]
# prepare output
res <- vector("list", length(ids))
names(res) <- ids
for (i in seq_len(nrow(dt))) {
str_id <- dt$id[i]
if (dt$freq[i] > 0) {
index_vec <- which(dt_inner$id == str_id)
if (length(index_vec) > 0) {
res[[str_id]] <- dt_inner$idx[index_vec]
} else {
lev_info <- vector("list", length = nr_dims)
names(lev_info) <- dimvars
for (dv in dimvars) {
code <- dt[[dv]][i]
info <- contr_codes[[dv]][[code]]
if (!info$is_root) {
lev_info[[dv]] <- info$contr_codes
} else {
lev_info[[dv]] <- unique_codes[[dv]]
}
}
cell_indices <- sdcTable:::pasteStrVec(unlist(expand.grid(lev_info)), nr_dims)
res[[str_id]] <- which(dt_inner$tmp %in% cell_indices)
}
}
}
return(res)
}
if (length(numvars) == 0) {
ck_log("no numerical variables were defined in `ck_setup()`, ",
"therefore no contributing indices need to be computed.")
max_contributions <- contr_indices <- NULL
} else {
ck_log("compute contributing indices for (sub)-totals")
contr_indices <- sdcTable::contributing_indices(
prob = prob, ids = NULL
)
ck_log("find top k contributors for each cell and numerical variable")
# finding top_k contributors for each cell and numerical variable
microdat <- prob@dataObj@rawData[, c(names(dims), numvars, wvar), with = FALSE]
microdat$.tmpid <- seq_len(nrow(x))
# perhaps c++?
# for each numerical variable and each cell, get the top_k contributions
# along with its ids and its values
.get_max_contributions <- function(indices, microdat, wvar, nv, top_k) {
res <- vector("list", length = length(indices))
names(res) <- names(indices)
# we have no numerical-variables -> return empty list
if (length(nv) == 0) {
return(res)
}
for (i in seq_len(length(res))) {
out <- vector("list", length = length(nv))
names(out) <- nv
xx <- subset(microdat, .tmpid %in% indices[[i]])
top_k <- min(top_k, nrow(xx))
for (v in nv) {
xx$.tmpordervar <- abs(xx[[v]])
xx$.tmpweightvar <- xx[[v]] * xx[[wvar]]
setorderv(xx, c(".tmpordervar", wvar), order = c(-1L, -1L))
if (nrow(xx) == 0) {
out[[v]]$uw_vals <- out[[v]]$w_vals <- 0
out[[v]]$uw_ids <- out[[v]]$w_ids <- NA
out[[v]]$uw_spread <- out[[v]]$w_spread <- 0
out[[v]]$uw_sum <- out[[v]]$w_sum <- 0
out[[v]]$uw_mean <- out[[v]]$w_mean <- 0
} else {
out[[v]]$uw_vals <- xx[[v]][1:top_k]
out[[v]]$uw_ids <- out[[v]]$w_ids <- xx$.tmpid[1:top_k]
out[[v]]$uw_spread <- diff(range(xx[[v]], na.rm = TRUE))
out[[v]]$uw_sum <- sum(xx[[v]], na.rm = TRUE)
out[[v]]$uw_mean <- out[[v]]$uw_sum / nrow(xx)
out[[v]]$w_vals <- xx$.tmpweightvar[1:top_k]
out[[v]]$w_spread <- diff(range(xx[[v]], na.rm = TRUE))
out[[v]]$w_sum <- sum(xx$.tmpweightvar, na.rm = TRUE)
out[[v]]$w_mean <- out[[v]]$w_sum / sum(xx[[wvar]], na.rm = TRUE)
}
# we compute if the number of contributors to the cell
# is even or odd. This information can later be used if
# we have different ptables (parity-case)
out[[v]]$even_contributors <- nrow(xx) %% 2 == 0
}
res[[i]] <- out
}
res
}
# top_k is hardcoded to 6;
# this is the maximum allowed value for top_k, also in params_nums()
max_contributions <- .get_max_contributions(
indices = contr_indices,
microdat = microdat,
nv = numvars,
wvar = wvar,
top_k = 6)
}
res <- tab[, c("strID", "freq", dimvars, "is_bogus", nv), with = FALSE]
cols_ck <- gen_vnames(c("total", countvars), prefix = "ck")
if (length(countvars) > 0) {
setnames(res, c(rkeyvar, paste0("rkey_", countvars)), cols_ck)
} else {
setnames(res, rkeyvar, cols_ck)
}
# fix cell keys
for (i in seq_len(length(cols_ck))) {
res[[cols_ck[i]]] <- res[[cols_ck[i]]] %% 1
}
cols_n <- gen_vnames(c("total", countvars), prefix = "uwc")
setnames(res, c("freq", countvars), cols_n)
cols_wc <- gen_vnames(c("total", countvars), prefix = "wc")
if (length(countvars) > 0) {
setnames(res, c(wvar, paste0("wc_", countvars)), cols_wc)
} else {
setnames(res, wvar, cols_wc)
}
cv <- c("total", countvars)
rescnts <- vector("list", length = length(cv) + 1)
names(rescnts) <- c("dims", cv)
rescnts[[1]] <- res[, c("strID", dimvars, "is_bogus"), with = FALSE]
for (i in seq_along(cv)) {
index <- i + 1
vv <- c(
gen_vnames(cv[i], prefix = "ck"),
gen_vnames(cv[i], prefix = "uwc"),
gen_vnames(cv[i], prefix = "wc")
)
rescnts[[index]] <- res[, vv, with = FALSE]
}
nr_numvars <- length(numvars)
if (nr_numvars > 0) {
seq_nums <- seq_len(nr_numvars)
resnums <- vector("list", length = length(numvars))
names(resnums) <- numvars
cols_uws <- gen_vnames(numvars, prefix = "uws")
setnames(res, numvars, cols_uws)
cols_ws <- gen_vnames(numvars, prefix = "ws")
for (j in seq_nums) {
cn1 <- gen_vnames("total", prefix = "ck")
cn2 <- gen_vnames(numvars[j], prefix = "rkey_nozero")
vv <- c(cn1, cn2, cols_uws[j], cols_ws[j])
vv <- res[, vv, with = FALSE]
vv[[cn2]] <- vv[[cn2]] %% 1 # normalize cellkey
setnames(vv, cn1, gen_vnames(numvars[j], prefix = "ck"))
setnames(vv, cn2, gen_vnames(numvars[j], prefix = "ck_nz"))
vv[["special_protection"]] <- FALSE
resnums[[j]] <- vv
}
rescnts <- append(rescnts, resnums)
}
varsdt <- data.table(
vname = c(freqvar, wvar, rkeyvar),
type = c("freqvar", "wvar", "rkeyvar")
)
varsdt <- rbind(varsdt, data.table(
vname = dimvars, type = "dimvars"
))
varsdt <- rbind(varsdt, data.table(
vname = c("total", countvars), type = "countvars"
))
if (length(numvars) > 0) {
varsdt <- rbind(varsdt, data.table(
vname = numvars, type = "numvars"
))
}
varsdt$is_perturbed <- NA
varsdt[type %in% c("countvars", "numvars"), is_perturbed := FALSE]
# modifications and perturbation parameters
mods <- pert_params <- list(
cnts = NULL,
nums = NULL
)
dupsinfo <- data.table(
id = seq_len(nrow(rescnts[[1]])),
is_bogus = rescnts[[1]]$is_bogus)
rescnts[[1]]$is_bogus <- NULL
private$.prob <- prob
private$.max_contributions <- max_contributions
private$.varsdt <- varsdt
private$.is_weighted <- is_weighted
private$.pert_params <- pert_params
private$.modifications <- mods
private$.results <- rescnts
private$.dupsinfo <- dupsinfo
private$.h_info <- h_info
private$.is_initialized <- TRUE
private$.validate()
invisible(self)
},
#' @description Perturb a count- or magnitude variable
#' @param v name(s) of count- or magnitude variables that should be perturbed.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. Updated data can be accessed using other exported
#' methods like `$freqtab()` or `$numtab()`.
perturb = function(v) {
# important variables
countvars <- self$cntvars()
numvars <- self$numvars()
avail <- c(countvars, numvars)
.check_avail(
v = v,
avail = avail,
msg = "Invalid variables specified in `v`:",
single_v = FALSE)
for (i in seq_len(length(v))) {
vname <- v[i]
if (vname %in% countvars) {
private$.ck_perturb_cnts(v = vname)
} else {
private$.ck_perturb_nums(v = vname)
}
}
return(invisible(self))
},
#' @description Extract results from already perturbed count variables as a
#' `data.table`
#' @param v a vector of variable names for count variables. If `NULL`
#' (the default), the results are returned for all available count
#' variables. For variables that have not yet perturbed, columns
#' `puwc` and `pwc` are filled with `NA`.
#' @param path if not `NULL`, a scalar character defining a (relative
#' or absolute) path to which the result table should be written. A `csv`
#' file will be generated and, if specified, `path` must have
#' ".csv" as file-ending
#' @return This method returns a `data.table` containing all combinations of the dimensional variables in
#' the first n columns. Additionally, the following columns are shown:
#' - `vname`: name of the perturbed variable
#' - `uwc`: unweighted counts
#' - `wc`: weighted counts
#' - `puwc`: perturbed unweighted counts or `NA` if `vname` was not yet perturbed
#' - `pwc`: perturbed weighted counts or `NA` if `vname` was not yet perturbed
freqtab = function(v = NULL, path = NULL) {
if (!is.null(path)) {
.valid_path(path, ext = "csv", check_exists = FALSE)
}
avail <- self$cntvars()
if (is.null(v)) {
v <- avail
} else {
if (!is.character(v)) {
stop("Argument `v` must be a character vector specifying variable names.", call. = FALSE)
}
v <- tolower(v)
.check_avail(
v = v,
avail = avail,
msg = "Invalid variables specified in `v`:",
single_v = FALSE)
}
# return a table for a single perturbed count variable
.onevar_tab_freq <- function(results, vname, is_perturbed) {
dt1 <- results$dims
dt1[["strID"]] <- NULL
dt1$vname <- vname
dt2 <- results[[vname]]
cn <- names(dt2)
setnames(dt2, gsub(paste0("_", vname), "", cn))
if (is_perturbed) {
dt2$pert <- NULL
dt2$ck <- dt2$wcavg <- NULL
} else {
dt2$puwc <- NA_real_
dt2$pwc <- NA_real_
dt2$ck <- NULL
}
dt <- cbind(dt1, dt2)
dt[]
}
results <- copy(private$.results)
res <- rbindlist(lapply(v, function(x) {
.onevar_tab_freq(
results = results,
vname = x,
is_perturbed = private$.is_perturbed_countvar(x))
}))
if (!is.null(path)) {
if (file.exists(path)) {
file.remove(path)
}
res <- tryCatch(fwrite(res, file = path, sep = ";"), error = function(e) e)
if (inherits(res, "error")) {
warning("File ", shQuote(path), " could not be written to disk", call. = FALSE)
}
message("File ", shQuote(path), " successfully written to disk.")
}
return(res)
},
#' @description Extract results from already perturbed continuous variables
#' as a `data.table`.
#' @param v a vector of variable names of continuous variables. If `NULL`
#' (the default), the results are returned for all available numeric variables.
#' @param mean_before_sum (logical); if `TRUE`, the perturbed values are adjusted
#' by a factor `((n+p))⁄n` with
#' - `n`: the original weighted cell value
#' - `p`: the perturbed cell value
#'
#' This makes sense if the the accuracy of the variable mean is considered to be
#' more important than accuracy of sums of the variable. The default value is
#' `FALSE` (no adjustment is done)
#' @param path if not `NULL`, a scalar character defining a (relative or absolute)
#' path to which the result table should be written. A `csv` file will be generated
#' and, if specified, `path` must have ".csv" as file-ending
#'
#' @return This method returns a `data.table` containing all combinations of the
#' dimensional variables in the first n columns. Additionally, the following
#' columns are shown:
#' - `vname`: name of the perturbed variable
#' - `uws`: unweighted sum of the given variable
#' - `ws`: weighted cellsum
#' - `pws`: perturbed weighted sum of the given cell or `NA` if `vname`
#' has not not perturbed
numtab = function(v = NULL, mean_before_sum = FALSE, path = NULL) {
if (!is.null(path)) {
.valid_path(path, ext = "csv", check_exists = FALSE)
}
if (!rlang::is_scalar_logical(mean_before_sum)) {
stop("Argument `mean_before_sum` needs to be a logical scalar.", call. = FALSE)
}
avail <- self$numvars()
if (length(avail) == 0) {
stop("No numerical variables found in this table-instance.", call. = FALSE)
}
if (is.null(v)) {
v <- avail
}
if (!is.character(v)) {
stop("Argument `v` must be a character vector specifying variable names.", call. = FALSE)
} else {
.check_avail(
v = v,
avail = avail,
msg = "Invalid continuous variables specified in `v`:",
single_v = FALSE)
}
tab <- rbindlist(lapply(v, function(x) {
tmp <- private$.results[[x]]
tmp$vname <- x
if (private$.is_perturbed_numvar(x)) {
tmp <- tmp[, c(8, 3:4, 7)]
} else {
tmp <- tmp[, c(6, 3:4)]
tmp$pws <- NA_real_
}
setnames(tmp, c("vname", "uws", "ws", "pws"))
cbind(private$.results$dims, tmp)[, -1]
}))
if (mean_before_sum) {
ck_log("applying correction based on argument `mean_before_sum`")
tab$pws <- tab$pws * (tab$pws / tab$ws)
}
if (!is.null(path)) {
if (file.exists(path)) {
file.remove(path)
}
res <- tryCatch(fwrite(tab, file = path, sep = ";"), error = function(e) e)
if (inherits(res, "error")) {
warning("File ", shQuote(path), " could not be written to disk", call. = FALSE)
}
message("File ", shQuote(path), " successfully written to disk.")
}
tab
},
#' @description Utility measures for perturbed count variables
#' @param v name of a count variable for which utility measures
#' should be computed.
#' @param exclude_zeros should empty (zero) cells in the original values
#' be excluded when computing distance measures
#' @return This method returns a `list` containing a set of utility
#' measures based on some distance functions. For a detailed description
#' of the computed measures, see [ck_cnt_measures()]
measures_cnts = function(v, exclude_zeros = TRUE) {
if (!is_scalar_character(v)) {
stop("Argument `v` needs to be a scalar character!", call. = FALSE)
}
if (!rlang::is_scalar_logical(exclude_zeros)) {
stop("Argument `exclude_zeros` needs to be a scalar logical value.", call. = FALSE)
}
v <- tolower(v)
if (private$.is_perturbed_countvar(v)) {
return(private$.ck_utility_cnts(v, exclude_zeros = exclude_zeros))
} else {
avail <- private$.ck_perturbed_vars("countvars")
e <- c(
"Variable `v` is neither a perturbed count ",
"nor a perturbed numeric variable.\n")
if (length(avail) > 0) {
e <- c(e, "Possible choices are: ",
paste(shQuote(avail), collapse = "\n"))
} else {
e <- c(e, "Please perturb one of the following variables first: ",
paste(shQuote(self$cntvars()), collapse = "\n"))
}
stop(paste(e, collapse = ""), call. = FALSE)
}
},
#' @description Utility measures for continuous variables (not yet implemented)
#' @param v name of a continuous variable for which utility measures
#' should be computed.
#' @return for (now) an empty list; In future versions of the package, the
#' Method will return utility measures for perturbed magnitude tables.
measures_nums = function(v) {
warning("No utility measures for continuous variables are available yet.", call. = FALSE)
return(list())
},
#' @description Names of variables that can be perturbed / tabulated
#' @return returns a `list` with the following two elements:
#' - `cntvars`: character vector with names of available count
#' variables for perturbation
#' - `numvars`: character vector with names of available numerical
#' variables for perturbation
allvars = function() {
list(
cntvars = self$cntvars(),
numvars = self$numvars()
)
},
#' @description Names of count variables that can be perturbed
#' @return a character vector containing variable names
cntvars = function() {
private$.ck_vars("countvars")
},
#' @description Names of continuous variables that can be perturbed
#' @return a character vector containing variable names
numvars = function() {
private$.ck_vars("numvars")
},
#' @description Information about hierarchies
#' @return a `list` (for each dimensional variable) with
#' information on the hierarchies. This may be used to restrict output tables to
#' specific levels or codes. Each list element is a `data.table` containing
#' the following variables:
#' - `code`: the name of a code within the hierarchy
#' - `level`: number defining the level of the code; the higher the number,
#' the lower the hierarchy with `1` being the overall total
#' - `is_leaf`: if `TRUE`, this code is a leaf node which means no other codes
#' contribute to it
#' - `parent`: name of the parent code
#'
hierarchy_info = function() {
return(private$.h_info)
},
#' @description Modifications applied to count variables
#' @return a `data.table` containing modifications applied to count variables
mod_cnts = function() {
private$modifications(type = "cnts")
},
#' @description Modifications applied to numerical variables
#' @return a `data.table` containing modifications applied to numerical variables
mod_nums = function() {
private$modifications(type = "nums")
},
#' @description Identify sensitive cells based on minimum frequency rule
#' @param v a single variable name of a continuous variable (see method `numvars()`)
#' @param n a number defining the threshold. All cells `<= n` are considered as unsafe.
#' @param weighted if `TRUE`, the weighted number of contributors to a cell are compared to
#' the threshold specified in `n` (default); else the unweighted number of contributors is used.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other methods (e.g `$perturb()`).
supp_freq = function(v, n, weighted = TRUE) {
if (!rlang::is_scalar_logical(weighted)) {
stop("argument `weighted` is not a logical value.", call. = FALSE)
}
.check_avail(
v = v,
avail = self$numvars(),
msg = "Invalid variable specified in `v`:",
single_v = TRUE)
if (!rlang::is_scalar_double(n)) {
stop("`n` is not a number.", call. = FALSE)
}
if (!weighted) {
testv <- "freq"
} else {
testv <- "tmpweightvarfortabulation"
}
res <- sdcProb2df(private$.prob, addDups = TRUE, addNumVars = TRUE)[[testv]]
pat <- res <= n
private$.update_supps(v = v, pat = pat, rule = "freq-rule")
invisible(self)
},
#' @description Identify sensitive cells based on weighted or unweighted cell value
#' @param v a single variable name of a continuous variable (see method `numvars()`)
#' @param n a number defining the threshold. All cells `<= n` are considered as unsafe.
#' @param weighted if `TRUE`, the weighted cell value of variable `v` is compared to
#' the threshold specified in `n` (default); else the unweighted number is used.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other methods (e.g `$perturb()`).
supp_val = function(v, n, weighted = TRUE) {
if (!rlang::is_scalar_logical(weighted)) {
stop("argument `weighted` is not a logical value.", call. = FALSE)
}
.check_avail(
v = v,
avail = self$numvars(),
msg = "Invalid variable specified in `v`:",
single_v = TRUE)
if (!rlang::is_scalar_double(n)) {
stop("`n` is not a number.", call. = FALSE)
}
if (weighted) {
testv <- paste0("ws_", v)
} else {
testv <- v
}
res <- sdcProb2df(private$.prob, addDups = TRUE, addNumVars = TRUE)[[testv]]
pat <- res <= n
private$.update_supps(v = v, pat = pat, rule = "val-rule")
invisible(self)
},
#' @description Identify sensitive cells based on their `names`
#' @param v a single variable name of a continuous variable (see method `numvars()`)
#' @param inp a `data.frame` where each colum represents a dimensional variable. Each row of
#' this input is then used to compute the relevant cells to be identified as sensitive where
#' `NA`-values are possible and used to match any characteristics of the dimensional variable.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other methods (e.g `$perturb()`).
supp_cells = function(v, inp) {
.check_avail(
v = v,
avail = self$numvars(),
msg = "Invalid variable specified in `v`:",
single_v = TRUE
)
if (!inherits(inp, "data.frame")) {
stop("argument `inp` needs to be a `data.frame`", call. = FALSE)
}
dv <- sort(private$.ck_vars("dimvars"))
if (!identical(sort(names(inp)), dv)) {
e <- c(
"invalid and/or missing columns detected in argument `inp`",
"(needs to match names of dimensional variables)"
)
stop(paste(e, collapse = " "), call. = FALSE)
}
for (vname in names(inp)) {
if (!all(na.omit(unique(inp[[vname]])) %in% private$.results$dims[[vname]])) {
e <- c(
"invalid/unknown values detected in argument `inp`",
"in column", shQuote(vname)
)
stop(paste(e, collapse = " "), call. = FALSE)
}
}
inp <- as.data.frame(inp)
inp <- inp[, dv, drop = FALSE]
res <- sdcProb2df(private$.prob, addDups = TRUE, addNumVars = TRUE, dimCodes = "original")
res <- res[, c("strID", "freq", dv), with = FALSE]
res$supp <- FALSE
inp$tmpid <- seq_len(nrow(inp))
for (i in seq_len(nrow(inp))) {
xx <- inp[i, , drop = FALSE]
xx <- xx[, !apply(xx, 2, is.na), drop = FALSE]
res <- merge(res, xx, by = setdiff(names(xx), "tmpid"), all.x = TRUE)
res[!is.na(tmpid), supp := TRUE]
res[, tmpid := NULL]
}
res <- setkey(res, strID) # required because merge changes order
private$.update_supps(v = v, pat = res$supp, rule = "cell-rule")
invisible(self)
},
#' @description Identify sensitive cells based on the p%-rule rule. Please note that this rule
#' can only be applied to positive-only variables.
#' @param v a single variable name of a continuous variable (see method `numvars()`)
#' @param p a number defining a percentage between `1` and `99`.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other methods (e.g `$perturb()`).
supp_p = function(v, p) {
.check_avail(
v = v,
avail = self$numvars(),
msg = "Invalid variable specified in `v`:",
single_v = TRUE)
if (private$.has_negative_values(v = v)) {
stop("dominance rule can only be computed on strictly positive variables.", call. = FALSE)
}
if (!rlang::is_scalar_integerish(p)) {
stop("`p` is not an integerish number.", call. = FALSE)
}
if (p < 1 | p > 99) {
stop("`p` must be >= 1 and < 100.", call. = FALSE)
}
res <- sdcTable::primarySuppression(
object = private$.prob,
type = "p",
numVarName = v,
p = p)
pat <- sdcProb2df(res, addDups = TRUE)[["sdcStatus"]] == "u"
private$.update_supps(v = v, pat = pat, rule = "p%-rule")
invisible(self)
},
#' @description Identify sensitive cells based on the pq-rule. Please note that this rule
#' can only be applied to positive-only variables.
#' @param v a single variable name of a continuous variable (see method `numvars()`)
#' @param p a number defining a percentage between `1` and `99`.
#' @param q a number defining a percentage between `1` and `99`. This value must be larger than `p`.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other methods (e.g `$perturb()`).
supp_pq = function(v, p, q) {
.check_avail(
v = v,
avail = self$numvars(),
msg = "Invalid variable specified in `v`:",
single_v = TRUE)
if (private$.has_negative_values(v = v)) {
stop("dominance rule can only be computed on strictly positive variables.", call. = FALSE)
}
if (!rlang::is_scalar_integerish(p)) {
stop("`p` is not an integerish number.", call. = FALSE)
}
if (p < 1 | p > 99) {
stop("`p` must be >= 1 and < 100.", call. = FALSE)
}
if (!rlang::is_scalar_integerish(q)) {
stop("`q` is not an integerish number.", call. = FALSE)
}
if (q < 1 | q > 99) {
stop("`q` must be >= 1 and < 100.", call. = FALSE)
}
if (p > q) {
stop("Argument `p` must be < than `q`", call. = FALSE)
}
res <- sdcTable::primarySuppression(
object = private$.prob,
type = "pq",
numVarName = v,
p = p,
q = q)
pat <- sdcProb2df(res, addDups = TRUE)[["sdcStatus"]] == "u"
private$.update_supps(v = v, pat = pat, rule = "pq-rule")
invisible(self)
},
#' @description Identify sensitive cells based on the nk-dominance rule. Please note that this rule
#' can only be applied to positive-only variables.
#' @param v a single variable name of a continuous variable (see method `numvars()`)
#' @param n an integerish number `>= 2`
#' @param k a number defining a percentage between `1` and `99`. All cells to which the top `n`
#' contributers contribute more than `k%` is considered unsafe
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other methods (e.g `$perturb()`).
supp_nk = function(v, n, k) {
.check_avail(
v = v,
avail = self$numvars(),
msg = "Invalid variable specified in `v`:",
single_v = TRUE)
if (private$.has_negative_values(v = v)) {
stop("dominance rule can only be computed on strictly positive variables.", call. = FALSE)
}
if (!rlang::is_scalar_double(n)) {
stop("`n` is not a number.", call. = FALSE)
}
if (n < 2) {
stop("`n` must be >= 2", call. = FALSE)
}
if (!rlang::is_scalar_integerish(k)) {
stop("`k` is not an integerish number.", call. = FALSE)
}
if (k < 1 | k > 99) {
stop("`k` must be >= 1 and < 100.", call. = FALSE)
}
res <- sdcTable::primarySuppression(
object = private$.prob,
type = "nk",
numVarName = v,
n = n,
k = k)
pat <- sdcProb2df(res, addDups = TRUE)[["sdcStatus"]] == "u"
private$.update_supps(v = v, pat = pat, rule = "nk-rule")
invisible(self)
},
#' @description Return perturbation parameters of count variables
#' @return a named list in which each list-element contains the
#' active perturbation parameters for the specific count variable
#' defined by the list-name.
params_cnts_get = function() {
return(private$.pert_params$cnts)
},
#' @description Set perturbation parameters for count variables
#' @param val a perturbation object created with [ck_params_cnts()]
#' @param v a character vector (or `NULL`). If `NULL` (the default),
#' the perturbation parameters provided in `val` are set for all
#' count variables; otherwise one may specify the names of
#' the count variables for which the parameters should be set.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other
#' methods (e.g `$perturb()`).
params_cnts_set = function(val, v = NULL) {
if (!inherits(val, "ck_params")) {
stop("Please create the input using `ck_params_cnts()`", call. = FALSE)
}
if (val$type != "cnts") {
stop("Please create the input using `ck_params_cnts()`", call. = FALSE)
}
cv <- self$cntvars()
if (is.null(v)) {
v <- cv
} else {
if (!is.character(v)) {
stop("Argument `v` needs to be a character vector.", call. = FALSE)
}
.check_avail(
v = v,
avail = cv,
msg = "Invalid variables specified in `v`:",
single_v = FALSE)
}
ex_params <- private$.pert_params$cnts
for (curvar in v) {
if (private$.is_perturbed_countvar(curvar)) {
message("--> Variable ", shQuote(curvar), " was already perturbed: parameters are not updated.")
} else {
if (curvar %in% names(ex_params)) {
message("--> replacing perturbation parameters for variable ", shQuote(curvar))
} else {
message("--> setting perturbation parameters for variable ", shQuote(curvar))
}
ex_params[[curvar]] <- val
}
}
private$.pert_params$cnts <- ex_params
return(invisible(self))
},
#' @description reset results and parameters for already perturbed
#' count variables
#' @param v if `v` equals `NULL` (the default), the results are reset
#' for all perturbed count variables; otherwise it is possible to specify
#' the names of already perturbed count variables.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other
#' methods (e.g `$perturb()` or `$freqtab()`).
reset_cntvars = function(v = NULL) {
avail <- private$.ck_perturbed_vars(what = "countvars")
if (length(avail) == 0) {
message("No perturbed count variables available.")
return(invisible(self))
}
if (is.null(v)) {
v <- avail
} else {
if (!is.character(v)) {
stop("Argument `v` needs to be a character vector.", call. = FALSE)
}
.check_avail(v = v, avail = avail, msg = "Invalid variables specified in `v`:")
}
private$.reset_vars(vars = v)
return(invisible(self))
},
#' @description reset results and parameters for already perturbed
#' numerical variables
#' @param v if `v` equals `NULL` (the default), the results are reset for all perturbed
#' numerical variables; otherwise it is possible to specify the names of already
#' perturbed continuous variables.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other
#' methods (e.g `$perturb()` or `$numtab()`).
reset_numvars = function(v = NULL) {
avail <- private$.ck_perturbed_vars(what = "numvars")
if (length(avail) == 0) {
message("No perturbed numerical variables available.")
return(invisible(self))
}
if (is.null(v)) {
v <- avail
} else {
if (!is.character(v)) {
stop("Argument `v` needs to be a character vector.", call. = FALSE)
}
.check_avail(v = v, avail = avail, msg = "Invalid variables specified in `v`:")
}
private$.reset_vars(vars = v)
return(invisible(self))
},
#' @description reset results and parameters for all already perturbed variables.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other
#' methods (e.g `$perturb()`, `$freqtab()` or `$numtab()`).
reset_allvars = function() {
avail <- c(private$.ck_perturbed_vars(what = "countvars"), private$.ck_perturbed_vars(what = "numvars"))
if (length(avail) == 0) {
message("No perturbed variables available.")
return(invisible(self))
}
private$.reset_vars(vars = avail)
},
#' @description Return perturbation parameters of continuous variables
#' @return a named list in which each list-element contains the
#' active perturbation parameters for the specific continuous variable
#' defined by the list-name.
params_nums_get = function() {
return(private$.pert_params$nums)
},
#' @description set perturbation parameters for continuous variables.
#' @param val a perturbation object created with [ck_params_nums()]
#' @param v a character vector (or `NULL`); if `NULL` (the default), the
#' perturbation parameters provided in `val` are set for all continuous
#' variables; otherwise one may specify the names of the numeric variables for
#' which the parameters should be set.
#' @return A modified `cellkey_obj` object in which private slots were
#' updated for side-effects. These updated values are used by other
#' methods (e.g `$perturb()`).
params_nums_set = function(val, v = NULL) {
nv <- self$numvars()
if (length(nv) == 0) {
message("no numeric variables are available!")
return(invisible(NULL))
}
if (!inherits(val, "ck_params")) {
stop("Please create the input using `ck_params_nums()`", call. = FALSE)
}
if (!val$type %in% c("params_m_flex", "params_m_simple")) {
stop("Please create the input using `ck_params_nums()`", call. = FALSE)
}
if (is.null(v)) {
v <- nv
} else {
if (!is.character(v)) {
stop("Argument `v` needs to be a character vector.", call. = FALSE)
}
.check_avail(
v = v,
avail = nv,
msg = "Invalid variables specified in `v`:",
single_v = FALSE)
}
ex_params <- private$.pert_params$nums
for (curvar in v) {
if (private$.is_perturbed_numvar(curvar)) {
message("--> Variable ", shQuote(curvar), " was already perturbed: parameters are not updated.")
} else {
if (curvar %in% names(ex_params)) {
message("--> replacing perturbation parameters for variable ", shQuote(curvar))
} else {
message("--> setting perturbation parameters for variable ", shQuote(curvar))
}
ex_params[[curvar]] <- val
}
}
private$.pert_params$nums <- ex_params
return(invisible(self))
},
#' @description some aggregated summary statistics about perturbed variables
#' @return invisible `NULL`
summary = function() {
cli::cat_line(cli::boxx("Utility measures for perturbed count variables", padding = 0))
# perturbed countvars
cv <- private$.ck_perturbed_vars("countvars")
if (length(cv) == 0) {
cli::cat_line(cli::symbol$cross, " no count variables have been perturbed")
} else {
cli::cat_rule("Distribution statistics of perturbations")
info_cnts <- private$modifications("cnts")
cnt_info <- info_cnts[, as.list(get_distr_vals(pert)), by = "countvar"]
print(cnt_info)
# unweighted only!
cnt_measures <- lapply(cv, function(x, excl) {
self$measures_cnts(v = x)
})
names(cnt_measures) <- cv
cli::cat_line()
cli::cat_rule("Distance-based measures")
for (vv in names(cnt_measures)) {
cli::cat_line(cli::symbol$tick, " Variable: ", shQuote(vv), "\n")
print(cnt_measures[[vv]]$measures)
cli::cat_line()
}
}
numvars_available <- TRUE
if (numvars_available) {
cli::cat_line(cli::boxx("Utility measures for perturbed numerical variables", padding = 0))
nv <- private$.ck_perturbed_vars("numvars")
if (length(nv) == 0) {
cli::cat_line(cli::symbol$cross, " no numerical variables have been perturbed")
} else {
cli::cat_rule("Distribution statistics of perturbations")
rr <- self$numtab()
rr$pert_total <- rr$pws - rr$ws
num_info <- rr[, as.list(get_distr_vals(pert_total)), by = "vname"]
print(num_info)
}
}
return(invisible(NULL))
# return(invisible(
# list(cnt_info = cnt_info,
# cnt_measures = cnt_measures,
# num_info = num_info)
# ))
},
#' @description prints information about the current table
#' @return invisible `NULL`
print = function() {
cli::cat_rule("Table Information")
nr_dims <- ncol(private$.results$dims) - 1
nr_cells <- nrow(private$.results$dims)
dims <- paste(shQuote(private$.ck_vars("dimvars")), collapse = ", ")
cli::cat_line(cli::symbol$tick, " ", nr_cells, " cells in ", nr_dims, " dimensions (", dims, ")")
if (private$.is_weighted) {
cli::cat_line(cli::symbol$tick, " weights: yes")
} else {
cli::cat_line(cli::symbol$tick, " weights: no")
}
cli::cat_rule("Tabulated / Perturbed countvars")
countvars <- private$.varsdt[type == "countvars"]
for (i in seq_len(nrow(countvars))) {
v <- countvars$vname[i]
if (countvars$is_perturbed[i]) {
cli::cat_line(cli::symbol$checkbox_on, " ", shQuote(v), " (perturbed)")
} else {
cli::cat_line(cli::symbol$checkbox_off, " ", shQuote(v))
}
}
numvars <- private$.varsdt[type == "numvars"]
if (nrow(numvars) > 0) {
cli::cat_rule("Tabulated / Perturbed numvars")
for (i in seq_len(nrow(numvars))) {
v <- numvars$vname[i]
if (numvars$is_perturbed[i]) {
cli::cat_line(cli::symbol$checkbox_on, " ", shQuote(v), " (perturbed)")
} else {
cli::cat_line(cli::symbol$checkbox_off, " ", shQuote(v))
}
}
}
return(invisible(NULL))
}#,
# to delete
#everything=function() {
# list(
# prob = private$.prob,
# pert_params = private$.pert_params,
# results = private$.results,
# max_contr = private$.max_contributions)
#}
),
private = list(
.prob = NULL,
.max_contributions = NULL,
.varsdt = NULL,
.is_weighted = NULL,
.pert_params = list(),
.modifications = list(),
.results = list(),
.dupsinfo = NULL,
.h_info = NULL,
.is_initialized = FALSE,
# return important variables
.ck_vars = function(what) {
vname <- type <- NULL
vv <- private$.varsdt
if (!is.null(what)) {
stopifnot(length(what) == 1)
stopifnot(what %in% c("freqvar", "countvars", "numvars", "wvar", "dimvars", "rkeyvar"))
return(vv[type == what, vname])
}
return(vv)
},
# names of already perturbed count or continuous variables
.ck_perturbed_vars = function(what) {
stopifnot(what %in% c("countvars", "numvars"))
type <- is_perturbed <- vname <- NULL
private$.varsdt[type == what & is_perturbed == TRUE, vname]
},
# actually perturb count variables
.ck_perturb_cnts = function(v) {
lookup_freq <- function(ptab, cellkeys, cellvals) {
row_nr <- rep(-1, length(cellkeys))
pert_vals <- rep(0L, length(cellkeys))
symmetry <- max(ptab$i)
ptab$ids <- seq_len(nrow(ptab))
for (d in seq_len(symmetry)) {
if (d == symmetry) {
rkind <- cellvals >= d
} else {
rkind <- cellvals == d
}
if (sum(rkind) > 0) {
ii <- which(ptab$i == d)
v <- ptab$ub[ii]
ids <- ptab$ids[ii]
ck <- cellkeys[rkind]
diffs <- ptab$v[ii]
# row_ind
rowind <- sapply(1:sum(rkind), function(x) {
which.max(ck[x] < v)
})
pert_vals[rkind] <- as.integer(diffs[rowind])
row_nr[rkind] <- ids[rowind]
}
}
data.table(
row_nr = row_nr,
pert = pert_vals,
ckey = cellkeys
)
}
if (private$.is_perturbed_countvar(v)) {
message("Variable ", shQuote(v), " was already perturbed!")
return(invisible(self))
}
countvars <- private$.ck_vars("countvars")
numvars <- private$.ck_vars("numvars")
rkeyvar <- private$.ck_vars("rkeyvar")
freqvar <- private$.ck_vars("freqvar")
wvar <- private$.ck_vars("wvar")
dimvars <- private$.ck_vars("dimvars")
# perturbation parameters
params <- private$.pert_params$cnts[[v]]
if (is.null(params)) {
e <- c(
"No perturbation parameters found for variable", shQuote(v), "-",
"Please use method `$params_cnts_set().")
stop(paste(e, collapse = " "), call. = FALSE)
}
newtab <- copy(private$.results[[v]])
# compute average weights
col_wcavg <- gen_vnames(v, prefix = "wcavg")
col_n <- gen_vnames(v, prefix = "uwc")
col_wc <- gen_vnames(v, prefix = "wc")
wcavg <- newtab[[col_wc]] / newtab[[col_n]]
wcavg[is.nan(wcavg)] <- 0
newtab[[col_wcavg]] <- wcavg
# compute cell-keys for each column independently
col_ck <- gen_vnames(v, prefix = "ck")
# list with perturbation results for each countvar
col_pert <- gen_vnames(v, prefix = "pert")
# lookup perturbation values
mods <- lookup_freq(
ptab = params$params$ptable,
cellkeys = newtab[[col_ck]],
cellvals = newtab[[col_n]])
mods$countvar <- v
mods <- cbind(private$.results$dims[, dimvars, with = FALSE], mods)
newtab[[col_pert]] <- mods$pert
modifications <- rbind(private$.modifications$cnts, mods)
# compute perturbed unweighted counts
col_puwc <- gen_vnames(v, prefix = "puwc")
col_pwc <- gen_vnames(v, prefix = "pwc")
# compute perturbed unweighted counts
puwc <- newtab[[col_pert]] + newtab[[col_n]]
puwc[is.na(puwc)] <- 0
newtab[[col_puwc]] <- puwc
# compute perturbed weighted counts
pwc <- newtab[[col_puwc]] * newtab[[col_wcavg]]
pwc[is.na(pwc) | is.infinite(pwc)] <- 0
newtab[[col_pwc]] <- pwc
# restrict table to required columns
private$.results[[v]] <- newtab
private$.modifications$cnts <- modifications
# update vars
varsdt <- private$.varsdt
varsdt$is_perturbed[varsdt$vname == v] <- TRUE
private$.varsdt <- varsdt
private$.validate()
message("Count variable ", shQuote(v), " was perturbed.")
return(invisible(self))
},
# actually perturb continuous variables
.ck_perturb_nums = function(v) {
dig <- .ck_digits()
if (private$.is_perturbed_numvar(v)) {
message("Variable ", shQuote(v), " was already perturbed!")
return(invisible(self))
}
# parameters
params <- private$.pert_params$nums[[v]]$params
if (is.null(params)) {
e <- "Please set perturbation parameters using `$params_nums_set() first.`"
stop(e, call. = FALSE)
}
ck_log("parameters:")
ck_log("--> type: ", shQuote(params$type))
ck_log("--> top_k: ", params$top_k)
ck_log("--> same_key: ", params$same_key)
ck_log("--> use_zero_rkeys: ", params$use_zero_rkeys)
# for now; only flex-function is allowed as input!
mult_params <- params$mult_params
ck_log("--> epsilon: ", paste(params$mult_params$epsilon, collapse = ", "))
ck_log("--> mu_c: ", params$mu_c)
ck_log("--> separation: ", params$separation)
ck_log("--> m1_sq: ", round(params$m_fixed_sq, digits = dig))
ck_log("--> separation_point: ", round(params$zs, digit = dig))
ck_log("--> E: ", round(params$E, digits = dig))
ck_log("--> even_odd: ", params$even_odd)
cl <- class(params$mult_params)
if (cl == "params_m_simple") {
lookup_type <- "simple"
} else if (cl == "params_m_flex") {
lookup_type <- "flex"
} else {
stop("invalid perturbation parameters received.", call. = FALSE)
}
ck_log("--> multiplicator: ", shQuote(lookup_type))
# get maximum contributions for each cell and variable!
# restrict to current variable
max_contr <- lapply(private$.max_contributions, function(x) {
x[[v]]
})
# compute x_j values that are required input for the computation of
# the perturbation magnitutes m_j
w_sums <- lapply(max_contr, function(x) x$w_sum)
if (params$type == "top_contr") {
# we may get NAs -> need to ignore them later
x_vals <- lapply(max_contr, function(x) {
list(x = na.omit(x$w_vals[1:params$top_k]), even_odd = x$even_contributors)
})
} else if (params$type == "mean") {
x_vals <- lapply(max_contr, function(x) {
list(x = x$w_mean, even_odd = x$even_contributors)
})
} else if (params$type == "range") {
x_vals <- lapply(max_contr, function(x) {
list(x = x$w_spread, even_odd = x$even_contributors)
})
} else if (params$type == "sum") {
eo <- sapply(max_contr, function(x) {
x$even_contributors
})
x_vals <- lapply(names(w_sums), function(x) {
list(x = w_sums[[x]], even_odd = eo[[x]])
})
names(x_vals) <- names(w_sums)
}
if (!params$even_odd) {
# handle the "all"-case
x_vals <- lapply(x_vals, function(x) {
x$even_odd <- NA
x
})
}
# the table with cell_keys (non-zero cellkeys and weighted and unweighted sums)
newtab <- copy(private$.results[[v]])
dim_dt <- private$.results[["dims"]]
ck_log("selecting cell keys based on choice of argument `use_zero_rkey`")
if (!params$use_zero_rkeys) {
cellkeys <- newtab[[paste0("ck_nz_", v)]]
} else {
cellkeys <- newtab[[paste0("ck_", v)]]
}
cellvals <- newtab[[paste0("ws_", v)]]
# we compute top_k-scrambled cell keys
# required for argument `same_key`
ck_log("scrambling cell keys depending on argument `same_key`")
cellkeys <- parallel::mclapply(cellkeys, function(x) {
.scramble_ckeys(
ck = x,
top_k = params$top_k,
same_key = params$same_key)
}, mc.cores = .ck_cores()) # nolint
# we remove duplicated cells for now and add them later!
names(cellkeys) <- dim_dt[["strID"]]
index_nondup <- !private$.dupsinfo$is_bogus
cellkeys <- cellkeys[index_nondup]
cellvals <- cellvals[index_nondup]
prot_req <- private$.results[[v]]$special_protection[index_nondup]
lookup <- parallel::mclapply(x_vals, function(x) {
if (is.na(x$even_odd)) {
return(rep("all", length(x$x)))
} else if (isTRUE(x$even_odd)) {
return("even")
}
return("odd")
}, mc.cores = .ck_cores()) # nolint
if (lookup_type == "flex") {
fun <- .perturb_cell_flex
} else if (lookup_type == "simple") {
fun <- .perturb_cell_simple
}
res <- parallel::mclapply(seq_len(length(x_vals)), function(y) {
fun(
cv = cellvals[y],
x = x_vals[[y]]$x,
ck = cellkeys[[y]],
lookup = lookup[[y]],
prot_req = prot_req[y],
params = params)
}, mc.cores = .ck_cores()) # nolint
pert_result <- rbindlist(parallel::mclapply(seq_len(length(x_vals)), function(x) {
data.table(
cell_id = names(x_vals)[x],
ckey = cellkeys[[x]],
cv = res[[x]]$cv,
cv_pert = res[[x]]$cv_p,
pert = sum(res[[x]]$x_hats),
x_delta = res[[x]]$x_delta,
lookup = res[[x]]$lookup,
additional_protection = prot_req[x])
}, mc.cores = .ck_cores())) # nolint
ck_log("add rows for duplicated cells!")
index_nondup <- !private$.dupsinfo$is_bogus
pert_result <- merge(
x = dim_dt[index_nondup],
y = pert_result,
by.x = "strID",
by.y = "cell_id",
all.x = TRUE)
ck_log("updating results")
newtab <- private$.results[[v]]
# no duplicates
# nolint start
unq <- pert_result[!duplicated(strID), list(strID, pert, cv_pert)]
# nolint end
setnames(unq, c("strID", "pert_total", gen_vnames(v, prefix = "pws")))
# we merge the results back; duplicated cells have identical
# perturbation values (pws_{var} and pert_total)
# thus, we can easily merge
newtab[["strID"]] <- dim_dt[["strID"]]
newtab <- merge(newtab, unq, by = "strID", all.x = TRUE)
newtab[["strID"]] <- NULL
private$.results[[v]] <- newtab
ck_log("compute and add modifications")
# note: `ckey`: the first cell key (all others were scrambled)
mods <- copy(pert_result)
mods[["strID"]] <- mods$cv <- mods$cv_pert <- NULL
mods$numvar <- v
private$.modifications$nums <- rbind(private$.modifications$nums, mods)
# update vars
varsdt <- private$.varsdt
varsdt$is_perturbed[varsdt$vname == v] <- TRUE
private$.varsdt <- varsdt
private$.validate()
message("Numeric variable ", shQuote(v), " was perturbed.")
return(invisible(self))
},
# utility measures for count variables
.ck_utility_cnts = function(v, exclude_zeros) {
v <- tolower(v)
dt <- private$.results[[v]]
return(ck_cnt_measures(
orig = dt$uwc,
pert = dt$puwc,
exclude_zeros = exclude_zeros
))
},
# utility measures for numeric variables
.ck_utility_nums = function(v) {
stop(".ck_utility_nums() needs to be implemented", call. = FALSE)
},
# is `v` a perturbed countvar?
.is_perturbed_countvar = function(v) {
stopifnot(is_scalar_character(v))
v %in% private$.ck_perturbed_vars("countvars")
},
# is `v` a perturbed numvar?
.is_perturbed_numvar = function(v) {
stopifnot(is_scalar_character(v))
v %in% private$.ck_perturbed_vars("numvars")
},
# resets perturbed variables
.reset_vars = function(vars) {
for (v in vars) {
ck_log("removing perturbation results and parameters for ", shQuote(v))
if (private$.is_perturbed_countvar(v)) {
private$.results[[v]] <- private$.results[[v]][, 1:4]
private$.modifications$cnts <- private$.modifications$cnts[countvar != v]
private$.pert_params$cnts[[v]] <- NULL
} else {
private$.results[[v]] <- private$.results[[v]][, 1:5]
private$.modifications$nums <- private$.modifications$nums[numvar != v]
private$.pert_params$nums[[v]] <- NULL
}
private$.varsdt[vname == v, is_perturbed := FALSE]
}
},
# updates column `special_protection` after applying primary suppression
.update_supps = function(v, pat, rule) {
nr_ex_supp <- sum(private$.results[[v]]$special_protection)
index_new_supps <- which(pat)
if (length(index_new_supps) > 0) {
private$.results[[v]]$special_protection[index_new_supps] <- TRUE
}
nr_new_supps <- sum(private$.results[[v]]$special_protection)
s <- nr_new_supps - nr_ex_supp
message(rule, ": ", s, " new sensitive cells (incl. duplicates) found (total: ", nr_new_supps, ")")
},
# returns modification slot
# returns TRUE if we have a variable containing negative values
.has_negative_values = function(v) {
min(private$.prob@dataObj@rawData[[v]], na.rm = TRUE) < 0
},
modifications = function(type = NULL) {
if (is.null(type)) {
return(private$.modifications)
}
if (!is_scalar_character(type)) {
stop("Argument `type` needs to be a scalar characater.", call. = FALSE)
}
if (!type %in% c("cnts", "nums")) {
stop("Argument `type` must be either `cnts` or `nums`.", call. = FALSE)
}
return(private$.modifications[[type]])
},
.validate = function() {
if (!private$.is_initialized) {
return(invisible(TRUE))
}
# todo
return(invisible(TRUE))
}
)
)
#' @rdname cellkey_pkg
#' @name cellkey_pkg
#' @md
#' @title cellKey - an post-tabular perturbation method
#' @description This class allows to define statistical tables and perturb both
#' count and numerical variables.
#' @param x an object coercible to a `data.frame`
#' @param rkey either a column name within `x` referring to a variable containing record keys
#' or a single integer(ish) number > `5` that referns to the number of digits for record keys that
#' will be generated internally.
#' @param dims a list containing slots for each variable that should be
#' tabulated. Each slot consists should be created/modified using [sdcHierarchies::hier_create()],
#' [sdcHierarchies::hier_add()] and other functionality from package `sdcHierarchies`.
#' @param w (character) a scalar character referring to a variable in `x` holding sampling
#' weights. If `w` is `NULL` (the default), all weights are assumed to be `1`
#' @param countvars (character) an optional vector containing names of binary (0/1 coded)
#' variables withing `x` that should be included in the problem instance.
#' These variables can later be perturbed.
#' @param numvars (character) an optional vector of numerical variables that can later be tabulated.
#' @return A new `cellkey_obj` object. Such objects (internally) contain the fully computed
#' statistical tables given input microdata (`x`), the hierarchical definitionals (`dims`) as
#' well as the remaining inputs. Intermediate results are stored internally and can only be
#' modified / accessed via the exported public methods described below.
#' @export
#' @examples
#' \donttest{
#' x <- ck_create_testdata()
#'
#' # create some 0/1 variables that should be perturbed later
#' x[, cnt_females := ifelse(sex == "male", 0, 1)]
#' x[, cnt_males := ifelse(sex == "male", 1, 0)]
#' x[, cnt_highincome := ifelse(income >= 9000, 1, 0)]
#' # a variable with positive and negative contributions
#' x[, mixed := sample(-10:10, nrow(x), replace = TRUE)]
#'
#' # create record keys
#' x$rkey <- ck_generate_rkeys(dat = x)
#'
#' # define required inputs
#'
#' # hierarchy with some bogus codes
#' d_sex <- hier_create(root = "Total", nodes = c("male", "female"))
#' d_sex <- hier_add(d_sex, root = "female", "f")
#' d_sex <- hier_add(d_sex, root = "male", "m")
#'
#' d_age <- hier_create(root = "Total", nodes = paste0("age_group", 1:6))
#' d_age <- hier_add(d_age, root = "age_group1", "ag1a")
#' d_age <- hier_add(d_age, root = "age_group2", "ag2a")
#'
#' # define the cell key object
#' countvars <- c("cnt_females", "cnt_males", "cnt_highincome")
#' numvars <- c("expend", "income", "savings", "mixed")
#' tab <- ck_setup(
#' x = x,
#' rkey = "rkey",
#' dims = list(sex = d_sex, age = d_age),
#' w = "sampling_weight",
#' countvars = countvars,
#' numvars = numvars)
#'
#' # show some information about this table instance
#' tab$print() # identical with print(tab)
#'
#' # information about the hierarchies
#' tab$hierarchy_info()
#'
#' # which variables have been defined?
#' tab$allvars()
#'
#' # count variables
#' tab$cntvars()
#'
#' # continuous variables
#' tab$numvars()
#'
#' # create perturbation parameters for "total" variable and
#' # write to yaml-file
#'
#' # create a ptable using functionality from the ptable-pkg
#' f_yaml <- tempfile(fileext = ".yaml")
#' p_cnts1 <- ck_params_cnts(
#' ptab = ptable::pt_ex_cnts(),
#' path = f_yaml)
#'
#' # read parameters from yaml-file and set them for variable `"total"`
#' p_cnts1 <- ck_read_yaml(path = f_yaml)
#'
#' tab$params_cnts_set(val = p_cnts1, v = "total")
#'
#' # create alternative perturbation parameters by specifying parameters
#' para2 <- ptable::create_cnt_ptable(
#' D = 8, V = 3, js = 2, create = FALSE)
#'
#' p_cnts2 <- ck_params_cnts(ptab = para2)
#'
#' # use these ptable it for the remaining variables
#' tab$params_cnts_set(val = p_cnts2, v = countvars)
#'
#' # perturb a variable
#' tab$perturb(v = "total")
#'
#' # multiple variables can be perturbed as well
#' tab$perturb(v = c("cnt_males", "cnt_highincome"))
#'
#' # return weighted and unweighted results
#' tab$freqtab(v = c("total", "cnt_males"))
#'
#' # numerical variables (positive variables using flex-function)
#' # we also write the config to a yaml file
#' f_yaml <- tempfile(fileext = ".yaml")
#'
#' # create a ptable using functionality from the ptable-pkg
#' # a single ptable for all cells
#' ptab1 <- ptable::pt_ex_nums(parity = TRUE, separation = FALSE)
#'
#' # a single ptab for all cells except for very small ones
#' ptab2 <- ptable::pt_ex_nums(parity = TRUE, separation = TRUE)
#'
#' # different ptables for cells with even/odd number of contributors
#' # and very small cells
#' ptab3 <- ptable::pt_ex_nums(parity = FALSE, separation = TRUE)
#'
#' p_nums1 <- ck_params_nums(
#' ptab = ptab1,
#' type = "top_contr",
#' top_k = 3,
#' mult_params = ck_flexparams(
#' fp = 1000,
#' p = c(0.30, 0.03),
#' epsilon = c(1, 0.5, 0.2),
#' q = 3),
#' mu_c = 2,
#' same_key = FALSE,
#' use_zero_rkeys = FALSE,
#' path = f_yaml)
#'
#' # we read the parameters from the yaml-file
#' p_nums1 <- ck_read_yaml(path = f_yaml)
#'
#' # for variables with positive and negative values
#' p_nums2 <- ck_params_nums(
#' ptab = ptab2,
#' type = "top_contr",
#' top_k = 3,
#' mult_params = ck_flexparams(
#' fp = 1000,
#' p = c(0.15, 0.02),
#' epsilon = c(1, 0.4, 0.15),
#' q = 3),
#' mu_c = 2,
#' same_key = FALSE)
#'
#' # simple perturbation parameters (not using the flex-function approach)
#' p_nums3 <- ck_params_nums(
#' ptab = ptab3,
#' type = "mean",
#' mult_params = ck_simpleparams(p = 0.25),
#' mu_c = 2,
#' same_key = FALSE)
#'
#' # use `p_nums1` for all variables
#' tab$params_nums_set(p_nums1, c("savings", "income", "expend"))
#'
#' # use different parameters for variable `mixed`
#' tab$params_nums_set(p_nums2, v = "mixed")
#'
#' # identify sensitive cells to which extra protection (`mu_c`) is added.
#' tab$supp_p(v = "income", p = 85)
#' tab$supp_pq(v = "income", p = 85, q = 90)
#' tab$supp_nk(v = "income", n = 2, k = 90)
#' tab$supp_freq(v = "income", n = 14, weighted = FALSE)
#' tab$supp_val(v = "income", n = 10000, weighted = TRUE)
#' tab$supp_cells(
#' v = "income",
#' inp = data.frame(
#' sex = c("female", "female"),
#' "age" = c("age_group1", "age_group3")
#' )
#' )
#'
#' # perturb variables
#' tab$perturb(v = c("income", "savings"))
#'
#' # extract results
#' tab$numtab("income", mean_before_sum = TRUE)
#' tab$numtab("income", mean_before_sum = FALSE)
#' tab$numtab("savings")
#'
#' # results can be resetted, too
#' tab$reset_cntvars(v = "cnt_males")
#'
#' # we can then set other parameters and perturb again
#' tab$params_cnts_set(val = p_cnts1, v = "cnt_males")
#'
#' tab$perturb(v = "cnt_males")
#'
#' # write results to a .csv file
#' tab$freqtab(
#' v = c("total", "cnt_males"),
#' path = file.path(tempdir(), "outtab.csv")
#' )
#'
#' # show results containing weighted and unweighted results
#' tab$freqtab(v = c("total", "cnt_males"))
#'
#' # utility measures for a count variable
#' tab$measures_cnts(v = "total", exclude_zeros = TRUE)
#'
#' # modifications for perturbed count variables
#' tab$mod_cnts()
#'
#' # display a summary about utility measures
#' tab$summary()
#' }
ck_setup <- function(x, rkey, dims, w = NULL, countvars = NULL, numvars = NULL) {
ck_class$new(
x = x,
rkey = rkey,
dims = dims,
w = w,
countvars = countvars,
numvars = numvars)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.