ck_flexparams: Set parameters required to perturb numeric variables using a...

View source: R/ck_params_nums.R

ck_flexparamsR Documentation

Set parameters required to perturb numeric variables using a flex function

Description

ck_flexparams() allows to define a flex function that is used to lookup perturbation magnitudes (percentages) used when perturbing continuous variables.

Usage

ck_flexparams(fp, p = c(0.25, 0.05), epsilon = 1, q = 3)

Arguments

fp

(numeric scalar); at which point should the noise coefficient function reaches its desired maximum (defined by the first element of p)

p

a numeric vector of length 2 where both elements specify a percentage. The first value refers to the desired maximum perturbation percentage for small cells (depending on fp) while the second element refers to the desired maximum perturbation percentage for large cells. Both values must be between 0 and 1 and need to be in descending order.

epsilon

a numeric vector in descending order with all values ⁠>= 0⁠ and ⁠<= 1⁠ with the first element forced to equal 1. The length of this vector must correspond with the number top_k specified in ck_params_nums() when creating parameters for type == "top_contr" which is checked at runtime. This setting allows to use different flex-functions for the largest top_k contributors.

q

(numeric scalar); Parameter of the function; q needs to be ⁠>= 1⁠

Details

details about the flex function can be found in Deliverable D4.2, Part I in SGA "Open Source tools for perturbative confidentiality methods"

Value

an object suitable as input for ck_params_nums().

See Also

ck_simpleparams(), ck_params_nums()

Examples


x <- ck_create_testdata()

# create some 0/1 variables that should be perturbed later
x[, cnt_females := ifelse(sex == "male", 0, 1)]
x[, cnt_males := ifelse(sex == "male", 1, 0)]
x[, cnt_highincome := ifelse(income >= 9000, 1, 0)]
# a variable with positive and negative contributions
x[, mixed := sample(-10:10, nrow(x), replace = TRUE)]

# create record keys
x$rkey <- ck_generate_rkeys(dat = x)

# define required inputs

# hierarchy with some bogus codes
d_sex <- hier_create(root = "Total", nodes = c("male", "female"))
d_sex <- hier_add(d_sex, root = "female", "f")
d_sex <- hier_add(d_sex, root = "male", "m")

d_age <- hier_create(root = "Total", nodes = paste0("age_group", 1:6))
d_age <- hier_add(d_age, root = "age_group1", "ag1a")
d_age <- hier_add(d_age, root = "age_group2", "ag2a")

# define the cell key object
countvars <- c("cnt_females", "cnt_males", "cnt_highincome")
numvars <- c("expend", "income", "savings", "mixed")
tab <- ck_setup(
  x = x,
  rkey = "rkey",
  dims = list(sex = d_sex, age = d_age),
  w = "sampling_weight",
  countvars = countvars,
  numvars = numvars)

# show some information about this table instance
tab$print() # identical with print(tab)

# information about the hierarchies
tab$hierarchy_info()

# which variables have been defined?
tab$allvars()

# count variables
tab$cntvars()

# continuous variables
tab$numvars()

# create perturbation parameters for "total" variable and
# write to yaml-file

# create a ptable using functionality from the ptable-pkg
f_yaml <- tempfile(fileext = ".yaml")
p_cnts1 <- ck_params_cnts(
  ptab = ptable::pt_ex_cnts(),
  path = f_yaml)

# read parameters from yaml-file and set them for variable `"total"`
p_cnts1 <- ck_read_yaml(path = f_yaml)

tab$params_cnts_set(val = p_cnts1, v = "total")

# create alternative perturbation parameters by specifying parameters
para2 <- ptable::create_cnt_ptable(
  D = 8, V = 3, js = 2, create = FALSE)

p_cnts2 <- ck_params_cnts(ptab = para2)

# use these ptable it for the remaining variables
tab$params_cnts_set(val = p_cnts2, v = countvars)

# perturb a variable
tab$perturb(v = "total")

# multiple variables can be perturbed as well
tab$perturb(v = c("cnt_males", "cnt_highincome"))

# return weighted and unweighted results
tab$freqtab(v = c("total", "cnt_males"))

# numerical variables (positive variables using flex-function)
# we also write the config to a yaml file
f_yaml <- tempfile(fileext = ".yaml")

# create a ptable using functionality from the ptable-pkg
# a single ptable for all cells
ptab1 <- ptable::pt_ex_nums(parity = TRUE, separation = FALSE)

# a single ptab for all cells except for very small ones
ptab2 <- ptable::pt_ex_nums(parity = TRUE, separation = TRUE)

# different ptables for cells with even/odd number of contributors
# and very small cells
ptab3 <- ptable::pt_ex_nums(parity = FALSE, separation = TRUE)

p_nums1 <- ck_params_nums(
  ptab = ptab1,
  type = "top_contr",
  top_k = 3,
  mult_params = ck_flexparams(
    fp = 1000,
    p = c(0.30, 0.03),
    epsilon = c(1, 0.5, 0.2),
    q = 3),
  mu_c = 2,
  same_key = FALSE,
  use_zero_rkeys = FALSE,
  path = f_yaml)

# we read the parameters from the yaml-file
p_nums1 <- ck_read_yaml(path = f_yaml)

# for variables with positive and negative values
p_nums2 <- ck_params_nums(
  ptab = ptab2,
  type = "top_contr",
  top_k = 3,
  mult_params = ck_flexparams(
    fp = 1000,
    p = c(0.15, 0.02),
    epsilon = c(1, 0.4, 0.15),
    q = 3),
  mu_c = 2,
  same_key = FALSE)

# simple perturbation parameters (not using the flex-function approach)
p_nums3 <- ck_params_nums(
  ptab = ptab3,
  type = "mean",
  mult_params = ck_simpleparams(p = 0.25),
  mu_c = 2,
  same_key = FALSE)

# use `p_nums1` for all variables
tab$params_nums_set(p_nums1, c("savings", "income", "expend"))

# use different parameters for variable `mixed`
tab$params_nums_set(p_nums2, v = "mixed")

# identify sensitive cells to which extra protection (`mu_c`) is added.
tab$supp_p(v = "income", p = 85)
tab$supp_pq(v = "income", p = 85, q = 90)
tab$supp_nk(v = "income", n = 2, k = 90)
tab$supp_freq(v = "income", n = 14, weighted = FALSE)
tab$supp_val(v = "income", n = 10000, weighted = TRUE)
tab$supp_cells(
  v = "income",
  inp = data.frame(
    sex = c("female", "female"),
    "age" = c("age_group1", "age_group3")
  )
)

# perturb variables
tab$perturb(v = c("income", "savings"))

# extract results
tab$numtab("income", mean_before_sum = TRUE)
tab$numtab("income", mean_before_sum = FALSE)
tab$numtab("savings")

# results can be resetted, too
tab$reset_cntvars(v = "cnt_males")

# we can then set other parameters and perturb again
tab$params_cnts_set(val = p_cnts1, v = "cnt_males")

tab$perturb(v = "cnt_males")

# write results to a .csv file
tab$freqtab(
  v = c("total", "cnt_males"),
  path = file.path(tempdir(), "outtab.csv")
)

# show results containing weighted and unweighted results
tab$freqtab(v = c("total", "cnt_males"))

# utility measures for a count variable
tab$measures_cnts(v = "total", exclude_zeros = TRUE)

# modifications for perturbed count variables
tab$mod_cnts()

# display a summary about utility measures
tab$summary()


sdcTools/cellKey documentation built on Dec. 5, 2023, 1:05 a.m.