tests/testthat/test_countvars.R

testthat::skip_on_cran()
context("Testing Frequency Tables")

set.seed(120, sample.kind = "Reject")
dat <- ck_create_testdata()

seed <- ck_create_seed_from_hash(dat)
test_that("check seed from hash generation", {
  expect_equal(seed, 1474937517)
})

dim_sex <- hier_create(root = "Total", nodes = c("male", "female"))
dim_age <- hier_create(root = "Total", paste0("age_group", 1:6))
dims <- list(sex = dim_sex, age = dim_age)
test_that("dims-hash is ok", {
  expect_identical(class(dims), "list")
  expect_identical(nrow(dim_sex), 3L)
  expect_identical(max(dim_sex$level), 2)
  expect_identical(nrow(dim_age), 7L)
  expect_identical(max(dim_age$level), 2)
})

## test generation of destatis rkeys
rk1 <- ck_generate_rkeys(dat = dat, nr_digits = 5)
rk2 <- ck_generate_rkeys(dat = dat, nr_digits = 5)
test_that("check rkey generation and seed is ok", {
  expect_identical(rk1, rk2)
  expect_identical(round(mean(rk1), digits = 3), 0.501)
})
dat$rec_key <- rk1
test_that("checking dimension and structure of generated testdata is ok", {
  expect_true(is.data.table(dat))
  expect_identical(round(mean(dat$sampling_weight), digits = 3), 59.719)
  expect_identical(round(mean(dat$household_weights), digits = 3), 21.834)

  expect_identical(nrow(dat), 4580L)
  expect_identical(ncol(dat), 16L)
  expect_identical(sum(dat$sex == "male"), 2296L)
})

## perturbation parameters for count variables
set.seed(120, sample.kind = "Reject")
suppressMessages(para <- ptable::create_cnt_ptable(
  D = 5,
  V = 3,
  js = 2,
  pstay = 0.5,
  optim = 1,
  mono = TRUE,
  create = FALSE))

params_cnts <- ck_params_cnts(ptab = ptable::create_ptable(params = para))
ptab <- params_cnts$params$ptable
test_that("ck_params_cnts() is ok", {
  expect_is(params_cnts, "ck_params")
  expect_identical(params_cnts$type, "cnts")
  expect_identical(dim(ptab), c(66L, 7L))
  expect_identical(max(ptab$i), 8)
})

# we need to load a custom data set because otherwise, the
# digest based checks don't work
test_that("checking perturbation parameters for counts", {
  expect_is(params_cnts, "ck_params")
  expect_equal(params_cnts$type, "cnts")
  dt <- params_cnts$params$ptable
  expect_is(dt, "data.table")
  expect_identical(dim(dt), c(66L, 7L))
  expect_identical(round(mean(dt$p), digits = 3), 0.136)
  expect_identical(round(mean(dt$lb), digits = 3), 0.562)
  expect_identical(round(mean(dt$ub), digits = 3), 0.698)
  expect_identical(round(mean(dt$v), digits = 3), 1.045)
})

countvars <- NULL
w <- "sampling_weight"
rkey <- "rec_key"

tab <- ck_setup(
  x = dat,
  rkey = "rec_key",
  dims = dims,
  w = w,
  countvars = countvars
)

# set perturbation parameters for all variables
tab$params_cnts_set(val = params_cnts, v = NULL)

expect_message(tab$perturb("total"))
expect_message(tab$perturb("total"), "Variable 'total' was already perturbed!")

res_freqtab <- tab$freqtab("total")
test_that("check ck_define_table() with already existing rec-keys", {
  expect_is(tab, "cellkey_obj")
  expect_identical(dim(res_freqtab), c(21L, 7L))
  expect_identical(res_freqtab$uwc[3], 1143)
  expect_identical(res_freqtab$puwc[3], 1147)
  expect_identical(round(mean(res_freqtab$pwc), digits = 3), 52096.24)
  expect_identical(round(mean(res_freqtab$puwc), digits = 3), 872.333)
})

dat$rec_key <- NULL
tab <- ck_setup(
  x = dat,
  rkey = 7,
  dims = dims,
  w = w,
  countvars = countvars)

expect_error(tab$perturb("total"))
tab$params_cnts_set(val = params_cnts, v = "total")
expect_message(tab$perturb("total"), "Count variable 'total' was perturbed.")
expect_message(tab$perturb("total"), "Variable 'total' was already perturbed!")

test_that("ck_define_table() with new record keys is ok", {
  expect_is(tab, "cellkey_obj")
  dt <- tab$freqtab("total")
  expect_identical(dt$uwc[3], 1143)
  expect_identical(dt$puwc[3], 1147)
  expect_identical(round(mean(dt$pwc), digits = 3), 52096.24)
  expect_identical(round(mean(dt$puwc), digits = 3), 872.333)
})

freqtab <- tab$freqtab("total")
test_that("weighted version of ck_perturb() is ok", {
  expect_identical(freqtab$uwc[3], 1143)
  expect_identical(freqtab$puwc[3], 1147)
  expect_identical(round(mean(freqtab$pwc), digits = 3), 52096.24)
  expect_identical(round(mean(freqtab$puwc), digits = 3), 872.333)

  dt <- tab$mod_cnts()
  expect_identical(dim(dt), c(21L, 6L))
  expect_identical(round(mean(dt$ckey), digits = 3), 0.456)
  expect_identical(round(mean(dt$pert), digits = 3), -0.048)
  expect_identical(range(dt$row_nr), c(40, 65))
})

mm <- tab$measures_cnts("Total")
test_that("ck_cnt_measures() [exclude_zeros = TRUE] is ok", {
  expect_identical(range(as.numeric(mm$overview$noise)), c(1, 7))
  expect_identical(range(as.numeric(mm$overview$cnt)), c(1, 9))
  expect_identical(round(mean(as.numeric(mm$overview$pct)), digits = 3), 0.143)

  expect_identical(range(as.numeric(mm$measures$d1)), c(0, 4))
  expect_identical(range(as.numeric(mm$measures$d2)), c(0, 0.429))
  expect_identical(range(as.numeric(mm$measures$d3)), c(0, 0.517))

  expect_identical(range(mm$cumdistr_d1$cnt), c(9L, 21L))
  expect_identical(round(range(mm$cumdistr_d1$pct), digits = 3), c(0.429, 1))

  expect_identical(range(mm$cumdistr_d2$cnt), c(19L, 21L))
  expect_identical(round(range(mm$cumdistr_d2$pct), digits = 3), c(0.905, 1))

  expect_identical(range(mm$cumdistr_d3$cnt), c(12L, 21L))
  expect_identical(round(range(mm$cumdistr_d3$pct), digits = 3), c(0.571, 1))
  expect_identical(mm$false_nonzero, 0L)
  expect_identical(mm$false_zero, 0L)
  expect_identical(mm$exclude_zeros, TRUE)
})

mm <- tab$measures_cnts("Total", exclude_zeros = FALSE)
test_that("ck_cnt_measures() [exclude_zeros = FALSE] is ok", {
  expect_identical(range(as.numeric(mm$overview$noise)), c(1, 7))
  expect_identical(range(as.numeric(mm$overview$cnt)), c(1, 9))
  expect_identical(round(mean(as.numeric(mm$overview$pct)), digits = 3), 0.143)

  expect_identical(range(as.numeric(mm$measures$d1)), c(0, 4))
  expect_identical(range(as.numeric(mm$measures$d2)), c(0, 0.429))
  expect_identical(range(as.numeric(mm$measures$d3)), c(0, 0.517))

  expect_identical(range(mm$cumdistr_d1$cnt), c(9L, 21L))
  expect_identical(round(range(mm$cumdistr_d1$pct), digits = 3), c(0.429, 1))

  expect_identical(range(mm$cumdistr_d2$cnt), c(19L, 21L))
  expect_identical(round(range(mm$cumdistr_d2$pct), digits = 3), c(0.905, 1))

  expect_identical(range(mm$cumdistr_d3$cnt), c(12L, 21L))
  expect_identical(round(range(mm$cumdistr_d3$pct), digits = 3), c(0.571, 1))
  expect_identical(mm$false_nonzero, 0L)
  expect_identical(mm$false_zero, 0L)
  expect_identical(mm$exclude_zeros, FALSE)
})

# no weights
tab <- ck_setup(
  x = dat,
  rkey = 7,
  dims = dims,
  w = NULL,
  countvars = countvars)

# set params
tab$params_cnts_set(params_cnts, v = NULL)

tab$perturb("total")
freqtab <- tab$freqtab("total")

test_that("checking unweighted version of perturb()", {
  expect_identical(dim(freqtab), c(21L, 7L))
  expect_identical(freqtab$uwc[3], 1143)
  expect_identical(freqtab$puwc[3], 1147)
  expect_identical(freqtab$pwc, freqtab$puwc)
  expect_identical(round(mean(freqtab$pwc), digits = 3), 872.333)

  dt <- tab$mod_cnts()
  expect_identical(dim(dt), c(21L, 6L))
  expect_identical(round(mean(dt$ckey), digits = 3), 0.456)
  expect_identical(round(mean(dt$pert), digits = 3), -0.048)
  expect_identical(range(dt$row_nr), c(40, 65))
})

context("Testing multiple countvars")
dat[, cnt_males := ifelse(sex == "male", 1, 0)]
dat[, cnt_highincome := ifelse(income >= 9000, 1, 0)]
countvars <- c("cnt_males", "cnt_highincome")

tab <- ck_setup(
  x = dat,
  rkey = 7,
  dims = dims,
  w = w,
  countvars = countvars)

# set params
tab$params_cnts_set(params_cnts, v = NULL)
tab$perturb(c("total", "cnt_males", "cnt_highincome"))
test_that("check tabulation of cnt_males is ok", {
  dt <- tab$freqtab("cnt_males")
  expect_identical(dt$uwc[3], 571)
  expect_identical(dt$puwc[3], 571)
  expect_identical(round(mean(dt$pwc), digits = 3), 25757.65)
  expect_identical(round(mean(dt$puwc), digits = 3), 437.048)

  expect_identical(round(range(dt$puwc), digits = 3), c(0, 2297))
  expect_identical(round(range(dt$pwc), digits = 3), c(0, 135387.941))

  mm <- tab$measures_cnts("cnt_males")
  expect_identical(round(range(mm$overview$pct), digits = 3), c(0.095, 0.810))
  expect_identical(round(range(mm$measures$d1), digits = 3), c(0, 4))
  expect_identical(round(range(mm$measures$d2), digits = 3), c(0, 0.048))
  expect_identical(round(range(mm$measures$d3), digits = 3), c(0, 0.221))
})

test_that("check tabulation of cnt_highincome is ok", {
  dt <- tab$freqtab("cnt_highincome")
  expect_identical(dt$uwc[3], 123)
  expect_identical(dt$puwc[3], 123)
  expect_identical(round(mean(dt$pwc), digits = 3), 5063.68)
  expect_identical(round(mean(dt$puwc), digits = 3), 84.286)

  expect_identical(round(range(dt$puwc), digits = 3), c(0, 444))
  expect_identical(round(range(dt$pwc), digits = 3), c(0, 26671.928))

  mm <- tab$measures_cnts("cnt_highincome")
  expect_identical(round(range(mm$overview$pct), digits = 3), c(0.048, 0.667))
  expect_identical(round(range(mm$measures$d1), digits = 3), c(0, 3))
  expect_identical(round(range(mm$measures$d2), digits = 3), c(0, 0.158))
  expect_identical(round(range(mm$measures$d3), digits = 3), c(0, 0.359))
})

test_that("check tabulation of multiple count variables is ok", {
  dt <- tab$freqtab(c("total", "cnt_males", "cnt_highincome"))
  expect_identical(dim(dt), c(63L, 7L))
  expect_identical(dt$uwc[3], 1143)
  expect_identical(dt$puwc[3], 1148)
  expect_identical(round(mean(dt$pwc), digits = 3), 27640.21)
  expect_identical(round(mean(dt$puwc), digits = 3), 464.587)
  expect_identical(round(range(dt$puwc), digits = 3), c(0, 4582))
  expect_identical(round(range(dt$pwc), digits = 3), c(0, 273633.438))

  mm <- tab$measures_cnts("cnt_highincome")
  expect_identical(round(range(mm$overview$pct), digits = 3), c(0.048, 0.667))
  expect_identical(round(range(mm$measures$d1), digits = 3), c(0, 3))
  expect_identical(round(range(mm$measures$d2), digits = 3), c(0, 0.158))
  expect_identical(round(range(mm$measures$d3), digits = 3), c(0, 0.359))
})
sdcTools/cellKey documentation built on Dec. 5, 2023, 1:05 a.m.