tests/testthat/test-sensitive.R

test_that("sensitive columns are faked and do not leak", {
  set.seed(42)
  df <- data.frame(
    id     = 1001:1020,
    email  = c("alice@test.com","bob@test.com", NA, rep("x@y.com", 17)),
    phone  = c("(415) 555-0101","+1-202-555-0123", NA, rep("212-555-9999", 17)),
    name   = c("Ann","Bob","Cara",rep("Don",17)),
    amount = c(10.5, 20.1, 15, rep(30,17))
  )
  
  fake <- generate_fake_data(df, n = 12, seed = 1)
  
  # No original sensitive values appear (ignore NAs)
  expect_length(intersect(na.omit(df$email), na.omit(fake$email)), 0)
  expect_length(intersect(na.omit(df$phone), na.omit(fake$phone)), 0)
  expect_length(intersect(df$id, fake$id), 0)
  
  # Types preserved for non-sensitive columns
  expect_type(fake$amount, "double")
  expect_type(fake$name, "character")
})

test_that("sensitive_strategy='drop' removes those columns", {
  df <- data.frame(
    id=1:5, email=sprintf("x%02d@test.com",1:5),
    phone=sprintf("555-%04d", 1:5), x=1:5
  )
  fake <- generate_fake_data(df, n = 10, sensitive_strategy = "drop", seed = 2)
  expect_false(any(c("id","email","phone") %in% names(fake)))
  expect_true("x" %in% names(fake))
})

test_that("NA rate roughly preserved for sensitive columns when faked", {
  set.seed(123)
  df <- data.frame(
    email = c("a@test.com", NA, "b@test.com", rep(NA, 7))
  )
  fake <- generate_fake_data(df, n = 200, seed = 123)
  # Original NA rate is 0.8
  expect_gt(mean(is.na(fake$email)), 0.7)
  expect_lt(mean(is.na(fake$email)), 0.9)
})

Try the FakeDataR package in your browser

Any scripts or data that you put into this service are public.

FakeDataR documentation built on Nov. 6, 2025, 1:15 a.m.