inst/doc/privacy-and-validation.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE)
set.seed(2)

## -----------------------------------------------------------------------------
library(FakeDataR)

df <- data.frame(
  id    = 1:50,
  email = sprintf("u%02d@x.com", 1:50),
  phone = sprintf("555-01%02d", 1:50),
  dept  = sample(c("A","B","C"), 50, TRUE),
  spend = round(runif(50, 10, 200), 2),
  check.names = FALSE
)


# Auto-detect sensitive columns and fake them
# Strategy: fake sensitive fields (default)
fake_low <- generate_fake_with_privacy(
  data = df, n = 60, level = "low", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "fake",
  normalize = TRUE
)

# Auto-detect and drop sensitive columns
# Strategy: drop sensitive fields
fake_drop <- generate_fake_with_privacy(
  data = df, n = 60, level = "medium", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "drop",
  normalize = TRUE
)

names(fake_low)
names(fake_drop)

# Inspect privacy metadata
attr(fake_low,  "sensitive_columns")
attr(fake_drop, "dropped_columns")
attr(fake_low,  "name_map")



## -----------------------------------------------------------------------------
fake_explicit <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = c("id","email","phone"),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
names(fake_explicit)
attr(fake_explicit, "sensitive_columns")



## -----------------------------------------------------------------------------

# A broad, configurable pattern set
sensitive_patterns <- c(
  # direct IDs / names
  "^id$", "employee[_-]?id", "user(name|[_-]?id)?$", "full[_-]?name", "first[_-]?name", "last[_-]?name",
  # contact
  "email|e-mail", "phone|tel|mobile", "fax",
  # address / geo
  "address|street|road|avenue|apt|unit|suite|zip|postal|postcode|city|state|province|country",
  "lat(itude)?|lon(gitude)?|gps",
  # government IDs (international sampling)
  "RegId|ssn|sin|nin|aadhaar|aadhar|bvn|curp|dni|ced(ul|)+a|cpf|pan\\b|tin\\b|ein\\b|pesel|nin\\b",
  # licenses / travel docs
  "passport|visa|license|licence|driver|dl\\b|vin|plate",
  # finance / payments
  "iban|swift|bic|routing|sort[_-]?code|account|acct|bank",
  "credit|debit|card|cvv|cvc|pan[_-]?number",
  # auth / secrets / device
  "password|pass|pwd|pin|otp|secret|token|api[_-]?key|auth|bearer|session|cookie",
  "ip(_address)?|mac(_address)?|imei|imsi|serial|device|udid|android[_-]?id|idfa|gaid",
  # medical / patient
  "mrn|nhs|medicare|medicaid|patient|diagnosis",
  # birthdays
  "dob|date[_-]?of[_-]?birth|birth(day|date)",
  # education
  "student[_-]?id"
)

rx <- paste0("(?i)(", paste(sensitive_patterns, collapse = "|"), ")")
sens_cols <- names(df)[grepl(rx, names(df))]
sens_cols

sens_cols <- names(df)[grepl(rx, names(df))]
fake_custom_detect <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = unique(c(sens_cols, "email")),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
attr(fake_custom_detect, "sensitive_columns")


## -----------------------------------------------------------------------------
v1 <- validate_fake(df, fake_low)
head(v1, 5)

Try the FakeDataR package in your browser

Any scripts or data that you put into this service are public.

FakeDataR documentation built on Nov. 6, 2025, 1:15 a.m.