inst/doc/creating-realistic-data.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(charlatan)

## -----------------------------------------------------------------------------
# setup
fraudster_cl <- fraudster("en_US")
n <- 5
set.seed(1235)

## -----------------------------------------------------------------------------
# create product data
products <- data.frame(
  prefix = c(rep(1, 5), rep(2, 2), rep(5, 2)),
  product_id = fraudster_cl$integer(n = 9, min = 1000, max = 9999),
  main_category = c(rep("Shoes", 5), rep("Jeans", 2), rep("Dresses", 2)),
  sub_category = c(
    "Dress shoes", "Tennis shoes", "Boots", "Hiking boots", "Country & Western style boots",
    "Regular fit", "Straight fit",
    "Summer dress", "Evening gown"
  )
)
## when you have {dplyr} installed there are way cleaner ways to do this
products$product_id <- as.integer(sprintf("%s%s", as.character(products$prefix), products$product_id))
products

## -----------------------------------------------------------------------------
# create orders
orders <- data.frame(
  order_id = fraudster_cl$integer(n = n, min = 10000, max = 90000),
  location_id = fraudster_cl$integer(n = n, min = 1, max = 5),
  price_paid = fraudster_cl$integer(n = n, min = 1, max = 9900) / 100,
  product_id = sample(products$product_id, size = n, replace = TRUE),
  order_email = fraudster_cl$email(n = n),
  customer_name = fraudster_cl$name(n = n),
  shipping_address = fraudster_cl$address(n = n)
)

## -----------------------------------------------------------------------------
# combine orders and transactions
example_transactions <- merge(orders, products)
## reorder the columns to let it make more sense.
example_transactions[, c("order_id", "location_id", "product_id", "main_category", "sub_category", "price_paid", "customer_name", "order_email", "shipping_address")]

## -----------------------------------------------------------------------------
# setup the providers
ap <- AddressProvider_en_US$new()
pp <- PersonProvider_en_US$new()
ip <- InternetProvider_en_US$new()
lp <- LoremProvider_en_US$new()
SSNP <- SSNProvider_en_US$new()
dtp <- DateTimeProvider$new()
np <- NumericsProvider$new()
pnp <- PhoneNumberProvider_en_US$new()

set.seed(1235)

## -----------------------------------------------------------------------------
prot_health <- list(
  first_name = pp$first_name(),
  last_name = pp$last_name(),
  phone_number = pnp$render(),
  fax_number = pnp$render(),
  street = ap$street_address(),
  zipcode = ap$postcode(),
  email = ip$email(),
  county = paste0(lp$word(), " county"),
  SSN = SSNP$render(),
  dob = as.Date(dtp$date_time_between("1930-01-01", "1990-12-31")),
  # I've decided record number is an integer between 10000 - 99999
  medical_record_number = np$integer(min = 10000, max = 99999),
  ip_address = ip$ipv4()
)
prot_health

## -----------------------------------------------------------------------------
#' Generate a bunch of dates in sequence
gen_med_record <- function(date_value, events = 4, event_types = c("admission", "x-ray", "blood-test", "general exam")) {
  days <- sort(np$integer(events, 1, 365))
  result <- data.frame(
    date = date_value + days
  )
  result$event <- sample(event_types, size = nrow(result), replace = TRUE)
  result
}

result <- gen_med_record(date_value = as.Date("2022-03-01"), events = 5)
## And add the generated data to this dataframe too
result$medical_record_number <- prot_health$medical_record_number
result$last_name <- prot_health$last_name
result$date_of_birth <- prot_health$dob
result

Try the charlatan package in your browser

Any scripts or data that you put into this service are public.

charlatan documentation built on Oct. 17, 2024, 9:06 a.m.