R/schema.R
In FakeDataR: Privacy-Preserving Synthetic Data for 'LLM' Workflows

# Internal helpers to build a scrubbed JSON schema
# (intentionally not exported)
#' @keywords internal
#' @noRd
.safe_minmax <- function(x) {
  mn <- suppressWarnings(min(x, na.rm = TRUE))
  mx <- suppressWarnings(max(x, na.rm = TRUE))
  if (!is.finite(mn)) mn <- NA
  if (!is.finite(mx)) mx <- NA
  list(min = mn, max = mx)
}

#' Build a safe schema from FAKE data
#'
#' We *only* summarize the FAKE data (never original),
#' so that no real values/labels leak. The goal is to give LLMs enough
#' structure (types, ranges, counts, missingness) to write good code.
#'
#' @param data  original data (unused here, kept for future hooks)
#' @param fake  fake data.frame generated by generate_fake_*()
#' @param level privacy preset (low/medium/high)
#' @return a nested list suitable for jsonlite::write_json()
#' @keywords internal
#' @noRd
.schema_from_data <- function(data, fake, level = c("low","medium","high")) {
  level <- match.arg(level)
  
  pkg <- "FakeDataR"
  ver <- tryCatch(as.character(utils::packageVersion(pkg)),
                  error = function(e) "0.0.0.9000")
  
  cols <- lapply(names(fake), function(nm) {
    col <- fake[[nm]]
    cls <- class(col)[1]
    na_prop <- mean(is.na(col))
    
    if (is.factor(col)) {
      # Use only counts, no label names
      out <- list(
        name = nm, type = "factor",
        n_levels = length(levels(col)),
        na_prop = na_prop
      )
      # (Optionally include blank proportion if factor had blanks)
      out$blank_prop <- mean(trimws(as.character(col)) == "", na.rm = TRUE)
      return(out)
      
    } else if (is.character(col)) {
      uniq <- unique(trimws(as.character(col)))
      uniq <- uniq[!is.na(uniq) & nzchar(uniq)]
      return(list(
        name = nm, type = "character",
        n_levels = length(uniq),
        na_prop = na_prop,
        blank_prop = mean(trimws(as.character(col)) == "", na.rm = TRUE)
      ))
      
    } else if (is.logical(col)) {
      return(list(
        name = nm, type = "logical",
        true_prop  = mean(col == TRUE,  na.rm = TRUE),
        false_prop = mean(col == FALSE, na.rm = TRUE),
        na_prop = na_prop
      ))
      
    } else if (inherits(col, "Date")) {
      mm <- .safe_minmax(col)
      # Keep ISO-formatted strings for stability
      return(list(
        name = nm, type = "Date",
        min = if (!is.na(mm$min)) as.character(mm$min) else NA,
        max = if (!is.na(mm$max)) as.character(mm$max) else NA,
        na_prop = na_prop
      ))
      
    } else if (inherits(col, "POSIXct")) {
      tz <- attr(col, "tzone")
      mm <- .safe_minmax(col)
      return(list(
        name = nm, type = "POSIXct",
        tzone = if (is.null(tz)) NA else tz,
        min = if (!is.na(mm$min)) as.character(mm$min) else NA,
        max = if (!is.na(mm$max)) as.character(mm$max) else NA,
        na_prop = na_prop
      ))
      
    } else if (is.integer(col)) {
      mm <- .safe_minmax(col)
      return(list(
        name = nm, type = "integer",
        min = mm$min, max = mm$max,
        na_prop = na_prop
      ))
      
    } else if (is.numeric(col)) {
      mm <- .safe_minmax(col)
      return(list(
        name = nm, type = "numeric",
        min = mm$min, max = mm$max,
        na_prop = na_prop
      ))
      
    } else {
      # Fallback: unknown column type, report class only
      return(list(
        name = nm, type = cls,
        na_prop = na_prop
      ))
    }
  })
  
  list(
    package = pkg,
    version = ver,
    privacy = level,
    n_rows = nrow(fake),
    n_cols = ncol(fake),
    columns = cols
  )
}