tests/testthat/helper-test-data.R

# Helper functions for creating realistic PNADC test data
# These functions ensure consistency between birthday and age variables

#' Create Realistic PNADC Test Data
#'
#' Generates synthetic PNADC data with internally consistent fields.
#' Age is correctly derived from birth date and survey date.
#'
#' @param n_quarters Number of quarters to generate (default 4)
#' @param n_upas Number of UPAs per quarter (default 10)
#' @param persons_per_household Average persons per household (default 3)
#' @param start_year Starting year (default 2023)
#' @param seed Random seed for reproducibility
#' @return data.table with realistic PNADC structure
create_realistic_pnadc <- function(n_quarters = 4,
                                    n_upas = 10,
                                    persons_per_household = 3,
                                    start_year = 2023L,
                                    seed = 42L) {
  set.seed(seed)

  # Generate quarter combinations
  quarters <- data.table::data.table(
    q = seq_len(n_quarters)
  )
  quarters[, `:=`(
    Ano = start_year + (q - 1L) %/% 4L,
    Trimestre = ((q - 1L) %% 4L) + 1L
  )]

  # Generate UPAs (each UPA has a panel group V1014)
  upas <- data.table::data.table(
    UPA = seq_len(n_upas),
    V1014 = sample(1:8, n_upas, replace = TRUE)  # Panel groups 1-8
  )

  # Generate households per UPA
  households_per_upa <- sample(2:5, n_upas, replace = TRUE)
  households <- data.table::rbindlist(lapply(seq_len(n_upas), function(u) {
    data.table::data.table(
      UPA = u,
      V1008 = seq_len(households_per_upa[u])
    )
  }))
  households <- merge(households, upas, by = "UPA")

  # Generate persons per household
  persons <- data.table::rbindlist(lapply(seq_len(nrow(households)), function(h) {
    n_persons <- max(1L, rpois(1, persons_per_household))
    hh <- households[h]
    data.table::data.table(
      UPA = hh$UPA,
      V1008 = hh$V1008,
      V1014 = hh$V1014,
      V2003 = seq_len(n_persons)
    )
  }))

  # Generate birth dates with realistic age distribution
  # PNADC covers all ages, with typical Brazilian population distribution
  n_persons <- nrow(persons)

  # Generate birth years (wider range for realistic age distribution)
  # Ages typically 0-100, median around 35
  birth_years <- start_year - sample(0:85, n_persons, replace = TRUE,
                                      prob = dnorm(0:85, mean = 35, sd = 20) + 0.01)
  birth_months <- sample(1:12, n_persons, replace = TRUE)
  birth_days <- sapply(birth_months, function(m) {
    max_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)[m]
    sample(1:max_day, 1)
  })

  persons[, `:=`(
    V20082 = as.integer(birth_years),
    V20081 = as.integer(birth_months),
    V2008 = as.integer(birth_days)
  )]

  # Properly create the cross-product of persons and quarters
  dt <- data.table::rbindlist(lapply(seq_len(nrow(quarters)), function(q) {
    qdata <- quarters[q]
    persons_copy <- data.table::copy(persons)
    persons_copy[, `:=`(
      Ano = qdata$Ano,
      Trimestre = qdata$Trimestre
    )]
    persons_copy
  }))

  # Calculate age based on survey date
  # Survey is conducted in months corresponding to quarter
  # Q1 = Jan-Mar, Q2 = Apr-Jun, Q3 = Jul-Sep, Q4 = Oct-Dec
  # Assume interview at middle of quarter (2nd month)
  dt[, survey_month := (Trimestre - 1L) * 3L + 2L]

  # Calculate age correctly
  dt[, V2009 := {
    age <- Ano - V20082
    # Adjust if birthday hasn't occurred yet this year
    had_birthday <- (survey_month > V20081) |
                    (survey_month == V20081 & 15L >= V2008)  # Assume interview on 15th
    age <- age - as.integer(!had_birthday)
    pmax(0L, age)  # Ensure non-negative
  }]

  # Add some realistic NA values for unknown birthdays (~2%)
  unknown_idx <- sample(seq_len(nrow(dt)), size = ceiling(nrow(dt) * 0.02))
  dt[unknown_idx, `:=`(
    V2008 = 99L,    # PNADC code for unknown
    V20081 = 99L,
    V20082 = 9999L
  )]

  # Clean up
  dt[, survey_month := NULL]

  # Add weight and calibration columns (commonly needed for apply_periods tests)
  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  dt[, `:=`(
    V1028 = runif(.N, 500, 2000),
    UF = sample(valid_ufs, .N, replace = TRUE),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # Reorder columns to match PNADC structure
  data.table::setcolorder(dt, c("Ano", "Trimestre", "UPA", "V1008", "V1014", "V2003",
                                 "V2008", "V20081", "V20082", "V2009",
                                 "V1028", "UF", "posest", "posest_sxi"))

  dt
}


#' Create Minimal Valid PNADC Data
#'
#' Creates a minimal dataset that passes validation for testing.
#'
#' @param n Number of observations
#' @param year Survey year
#' @param quarter Quarter (1-4)
#' @return data.table with minimal required columns
create_minimal_pnadc <- function(n = 10L, year = 2023L, quarter = 1L) {
  # Generate birth dates
  birth_years <- year - sample(18:65, n, replace = TRUE)
  birth_months <- sample(1:12, n, replace = TRUE)
  birth_days <- sapply(birth_months, function(m) {
    max_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)[m]
    sample(1:max_day, 1)
  })

  dt <- data.table::data.table(
    Ano = rep(year, n),
    Trimestre = rep(quarter, n),
    UPA = seq_len(n),
    V1008 = rep(1L, n),
    V1014 = sample(1:8, n, replace = TRUE),
    V2003 = rep(1L, n),
    V2008 = as.integer(birth_days),
    V20081 = as.integer(birth_months),
    V20082 = as.integer(birth_years)
  )

  # Calculate age correctly based on survey date
  # Assume interview at middle of quarter (2nd month)
  dt[, survey_month := (Trimestre - 1L) * 3L + 2L]
  dt[, V2009 := {
    age <- Ano - V20082
    # Adjust if birthday hasn't occurred yet this year
    had_birthday <- (survey_month > V20081) |
                    (survey_month == V20081 & 15L >= V2008)  # Assume interview on 15th
    age <- age - as.integer(!had_birthday)
    pmax(0L, age)  # Ensure non-negative
  }]
  dt[, survey_month := NULL]

  dt
}


#' Create PNADC Data with Calibration Columns
#'
#' Creates test data including columns needed for weight calibration.
#'
#' @param n Number of observations
#' @param year Survey year
#' @param quarter Quarter (1-4)
#' @return data.table with calibration-ready structure
create_pnadc_for_calibration <- function(n = 100L, year = 2023L, quarter = 1L) {
  dt <- create_minimal_pnadc(n, year, quarter)

  # Add calibration columns
  # Valid Brazilian UF codes: 11-17 (North), 21-29 (Northeast), 31-35 (Southeast), 41-43 (South), 50-53 (Central-West)
  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  dt[, `:=`(
    UF = sample(valid_ufs, n, replace = TRUE),  # Brazilian state codes
    V1028 = runif(n, 500, 2000),                # Quarterly weights
    V1032 = runif(n, 500, 2000),                # Annual weights
    posest = sample(1:500, n, replace = TRUE),
    posest_sxi = sample(100:999, n, replace = TRUE)
  )]

  dt
}


#' Create Stacked Multi-Quarter PNADC Data
#'
#' Creates data spanning multiple quarters for testing cross-quarter aggregation.
#'
#' @param n_quarters Number of quarters
#' @param start_year Starting year
#' @param n_upas Number of unique UPAs
#' @return data.table with stacked quarterly data
create_stacked_pnadc <- function(n_quarters = 8L, start_year = 2022L, n_upas = 20L) {
  create_realistic_pnadc(
    n_quarters = n_quarters,
    n_upas = n_upas,
    start_year = start_year
  )
}


#' Create Monthly Population Targets for Testing
#'
#' Creates mock monthly population targets matching SIDRA format.
#'
#' @param start_yyyymm Starting month (YYYYMM format)
#' @param n_months Number of months
#' @param base_pop Base population (in thousands)
#' @return data.table with monthly population targets
create_monthly_targets <- function(start_yyyymm = 202301L,
                                    n_months = 12L,
                                    base_pop = 215000) {
  start_year <- start_yyyymm %/% 100L
  start_month <- start_yyyymm %% 100L

  months <- data.table::data.table(
    month_num = seq_len(n_months)
  )
  months[, `:=`(
    year = start_year + (start_month + month_num - 2L) %/% 12L,
    month = ((start_month + month_num - 2L) %% 12L) + 1L
  )]
  months[, ref_month_yyyymm := year * 100L + month]

  # Add slight growth trend
  months[, m_populacao := base_pop * (1 + 0.001 * (month_num - 1))]

  months[, .(ref_month_yyyymm, m_populacao)]
}


#' Create Mock Rolling Quarter Data for Testing
#'
#' Creates synthetic rolling quarter data matching SIDRA format.
#' Useful for testing mensalize_sidra_series() without API calls.
#'
#' @param n_months Number of months to generate
#' @param series Character vector of series names to include
#' @param start_yyyymm Starting YYYYMM (default 201201)
#' @param base_values Named list of base values per series
#' @return data.table with anomesfinaltrimmovel, mesnotrim, and series columns
create_mock_rolling_quarters <- function(n_months = 36L,
                                          series = "popocup",
                                          start_yyyymm = 201201L,
                                          base_values = NULL) {
  start_year <- start_yyyymm %/% 100L
  start_month <- start_yyyymm %% 100L

  dt <- data.table::data.table(month_num = seq_len(n_months))
  dt[, `:=`(
    year = start_year + (start_month + month_num - 2L) %/% 12L,
    month = ((start_month + month_num - 2L) %% 12L) + 1L
  )]
  dt[, anomesfinaltrimmovel := year * 100L + month]
  dt[, mesnotrim := ((month - 1L) %% 3L) + 1L]

  # Generate series columns with realistic rolling quarter patterns
  for (s in series) {
    base <- if (!is.null(base_values) && s %in% names(base_values)) {
      base_values[[s]]
    } else {
      100000  # default base value
    }
    # Slight trend + seasonal pattern for realism
    dt[, (s) := base * (1 + 0.001 * (month_num - 1) +
                          0.005 * sin(2 * pi * month_num / 12))]
  }

  dt[, .(anomesfinaltrimmovel, mesnotrim,
         .SD), .SDcols = series]
}


#' Generate proper YYYYMM sequence
#'
#' @param start_yyyymm Starting YYYYMM integer
#' @param n Number of months
#' @return Integer vector of YYYYMM values
generate_yyyymm_seq <- function(start_yyyymm, n) {
  start_year <- start_yyyymm %/% 100L
  start_month <- start_yyyymm %% 100L
  vapply(seq_len(n), function(i) {
    total_months <- start_month + i - 2L
    y <- start_year + total_months %/% 12L
    m <- (total_months %% 12L) + 1L
    y * 100L + m
  }, integer(1))
}


#' Create mock monthly population targets for tests (avoids SIDRA fetch).
#'
#' Builds a target_totals data.table by aggregating V1028 weights of the
#' input PNADC dataset across quarters and expanding the quarterly sum to
#' three months. The result has the structure expected by
#' pnadc_apply_periods() for calibration_unit = "month". For "fortnight"
#' or "week", apply derive_fortnight_population() or
#' derive_weekly_population() to the output.
#'
#' Using V1028 sums ensures the mock is realistic relative to the
#' synthetic dataset (so calibration converges and test assertions are
#' meaningful), without making any HTTP request to SIDRA.
#'
#' @param data PNADC data.table with columns Ano, Trimestre, V1028.
#' @return data.table with columns ref_month_yyyymm and m_populacao.
create_mock_pop_targets <- function(data) {
  qtr_wsum <- data[, .(qtr_wsum = sum(V1028, na.rm = TRUE)),
                    by = .(Ano, Trimestre)]
  qtr_wsum[, {
    months <- (Trimestre - 1L) * 3L + 1:3
    data.table::data.table(
      ref_month_yyyymm = Ano * 100L + months,
      m_populacao = qtr_wsum / 1000
    )
  }, by = .(Ano, Trimestre)][, .(ref_month_yyyymm, m_populacao)]
}

Try the PNADCperiods package in your browser

Any scripts or data that you put into this service are public.

PNADCperiods documentation built on April 28, 2026, 9:07 a.m.