helper-test-data.R
In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data

# Helper functions for creating realistic PNADC test data
# These functions ensure consistency between birthday and age variables

#' Create Realistic PNADC Test Data
#'
#' Generates synthetic PNADC data with internally consistent fields.
#' Age is correctly derived from birth date and survey date.
#'
#' @param n_quarters Number of quarters to generate (default 4)
#' @param n_upas Number of UPAs per quarter (default 10)
#' @param persons_per_household Average persons per household (default 3)
#' @param start_year Starting year (default 2023)
#' @param seed Random seed for reproducibility
#' @return data.table with realistic PNADC structure
create_realistic_pnadc <- function(n_quarters = 4,
                                    n_upas = 10,
                                    persons_per_household = 3,
                                    start_year = 2023L,
                                    seed = 42L) {
  set.seed(seed)

  # Generate quarter combinations
  quarters <- data.table::data.table(
    q = seq_len(n_quarters)
  )
  quarters[, `:=`(
    Ano = start_year + (q - 1L) %/% 4L,
    Trimestre = ((q - 1L) %% 4L) + 1L
  )]

  # Generate UPAs (each UPA has a panel group V1014)
  upas <- data.table::data.table(
    UPA = seq_len(n_upas),
    V1014 = sample(1:8, n_upas, replace = TRUE)  # Panel groups 1-8
  )

  # Generate households per UPA
  households_per_upa <- sample(2:5, n_upas, replace = TRUE)
  households <- data.table::rbindlist(lapply(seq_len(n_upas), function(u) {
    data.table::data.table(
      UPA = u,
      V1008 = seq_len(households_per_upa[u])
    )
  }))
  households <- merge(households, upas, by = "UPA")

  # Generate persons per household
  persons <- data.table::rbindlist(lapply(seq_len(nrow(households)), function(h) {
    n_persons <- max(1L, rpois(1, persons_per_household))
    hh <- households[h]
    data.table::data.table(
      UPA = hh$UPA,
      V1008 = hh$V1008,
      V1014 = hh$V1014,
      V2003 = seq_len(n_persons)
    )
  }))

  # Generate birth dates with realistic age distribution
  # PNADC covers all ages, with typical Brazilian population distribution
  n_persons <- nrow(persons)

  # Generate birth years (wider range for realistic age distribution)
  # Ages typically 0-100, median around 35
  birth_years <- start_year - sample(0:85, n_persons, replace = TRUE,
                                      prob = dnorm(0:85, mean = 35, sd = 20) + 0.01)
  birth_months <- sample(1:12, n_persons, replace = TRUE)
  birth_days <- sapply(birth_months, function(m) {
    max_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)[m]
    sample(1:max_day, 1)
  })

  persons[, `:=`(
    V20082 = as.integer(birth_years),
    V20081 = as.integer(birth_months),
    V2008 = as.integer(birth_days)
  )]

  # Properly create the cross-product of persons and quarters
  dt <- data.table::rbindlist(lapply(seq_len(nrow(quarters)), function(q) {
    qdata <- quarters[q]
    persons_copy <- data.table::copy(persons)
    persons_copy[, `:=`(
      Ano = qdata$Ano,
      Trimestre = qdata$Trimestre
    )]
    persons_copy
  }))

  # Calculate age based on survey date
  # Survey is conducted in months corresponding to quarter
  # Q1 = Jan-Mar, Q2 = Apr-Jun, Q3 = Jul-Sep, Q4 = Oct-Dec
  # Assume interview at middle of quarter (2nd month)
  dt[, survey_month := (Trimestre - 1L) * 3L + 2L]

  # Calculate age correctly
  dt[, V2009 := {
    age <- Ano - V20082
    # Adjust if birthday hasn't occurred yet this year
    had_birthday <- (survey_month > V20081) |
                    (survey_month == V20081 & 15L >= V2008)  # Assume interview on 15th
    age <- age - as.integer(!had_birthday)
    pmax(0L, age)  # Ensure non-negative
  }]

  # Add some realistic NA values for unknown birthdays (~2%)
  unknown_idx <- sample(seq_len(nrow(dt)), size = ceiling(nrow(dt) * 0.02))
  dt[unknown_idx, `:=`(
    V2008 = 99L,    # PNADC code for unknown
    V20081 = 99L,
    V20082 = 9999L
  )]

  # Clean up
  dt[, survey_month := NULL]

  # Add weight and calibration columns (commonly needed for apply_periods tests)
  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  dt[, `:=`(
    V1028 = runif(.N, 500, 2000),
    UF = sample(valid_ufs, .N, replace = TRUE),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # Reorder columns to match PNADC structure
  data.table::setcolorder(dt, c("Ano", "Trimestre", "UPA", "V1008", "V1014", "V2003",
                                 "V2008", "V20081", "V20082", "V2009",
                                 "V1028", "UF", "posest", "posest_sxi"))

  dt
}


#' Create Minimal Valid PNADC Data
#'
#' Creates a minimal dataset that passes validation for testing.
#'
#' @param n Number of observations
#' @param year Survey year
#' @param quarter Quarter (1-4)
#' @return data.table with minimal required columns
create_minimal_pnadc <- function(n = 10L, year = 2023L, quarter = 1L) {
  # Generate birth dates
  birth_years <- year - sample(18:65, n, replace = TRUE)
  birth_months <- sample(1:12, n, replace = TRUE)
  birth_days <- sapply(birth_months, function(m) {
    max_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)[m]
    sample(1:max_day, 1)
  })

  dt <- data.table::data.table(
    Ano = rep(year, n),
    Trimestre = rep(quarter, n),
    UPA = seq_len(n),
    V1008 = rep(1L, n),
    V1014 = sample(1:8, n, replace = TRUE),
    V2003 = rep(1L, n),
    V2008 = as.integer(birth_days),
    V20081 = as.integer(birth_months),
    V20082 = as.integer(birth_years)
  )

  # Calculate age correctly based on survey date
  # Assume interview at middle of quarter (2nd month)
  dt[, survey_month := (Trimestre - 1L) * 3L + 2L]
  dt[, V2009 := {
    age <- Ano - V20082
    # Adjust if birthday hasn't occurred yet this year
    had_birthday <- (survey_month > V20081) |
                    (survey_month == V20081 & 15L >= V2008)  # Assume interview on 15th
    age <- age - as.integer(!had_birthday)
    pmax(0L, age)  # Ensure non-negative
  }]
  dt[, survey_month := NULL]

  dt
}


#' Create PNADC Data with Calibration Columns
#'
#' Creates test data including columns needed for weight calibration.
#'
#' @param n Number of observations
#' @param year Survey year
#' @param quarter Quarter (1-4)
#' @return data.table with calibration-ready structure
create_pnadc_for_calibration <- function(n = 100L, year = 2023L, quarter = 1L) {
  dt <- create_minimal_pnadc(n, year, quarter)

  # Add calibration columns
  # Valid Brazilian UF codes: 11-17 (North), 21-29 (Northeast), 31-35 (Southeast), 41-43 (South), 50-53 (Central-West)
  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  dt[, `:=`(
    UF = sample(valid_ufs, n, replace = TRUE),  # Brazilian state codes
    V1028 = runif(n, 500, 2000),                # Quarterly weights
    V1032 = runif(n, 500, 2000),                # Annual weights
    posest = sample(1:500, n, replace = TRUE),
    posest_sxi = sample(100:999, n, replace = TRUE)
  )]

  dt
}


#' Create Stacked Multi-Quarter PNADC Data
#'
#' Creates data spanning multiple quarters for testing cross-quarter aggregation.
#'
#' @param n_quarters Number of quarters
#' @param start_year Starting year
#' @param n_upas Number of unique UPAs
#' @return data.table with stacked quarterly data
create_stacked_pnadc <- function(n_quarters = 8L, start_year = 2022L, n_upas = 20L) {
  create_realistic_pnadc(
    n_quarters = n_quarters,
    n_upas = n_upas,
    start_year = start_year
  )
}


#' Create Monthly Population Targets for Testing
#'
#' Creates mock monthly population targets matching SIDRA format.
#'
#' @param start_yyyymm Starting month (YYYYMM format)
#' @param n_months Number of months
#' @param base_pop Base population (in thousands)
#' @return data.table with monthly population targets
create_monthly_targets <- function(start_yyyymm = 202301L,
                                    n_months = 12L,
                                    base_pop = 215000) {
  start_year <- start_yyyymm %/% 100L
  start_month <- start_yyyymm %% 100L

  months <- data.table::data.table(
    month_num = seq_len(n_months)
  )
  months[, `:=`(
    year = start_year + (start_month + month_num - 2L) %/% 12L,
    month = ((start_month + month_num - 2L) %% 12L) + 1L
  )]
  months[, ref_month_yyyymm := year * 100L + month]

  # Add slight growth trend
  months[, m_populacao := base_pop * (1 + 0.001 * (month_num - 1))]

  months[, .(ref_month_yyyymm, m_populacao)]
}


#' Create Mock Rolling Quarter Data for Testing
#'
#' Creates synthetic rolling quarter data matching SIDRA format.
#' Useful for testing mensalize_sidra_series() without API calls.
#'
#' @param n_months Number of months to generate
#' @param series Character vector of series names to include
#' @param start_yyyymm Starting YYYYMM (default 201201)
#' @param base_values Named list of base values per series
#' @return data.table with anomesfinaltrimmovel, mesnotrim, and series columns
create_mock_rolling_quarters <- function(n_months = 36L,
                                          series = "popocup",
                                          start_yyyymm = 201201L,
                                          base_values = NULL) {
  start_year <- start_yyyymm %/% 100L
  start_month <- start_yyyymm %% 100L

  dt <- data.table::data.table(month_num = seq_len(n_months))
  dt[, `:=`(
    year = start_year + (start_month + month_num - 2L) %/% 12L,
    month = ((start_month + month_num - 2L) %% 12L) + 1L
  )]
  dt[, anomesfinaltrimmovel := year * 100L + month]
  dt[, mesnotrim := ((month - 1L) %% 3L) + 1L]

  # Generate series columns with realistic rolling quarter patterns
  for (s in series) {
    base <- if (!is.null(base_values) && s %in% names(base_values)) {
      base_values[[s]]
    } else {
      100000  # default base value
    }
    # Slight trend + seasonal pattern for realism
    dt[, (s) := base * (1 + 0.001 * (month_num - 1) +
                          0.005 * sin(2 * pi * month_num / 12))]
  }

  dt[, .(anomesfinaltrimmovel, mesnotrim,
         .SD), .SDcols = series]
}


#' Generate proper YYYYMM sequence
#'
#' @param start_yyyymm Starting YYYYMM integer
#' @param n Number of months
#' @return Integer vector of YYYYMM values
generate_yyyymm_seq <- function(start_yyyymm, n) {
  start_year <- start_yyyymm %/% 100L
  start_month <- start_yyyymm %% 100L
  vapply(seq_len(n), function(i) {
    total_months <- start_month + i - 2L
    y <- start_year + total_months %/% 12L
    m <- (total_months %% 12L) + 1L
    y * 100L + m
  }, integer(1))
}


#' Create mock monthly population targets for tests (avoids SIDRA fetch).
#'
#' Builds a target_totals data.table by aggregating V1028 weights of the
#' input PNADC dataset across quarters and expanding the quarterly sum to
#' three months. The result has the structure expected by
#' pnadc_apply_periods() for calibration_unit = "month". For "fortnight"
#' or "week", apply derive_fortnight_population() or
#' derive_weekly_population() to the output.
#'
#' Using V1028 sums ensures the mock is realistic relative to the
#' synthetic dataset (so calibration converges and test assertions are
#' meaningful), without making any HTTP request to SIDRA.
#'
#' @param data PNADC data.table with columns Ano, Trimestre, V1028.
#' @return data.table with columns ref_month_yyyymm and m_populacao.
create_mock_pop_targets <- function(data) {
  qtr_wsum <- data[, .(qtr_wsum = sum(V1028, na.rm = TRUE)),
                    by = .(Ano, Trimestre)]
  qtr_wsum[, {
    months <- (Trimestre - 1L) * 3L + 1:3
    data.table::data.table(
      ref_month_yyyymm = Ano * 100L + months,
      m_populacao = qtr_wsum / 1000
    )
  }, by = .(Ano, Trimestre)][, .(ref_month_yyyymm, m_populacao)]
}
Any scripts or data that you put into this service are public.
PNADCperiods documentation built on April 28, 2026, 9:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PNADCperiods
Identify Reference Periods in Brazil's PNADC Survey Data

tests/testthat/helper-test-data.R
In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data

Try the PNADCperiods package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PNADCperiods Identify Reference Periods in Brazil's PNADC Survey Data

tests/testthat/helper-test-data.R In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data

Try the PNADCperiods package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PNADCperiods
Identify Reference Periods in Brazil's PNADC Survey Data

tests/testthat/helper-test-data.R
In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data