Nothing
# Helper functions for creating realistic PNADC test data
# These functions ensure consistency between birthday and age variables
#' Create Realistic PNADC Test Data
#'
#' Generates synthetic PNADC data with internally consistent fields.
#' Age is correctly derived from birth date and survey date.
#'
#' @param n_quarters Number of quarters to generate (default 4)
#' @param n_upas Number of UPAs per quarter (default 10)
#' @param persons_per_household Average persons per household (default 3)
#' @param start_year Starting year (default 2023)
#' @param seed Random seed for reproducibility
#' @return data.table with realistic PNADC structure
create_realistic_pnadc <- function(n_quarters = 4,
n_upas = 10,
persons_per_household = 3,
start_year = 2023L,
seed = 42L) {
set.seed(seed)
# Generate quarter combinations
quarters <- data.table::data.table(
q = seq_len(n_quarters)
)
quarters[, `:=`(
Ano = start_year + (q - 1L) %/% 4L,
Trimestre = ((q - 1L) %% 4L) + 1L
)]
# Generate UPAs (each UPA has a panel group V1014)
upas <- data.table::data.table(
UPA = seq_len(n_upas),
V1014 = sample(1:8, n_upas, replace = TRUE) # Panel groups 1-8
)
# Generate households per UPA
households_per_upa <- sample(2:5, n_upas, replace = TRUE)
households <- data.table::rbindlist(lapply(seq_len(n_upas), function(u) {
data.table::data.table(
UPA = u,
V1008 = seq_len(households_per_upa[u])
)
}))
households <- merge(households, upas, by = "UPA")
# Generate persons per household
persons <- data.table::rbindlist(lapply(seq_len(nrow(households)), function(h) {
n_persons <- max(1L, rpois(1, persons_per_household))
hh <- households[h]
data.table::data.table(
UPA = hh$UPA,
V1008 = hh$V1008,
V1014 = hh$V1014,
V2003 = seq_len(n_persons)
)
}))
# Generate birth dates with realistic age distribution
# PNADC covers all ages, with typical Brazilian population distribution
n_persons <- nrow(persons)
# Generate birth years (wider range for realistic age distribution)
# Ages typically 0-100, median around 35
birth_years <- start_year - sample(0:85, n_persons, replace = TRUE,
prob = dnorm(0:85, mean = 35, sd = 20) + 0.01)
birth_months <- sample(1:12, n_persons, replace = TRUE)
birth_days <- sapply(birth_months, function(m) {
max_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)[m]
sample(1:max_day, 1)
})
persons[, `:=`(
V20082 = as.integer(birth_years),
V20081 = as.integer(birth_months),
V2008 = as.integer(birth_days)
)]
# Properly create the cross-product of persons and quarters
dt <- data.table::rbindlist(lapply(seq_len(nrow(quarters)), function(q) {
qdata <- quarters[q]
persons_copy <- data.table::copy(persons)
persons_copy[, `:=`(
Ano = qdata$Ano,
Trimestre = qdata$Trimestre
)]
persons_copy
}))
# Calculate age based on survey date
# Survey is conducted in months corresponding to quarter
# Q1 = Jan-Mar, Q2 = Apr-Jun, Q3 = Jul-Sep, Q4 = Oct-Dec
# Assume interview at middle of quarter (2nd month)
dt[, survey_month := (Trimestre - 1L) * 3L + 2L]
# Calculate age correctly
dt[, V2009 := {
age <- Ano - V20082
# Adjust if birthday hasn't occurred yet this year
had_birthday <- (survey_month > V20081) |
(survey_month == V20081 & 15L >= V2008) # Assume interview on 15th
age <- age - as.integer(!had_birthday)
pmax(0L, age) # Ensure non-negative
}]
# Add some realistic NA values for unknown birthdays (~2%)
unknown_idx <- sample(seq_len(nrow(dt)), size = ceiling(nrow(dt) * 0.02))
dt[unknown_idx, `:=`(
V2008 = 99L, # PNADC code for unknown
V20081 = 99L,
V20082 = 9999L
)]
# Clean up
dt[, survey_month := NULL]
# Add weight and calibration columns (commonly needed for apply_periods tests)
valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
dt[, `:=`(
V1028 = runif(.N, 500, 2000),
UF = sample(valid_ufs, .N, replace = TRUE),
posest = sample(1:500, .N, replace = TRUE),
posest_sxi = sample(100:999, .N, replace = TRUE)
)]
# Reorder columns to match PNADC structure
data.table::setcolorder(dt, c("Ano", "Trimestre", "UPA", "V1008", "V1014", "V2003",
"V2008", "V20081", "V20082", "V2009",
"V1028", "UF", "posest", "posest_sxi"))
dt
}
#' Create Minimal Valid PNADC Data
#'
#' Creates a minimal dataset that passes validation for testing.
#'
#' @param n Number of observations
#' @param year Survey year
#' @param quarter Quarter (1-4)
#' @return data.table with minimal required columns
create_minimal_pnadc <- function(n = 10L, year = 2023L, quarter = 1L) {
# Generate birth dates
birth_years <- year - sample(18:65, n, replace = TRUE)
birth_months <- sample(1:12, n, replace = TRUE)
birth_days <- sapply(birth_months, function(m) {
max_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)[m]
sample(1:max_day, 1)
})
dt <- data.table::data.table(
Ano = rep(year, n),
Trimestre = rep(quarter, n),
UPA = seq_len(n),
V1008 = rep(1L, n),
V1014 = sample(1:8, n, replace = TRUE),
V2003 = rep(1L, n),
V2008 = as.integer(birth_days),
V20081 = as.integer(birth_months),
V20082 = as.integer(birth_years)
)
# Calculate age correctly based on survey date
# Assume interview at middle of quarter (2nd month)
dt[, survey_month := (Trimestre - 1L) * 3L + 2L]
dt[, V2009 := {
age <- Ano - V20082
# Adjust if birthday hasn't occurred yet this year
had_birthday <- (survey_month > V20081) |
(survey_month == V20081 & 15L >= V2008) # Assume interview on 15th
age <- age - as.integer(!had_birthday)
pmax(0L, age) # Ensure non-negative
}]
dt[, survey_month := NULL]
dt
}
#' Create PNADC Data with Calibration Columns
#'
#' Creates test data including columns needed for weight calibration.
#'
#' @param n Number of observations
#' @param year Survey year
#' @param quarter Quarter (1-4)
#' @return data.table with calibration-ready structure
create_pnadc_for_calibration <- function(n = 100L, year = 2023L, quarter = 1L) {
dt <- create_minimal_pnadc(n, year, quarter)
# Add calibration columns
# Valid Brazilian UF codes: 11-17 (North), 21-29 (Northeast), 31-35 (Southeast), 41-43 (South), 50-53 (Central-West)
valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
dt[, `:=`(
UF = sample(valid_ufs, n, replace = TRUE), # Brazilian state codes
V1028 = runif(n, 500, 2000), # Quarterly weights
V1032 = runif(n, 500, 2000), # Annual weights
posest = sample(1:500, n, replace = TRUE),
posest_sxi = sample(100:999, n, replace = TRUE)
)]
dt
}
#' Create Stacked Multi-Quarter PNADC Data
#'
#' Creates data spanning multiple quarters for testing cross-quarter aggregation.
#'
#' @param n_quarters Number of quarters
#' @param start_year Starting year
#' @param n_upas Number of unique UPAs
#' @return data.table with stacked quarterly data
create_stacked_pnadc <- function(n_quarters = 8L, start_year = 2022L, n_upas = 20L) {
create_realistic_pnadc(
n_quarters = n_quarters,
n_upas = n_upas,
start_year = start_year
)
}
#' Create Monthly Population Targets for Testing
#'
#' Creates mock monthly population targets matching SIDRA format.
#'
#' @param start_yyyymm Starting month (YYYYMM format)
#' @param n_months Number of months
#' @param base_pop Base population (in thousands)
#' @return data.table with monthly population targets
create_monthly_targets <- function(start_yyyymm = 202301L,
n_months = 12L,
base_pop = 215000) {
start_year <- start_yyyymm %/% 100L
start_month <- start_yyyymm %% 100L
months <- data.table::data.table(
month_num = seq_len(n_months)
)
months[, `:=`(
year = start_year + (start_month + month_num - 2L) %/% 12L,
month = ((start_month + month_num - 2L) %% 12L) + 1L
)]
months[, ref_month_yyyymm := year * 100L + month]
# Add slight growth trend
months[, m_populacao := base_pop * (1 + 0.001 * (month_num - 1))]
months[, .(ref_month_yyyymm, m_populacao)]
}
#' Create Mock Rolling Quarter Data for Testing
#'
#' Creates synthetic rolling quarter data matching SIDRA format.
#' Useful for testing mensalize_sidra_series() without API calls.
#'
#' @param n_months Number of months to generate
#' @param series Character vector of series names to include
#' @param start_yyyymm Starting YYYYMM (default 201201)
#' @param base_values Named list of base values per series
#' @return data.table with anomesfinaltrimmovel, mesnotrim, and series columns
create_mock_rolling_quarters <- function(n_months = 36L,
series = "popocup",
start_yyyymm = 201201L,
base_values = NULL) {
start_year <- start_yyyymm %/% 100L
start_month <- start_yyyymm %% 100L
dt <- data.table::data.table(month_num = seq_len(n_months))
dt[, `:=`(
year = start_year + (start_month + month_num - 2L) %/% 12L,
month = ((start_month + month_num - 2L) %% 12L) + 1L
)]
dt[, anomesfinaltrimmovel := year * 100L + month]
dt[, mesnotrim := ((month - 1L) %% 3L) + 1L]
# Generate series columns with realistic rolling quarter patterns
for (s in series) {
base <- if (!is.null(base_values) && s %in% names(base_values)) {
base_values[[s]]
} else {
100000 # default base value
}
# Slight trend + seasonal pattern for realism
dt[, (s) := base * (1 + 0.001 * (month_num - 1) +
0.005 * sin(2 * pi * month_num / 12))]
}
dt[, .(anomesfinaltrimmovel, mesnotrim,
.SD), .SDcols = series]
}
#' Generate proper YYYYMM sequence
#'
#' @param start_yyyymm Starting YYYYMM integer
#' @param n Number of months
#' @return Integer vector of YYYYMM values
generate_yyyymm_seq <- function(start_yyyymm, n) {
start_year <- start_yyyymm %/% 100L
start_month <- start_yyyymm %% 100L
vapply(seq_len(n), function(i) {
total_months <- start_month + i - 2L
y <- start_year + total_months %/% 12L
m <- (total_months %% 12L) + 1L
y * 100L + m
}, integer(1))
}
#' Create mock monthly population targets for tests (avoids SIDRA fetch).
#'
#' Builds a target_totals data.table by aggregating V1028 weights of the
#' input PNADC dataset across quarters and expanding the quarterly sum to
#' three months. The result has the structure expected by
#' pnadc_apply_periods() for calibration_unit = "month". For "fortnight"
#' or "week", apply derive_fortnight_population() or
#' derive_weekly_population() to the output.
#'
#' Using V1028 sums ensures the mock is realistic relative to the
#' synthetic dataset (so calibration converges and test assertions are
#' meaningful), without making any HTTP request to SIDRA.
#'
#' @param data PNADC data.table with columns Ano, Trimestre, V1028.
#' @return data.table with columns ref_month_yyyymm and m_populacao.
create_mock_pop_targets <- function(data) {
qtr_wsum <- data[, .(qtr_wsum = sum(V1028, na.rm = TRUE)),
by = .(Ano, Trimestre)]
qtr_wsum[, {
months <- (Trimestre - 1L) * 3L + 1:3
data.table::data.table(
ref_month_yyyymm = Ano * 100L + months,
m_populacao = qtr_wsum / 1000
)
}, by = .(Ano, Trimestre)][, .(ref_month_yyyymm, m_populacao)]
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.