Nothing
#' @title Input Validation Utilities
#' @description Internal helper functions for validating PNADC input data.
#' @name utils-validation
#' @keywords internal
NULL
# ============================================================================
# IBGE PERIOD INVARIANT VALIDATION
# ============================================================================
#' Validate IBGE Period Crosswalk Invariants
#'
#' Ensures that period identification functions never produce invalid IBGE
#' calendar values. This function checks all invariants that must hold for
#' valid period assignments.
#'
#' @param crosswalk A data.table crosswalk with period columns
#' @param strict Logical. If TRUE (default), stops with an error on violations.
#' If FALSE, issues warnings and returns a validation report.
#' @param context Character string describing where validation is being called
#' from (for error messages). Default is "crosswalk".
#'
#' @return If \code{strict = TRUE}, returns invisibly if valid or stops with error.
#' If \code{strict = FALSE}, returns a list with:
#' \itemize{
#' \item \code{valid}: Logical indicating if all invariants passed
#' \item \code{violations}: Named list of violations found (empty if none)
#' }
#'
#' @details
#' The following IBGE calendar invariants are checked:
#' \itemize{
#' \item \strong{Valid ranges}: ref_week_in_quarter in [1,12] or NA,
#' ref_fortnight_in_quarter in [1,6] or NA, ref_month_in_quarter in [1,3] or NA
#' \item \strong{Nesting}: week requires fortnight, fortnight requires month
#' \item \strong{Fortnight-month consistency}: fortnights 1-2 belong to month 1,
#' 3-4 to month 2, 5-6 to month 3
#' \item \strong{Week-fortnight consistency}: weeks 1-2 belong to fortnight 1,
#' 3-4 to fortnight 2, etc.
#' }
#'
#' @keywords internal
#' @noRd
validate_period_invariants <- function(crosswalk, strict = TRUE, context = "crosswalk") {
violations <- list()
# -------------------------------------------------------------------------
# INVARIANT 1: Valid ranges
# -------------------------------------------------------------------------
# Check ref_month_in_quarter in [1,3] or NA
if ("ref_month_in_quarter" %in% names(crosswalk)) {
invalid_months <- crosswalk[!is.na(ref_month_in_quarter) &
(ref_month_in_quarter < 1L | ref_month_in_quarter > 3L)]
if (nrow(invalid_months) > 0) {
violations$invalid_month_values <- list(
count = nrow(invalid_months),
values = unique(invalid_months$ref_month_in_quarter),
message = sprintf(
"ref_month_in_quarter must be 1-3 or NA. Found %d rows with invalid values: %s",
nrow(invalid_months),
paste(unique(invalid_months$ref_month_in_quarter), collapse = ", ")
)
)
}
}
# Check ref_fortnight_in_quarter in [1,6] or NA
if ("ref_fortnight_in_quarter" %in% names(crosswalk)) {
invalid_fortnights <- crosswalk[!is.na(ref_fortnight_in_quarter) &
(ref_fortnight_in_quarter < 1L | ref_fortnight_in_quarter > 6L)]
if (nrow(invalid_fortnights) > 0) {
violations$invalid_fortnight_values <- list(
count = nrow(invalid_fortnights),
values = unique(invalid_fortnights$ref_fortnight_in_quarter),
message = sprintf(
"ref_fortnight_in_quarter must be 1-6 or NA. Found %d rows with invalid values: %s",
nrow(invalid_fortnights),
paste(unique(invalid_fortnights$ref_fortnight_in_quarter), collapse = ", ")
)
)
}
}
# Check ref_week_in_quarter in [1,12] or NA
if ("ref_week_in_quarter" %in% names(crosswalk)) {
invalid_weeks <- crosswalk[!is.na(ref_week_in_quarter) &
(ref_week_in_quarter < 1L | ref_week_in_quarter > 12L)]
if (nrow(invalid_weeks) > 0) {
violations$invalid_week_values <- list(
count = nrow(invalid_weeks),
values = unique(invalid_weeks$ref_week_in_quarter),
message = sprintf(
"ref_week_in_quarter must be 1-12 or NA. Found %d rows with invalid values: %s",
nrow(invalid_weeks),
paste(unique(invalid_weeks$ref_week_in_quarter), collapse = ", ")
)
)
}
}
# -------------------------------------------------------------------------
# INVARIANT 2: Nesting - week requires fortnight, fortnight requires month
# -------------------------------------------------------------------------
# Check: if week is determined, fortnight must be determined
if (all(c("ref_week_in_quarter", "ref_fortnight_in_quarter") %in% names(crosswalk))) {
week_without_fortnight <- crosswalk[!is.na(ref_week_in_quarter) &
is.na(ref_fortnight_in_quarter)]
if (nrow(week_without_fortnight) > 0) {
violations$week_without_fortnight <- list(
count = nrow(week_without_fortnight),
message = sprintf(
"Nesting violation: %d rows have week determined but fortnight is NA",
nrow(week_without_fortnight)
)
)
}
}
# Check: if fortnight is determined, month must be determined
if (all(c("ref_fortnight_in_quarter", "ref_month_in_quarter") %in% names(crosswalk))) {
fortnight_without_month <- crosswalk[!is.na(ref_fortnight_in_quarter) &
is.na(ref_month_in_quarter)]
if (nrow(fortnight_without_month) > 0) {
violations$fortnight_without_month <- list(
count = nrow(fortnight_without_month),
message = sprintf(
"Nesting violation: %d rows have fortnight determined but month is NA",
nrow(fortnight_without_month)
)
)
}
}
# -------------------------------------------------------------------------
# INVARIANT 3: Fortnight-month consistency
# Fortnights 1-2 belong to month 1, 3-4 to month 2, 5-6 to month 3
# -------------------------------------------------------------------------
if (all(c("ref_fortnight_in_quarter", "ref_month_in_quarter") %in% names(crosswalk))) {
# Calculate expected month from fortnight: ((fortnight - 1) %/% 2) + 1
fortnight_month_inconsistent <- crosswalk[
!is.na(ref_fortnight_in_quarter) & !is.na(ref_month_in_quarter) &
(((ref_fortnight_in_quarter - 1L) %/% 2L) + 1L) != ref_month_in_quarter
]
if (nrow(fortnight_month_inconsistent) > 0) {
violations$fortnight_month_mismatch <- list(
count = nrow(fortnight_month_inconsistent),
message = sprintf(
"Fortnight-month inconsistency: %d rows have fortnight that doesn't match month",
nrow(fortnight_month_inconsistent)
)
)
}
}
# -------------------------------------------------------------------------
# INVARIANT 4: Week-fortnight consistency
# Weeks 1-2 belong to fortnight 1, 3-4 to fortnight 2, etc.
# Formula: expected_fortnight = ((week - 1) %/% 2) + 1
# -------------------------------------------------------------------------
if (all(c("ref_week_in_quarter", "ref_fortnight_in_quarter") %in% names(crosswalk))) {
# Calculate expected fortnight from week
week_fortnight_inconsistent <- crosswalk[
!is.na(ref_week_in_quarter) & !is.na(ref_fortnight_in_quarter) &
(((ref_week_in_quarter - 1L) %/% 2L) + 1L) != ref_fortnight_in_quarter
]
if (nrow(week_fortnight_inconsistent) > 0) {
violations$week_fortnight_mismatch <- list(
count = nrow(week_fortnight_inconsistent),
message = sprintf(
"Week-fortnight inconsistency: %d rows have week that doesn't match fortnight",
nrow(week_fortnight_inconsistent)
)
)
}
}
# -------------------------------------------------------------------------
# INVARIANT 5: Week-month consistency (derived from above, but direct check)
# Weeks 1-4 belong to month 1, 5-8 to month 2, 9-12 to month 3
# -------------------------------------------------------------------------
if (all(c("ref_week_in_quarter", "ref_month_in_quarter") %in% names(crosswalk))) {
# Calculate expected month from week: ((week - 1) %/% 4) + 1
week_month_inconsistent <- crosswalk[
!is.na(ref_week_in_quarter) & !is.na(ref_month_in_quarter) &
(((ref_week_in_quarter - 1L) %/% 4L) + 1L) != ref_month_in_quarter
]
if (nrow(week_month_inconsistent) > 0) {
violations$week_month_mismatch <- list(
count = nrow(week_month_inconsistent),
message = sprintf(
"Week-month inconsistency: %d rows have week that doesn't match month",
nrow(week_month_inconsistent)
)
)
}
}
# -------------------------------------------------------------------------
# RETURN RESULT
# -------------------------------------------------------------------------
result <- list(
valid = length(violations) == 0,
violations = violations
)
if (strict && !result$valid) {
# Build error message
error_msgs <- vapply(violations, function(v) v$message, character(1))
msg <- paste0(
"IBGE period invariant violations in ", context, ":\n",
paste(" - ", error_msgs, collapse = "\n")
)
stop(msg, call. = FALSE)
}
if (!strict && !result$valid) {
# Issue warnings
for (violation in violations) {
warning(paste0(context, ": ", violation$message), call. = FALSE)
}
}
invisible(result)
}
#' Required Variables for Reference Period Identification
#'
#' Returns the minimum required column names for identifying reference periods
#' (month, fortnight, and week).
#'
#' @return Character vector of required column names
#' @keywords internal
#' @noRd
required_vars_ref_month <- function() {
# V1008 is required for week identification (household-level aggregation)
c("Ano", "Trimestre", "UPA", "V1008", "V1014", "V2008", "V20081", "V20082", "V2009")
}
#' Required Variables for Crosswalk Join Keys
#'
#' Returns the column names used as join keys in the output crosswalk.
#' The crosswalk is at household-quarter level (not person level),
#' so V2003 (person sequence) is NOT included.
#'
#' @return Character vector of join key column names
#' @keywords internal
#' @noRd
join_key_vars <- function() {
# Note: Crosswalk is at household level (V1008), not person level (V2003)
# All persons in a household share the same reference period
c("Ano", "Trimestre", "UPA", "V1008", "V1014")
}
#' Required Variables for Weight Calibration
#'
#' Returns the additional column names required for computing calibrated weights.
#'
#' Note: Weight calibration operates at **person level** (not household level)
#' because calibration cells are based on individual attributes like age (V2009).
#' Although the crosswalk assigns reference periods at household level (all
#' household members share the same reference period), each person receives
#' a potentially different calibrated weight based on their age group.
#'
#' @return Character vector of required column names
#' @keywords internal
#' @noRd
required_vars_weights <- function() {
c(
# Survey design - V1028 (quarterly) or V1032 (annual) weights
"V1028", "UF", "posest", "posest_sxi"
# V2009 (age) is already required for ref_month identification
# and is also used to define calibration cells (person-level)
)
}
#' Validate PNADC Input Data
#'
#' Checks that input data has required columns for the specified processing.
#'
#' @param data A data.frame or data.table with PNADC microdata
#' @param check_weights Logical. If TRUE, also check for weight-related variables.
#' @param stop_on_error Logical. If TRUE, stops with an error. If FALSE, returns
#' a validation report list.
#'
#' @return If \code{stop_on_error = TRUE}, returns invisibly if valid or stops with error.
#' If \code{stop_on_error = FALSE}, returns a list with:
#' \itemize{
#' \item \code{valid}: Logical indicating if data passed all validations
#' \item \code{issues}: Named list of validation issues found (empty if none)
#' \item \code{n_rows}: Number of rows in input data
#' \item \code{n_cols}: Number of columns in input data
#' \item \code{join_keys_available}: Character vector of available join key columns
#' }
#'
#' @details
#' The function performs the following validations:
#' \itemize{
#' \item Checks for required columns for reference period identification:
#' \code{Ano}, \code{Trimestre}, \code{UPA}, \code{V1008}, \code{V1014},
#' \code{V2008}, \code{V20081}, \code{V20082}, \code{V2009}
#' \item Validates year range (2012-2100 for PNADC coverage)
#' \item Validates quarter values (must be 1-4)
#' \item Validates birth day values (must be 1-31 or 99 for unknown)
#' \item Validates birth month values (must be 1-12 or 99 for unknown)
#' \item Warns about unusual ages (outside 0-130 range)
#' \item If \code{check_weights = TRUE}, also validates weight-related columns:
#' \code{V1028}, \code{UF}, \code{posest}, \code{posest_sxi}
#' }
#'
#' @seealso \code{\link{pnadc_identify_periods}} which calls this function
#' internally to validate input data.
#'
#' @examples
#' # Minimal valid data (all 9 required columns)
#' sample_data <- data.frame(
#' Ano = 2023L, Trimestre = 1L, UPA = 110000001L,
#' V1008 = 1L, V1014 = 1L,
#' V2008 = 15L, V20081 = 3L, V20082 = 1990L, V2009 = 33L
#' )
#' validate_pnadc(sample_data)
#'
#' # Data with missing columns returns issues (non-stop mode)
#' incomplete_data <- data.frame(Ano = 2023L, Trimestre = 1L)
#' result <- validate_pnadc(incomplete_data, stop_on_error = FALSE)
#' result$valid # FALSE
#' result$issues # lists missing columns
#'
#' @export
validate_pnadc <- function(data, check_weights = FALSE, stop_on_error = TRUE) {
checkmate::assert_data_frame(data, min.rows = 1)
issues <- list()
# Check required columns for reference month
required <- required_vars_ref_month()
missing <- setdiff(required, names(data))
if (length(missing) > 0) {
issues$missing_ref_month <- missing
}
# Check weight-related columns if requested
if (check_weights) {
required_wt <- required_vars_weights()
missing_wt <- setdiff(required_wt, names(data))
if (length(missing_wt) > 0) {
issues$missing_weights <- missing_wt
}
}
# Check join key columns (V1008, V2003 may be optional in some cases)
join_keys <- join_key_vars()
join_available <- intersect(join_keys, names(data))
# Validate data types and ranges
# Note: PNADC data may have character columns that need conversion
if ("Ano" %in% names(data)) {
years <- suppressWarnings(as.integer(unique(data$Ano)))
years <- years[!is.na(years)]
invalid_years <- years[years < 2012 | years > 2100]
if (length(invalid_years) > 0) {
issues$invalid_years <- invalid_years
}
}
if ("Trimestre" %in% names(data)) {
quarters <- suppressWarnings(as.integer(unique(data$Trimestre)))
quarters <- quarters[!is.na(quarters)]
invalid_quarters <- quarters[!quarters %in% 1:4]
if (length(invalid_quarters) > 0) {
issues$invalid_quarters <- invalid_quarters
}
}
if ("V2008" %in% names(data)) {
days <- suppressWarnings(as.integer(unique(data$V2008)))
# Exclude NA and special codes (99 = unknown)
days <- days[!is.na(days) & days != 99]
invalid_days <- days[days < 1 | days > 31]
if (length(invalid_days) > 0) {
issues$invalid_birth_days <- invalid_days
}
}
if ("V20081" %in% names(data)) {
months <- suppressWarnings(as.integer(unique(data$V20081)))
# Exclude NA and special codes (99 = unknown)
months <- months[!is.na(months) & months != 99]
invalid_months <- months[months < 1 | months > 12]
if (length(invalid_months) > 0) {
issues$invalid_birth_months <- invalid_months
}
}
if ("V2009" %in% names(data)) {
ages <- suppressWarnings(as.numeric(unique(data$V2009)))
ages <- ages[!is.na(ages)]
# Allow ages up to 130 (rare but possible; higher values are likely data errors)
# These extreme values will be handled gracefully during processing
invalid_ages <- ages[ages < 0 | ages > 130]
if (length(invalid_ages) > 0) {
issues$warning_ages <- paste("Unusual ages found:", paste(head(invalid_ages, 5), collapse = ", "),
"- these will be processed but may affect results")
}
}
# Prepare result
result <- list(
valid = length(issues) == 0 || all(grepl("^warning", names(issues))),
issues = issues,
n_rows = nrow(data),
n_cols = ncol(data),
join_keys_available = join_available
)
if (stop_on_error && !result$valid) {
error_issues <- issues[!grepl("^warning", names(issues))]
msg <- paste0(
"PNADC data validation failed:\n",
paste(
vapply(names(error_issues), function(nm) {
paste0(" - ", nm, ": ", paste(error_issues[[nm]], collapse = ", "))
}, character(1)),
collapse = "\n"
)
)
stop(msg, call. = FALSE)
}
if (stop_on_error) {
# Print warnings if any
warnings <- issues[grepl("^warning", names(issues))]
if (length(warnings) > 0) {
for (nm in names(warnings)) {
warning(warnings[[nm]], call. = FALSE)
}
}
invisible(result)
} else {
result
}
}
#' Validate Monthly Totals Data
#'
#' Checks that monthly totals data has required structure.
#'
#' @param monthly_totals A data.frame with monthly population totals
#' @param stop_on_error Logical. If TRUE, stops with an error.
#'
#' @return Validation result or stops with error
#' @keywords internal
#' @noRd
validate_monthly_totals <- function(monthly_totals, stop_on_error = TRUE) {
checkmate::assert_data_frame(monthly_totals, min.rows = 1)
issues <- list()
# Check for required column (either ref_month_yyyymm or anomesexato)
has_yyyymm <- "ref_month_yyyymm" %in% names(monthly_totals)
has_anomesexato <- "anomesexato" %in% names(monthly_totals)
if (!has_yyyymm && !has_anomesexato) {
issues$missing_date <- "Need either 'ref_month_yyyymm' or 'anomesexato' column"
}
# Check for population column
has_pop <- "m_populacao" %in% names(monthly_totals)
if (!has_pop) {
issues$missing_population <- "Need 'm_populacao' column with monthly population totals"
}
if (stop_on_error && length(issues) > 0) {
msg <- paste0(
"Monthly totals validation failed:\n",
paste(
vapply(names(issues), function(nm) {
paste0(" - ", nm, ": ", issues[[nm]])
}, character(1)),
collapse = "\n"
)
)
stop(msg, call. = FALSE)
}
invisible(length(issues) == 0)
}
#' Check for Data.table
#'
#' Ensures data is a data.table, converting if necessary.
#'
#' @param data A data.frame or data.table
#' @param copy Logical. If TRUE, always returns a copy. If FALSE, may return
#' the original object if already a data.table.
#' @return data.table
#' @keywords internal
#' @noRd
ensure_data_table <- function(data, copy = FALSE) {
if (data.table::is.data.table(data)) {
if (copy) {
data.table::copy(data)
} else {
data
}
} else {
data.table::as.data.table(data)
}
}
#' Subset Data to Required Columns and Copy
#'
#' Instead of copying the entire data.table (potentially 50+ columns),
#' this function first subsets to only the required columns, then copies.
#'
#' @param data A data.frame or data.table
#' @param required_cols Character vector of required column names
#' @param optional_cols Character vector of optional column names (included if present)
#' @return data.table with only the specified columns (always a copy)
#' @keywords internal
#' @noRd
subset_and_copy <- function(data, required_cols, optional_cols = NULL) {
# Check required columns exist
missing <- setdiff(required_cols, names(data))
if (length(missing) > 0) {
stop("Missing required columns: ", paste(missing, collapse = ", "), call. = FALSE)
}
# Build column list: required + available optional
cols_to_keep <- required_cols
if (!is.null(optional_cols)) {
cols_to_keep <- unique(c(cols_to_keep, intersect(optional_cols, names(data))))
}
# Subset and copy in one operation
# Use .SDcols to avoid R CMD check NOTE about ..cols_to_keep
if (data.table::is.data.table(data)) {
data.table::copy(data[, .SD, .SDcols = cols_to_keep])
} else {
data.table::as.data.table(data[, cols_to_keep, drop = FALSE])
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.