Nothing
#' @title Fetch Monthly Population from SIDRA
#' @description Functions to download and transform IBGE's population estimates
#' from SIDRA API for use in monthly weight calibration.
#' @name fetch-sidra-population
#' @keywords internal
NULL
# ============================================================================
# NOTE: Caching infrastructure is in fetch-sidra-series.R (.sidra_cache)
# This file uses the unified cache via .is_cache_valid(), .get_cache(), .set_cache()
# ============================================================================
#' Fetch Monthly Population from SIDRA
#'
#' Downloads population estimates from IBGE SIDRA API (table 6022) and
#' transforms from moving-quarter to exact monthly values.
#'
#' @param start_yyyymm Integer. First month to include (YYYYMM format).
#' If NULL, returns all available months.
#' @param end_yyyymm Integer. Last month to include (YYYYMM format).
#' If NULL, returns all available months.
#' @param verbose Logical. Print progress messages? Default TRUE.
#' @param use_cache Logical. If TRUE, uses cached data if available and not
#' expired. Default FALSE (always fetch fresh data for consistency).
#' Set to TRUE for faster repeated calls during development.
#' @param cache_max_age_hours Numeric. Maximum cache age in hours before
#' automatic expiration when use_cache=TRUE. Default 24 hours.
#'
#' @return A data.table with columns:
#' \itemize{
#' \item \code{ref_month_yyyymm}: Integer in YYYYMM format
#' \item \code{m_populacao}: Monthly population in thousands
#' }
#' Returns \code{NULL} invisibly with an informative message if the SIDRA
#' API is unreachable (per CRAN policy on Internet resources).
#'
#' @details
#' SIDRA table 6022 provides moving-quarter population estimates. Each value
#' represents the 3-month average centered on the middle month. For example,
#' the value for code 201203 (quarter ending March 2012) represents the
#' population for February 2012.
#'
#' This function:
#' \enumerate{
#' \item Fetches raw moving-quarter data from SIDRA
#' \item Transforms to exact monthly values by aligning with middle months
#' \item Extrapolates boundary months (first and last) using quadratic regression
#' }
#'
#' The extrapolation uses quadratic regression on population differences to
#' estimate the first month (Jan 2012) and the most recent month.
#'
#' @section Dependencies:
#' This function requires the \code{sidrar} package for API access.
#' Install with: \code{install.packages("sidrar")}
#'
#' @examples
#' \donttest{
#' pop <- fetch_monthly_population()
#'
#' pop <- fetch_monthly_population(201301, 201912)
#' }
#'
#' @seealso \code{\link{pnadc_apply_periods}} which uses this function when
#' \code{calibrate = TRUE}
#'
#' @export
fetch_monthly_population <- function(start_yyyymm = NULL,
end_yyyymm = NULL,
verbose = TRUE,
use_cache = FALSE,
cache_max_age_hours = 24) {
# OPTIMIZATION: Check cache first to avoid repeated API calls
if (use_cache && .is_cache_valid("population", max_age_hours = cache_max_age_hours)) {
if (verbose) message(" Using cached population data...")
dt <- .get_cache("population")
# Filter to requested date range if specified
if (!is.null(start_yyyymm)) {
dt <- dt[ref_month_yyyymm >= start_yyyymm]
}
if (!is.null(end_yyyymm)) {
dt <- dt[ref_month_yyyymm <= end_yyyymm]
}
if (verbose) {
message(" Population data: ",
min(dt$ref_month_yyyymm), " to ", max(dt$ref_month_yyyymm),
" (", nrow(dt), " months, from cache)")
}
return(dt)
}
# Check for sidrar package
if (!requireNamespace("sidrar", quietly = TRUE)) {
stop(
"Package 'sidrar' is required for fetching population from SIDRA.\n",
"Install with: install.packages('sidrar')",
call. = FALSE
)
}
if (verbose) message(" Fetching population from SIDRA API (table 6022)...")
# Fetch from SIDRA
# Table 6022: Population estimates from PNADC
# Variable 606: Population (in thousands)
# n1/all: National level
# p/all: All periods
#
# CRAN policy: Internet resources must fail gracefully with an informative
# message (no warning, no error). Return NULL invisibly so callers can
# detect failure without check()-time errors.
raw <- tryCatch({
# suppressMessages to hide sidrar's "All others arguments are desconsidered when 'api' is informed"
suppressMessages(sidrar::get_sidra(api = "/t/6022/n1/all/v/606/p/all"))
}, error = function(e) {
message(
"fetch_monthly_population: failed to fetch from SIDRA API. ",
"Check internet connection or try again later. ",
"Error: ", conditionMessage(e)
)
NULL
})
if (is.null(raw)) {
return(invisible(NULL))
}
if (verbose) message(" Transforming moving-quarter to exact months...")
# Convert to data.table and extract relevant columns
dt <- data.table::as.data.table(raw)
# The column name may vary; find the moving quarter code column.
# CRAN policy: if the upstream SIDRA response schema has changed (no
# matching column), fail gracefully with a message rather than stop().
code_col <- grep("Trimestre.*vel.*digo|trimestre.*vel.*digo",
names(dt), value = TRUE, ignore.case = TRUE)
if (length(code_col) == 0) {
# Try alternative pattern
code_col <- grep("M.*vel.*C.*digo", names(dt), value = TRUE, ignore.case = TRUE)
}
if (length(code_col) == 0) {
message(
"fetch_monthly_population: SIDRA response schema appears to have ",
"changed (could not find moving quarter code column). Returning NULL."
)
return(invisible(NULL))
}
code_col <- code_col[1]
dt <- dt[, .(
anomesfinaltrimmovel = as.integer(get(code_col)),
populacao = as.numeric(Valor)
)]
# Remove any rows with NA codes
dt <- dt[!is.na(anomesfinaltrimmovel)]
# Sort by moving quarter code
data.table::setorder(dt, anomesfinaltrimmovel)
# Transform moving quarter to exact month
dt <- transform_moving_quarter_to_monthly(dt, verbose = verbose)
# Apply quadratic extrapolation for boundary months
if (verbose) message(" Extrapolating boundary months...")
dt <- extrapolate_boundary_months(dt)
# Rename to standard output column name
data.table::setnames(dt, "anomesexato", "ref_month_yyyymm")
# Keep only final columns
dt <- dt[, .(ref_month_yyyymm, m_populacao)]
# OPTIMIZATION: Store FULL unfiltered data in cache for future calls
if (use_cache) {
.set_cache("population", dt)
}
# Apply date filters AFTER caching (cache stores full data, then we filter)
if (!is.null(start_yyyymm)) {
dt <- dt[ref_month_yyyymm >= start_yyyymm]
}
if (!is.null(end_yyyymm)) {
dt <- dt[ref_month_yyyymm <= end_yyyymm]
}
if (verbose) {
message(" Population data: ",
min(dt$ref_month_yyyymm), " to ", max(dt$ref_month_yyyymm),
" (", nrow(dt), " months)")
}
dt
}
#' Transform Moving Quarter to Monthly Population
#'
#' Internal function that transforms SIDRA moving-quarter population values
#' to exact monthly values.
#'
#' @param dt data.table with columns anomesfinaltrimmovel and populacao
#' @param verbose Logical. Print messages?
#' @return data.table with columns anomesexato and m_populacao
#' @keywords internal
#' @noRd
transform_moving_quarter_to_monthly <- function(dt, verbose = FALSE) {
# The moving quarter code (e.g., 201203) represents the ENDING month
# of a 3-month window. The population value is for the MIDDLE month.
#
# Example:
# Code 201203 = Jan+Feb+Mar window = Feb 2012 population
# Code 201204 = Feb+Mar+Apr window = Mar 2012 population
#
# To get exact monthly values:
# 1. Add dummy rows for the first two months (Jan and Feb of first year)
# 2. Sort by date
# 3. Shift: m_populacao[n] = populacao[n+1]
#
# After this shift:
# anomesexato=201201 gets populacao from row with code=201202 -> NA
# anomesexato=201202 gets populacao from row with code=201203 -> Feb value
# anomesexato=201203 gets populacao from row with code=201204 -> Mar value
# Get the first moving quarter code (e.g., 201203)
first_code <- min(dt$anomesfinaltrimmovel)
year_start <- first_code %/% 100L
# Add two dummy rows for Jan and Feb of the first year
# These will have populacao = NA (need extrapolation later)
dummy_rows <- data.table::data.table(
anomesfinaltrimmovel = c(year_start * 100L + 1L, year_start * 100L + 2L),
populacao = NA_real_
)
dt <- data.table::rbindlist(list(dummy_rows, dt))
# Create anomesexato (exact month) = same as anomesfinaltrimmovel for now
dt[, anomesexato := anomesfinaltrimmovel]
# Sort by exact month
data.table::setorder(dt, anomesexato)
# Shift: m_populacao gets value from NEXT row's populacao
# This aligns the moving quarter value with its middle month
# shift(x, n=1, type="lead") gets the value from the next row
dt[, m_populacao := data.table::shift(populacao, n = 1L, type = "lead")]
# Clean up
dt[, populacao := NULL]
dt[, anomesfinaltrimmovel := NULL]
dt
}
#' Extrapolate Boundary Months Using Quadratic Regression
#'
#' Internal function that fills in NA values for the first and last months
#' of the population series using quadratic extrapolation on differences.
#'
#' @param dt data.table with columns anomesexato and m_populacao
#' @return data.table with extrapolated values for boundary months
#' @keywords internal
#' @noRd
extrapolate_boundary_months <- function(dt) {
n <- nrow(dt)
# Create row indices for regression
dt[, row_num := .I]
dt[, row_num2 := row_num^2]
# First differences of population
dt[, d_pop := m_populacao - data.table::shift(m_populacao, 1L)]
# Extrapolate FIRST month (row 1) using quadratic regression on first 26 observations
# This fills in Jan 2012 which has no moving quarter centered on it
if (n >= 26 && is.na(dt[1, m_populacao])) {
# Fit quadratic model on first 26 rows (excluding row 1 which has NA d_pop)
fit_data <- dt[2:26]
if (sum(!is.na(fit_data$d_pop)) >= 3) {
fit_start <- stats::lm(d_pop ~ row_num + row_num2, data = fit_data)
predicted_diff_2 <- stats::predict(fit_start, newdata = dt[2])
# Extrapolate: value[1] = value[2] - predicted_diff[2]
dt[1, m_populacao := dt[2, m_populacao] - round(predicted_diff_2)]
}
}
# Extrapolate LAST month (row n) using quadratic regression on last 25 observations
# This fills in the final month which has no moving quarter centered on it
if (n >= 25 && is.na(dt[n, m_populacao])) {
# Fit quadratic model on last 25 rows (excluding row n which has NA)
start_row <- max(1, n - 24)
fit_data <- dt[start_row:(n-1)]
if (sum(!is.na(fit_data$d_pop)) >= 3) {
fit_end <- stats::lm(d_pop ~ row_num + row_num2, data = fit_data)
predicted_diff_n <- stats::predict(fit_end, newdata = dt[n])
# Extrapolate: value[n] = value[n-1] + predicted_diff[n]
dt[n, m_populacao := dt[n-1, m_populacao] + round(predicted_diff_n)]
}
}
# Clean up temporary columns
dt[, c("row_num", "row_num2", "d_pop") := NULL]
dt
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.