test-integration-pipeline.R
In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data

# Integration tests for end-to-end pipelines
# These tests verify the complete workflow: identify → apply → calibrate

# =============================================================================
# BASIC END-TO-END PIPELINE TESTS
# =============================================================================

test_that("full pipeline works end-to-end with monthly calibration", {
  # 1. Setup: Create realistic multi-quarter data
  set.seed(900)
  data <- create_realistic_pnadc(n_quarters = 4, n_upas = 30)

  # Add calibration columns
  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  data[, `:=`(
    UF = sample(valid_ufs, .N, replace = TRUE),
    V1028 = runif(.N, 500, 2000),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # 2. Execute: Full pipeline
  # Step 1: Identify periods
  crosswalk <- pnadc_identify_periods(data, verbose = FALSE)

  # Step 2: Apply and calibrate (mock targets, no SIDRA)
  result <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = create_mock_pop_targets(data),
    smooth = FALSE,
    verbose = FALSE
  )

  # 3. Verify: Basic structure
  expect_true(is.data.frame(result))
  expect_equal(nrow(result), nrow(data))
  expect_true("weight_monthly" %in% names(result))
  expect_true("determined_month" %in% names(result))

  # 4. Verify: Weights are valid (not sum preservation)
  determined <- result[determined_month == TRUE]
  expect_true(all(determined$weight_monthly > 0),
              label = "Pipeline produces positive weights")
  expect_false(any(is.na(determined$weight_monthly)),
               label = "No NA weights for determined observations")

  # 5. Verify: Determination rates are reasonable
  # With 4 quarters, expect >70% month determination
  det_rate <- result[, mean(determined_month, na.rm = TRUE)]
  expect_true(det_rate > 0.70,
              label = paste0("Month determination rate (", round(det_rate, 3),
                             ") should be >70% with 4 quarters"))
})


test_that("full pipeline works with fortnight calibration", {
  # 1. Setup: Create data
  set.seed(901)
  data <- create_realistic_pnadc(n_quarters = 4, n_upas = 25)

  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  data[, `:=`(
    UF = sample(valid_ufs, .N, replace = TRUE),
    V1028 = runif(.N, 500, 2000),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # 2. Execute: Full pipeline with fortnight (mock targets, no SIDRA)
  crosswalk <- pnadc_identify_periods(data, verbose = FALSE)

  result <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "fortnight",
    target_totals = PNADCperiods:::derive_fortnight_population(
      create_mock_pop_targets(data)
    ),
    smooth = FALSE,
    verbose = FALSE
  )

  # 3. Verify: Structure
  expect_true("weight_fortnight" %in% names(result))
  expect_true("determined_fortnight" %in% names(result))

  # 4. Verify: Each fortnight's weights sum to monthly total
  # Use custom targets scaled to test data's quarterly V1028 sum so calibration converges
  qtr_wsum <- data[, .(qtr_wsum = sum(V1028, na.rm = TRUE)),
                  by = .(Ano, Trimestre)]
  targets <- qtr_wsum[, {
    months <- (Trimestre - 1L) * 3L + 1:3
    data.table::data.table(
      ref_month_yyyymm = Ano * 100L + months,
      m_populacao = qtr_wsum / 1000
    )
  }, by = .(Ano, Trimestre)][, .(ref_month_yyyymm, m_populacao)]

  fn_targets <- PNADCperiods:::derive_fortnight_population(targets)
  result_fn <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "fortnight",
    target_totals = fn_targets,
    smooth = FALSE,
    verbose = FALSE
  )
  result_mo <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = targets,
    smooth = FALSE,
    verbose = FALSE
  )

  t_fortnight <- result_fn[!is.na(ref_fortnight_in_month), .(
    N_fn = sum(weight_fortnight, na.rm = TRUE)
  ), by = .(Ano, ref_month_in_year, ref_fortnight_in_month)]

  t_month <- result_mo[!is.na(ref_month_in_quarter), .(
    N_mo = sum(weight_monthly, na.rm = TRUE)
  ), by = .(Ano, ref_month_in_year)]

  chk <- merge(t_fortnight, t_month,
               by = c("Ano", "ref_month_in_year"))

  if (nrow(chk) > 0) {
    expect_equal(chk$N_fn, chk$N_mo, tolerance = 0.01,
                 label = "Fortnight weights sum to monthly total")
  }
})


test_that("full pipeline with smoothing produces consistent results", {
  # 1. Setup: Create data with enough quarters for smoothing
  set.seed(902)
  data <- create_realistic_pnadc(n_quarters = 6, n_upas = 25)

  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  data[, `:=`(
    UF = sample(valid_ufs, .N, replace = TRUE),
    V1028 = runif(.N, 500, 2000),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # 2. Execute: Pipeline with smoothing (mock targets, no SIDRA)
  crosswalk <- pnadc_identify_periods(data, verbose = FALSE)

  result <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = create_mock_pop_targets(data),
    smooth = TRUE,
    verbose = FALSE
  )

  # 3. Verify: Smoothed weights are valid
  determined <- result[determined_month == TRUE]
  expect_true(all(determined$weight_monthly > 0),
              label = "Smoothed weights are positive")
  expect_false(any(is.na(determined$weight_monthly)),
               label = "No NA values after smoothing")
})


# =============================================================================
# EXPERIMENTAL STRATEGIES INTEGRATION
# =============================================================================

test_that("pipeline with experimental strategies maintains invariants", {
  # 1. Setup: Create data
  set.seed(903)
  data <- create_realistic_pnadc(n_quarters = 4, n_upas = 25)

  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  data[, `:=`(
    UF = sample(valid_ufs, .N, replace = TRUE),
    V1028 = runif(.N, 500, 2000),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # 2. Execute: Strict identification
  crosswalk_strict <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE)

  # 3. Execute: Experimental strategies
  crosswalk_exp <- pnadc_experimental_periods(
    crosswalk_strict,
    strategy = "both",
    confidence_threshold = 0.9,
    verbose = FALSE
  )

  # 4. Execute: Apply calibration with experimental crosswalk
  #    (mock targets, no SIDRA)
  result <- pnadc_apply_periods(
    data, crosswalk_exp,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = create_mock_pop_targets(data),
    verbose = FALSE
  )

  # 5. Verify: Weights are valid (experimental strategies)
  determined <- result[determined_month == TRUE]
  expect_true(all(determined$weight_monthly > 0),
              label = "Experimental pipeline produces positive weights")
  expect_false(any(is.na(determined$weight_monthly)),
               label = "No NA weights for determined observations")

  # 6. Verify: Experimental improved determination rate
  strict_rate <- crosswalk_strict[, mean(determined_month, na.rm = TRUE)]
  exp_rate <- crosswalk_exp[, mean(determined_month, na.rm = TRUE)]

  expect_true(exp_rate >= strict_rate,
              label = paste0("Experimental (", round(exp_rate, 3),
                             ") >= strict (", round(strict_rate, 3), ")"))
})


# =============================================================================
# MULTI-YEAR PIPELINE TESTS
# =============================================================================

test_that("pipeline works with multi-year data (year anchor)", {
  # 1. Setup: Create 2-year data (8 quarters)
  set.seed(904)
  data <- create_realistic_pnadc(n_quarters = 8, n_upas = 25, start_year = 2022)

  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  data[, `:=`(
    UF = sample(valid_ufs, .N, replace = TRUE),
    V1028 = runif(.N, 500, 2000),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # 2. Execute: Pipeline with year anchor (mock targets, no SIDRA)
  crosswalk <- pnadc_identify_periods(data, verbose = FALSE)

  result <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "year",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = create_mock_pop_targets(data),
    verbose = FALSE
  )

  # 3. Verify: Weights are valid
  determined <- result[determined_month == TRUE]
  expect_true(all(determined$weight_monthly > 0),
              label = "Multi-year pipeline produces positive weights")
  expect_false(any(is.na(determined$weight_monthly)),
               label = "No NA weights for determined observations")

  # 4. Verify: High determination rate with 8 quarters
  det_rate <- result[, mean(determined_month, na.rm = TRUE)]
  expect_true(det_rate > 0.85,
              label = paste0("8-quarter determination (", round(det_rate, 3),
                             ") should be >85%"))
})


# =============================================================================
# ERROR HANDLING IN PIPELINE
# =============================================================================

test_that("pipeline handles missing weight columns gracefully", {
  # 1. Setup: Create data then remove V1028 to simulate missing weight column
  set.seed(905)
  data <- create_realistic_pnadc(n_quarters = 2, n_upas = 10)
  data[, V1028 := NULL]  # Remove weight column to test error handling

  crosswalk <- pnadc_identify_periods(data, verbose = FALSE)

  # 2. Verify: Should error when requesting calibration without weight columns
  expect_error(
    pnadc_apply_periods(
      data, crosswalk,
      weight_var = "V1028",  # Column doesn't exist anymore
      anchor = "quarter",
      calibrate = TRUE,
      calibration_unit = "month",
      verbose = FALSE
    ),
    "V1028|weight_var|missing",
    label = "Missing weight column should error"
  )
})


test_that("pipeline works without calibration", {
  # 1. Setup: Create data with V1028 column but won't calibrate
  set.seed(906)
  data <- create_realistic_pnadc(n_quarters = 2, n_upas = 10)

  # Add V1028 column (required even when not calibrating)
  data[, V1028 := runif(.N, 500, 2000)]

  # 2. Execute: Pipeline without calibration
  crosswalk <- pnadc_identify_periods(data, verbose = FALSE)

  result <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = FALSE,  # No calibration
    verbose = FALSE
  )

  # 3. Verify: Basic structure without calibrated weight columns
  expect_true(is.data.frame(result))
  expect_false("weight_monthly" %in% names(result))
  expect_true("determined_month" %in% names(result))
  expect_true("ref_month_in_quarter" %in% names(result))
  expect_true("V1028" %in% names(result))  # Original weight column preserved
})


# =============================================================================
# KEEP_ALL PARAMETER INTEGRATION
# =============================================================================

test_that("pipeline with keep_all=FALSE filters correctly", {
  # 1. Setup: Create data
  set.seed(907)
  data <- create_realistic_pnadc(n_quarters = 2, n_upas = 15)

  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  data[, `:=`(
    UF = sample(valid_ufs, .N, replace = TRUE),
    V1028 = runif(.N, 500, 2000),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # 2. Execute: Pipeline with keep_all=TRUE (mock targets, no SIDRA)
  crosswalk <- pnadc_identify_periods(data, verbose = FALSE)
  mock_targets <- create_mock_pop_targets(data)

  result_all <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = mock_targets,
    keep_all = TRUE,
    verbose = FALSE
  )

  # 3. Execute: Pipeline with keep_all=FALSE
  result_filtered <- pnadc_apply_periods(
    data, crosswalk,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = mock_targets,
    keep_all = FALSE,
    verbose = FALSE
  )

  # 4. Verify: keep_all=FALSE returns fewer rows
  expect_true(nrow(result_filtered) <= nrow(result_all))

  # 5. Verify: All rows in filtered result are determined
  expect_true(all(result_filtered$determined_month == TRUE, na.rm = TRUE))

  # 6. Verify: No NA weights in filtered result
  expect_true(all(!is.na(result_filtered$weight_monthly)))
})


# =============================================================================
# CONSISTENCY CHECKS
# =============================================================================

test_that("pipeline produces consistent results with same data", {
  # 1. Setup: Create data
  set.seed(908)
  data <- create_realistic_pnadc(n_quarters = 3, n_upas = 20)

  valid_ufs <- c(11:17, 21:29, 31:35, 41:43, 50:53)
  data[, `:=`(
    UF = sample(valid_ufs, .N, replace = TRUE),
    V1028 = runif(.N, 500, 2000),
    posest = sample(1:500, .N, replace = TRUE),
    posest_sxi = sample(100:999, .N, replace = TRUE)
  )]

  # 2. Execute: Pipeline twice with same data (mock targets, no SIDRA)
  mock_targets <- create_mock_pop_targets(data)
  crosswalk1 <- pnadc_identify_periods(data, verbose = FALSE)
  result1 <- pnadc_apply_periods(
    data, crosswalk1,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = mock_targets,
    verbose = FALSE
  )

  crosswalk2 <- pnadc_identify_periods(data, verbose = FALSE)
  result2 <- pnadc_apply_periods(
    data, crosswalk2,
    weight_var = "V1028",
    anchor = "quarter",
    calibrate = TRUE,
    calibration_unit = "month",
    target_totals = mock_targets,
    verbose = FALSE
  )

  # 3. Verify: Results should be identical
  expect_equal(result1[determined_month == TRUE]$weight_monthly,
               result2[determined_month == TRUE]$weight_monthly,
               tolerance = 1e-10,
               label = "Pipeline should produce consistent results")

  expect_equal(crosswalk1$determined_month,
               crosswalk2$determined_month,
               label = "Determination should be consistent")
})
Any scripts or data that you put into this service are public.
PNADCperiods documentation built on April 28, 2026, 9:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PNADCperiods
Identify Reference Periods in Brazil's PNADC Survey Data

tests/testthat/test-integration-pipeline.R
In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data

Try the PNADCperiods package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PNADCperiods Identify Reference Periods in Brazil's PNADC Survey Data

tests/testthat/test-integration-pipeline.R In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data

Try the PNADCperiods package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PNADCperiods
Identify Reference Periods in Brazil's PNADC Survey Data

tests/testthat/test-integration-pipeline.R
In PNADCperiods: Identify Reference Periods in Brazil's PNADC Survey Data