heaping-intro.R
In heaping: Correction of Heaping on Individual Level

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----load-package-------------------------------------------------------------
library(heaping)

## ----create-example-data------------------------------------------------------
# Generate realistic age data using a log-normal distribution
# These parameters produce a realistic age distribution
set.seed(123)
age_true <- rlnorm(10000, meanlog = 2.466869, sdlog = 1.652772)
age_true <- round(age_true[age_true < 93])

# Create heaped version where 27% of respondents round their age to multiples of 5
# This heap_ratio is typical for survey data with moderate heaping
heap_ratio <- 0.27
n <- length(age_true)
indices_to_heap <- sample(n, round(heap_ratio * n))
age_heaped <- age_true
age_heaped[indices_to_heap] <- round(age_heaped[indices_to_heap] / 5) * 5

## ----visualize-heaping, fig.cap="Comparison of true ages vs heaped ages"------
oldpar <- par(mfrow = c(1, 2))

# Define breaks to cover the full range
age_breaks <- seq(0, max(age_true, age_heaped) + 1, by = 1)

# True ages
hist(age_true, breaks = age_breaks, col = "steelblue",
     main = "True Ages (No Heaping)", xlab = "Age",
     border = "white")

# Heaped ages
hist(age_heaped, breaks = age_breaks, col = "coral",
     main = "Heaped Ages", xlab = "Age",
     border = "white")
par(oldpar)

## ----whipple-index------------------------------------------------------------
# Standard Whipple index (100 = no heaping, 500 = maximum heaping)
whipple(age_true, method = "standard")
whipple(age_heaped, method = "standard")

# Modified Whipple index (0 = no heaping, 1 = maximum heaping)
whipple(age_true, method = "modified")
whipple(age_heaped, method = "modified")

## ----myers-index--------------------------------------------------------------
# Myers' index (0 = no heaping, 90 = maximum)
myers(age_true)
myers(age_heaped)

## ----bachi-index--------------------------------------------------------------
# Bachi's index (0 = no heaping, 90 = maximum)
bachi(age_true)
bachi(age_heaped)

## ----noumbissi-index----------------------------------------------------------
# Noumbissi's index for digit 0 (1.0 = no heaping)
noumbissi(age_true, digit = 0)
noumbissi(age_heaped, digit = 0)

# Noumbissi's index for digit 5
noumbissi(age_true, digit = 5)
noumbissi(age_heaped, digit = 5)

## ----all-indices--------------------------------------------------------------
# Calculate all indices
indices_true <- heaping_indices(age_true)
indices_heaped <- heaping_indices(age_heaped)

# Compare
data.frame(
  Index = names(indices_true),
  True = round(unlist(indices_true), 3),
  Heaped = round(unlist(indices_heaped), 3)
)

## ----weighted-indices---------------------------------------------------------
# Create example weights
weights <- runif(length(age_heaped), 0.5, 2)

# Weighted Whipple index
whipple(age_heaped, weight = weights)

## ----basic-correction---------------------------------------------------------
# Correct heaping using log-normal distribution
age_corrected <- correctHeaps(age_heaped,
                              heaps = "5year",  # heaps at 0, 5, 10, 15, ...
                              method = "lnorm",
                              seed = 42)        # for reproducibility

# Compare Whipple indices
c(Original = whipple(age_true),
  Heaped = whipple(age_heaped),
  Corrected = whipple(age_corrected))

## ----visualize-correction, fig.cap="Before and after heaping correction"------
oldpar <- par(mfrow = c(1, 2))

hist(age_heaped, breaks = age_breaks, col = "coral",
     main = "Before Correction", xlab = "Age", border = "white")

hist(age_corrected, breaks = age_breaks, col = "forestgreen",
     main = "After Correction", xlab = "Age", border = "white")

par(oldpar)

## ----correction-methods-------------------------------------------------------
# Log-normal (default) - good for right-skewed data like age
age_lnorm <- correctHeaps(age_heaped, method = "lnorm", seed = 42)

# Normal - good for symmetric data
age_norm <- correctHeaps(age_heaped, method = "norm", seed = 42)

# Uniform - simplest approach, baseline comparison
age_unif <- correctHeaps(age_heaped, method = "unif", seed = 42)

# Kernel - nonparametric, adapts to local data shape
age_kernel <- correctHeaps(age_heaped, method = "kernel", seed = 42)

# Compare results
data.frame(
  Method = c("Original", "Heaped", "Log-normal", "Normal", "Uniform", "Kernel"),
  Whipple = c(whipple(age_true), whipple(age_heaped),
              whipple(age_lnorm), whipple(age_norm),
              whipple(age_unif), whipple(age_kernel))
)

## ----verbose-output-----------------------------------------------------------
result <- correctHeaps(age_heaped, heaps = "5year",
                       method = "lnorm", seed = 42, verbose = TRUE)

# Number of values changed
result$n_changed

# Changes by heap age
head(result$changes_by_heap, 10)

# Heaping ratios (ratio > 1 indicates heaping)
head(result$ratios, 10)

## ----10year-heaping-----------------------------------------------------------
# Create data with 10-year heaping
age_heap10 <- age_true
heapers10 <- sample(c(TRUE, FALSE), length(age_true),
                    replace = TRUE, prob = c(0.3, 0.7))
age_heap10[heapers10] <- round(age_heap10[heapers10] / 10) * 10

# Correct 10-year heaping
age_corrected10 <- correctHeaps(age_heap10, heaps = "10year",
                                method = "lnorm", seed = 42)

c(Heaped = whipple(age_heap10),
  Corrected = whipple(age_corrected10))

## ----custom-heaps-------------------------------------------------------------
# Heaping at specific ages (e.g., 18, 21, 30, 40, 50, 65)
custom_positions <- c(18, 21, 30, 40, 50, 65)
age_custom <- correctHeaps(age_heaped,
                           heaps = custom_positions,
                           method = "lnorm", seed = 42)

## ----single-heap--------------------------------------------------------------
# Add artificial heap at age 40
age_with_40heap <- c(age_true, rep(40, 500))

# Correct only the heap at 40
age_fixed_40 <- correctSingleHeap(age_with_40heap,
                                   heap = 40,
                                   before = 3,  # range: 37-43
                                   after = 3,
                                   method = "lnorm",
                                   seed = 42)

# Check the counts at age 40
c(Before = sum(age_with_40heap == 40),
  After = sum(age_fixed_40 == 40))

## ----model-based-setup, message=FALSE-----------------------------------------
# Create a dataset with correlated variables following the paper's approach
set.seed(123)

# Generate age from log-normal distribution
age <- rlnorm(10000, meanlog = 2.466869, sdlog = 1.652772)
age <- round(age[age < 93 & age >= 18])
n <- length(age)

# Simulate covariates correlated with age
age_scaled <- scale(age)

# Income as a function of age with noise
income <- exp(3 + 0.5 * age_scaled + rnorm(n, 0, 0.5))

# Marital status influenced by age
marital_status <- ifelse(plogis(-3 + 0.05 * age_scaled + rnorm(n, 0, 0.5)) > 0.5,
                         "married", "single")

# Education level with age-dependent probabilities
get_education_prob <- function(a) {
  if (a < 25) return(c(0.5, 0.35, 0.15))
  else if (a < 40) return(c(0.3, 0.45, 0.25))
  else return(c(0.2, 0.4, 0.4))
}
education <- sapply(age, function(a) {
  sample(c("High School", "Bachelor", "Master"), 1, prob = get_education_prob(a))
})

data_example <- data.frame(
  age = age,
  income = income,
  marital_status = marital_status,
  education = education
)

# Introduce heaping with 27% heap ratio (as in the paper)
heap_ratio <- 0.27
indices_to_heap <- sample(n, round(heap_ratio * n))
data_example$age_heaped <- data_example$age
data_example$age_heaped[indices_to_heap] <- round(data_example$age_heaped[indices_to_heap] / 5) * 5

## ----model-based-correction, message=FALSE, warning=FALSE---------------------
# Model-based correction using income, education and marital status
if (requireNamespace("ranger", quietly = TRUE) &&
    requireNamespace("VIM", quietly = TRUE)) {

  # Also apply simple correction for comparison
  data_example$age_simple <- correctHeaps(
    data_example$age_heaped,
    heaps = "5year",
    method = "lnorm",
    seed = 42
  )

  # Model-based correction using covariates
  data_example$age_corrected <- correctHeaps(
    data_example$age_heaped,
    heaps = "5year",
    method = "lnorm",
    model = age_heaped ~ income + marital_status + education,
    dataModel = data_example,
    seed = 42
  )

  # Compare correlations with log(income)
  log_income <- log(data_example$income)
  cor_original <- cor(data_example$age, log_income)
  cor_heaped <- cor(data_example$age_heaped, log_income)
  cor_simple <- cor(data_example$age_simple, log_income)
  cor_corrected <- cor(data_example$age_corrected, log_income)

  cat("Correlation of age with log(income):\n")
  cat("  Original (true):", round(cor_original, 4), "\n")
  cat("  Heaped:", round(cor_heaped, 4), "\n")
  cat("  Simple correction:", round(cor_simple, 4), "\n")
  cat("  Model-based correction:", round(cor_corrected, 4), "\n")
}

## ----model-based-visualization, fig.cap="Comparison of heaped ages and model-based corrected ages", fig.height=6----
if (requireNamespace("ranger", quietly = TRUE) &&
    requireNamespace("VIM", quietly = TRUE)) {

  oldpar <- par(mfrow = c(2, 2))

  # Age distributions
  age_breaks <- seq(min(data_example$age) - 1, max(data_example$age) + 1, by = 1)
  hist(data_example$age_heaped, breaks = age_breaks, col = "coral",
       main = "Heaped Ages", xlab = "Age", border = "white")

  hist(data_example$age_corrected, breaks = age_breaks, col = "forestgreen",
       main = "Model-Based Corrected Ages", xlab = "Age", border = "white")

  # Age vs log(Income) relationships
  log_income <- log(data_example$income)
  plot(data_example$age_heaped, log_income,
       pch = 16, col = adjustcolor("coral", 0.3), cex = 0.5,
       main = "Heaped: Age vs log(Income)",
       xlab = "Age", ylab = "log(Income)")
  abline(lm(log_income ~ data_example$age_heaped), col = "darkred", lwd = 2)

  plot(data_example$age_corrected, log_income,
       pch = 16, col = adjustcolor("forestgreen", 0.3), cex = 0.5,
       main = "Corrected: Age vs log(Income)",
       xlab = "Age", ylab = "log(Income)")
  abline(lm(log_income ~ data_example$age_corrected), col = "darkgreen", lwd = 2)

  par(oldpar)
}

## ----multiple-imputation------------------------------------------------------
# Create 5 corrected datasets with different seeds
m <- 5
corrected_datasets <- lapply(1:m, function(i) {
  correctHeaps(age_heaped, heaps = "5year",
               method = "lnorm", seed = i * 100)
})

# Calculate Whipple index for each
whipple_values <- sapply(corrected_datasets, whipple)

cat("Whipple indices across imputations:\n")
cat("  Mean:", round(mean(whipple_values), 2), "\n")
cat("  SD:", round(sd(whipple_values), 2), "\n")
cat("  Range:", round(min(whipple_values), 2), "-",
    round(max(whipple_values), 2), "\n")

## ----fixed-observations-------------------------------------------------------
# Assume first 100 observations are verified
verified_indices <- 1:100

age_protected <- correctHeaps(age_heaped,
                              heaps = "5year",
                              method = "lnorm",
                              fixed = verified_indices,  # don't change these
                              seed = 42)

# Verify protected observations unchanged
all(age_heaped[verified_indices] == age_protected[verified_indices])

## ----sprague-example----------------------------------------------------------
# Example: population counts in 5-year groups
pop_5year <- c(
  1971990, 2095820, 2157190, 2094110, 2116580,  # 0-4, 5-9, ..., 20-24
  2003840, 1785690, 1502990, 1214170, 796934,   # 25-29, ..., 45-49
  627551, 530305, 488014, 364498, 259029,       # 50-54, ..., 70-74
  158047, 125941                                 # 75-79, 80+
)

# Disaggregate to single years
pop_single <- sprague(pop_5year)

# First 15 ages
head(pop_single, 15)

# Total is preserved
c(Sum_5year = sum(pop_5year), Sum_single = sum(pop_single))

## ----old-age-indices----------------------------------------------------------
# Create old-age data with heaping
set.seed(42)
old_ages <- c(sample(85:105, 3000, replace = TRUE),
              rep(c(90, 95, 100), each = 200))  # heaping at round ages

# Coale-Li index (designed for ages 60+)
coale_li(old_ages, digit = 0, ageMin = 85)

# Jdanov index (for very old ages like 95, 100, 105)
jdanov(old_ages, Agei = c(95, 100, 105))

# Kannisto index (for a single old age)
kannisto(old_ages, Agei = 90)
kannisto(old_ages, Agei = 95)