In HughParsonage/CRIMpp: Model Retirement Incomes.

knitr::opts_chunk$set(
  collapse = TRUE,
  width = 100,
  fig.width = 9,
  fig.height = 9,
  cache.extra = list(sessionInfo(), format(Sys.Date(), "%Y-%m-%d")),
  comment = "#>"
)

This package outlines the formation of lifetime incomes as used in the retirement income models.

Outline

The distribution of lifetime income of individuals is calculated by:

Selecting a relevant subset of the longitudinal sample of the HILDA survey
For each individual in this subset, calculate their average income between 2005 and 2009 and their average income between 2010 and 2014.
Determine the income percentile of each individual in each period.
Construct the observed transition matrix of income for each age transition from 25-29 to 30-34 and from 55-59 to 60-64.
Smooth these matrix using a tensor product smooth. It may be more appropriate to use a different 2D kernel smoother, but this is the one that was chosen.
Apply these matrices across each age to end up with a machine_widget

# Set working directory to source file location
# This script:
#   - writes several transition tables to disk (through the source() line below)
#   - creates objects
#     * machine_widget (every row is a simulation's instance of a person;
#       the columns marked with .1 are the path that person took in that simulation)
#     * lifetime_pachinko that outputs for every Percentile the average lifetime_AWOTE across all simulations

These are the global variables. That is, only consider HILDA individuals aged between 30 and 70 in the sample and perform 10,000 simulations to obtain our percentiles.

# 1000 is pretty good (100,000 individuals)
N_SIMULATIONS <- 1000
OLDEST.AGE <- 65L
YOUNGEST.AGE <- 30L
# tbl_dt [100 x 2]
#    Percentile lifetime_AWOTE
#         <int>          <dbl>
# 1           1       13.50444
# 2           2       14.17248
# 3           3       14.56721
# 4           4       15.00996
# 5           5       15.38401
# 6           6       15.93115
# 7           7       16.49369
# 8           8       16.91389
# 9           9       17.61472
# 10         10       17.99585
# # ... with 90 more rows

N_SIMULATIONS <- as.integer(N_SIMULATIONS)
stopifnot(N_SIMULATIONS >= 1)
randz <- runif(100 * N_SIMULATIONS)

library(hutils)
library(readr)
library(ggplot2)
library(scales)
library(grattanCharts)
library(viridis)
library(magrittr)
library(mgcv)


library(data.table)
if (requireNamespace("hildaData", quietly = TRUE) &&
    requireNamespace("hildaData2016", quietly = TRUE)) {
  library(hildaData)
  library(hildaData2016)
} else {
  stop("package:hildaData is a local package, containing the raw Combined_* .dta files as data.tables.\n",
       "Either obtain this package from Hugh Parsonage or run", "\n",
       "\t",
       "Combined_n140c <- as.data.table(read.dta('path/to/your/Wave14/Combinedfile.dta'));", "\n",
       "\t",
       "Combined_m130c <- as.data.table(read.dta('path/to/your/Wave13/Combinedfile.dta'));", "\n",
       "\t", "...", "\n",
       "etc.")
}
library(hildaExtra)
library(grattan)

awote_by_year <-
  data.table(Year = 2017:2001,
             AWOTE = c(1568,
                       1532,
                       1499,
                       1477,
                       1437,
                       1396,
                       1330.1,
                       1275.2,
                       1226.8,
                       1158,
                       1098.6,
                       1044.1,
                       1011.8,
                       965,
                       929.1,
                       882.1,
                       842.6)) %>%
  .[, AWOTE_annual := 52.5 * AWOTE]

get_hilda_xwaveid_age_income <- function(year){
  wave <- letters[year - 2000]
  if (year == 2016L) {
    NAME <- "Combined_p160c"
  } else if (exists("Combined_n150c", where = "package:hildaData")) {
    NAME <- paste0("Combined_", wave, "150c")
  } else {
    if (year == 2014){
      NAME <- "Combined_n140c"
    } else {
      NAME <- paste0("Combined_", wave, "130c")
    }
  }

  get(NAME) %>%
    as.data.table %>%
    .[,.SD,
      .SDcols = c("xwaveid",
                  paste0(wave, "tifefp"), # income
                  paste0(wave, "hgage"),  # age
                  paste0(wave, "esbrd"),  # employment-status
                  paste0(wave, "wscmg")    # earnings from wage
      )] %>%
    setnames(paste0(wave, "tifefp"), "Income") %>%
    setnames(paste0(wave, "hgage"), "Age") %>%
    setnames(paste0(wave, "esbrd"), "Employment_status") %>%
    setnames(paste0(wave, "wscmg"), "Weekly_salary_wage_from_main_job") %>%
    .[, lapply(.SD, make_negatives_NA)] %>%
    .[, Year := get("year", mode = "numeric")]
}

if (exists("Combined_n150c", where = "package:hildaData")) {
  # alias
  Combined_n140c <- as.data.table(Combined_n150c)
}

xwaveid_by_lnwte <-
  Combined_n140c %>%
  as.data.table %>%
  .[, .(xwaveid, WEIGHT = nlnwte)] %>%
  .[WEIGHT > 0] %>%
  setkey(xwaveid) %>%
  unique(by = "xwaveid")

income_age_by_xwaveid_year <-
  lapply(2001:2016, get_hilda_xwaveid_age_income) %>%
  rbindlist(use.names = TRUE) %>%
  setkey(xwaveid) %>%
  .[xwaveid_by_lnwte, on = "xwaveid"] %>%
  .[, Income_percentile := weighted_ntile(Income, WEIGHT, n = 100), keyby = "Year"] %>%
  .[, Income_next_year := shift(Income, type = "lead"), by = "xwaveid"] %>%
  .[, Income_percentile_next_year := shift(Income_percentile, type = "lead"),
    by = "xwaveid"] %>%
  .[, DOB := Year - Age] %>%
  .[]

prob_year_on_year <-
  income_age_by_xwaveid_year %>%
  .[between(DOB, 2014L - OLDEST.AGE, 2010L - YOUNGEST.AGE)] %>%
  .[Weekly_salary_wage_from_main_job > 38L] %>%
  .[, Age_group := age_grouper(Age, interval = 10, min_age = 25, max_age = 75)] %>%
  .[, .(n_persons = sum(WEIGHT)),
    keyby = .(Age_group, Income_percentile, Income_percentile_next_year)] %>%
  .[, prob := n_persons / sum(n_persons), keyby = .(Age_group, Income_percentile)] %>%
  .[]

Age_group_2005_by_xwaveid <-
  income_age_by_xwaveid_year %>%
  .[Year == 2005L] %>%
  .[, .(Age_group_2005 = age_grouper(Age, interval = 5, min_age = 25, max_age = 75)),
    keyby = "xwaveid"]

## How much do we lose by restricting to only those present in at
## least one year in 2005-2009 and 2010-2014.

ALL <- function(...) {
  Reduce(`&&`, list(...))
}

InScope_by_xwaveid <-
  income_age_by_xwaveid_year %>%
  .[Weekly_salary_wage_from_main_job > 100L] %>%
  .[Employment_status == "[1] Employed"] %>%
  .[between(DOB, 2014L - OLDEST.AGE, 2010L - YOUNGEST.AGE)] %>%
  .[Year >= 2005L] %>%
  .[,
    .(in_scope = ALL(any(Year <= 2009L),
                     any(Year > 2009L),
                     .N >= 3L)),
    keyby = "xwaveid"]



# Must be at least 60% (of workers at least 30)
stopifnot(mean(InScope_by_xwaveid$in_scope) > 0.6)

income_AWOTE_by_age_percentile <-
  income_age_by_xwaveid_year %>%
  InScope_by_xwaveid[., on = "xwaveid", nomatch=0L] %>%
  .[(in_scope)] %>%
  .[Year >= 2005L] %>%
  awote_by_year[., on = "Year", nomatch=0L] %>%
  .[, Income_per_AWOTE_annual := Income / AWOTE_annual] %>%
  .[,
    .(AWOTE_annual_average = mean(Income_per_AWOTE_annual)),
    keyby = .(xwaveid, First_year_range = Year <= 2009)] %>%
  setkey(xwaveid) %>%
  xwaveid_by_lnwte[., on = "xwaveid", nomatch=0L] %>%
  Age_group_2005_by_xwaveid[., on = "xwaveid", nomatch=0L] %>%
  .[, AWOTE_annual_percentile := weighted_ntile(AWOTE_annual_average,
                                                weights = WEIGHT,
                                                n = 100),
    keyby = .(Age_group_2005, First_year_range)] %>%
  .[, Year_range := if_else(First_year_range, "Decile_ante_2009", "Decile_post_2009"),
    keyby = .(Age_group_2005, First_year_range)] %>%
  .[, .(xwaveid,
        Age_group_2005,
        Year_range,
        AWOTE_annual_percentile,
        AWOTE_annual_average)] %>%
  setkey(xwaveid) %>%
  xwaveid_by_lnwte[., on = "xwaveid", nomatch=0L] %>%
  .[, .(n_persons = sum(WEIGHT),
        AWOTE_average = weighted.mean(AWOTE_annual_average, WEIGHT)),
    keyby = .(Age_group_2005, Year_range, AWOTE_annual_percentile)]

# income_AWOTE_by_age_percentile %>%
#   write_csv("income-AWOTE-by-age-percentile.csv")

prob_5year_on_year <-
  income_age_by_xwaveid_year %>%
  InScope_by_xwaveid[., on = "xwaveid", nomatch=0L] %>%
  awote_by_year[., on = "Year", nomatch=0L] %>%
  .[Year >= 2005L] %>%
  .[(in_scope)] %>%
  .[, Income_per_AWOTE_annual := Income / AWOTE_annual] %>%
  .[,
    .(AWOTE_annual_average = mean(Income_per_AWOTE_annual)),
    keyby = .(xwaveid, First_year_range = Year <= 2009)] %>%
  setkey(xwaveid) %>%
  xwaveid_by_lnwte[., on = "xwaveid", nomatch=0L] %>%
  Age_group_2005_by_xwaveid[., on = "xwaveid", nomatch=0L] %>%
  .[,
    AWOTE_annual_decile := weighted_ntile(AWOTE_annual_average,
                                          weights = WEIGHT,
                                          n = 10),
    keyby = .(Age_group_2005, First_year_range)] %>%
  .[, Year_range := if_else(First_year_range,
                            "Decile_ante_2009",
                            "Decile_post_2009")] %>%
  .[, .(xwaveid, Age_group_2005, Year_range, AWOTE_annual_decile)] %>%
  dcast.data.table(xwaveid + Age_group_2005 ~  Year_range,
                   value.var = "AWOTE_annual_decile") %>%
  setkey(xwaveid) %>%
  xwaveid_by_lnwte[., on = "xwaveid", nomatch=0L] %>%
  .[, .(n_persons = sum(WEIGHT)),
    keyby = .(Age_group_2005, Decile_ante_2009, Decile_post_2009)] %>%
  .[, prob := n_persons / sum(n_persons),
    keyby = .(Age_group_2005, Decile_ante_2009)] %>%
  .[]

prob_5year_on_year_percentile <-
  income_age_by_xwaveid_year %>%
  InScope_by_xwaveid[., on = "xwaveid", nomatch=0L] %>%
  .[(in_scope)] %>%
  .[Year >= 2005L] %>%
  awote_by_year[., on = "Year", nomatch=0L] %>%
  .[, Income_per_AWOTE_annual := Income / AWOTE_annual] %>%
  .[, .(AWOTE_annual_average = mean(Income_per_AWOTE_annual)),
    keyby = .(xwaveid, First_year_range = Year <= 2009)] %>%
  setkey(xwaveid) %>%
  merge(xwaveid_by_lnwte) %>%
  merge(Age_group_2005_by_xwaveid) %>%
  .[, Year_range := if_else(First_year_range, "Percentile_ante_2009", "Percentile_post_2009")] %>%
  .[, AWOTE_annual_percentile := weighted_ntile(AWOTE_annual_average,
                                                weights = WEIGHT,
                                                n = 100),
    keyby = .(Age_group_2005, Year_range)] %>%
  .[, .(xwaveid, Age_group_2005, Year_range, AWOTE_annual_percentile)] %>%
  dcast.data.table(xwaveid + Age_group_2005 ~  Year_range,
                   value.var = "AWOTE_annual_percentile") %>%
  setkey(xwaveid) %>%
  xwaveid_by_lnwte[., on = "xwaveid", nomatch=0L] %>%
  .[, .(n_persons = sum(WEIGHT)),
    keyby = .(Age_group_2005,
              Percentile_ante_2009 = factor(Percentile_ante_2009),
              Percentile_post_2009 = factor(Percentile_post_2009))] %>%
  .[, prob := n_persons / sum(n_persons),
    keyby = .(Age_group_2005, Percentile_ante_2009)] %>%
  .[]

provide.dir("age-transition-matrices")

nan_to_zero <- function(x) {
  if (is.numeric(x)){
    x[is.nan(x)] <- 0
  }
  x
}

write_transition_matrix <- function(age){
  prob_5year_on_year_percentile[Age_group_2005 == age] %>%
    dcast.data.table(Percentile_ante_2009 ~ Percentile_post_2009,
                     fun = mean,
                     drop = FALSE,
                     fill = NaN,
                     value.var = "prob") %>%
    .[, lapply(.SD, nan_to_zero)]
}

lapply(unique(prob_5year_on_year_percentile$Age_group_2005), write_transition_matrix)
matrix_2529 <- as.matrix.data.frame(read.csv("age-transition-matrices/25-29-transition-matrix.csv"))[,-1]
matrix_3034 <- as.matrix.data.frame(read.csv("age-transition-matrices/30-34-transition-matrix.csv"))[,-1]
matrix_3539 <- as.matrix.data.frame(read.csv("age-transition-matrices/35-39-transition-matrix.csv"))[,-1]
matrix_4044 <- as.matrix.data.frame(read.csv("age-transition-matrices/40-44-transition-matrix.csv"))[,-1]
matrix_4549 <- as.matrix.data.frame(read.csv("age-transition-matrices/45-49-transition-matrix.csv"))[,-1]
matrix_5054 <- as.matrix.data.frame(read.csv("age-transition-matrices/50-54-transition-matrix.csv"))[,-1]
matrix_5559 <- as.matrix.data.frame(read.csv("age-transition-matrices/55-59-transition-matrix.csv"))[,-1]
matrix_6064 <- as.matrix.data.frame(read.csv("age-transition-matrices/60-64-transition-matrix.csv"))[,-1]

# {matrix_2529 %*%
#     matrix_3034 %*%
#     matrix_3539 %*%
#     matrix_4044 %*%
#     matrix_4549 %*%
#     matrix_5054 %*%
#     matrix_5559 %*%
#     matrix_6064} %>%
#   as.data.table %>%
#   fwrite("transition-matrix-25-60-ages.csv")

{out <- matrix_2529 +
    matrix_3034 +
    matrix_3539 +
    matrix_4044 +
    matrix_4549 +
    matrix_5054 +
    matrix_5559
    # matrix_6064
  out / rowSums(out)
  } %>%
  as.data.table %>%
  write_csv("transition-matrix-average-all-ages.csv")

prob_5year_on_year_percentile %>%
  group_by(Percentile_pre_2009, Percentile_post_2009) %>%
  summarise(prob = weighted.mean(prob, n_persons)) %>%
  dcast.data.table(Percentile_pre_2009 ~ Percentile_post_2009, fun = mean, value.var = "prob") %>%
  write_csv("transition-matrix-all-ages.csv")

nan_to_zero <- function(x) {
  if (is.numeric(x)){
    x[is.nan(x)] <- 0
  }
  x
}

get_transition_table <- function(age_range, filename){
  if (missing(age_range)) {
    age.range <- gsub("^.*([0-9]{2}.[0-9]{2}).transition.matrix\\.csv$", "\\1", filename)

    fread(filename, header = TRUE) %>%
      melt.data.table(id.vars = "Percentile_pre_2009",
                      variable.name = "Percentile_post_2009",
                      value.name = "prob") %>%
      arrange(Percentile_pre_2009) %>%
      group_by(Percentile_pre_2009) %>%
      mutate(cumprob = order_by(Percentile_post_2009, cumsum(prob))) %>%
      setkey(Percentile_pre_2009, cumprob) %>%
      mutate(age_range = age.range)
  } else {
    prob_5year_on_year_percentile[Age_group_2005 == age_range] %>%
      .[, Percentile_ante_2009 := factor(Percentile_ante_2009, levels = 1:100)] %>%
      .[, Percentile_post_2009 := factor(Percentile_post_2009, levels = 1:100)] %>%
      # dcast + melt = expand.grid (so that zero probabilities )
      dcast.data.table(Percentile_ante_2009 ~ Percentile_post_2009,
                       fun = mean,
                       drop = FALSE,
                       fill = NaN,
                       value.var = "prob") %>%
      .[, lapply(.SD, nan_to_zero)] %>%
      melt.data.table(id.vars = "Percentile_ante_2009",
                      variable.name = "Percentile_post_2009",
                      value.name = "prob") %>%
      .[, Percentile_ante_2009 := as.integer(Percentile_ante_2009)] %>%
      .[, Percentile_post_2009 := as.integer(Percentile_post_2009)] %>%
      setkey(Percentile_ante_2009) %>%
      .[, cumprob := cumsum(prob), keyby = "Percentile_ante_2009"] %>%
      setkey(Percentile_ante_2009, cumprob) %>%
      .[]
  }
}

transition_tables_all <-
  lapply(unique(income_AWOTE_by_age_percentile$Age), get_transition_table) %>%
  rbindlist(use.names = TRUE, fill = TRUE)

Tensor product smooth

A tensor product smooth was used to convert the spikes in the sparse transition matrices to a smoothed version, to reflect the underlying distribution. Intuitively, each nonzero value in observed transition matrix is a spike on a square board. The tensor product smooth is the surface that would appear if a somewhat segmented sheet were loosely placed over the spikes on the board. Higher values of (k) mean the sheet is 'tighter'.

An example of a the tensor product smooth applies to some dummy data is in the following figure. For the model, we used (k = 10).

cache_validate <- function(filename, expr, verbose = TRUE) {
  new.sha1 <- digest::sha1(deparse(substitute(expr)))

  cache_valid <-
    file.exists(filename) &&
    as.double(difftime(Sys.time(),
                       file.mtime(filename),
                       units = "days")) <= 1L

  if (!cache_valid) {
    return(list(FALSE, new = new.sha1))
  } else {
    a.sha1 <- paste0(tools::file_path_sans_ext(filename), ".sha1")
    if (!file.exists(a.sha1)) {
      return(list(FALSE, new = new.sha1))
    }
    a.sha1 <- paste0(tools::file_path_sans_ext(filename), ".sha1")
    if (verbose) {
      cat("\n", readChar(a.sha1, 40L), "\n",
          digest::sha1(deparse(substitute(expr))), "\n")
    }
    if (!file.exists(a.sha1)) {
      return(list(FALSE, new = new.sha1))

    }
    if (identical(new.sha1 <- digest::sha1(deparse(substitute(expr))),
                  old.sha1 <- readChar(a.sha1, 40L))) {
      list(TRUE, new = new.sha1, old = old.sha1)
    } else {
      list(FALSE, new = new.sha1, old = old.sha1)
    }
  }
}

use_cache_for <- function(filename, expr, validate.cache = cache_validate(filename, expr)) {
  hutils::provide.dir(dirname(filename))
  if (file.exists(filename) && validate.cache[[1]]) {
    switch(tools::file_ext(filename),
           "fst" = fst::read_fst(filename, as.data.table = TRUE),
           "csv" = fread(filename),
           "tsv" = fread(filename, sep = "\t"),
           stop("Invalid file extension:", tools::file_ext(filename)))
  } else {
    res <- expr
    switch(tools::file_ext(filename),
           "fst" = fst::write_fst(res, filename),
           "csv" = fwrite(res, filename),
           "tsv" = fwrite(res, sep = "\t"),
           warning("Invalid file extension:", tools::file_ext(filename), ". No cache written."))

    writeChar(validate.cache[["new"]],
              paste0(tools::file_path_sans_ext(filename), ".sha1"),
              nchar = 40L)
    return(res)
  }

}

dat <- use_cache_for("Pachinko-cache/dat.fst", {
set.seed(4)
dat <- CJ(x = seq(from = 0, to = 1, by = 0.005),
          y = seq(from = 0, to = 1, by = 0.005))
dat[, z := 0]
dat[, z1 := 0]
dat[sample.int(nrow(dat), size = nrow(dat) / 50), z1 := 1]
dat[x %between% c(0.7, 0.85) & ((x + y) %between% c(0.9, 1.05)), z := 1.25]
dat[x %between% c(0.2, 0.7) & (1.5 * x - y^2 < 0.5), z := 1]
dat[y %between% c(0.75, 0.8), z := 1]
dat[z > 0, z := z1 * z]
dat[x %between% c(0.35, 0.45) & y > 0.9, z := 0]
# gam_dat <- gam(z ~ te(x, y, k = c(10, 10)), data = dat)
dat[, predicted05 := predict(gam(z ~ te(x, y, k = c(5, 5)), data = dat), newdata = dat)]
dat[, predicted10 := predict(gam(z ~ te(x, y, k = c(10, 10)), data = dat), newdata = dat)]
dat[, predicted30 := predict(gam(z ~ te(x, y, k = c(30, 30)), data = dat), newdata = dat)]
})
dat %>%
  melt(measure.vars = c("z", "predicted05", "predicted10", "predicted30")) %>%
  .[, value := (value - min(value)) / max(value - min(value)), keyby = "variable"] %>%
  .[variable != "z", variable := paste("k", "=", sub("predicted", "", variable))] %>%
  ggplot(aes(x, y, fill = value)) +
  geom_tile() + 
  scale_fill_viridis() +
  theme_dark(base_size = 14) + 
  theme(legend.position = "none") +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  facet_wrap(~variable, nrow = 2)

smooth_matrix <- function(age_range) {
  stopifnot(length(age_range) == 1L)

  gam.1 <-
    prob_5year_on_year_percentile %>%
    .[Age_group_2005 == age_range]
  # k = 10 for less smooth data
  gam(prob ~ te(Percentile_ante_2009, Percentile_post_2009), k = c(10, 10), data = .)
  out <- matrix(pmax(fitted(gam.1), 0), ncol = 100)
  out / rowSums(out)
}

smooth_table <- function(age_range) {
  gam.1 <-
    get_transition_table(age_range) %>%
    # weight2rows("n_persons") %>%
    # k = 10 for less smooth data, somewhat computationally intensive than lower k
    # This is a tensor product smooth.
    gam(prob ~ te(Percentile_ante_2009, Percentile_post_2009, k = c(10, 10)),
        data = .)

  CJ(Percentile_ante_2009 = 1:100,
     Percentile_post_2009 = 1:100) %>%
    # gam will predict some values as (very slightly) negative.
    # We need to normalize them anyhow, so just force nonnegative.
    .[, prob := pmax(predict(gam.1, newdata = .), 0)] %>%
    .[, prob := prob / sum(prob), by = Percentile_ante_2009] %>%
    .[, cumprob := cumsum(shift(prob, n = 1, type =  "lag", fill = 0)),
      keyby = Percentile_ante_2009] %>%
    .[, .(Percentile_ante = Percentile_ante_2009,
          Percentile_post = Percentile_post_2009,
          cumprob)] %>%
    setkey(Percentile_ante, cumprob)
}

input <-
  CJ(Percentile_ante = 1:100,
     simulation = 1:N_SIMULATIONS) %>%
  .[, cumprob := randz]

setkey(input, Percentile_ante, cumprob)

copy_column <- function(.data, new_name, column_copied){
  .data2 <- copy(.data)
  .data2[[new_name]] <- .data[[column_copied]]
  as.data.table(.data2)
}

age_the_simulation <- function(input.table, five_year_group = 2){
  if (OLDEST.AGE < 70L && five_year_group == 8L) {
    # Avoid 'Can not cast empty data.table'
    return(input.table) # since not ageing that far
  }
  Ages <- c("25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64")
  age_range_fm <- Ages[five_year_group - 1]
  agerangefm <- sub("-", "", age_range_fm, fixed = TRUE)
  age_range_to <- Ages[five_year_group]
  agerangeto <- sub("-", "", age_range_to, fixed = TRUE)
  smooth_table(age_range_to)[input.table, roll = Inf] %>%
    setnames("Percentile_post", paste0("Percentile_", agerangeto)) %>%
    setnames("Percentile_ante", paste0("Percentile_", agerangefm)) %>%
    copy_column("Percentile_ante", paste0("Percentile_", agerangefm)) %>%
    # re roll the die
    .[, cumprob := runif(.N)] %>%
    setkey(Percentile_ante, cumprob)
}

machine_widget <- 
  if (YOUNGEST.AGE == 30) {
    input %>%
      setkey(Percentile_ante, cumprob) %>%
      age_the_simulation(2) %>%
      age_the_simulation(3) %>%
      age_the_simulation(4) %>%
      age_the_simulation(5) %>%
      age_the_simulation(6) %>%
      age_the_simulation(7) %>%
      age_the_simulation(8) %>%
      .[]
  } else if (YOUNGEST.AGE == 40) {
    input %>%
      setkey(Percentile_ante, cumprob) %>%
      age_the_simulation(4) %>%
      age_the_simulation(5) %>%
      age_the_simulation(6) %>%
      age_the_simulation(7) %>%
      age_the_simulation(8) %>%
      .[]
  } else if (YOUNGEST.AGE == 50) {
    input %>%
      setkey(Percentile_ante, cumprob) %>%
      age_the_simulation(6) %>%
      age_the_simulation(7) %>%
      age_the_simulation(8) %>%
      .[]
  }

awote_by_age <-
  income_AWOTE_by_age_percentile[Year_range == "Decile_post_2009",
                                 .(Age = Age_group_2005,
                                   Percentile = AWOTE_annual_percentile,
                                   AWOTE_average)] %>%
  setkey(Age, Percentile)  %>%
  # Ensure all percentiles are available
  # (Otherwise the benchmark might be non-monotonic.)
  {
    dot <- .
    gridExpanded <- CJ(Age = unique(dot$Age),
                       Percentile = 1:100)

    setkey(gridExpanded, Age, Percentile)

    dot[gridExpanded, on = c("Age", "Percentile")] %>%
      .[, AWOTE_average := if_else(is.na(AWOTE_average) & Percentile %in% c(1, 100),
                                   if_else(Percentile == 100,
                                           max(AWOTE_average, na.rm = TRUE),
                                           0),
                                   AWOTE_average), keyby = "Age"] %>%
      .[, AWOTE_average := zoo::na.approx(AWOTE_average, na.rm = FALSE), keyby = "Age"]
  } %>%
  # .[Age != "25-29"] %>%

  # Magic numbers from Will
  .[, AWOTE_average := if_else(Age %ein% c("55-59",
                                           if (OLDEST.AGE > 65L) "60-64"),
                               if_else(Percentile < 95,
                                       pminC(AWOTE_average, 1.56),
                                       if_else(Percentile < 99, 
                                               1.97,
                                               3.69)),
                               AWOTE_average)]



stopifnot(nrow(awote_by_age) == 100 * (OLDEST.AGE %/% 5 - YOUNGEST.AGE %/% 5))

lifetime_income_benchmark <-
  awote_by_age %>%
  setkey(Age, Percentile) %>%
  .[, .(lifetime_AWOTE = sum(AWOTE_average)), keyby = "Percentile"] %>%
  .[, lifetime_AWOTE := 5 * lifetime_AWOTE]

lifetime_pachinko_density <-
  machine_widget %>%
  .[, .SD, .SDcols = c("simulation", 
                       switch(as.character(YOUNGEST.AGE),
                              "30" = "Percentile_2529",
                              "40" = "Percentile_3539",
                              "50" = "Percentile_4549",
                              stop("Unanticipated YOUNGEST.AGE")),
                       grep(".1", names(.), value = TRUE),
                       switch(as.character(OLDEST.AGE),
                              "70" = "Percentile_6064", 
                              "65" = "Percentile_5559",
                              stop("Unanticipated OLDEST.AGE")))] %>%
  setnames(old = grep(".1$", names(.), value = TRUE),
           new = gsub(".1$", "", names(.)[grep(".1$", names(.), value = FALSE)])) %>%
  .[, id := .I] %>%
  melt.data.table(id.vars = c("id", "simulation"), variable.name = "Age", value.name = "Percentile") %>%
  .[, Age := gsub("Percentile_(..)(..)", "\\1-\\2", Age)] %>%
  setkeyv(c("Age", "Percentile")) %>%
  merge(awote_by_age)

lifetime_income_pachinko <-
  lifetime_pachinko_density[,
                            .(lifetime_AWOTE = sum(AWOTE_average) * 5,
                              Percentile = first(Percentile)),
                            keyby = "id"] %>%
  .[, .(lifetime_AWOTE = mean(lifetime_AWOTE)), keyby = "Percentile"] %>%
  .[, lifetime_AWOTE := round(lifetime_AWOTE, 1)] %T>%
  fwrite(sprintf("../inst/extdata/lifetime_income_pachinko-Age%s.csv", YOUNGEST.AGE)) %>%
  .[]

Result

The average lifetime income is comparable to the average income if we assumed individuals remained in the same percentile throughout their lives.

merge(lifetime_income_benchmark, lifetime_income_pachinko, by = "Percentile") %$%
  t.test(lifetime_AWOTE.x, lifetime_AWOTE.y)

The difference is most pronounced at the tails of the distribution:

rbindlist(list("benchmark" = lifetime_income_benchmark,
               "pachinko"  = lifetime_income_pachinko),
          id = "group") %>%
  ggplot(aes(x = Percentile, y = lifetime_AWOTE, group = group, color = group)) +
  geom_line() +
  scale_y_continuous(name = "Lifetime income / AWOTE",
                     sec.axis = sec_axis(trans = ~.*awote_by_year[which.max(Year)][["AWOTE_annual"]],
                                         name = "Lifetime income",
                                         label = grattan_dollar)) + 
  theme(legend.position = c(0.1, 0.9),
        legend.justification = c(0, 1))

for (the_percentile in c(10, 25, 50, 95)) {
  the_percentile_plot <-
    switch(as.character(YOUNGEST.AGE), 
           "30" = {
             machine_widget[Percentile_2529 %in% the_percentile,
                            .SD,
                            .SDcols = grep("^((Percentile_[0-9]{4}\\.1)|(.*(6064|2529)))$",
                                           names(machine_widget)),
                            key = "simulation"]
           },
           "40" = {
             machine_widget[Percentile_3539 %in% the_percentile,
                            .SD,
                            .SDcols = grep("^((Percentile_[0-9]{4}\\.1)|(.*(6064|3539)))$",
                                           names(machine_widget)),
                            key = "simulation"]
           },
           "50" = {
             machine_widget[Percentile_4549 %in% the_percentile,
                            .SD,
                            .SDcols = grep("^((Percentile_[0-9]{4}\\.1)|(.*(6064|4549)))$",
                                           names(machine_widget)),
                            key = "simulation"]
           }) %>%
    melt.data.table(id.vars = "simulation") %>%
    .[, variable := sub("^Percentile_(..)(..).*$", "\\1-\\2", variable)] %>%
    .[order(-variable)] %>%
    .[, variable := factor(variable,
                           levels = rev(unique(.$variable)),
                           ordered = TRUE)] %>%
    ggplot(aes(x = value, fill = variable)) +
    geom_bar(width = 1,
             color = grey(level = 0.75),
             aes(y = ..prop..)) +
    facet_wrap( ~ variable, scales = "free_y", ncol = 1) +
    theme_dark()  +
    theme(legend.position = "none") +
    scale_y_continuous(labels = percent, breaks = c(0, 0.02, 0.04, 0.06, 0.08))

  print(the_percentile_plot)
}