Test binary allocation queue with Lalonde training dataset. There are 722 observations, 297 in the treatment group, 425 in the control group.

Already completed optimal ranking analysis and regressions for wage and employment. Here, I combine the results together and generate some joint graphs analyzing:

  1. The relationship between expected probability of employment and wage ($A_i$) without training, and the expected return to training for employment and wage ($\alpha_i$).
  2. For each individual, does their optimal allocation ranking change under the Rawlsian to Utilitarian planner? For wage vs employment based rankings
  3. Who are those ranked in top 10 to receive? Along the spectrum
  4. Resource Equivalent Variation along the spectrum

Load Dependencies

rm(list = ls(all.names = TRUE))
options(knitr.duplicate.label = 'allow')
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
library(stringr)
library(broom)
library(haven)
library(ggplot2)
library(forcats)

library(REconTools)
library(PrjOptiAlloc)

library(knitr)
library(kableExtra)

bl_save_img = FALSE

Merge the Wage and Employmennt Rsults

Generate four categories by initial height and mother's education levels combinations.

spt_img_save <- '../_img/'
spt_img_save_draft <- 'C:/Users/fan/Documents/Dropbox (UH-ECON)/repos/HgtOptiAlloDraft/_img/'
# Load Data
data(df_opt_lalonde_training)
data(df_opt_lalonde_training_wage)
data(df_opt_lalonde_training_employ)
data(df_opt_lalonde_training_empage)

# dfj, dataframe joint
dfj <- df_opt_lalonde_training_employ %>%
  left_join(df_opt_lalonde_training_wage, by = 'id') %>%
  left_join(df_opt_lalonde_training_empage, by = 'id') %>%
  left_join(df_opt_lalonde_training %>% rename(id=X) %>%
              select(id, re74, re75))

# drop the .y variables, clean the .x out
dfj <- dfj %>% select(-contains(".y"), -contains(".x"))
# %>%
#           rename_at(vars(ends_with(".x")), funs(str_replace(., ".x", "")))

# order and organize variables
dfj <- dfj %>% select(id,
                      starts_with("A_"), starts_with("alpha_"), starts_with("beta_"),
                      contains("rank"), contains("rho"),
                      everything())

# treatment column to numeric
dfj$trt <- as.numeric(dfj$trt) - 1

# Summarize
# str(dfj)
summary(dfj)

Compute REV

Alternative Allocations

The observed random allocation, number of spots available.

ar_bin_observed <- dfj %>% pull(trt)
it_total_spots <- sum(ar_bin_observed)
print(it_total_spots)

There are 297 training spots.

A simple allocation rule is to just give training spots to those who had the lowest levels of wage in the initial year. This kind of simple initial condition based allocation rule is often used in practice.

There happen to be 289 individuals with zero wages in 1975. So the Naive allocation could provision to these individuals plus 8 additional individuals with the lowest non-zero wages in 1975.

# 1975 mean wage
dfj %>% group_by(trt) %>%
  summarize_at(vars(contains('re')),
               .funs = list(mean = ~mean(.)))

# 1974 Wage the number of zeros:
re75_n297 <- dfj %>% arrange(re75) %>% filter(row_number() <= it_total_spots) %>% select(id, re75)
REconTools::ff_summ_percentiles(re75_n297)
# The 297 lowest wages in 1975
dfj <- dfj %>% arrange(re75) %>%
  mutate(re75_zero = case_when(re75 == 0 ~ 1, re75 != 0 ~ 0)) %>%
  mutate(re75_rank = row_number()) %>%
  mutate(alloc_naive = case_when(re75_rank <= it_total_spots ~ 1,
                                 TRUE ~ 0))

REconTools::ff_summ_percentiles(dfj %>% select(re75_zero, re75_rank))

Compute REV

Compute Resource Equivalent Variations. Assume here that i use here the same ar_rho vector as used in emloyment and wage regressions.

Note that for each rho, the optimal ranking is different, and there are different optimal rankings for wage and employment outcomes.

Prepare for Various Inputs.

# Plann Preference Array
ar_rho = c(-100, -0.001,  0.95)
ar_rho <- 1 - (10^(c(seq(-2,2, length.out=30))))
ar_rho <- unique(ar_rho)

# Sort all Individuals
dfj <- dfj %>% arrange(id)

# Alternative Allocationc
ar_bin_observed <- dfj %>% pull(trt)
ar_bin_alloc_naive <- dfj %>% pull(alloc_naive)

# A and alpha Parameter
ar_A_wage <- dfj %>% pull(A_i_wage)
ar_A_employ <- dfj %>% pull(A_i_employ)
ar_A_empage <- dfj %>% pull(A_i_empage)

ar_alpha_wage <- dfj %>% pull(alpha_i_wage)
ar_alpha_employ <- dfj %>% pull(alpha_i_employ)
ar_alpha_empage <- dfj %>% pull(alpha_i_empage)

# Relative Preference Weights
ar_beta_wage <- dfj %>% pull(beta_i_wage)
ar_beta_employ <- dfj %>% pull(beta_i_employ)
ar_beta_empage <- dfj %>% pull(beta_i_empage)

Solve for Resource Equivalent Variations:

# REV Results Storage
ar_rev_wage_v_rand <- rep(0, length(ar_rho))
ar_rev_employ_v_rand <- rep(0, length(ar_rho))
ar_rev_empage <- rep(0, length(ar_rho))
ar_rev_employ_v_re75rank <- rep(0, length(ar_rho))

# Loop Over
for (it_rho_ctr in seq(1, length(ar_rho))) {

  fl_rho <- ar_rho[it_rho_ctr]

  svr_rho_employ  <- paste0('rho_c', it_rho_ctr, '_rk_employ')
  svr_rho_wage <- paste0('rho_c', it_rho_ctr, '_rk_wage')
  svr_rho_empage <- paste0('rho_c', it_rho_ctr, '_rk_empage')

  ar_queue_optimal_wage <- dfj %>% pull(svr_rho_wage)
  ar_queue_optimal_employ <- dfj %>% pull(svr_rho_employ)
  ar_queue_optimal_empage <- dfj %>% pull(svr_rho_empage)

  # 1. REV of Optimal All X Wage Regression relative to random observed
  ar_rev_wage_v_rand[it_rho_ctr] <-
    ffp_opt_sobin_rev(ar_queue_optimal_wage, ar_bin_observed,
                      ar_A_wage, ar_alpha_wage, ar_beta_wage,
                      fl_rho)

  # 2. REV of Optimal Age X only Employment Regression relative to random observed
  ar_rev_empage[it_rho_ctr] <-
    ffp_opt_sobin_rev(ar_queue_optimal_empage, ar_bin_observed,
                      ar_A_empage, ar_alpha_empage, ar_beta_empage,
                      fl_rho)

  # 3. REV of Optimal All X Employment Regression relative to random observed
  ar_rev_employ_v_rand[it_rho_ctr] <-
    ffp_opt_sobin_rev(ar_queue_optimal_employ, ar_bin_observed,
                      ar_A_employ, ar_alpha_employ, ar_beta_employ,
                      fl_rho)

  # 4. REV of Optimal All X Employment Regression relative to NAIVE ALLOCATION by Initial
  ar_rev_employ_v_re75rank[it_rho_ctr] <-
    ffp_opt_sobin_rev(ar_queue_optimal_employ, ar_bin_alloc_naive,
                      ar_A_employ, ar_alpha_employ, ar_beta_employ,
                      fl_rho)
}

# Print results
REconTools::ff_sup_lst2str(round(ar_rev_wage_v_rand,2))
REconTools::ff_sup_lst2str(round(ar_rev_employ_v_rand,2))
REconTools::ff_sup_lst2str(round(ar_rev_empage,2))
REconTools::ff_sup_lst2str(round(ar_rev_employ_v_re75rank,2))

Combine REV results to one dataframe:

# combine results
mt_combine <- cbind(ar_rho,
                    ar_rev_wage_v_rand*100,
                    ar_rev_employ_v_rand*100,
                    ar_rev_empage*100,
                    ar_rev_employ_v_re75rank*100)
colnames(mt_combine) <- c('rho',
                          'rev_expected_wage',
                          'rev_employment_prob',
                          'rev_employage_prob',
                          'rev_employ_v_re75rank')
tb_rev_wage_employ <- as_tibble(mt_combine) %>% rowid_to_column(var = "eval")
# Transform x-scale to 1-rho
tb_rev_wage_employ <- tb_rev_wage_employ %>% mutate(one_minus_rho = 1 - rho)

# Reshape to long
tb_rev_wage_employ_long <- tb_rev_wage_employ %>%
  pivot_longer(
    cols = starts_with('rev'),
    names_to = c('Outcome'),
    names_pattern = paste0('rev', "_(.*)"),
    values_to = 'rev'
  )

print(round(tb_rev_wage_employ %>%
              mutate(rand_v_init = rev_employment_prob/rev_employ_v_re75rank),2))

Draw the REV graph. Following this graph code.

# x-labels
x.labels <- c('λ=0.99', 'λ=0.90', 'λ=0', 'λ=-10', 'λ=-100')
x.breaks <- c(0.01, 0.10, 1, 10, 100)

# title line 2
# title_line1 <- sprintf("Percentage of Training Spots Misallocated, NSW Lalonde (AER, 1986)")
# title_line2 <- sprintf("REV (Resource Equivalent Variation) Along Planner Spectrum")

st_title <- sprintf(paste0('How much Fewer Resources are Needed (Shares) to Achieve the Same Welfare'))
title_line1 <- sprintf("Compare alternative allocations to optimal allocations given observables and estimates")
title_line2 <- sprintf("Solid Red Line: train 297 random NSW treatment individuals vs. optimally allocating 297 spots")
title_line3 <- sprintf("Dashed Blue Line: train 297 lowest baseline wage individuals vs. optimally allocating 297 spots")

# Relabel Variable
Outcome_levels <- c("random (RCT)" = "employment_prob", "baseline wage" = "employ_v_re75rank")
tb_rev_wage_employ_long_fig <- tb_rev_wage_employ_long %>%
  filter(Outcome == 'employment_prob' | Outcome == 'employ_v_re75rank') %>%
  mutate(Outcome = as_factor(Outcome)) %>%
  mutate(Outcome = fct_recode(Outcome, !!!Outcome_levels))

# Graph Results--Draw
line.plot <- tb_rev_wage_employ_long_fig %>%
  ggplot(aes(x=one_minus_rho, y=rev,
             group=Outcome,
             colour=Outcome,
             linetype=Outcome,
             shape=Outcome)) +
  geom_line() +
  geom_point() +
  # geom_vline(xintercept=c(1), linetype="dotted") +
  labs(title = st_title,
       subtitle = paste0(title_line1, '\n', title_line2, '\n', title_line3),
       x = 'log10 Rescale of λ, Log10(1-λ)\nλ=1 Utilitarian (Maximize Average), λ=-infty Rawlsian (Maximize Minimum)',
       y = paste0('100 x REV (Resource Equivalent Variations)'),
       caption = 'Based on a logistic regression of the employment effects of a training RCT. Data: Lalonde (AER, 1986).') +
  scale_x_continuous(trans='log10', labels = x.labels, breaks = x.breaks) +
  theme_bw(base_size=8) +
  ylim(0, 100)
# +
#   guides(colour=FALSE)


# Labeling
line.plot$labels$linetype <- "REV\nOptimal\nvs.\nAlternatives"
line.plot$labels$colour <- "REV\nOptimal\nvs.\nAlternatives"
line.plot$labels$shape <- "REV\nOptimal\nvs.\nAlternatives"

# Print
print(line.plot)

if (bl_save_img) {
  snm_cnts <- 'Lalonde_employ_rev.png'
  png(paste0(spt_img_save, snm_cnts),
      width = 135, height = 96, units='mm', res = 300, pointsize=7)
  print(line.plot)
  dev.off()
  png(paste0(spt_img_save_draft, snm_cnts),
      width = 135, height = 96, units='mm', res = 300,
      pointsize=5)
  print(line.plot)
  dev.off()
}

Analysis of A and alpha

Generate some tables where the distributions of $A$ and $\alpha$ are compared.

Scatter Plots

# Binary Marginal Effects and Prediction without Binary
ggplot.A.alpha <- function(df, svr_alpha = 'alpha_i', svr_A = "A_i",
                           slb_title = 'A_i and alpha_i (red)'){
  scatter <- ggplot(df, aes(x=!!sym(svr_A))) +
    geom_point(aes(y=!!sym(svr_alpha)), size=4, shape=4, color="red") +
    labs(title = paste0(slb_title),
         x = 'A_i = Prob Employ no Training',
         y = 'alpha_i = Effect of Training of Prob Employ',
         caption = paste0('Logit Regression on training. Control for age, educ, race. Age <= 23 or > 23 interaction.')) +
    theme_bw()
  return(scatter)
}

# Plot over multiple
ggplot.A.alpha(df = dfj,
               svr_alpha = 'alpha_i_wage', svr_A = "A_i_wage",
               slb_title = 'A_i and alpha_i, wage')

ggplot.A.alpha(df = dfj,
               svr_alpha = 'alpha_i_employ', svr_A = "A_i_employ",
               slb_title = 'Employment A_i and alpha_i Joint Distribution')

Histogram Plots

What is the distribution of A given alpha. This is relevant in the linear regression case

# Keep only relevant columns, and reshape data
dfj_hist <- dfj %>% select(id, A_i_wage, alpha_i_wage)
dfj_hist$alpha_i_wage <- factor(dfj_hist$alpha_i_wage)
dfj_hist$alpha_i_wage <- factor(dfj_hist$alpha_i_wage, labels = c('Age <= 23\nalpha_i=$454', 'Age > 23\nalpha_i=$1071'))


title_line1 <- sprintf("Wage A_i Distribution (Linear Wage), NSW Training Lalonde (AER, 1986)")
title_line2 <- sprintf("By Two Unique Levels of alpha_i: for Age <= 23 and for Age > 23")

# Graph
dfj_hist %>% ggplot(aes(x=A_i_wage, fill=alpha_i_wage)) +
  geom_density( color="#e9ecef", alpha=0.6, position = 'identity') +
  scale_fill_manual(values=c("#69b3a2", "#404080")) +
  labs(fill="") %>%
  labs(title = paste0(title_line1, '\n', title_line2),
       x = 'A_i = Expected Wage no Training (in thousands, 1979)',
       y = 'density',
       caption = paste0('Linear Wage Regression on training. Control for age, educ, race. Age <= 23 or > 23 interaction.'))

Min and Max Rank Change Across rho

The max calculated by ffp_opt_anlyz_rhgin_bin is the top rank, small in number. The min calculated by the function is the lowest ranked, largest number.

Histogram Plots and Table highest Rank Reached

Looked highest rank reached for each (highest rank (smallest number ever reached)).

# Generate min and max gaps
dfj_highest <- dfj %>% select(id, rank_max_wage, rank_max_employ)

# Wide to long
st_gap_prefix <- 'rank_max'
dfj_highest_long <- dfj_highest %>%
  pivot_longer(
    cols = starts_with(st_gap_prefix),
    names_to = c('employvswwage'),
    names_pattern = paste0(st_gap_prefix, "_(.*)"),
    values_to = 'highestrank'
  )

# Rank change to categories
dfj_highest_long <- dfj_highest_long %>%
  mutate(highestrank_grp =
           case_when(highestrank == 1 ~ "Top A 1",
                     highestrank <= 10  & highestrank > 1 ~ "Top B 10",
                     highestrank <= 50 & highestrank > 10 ~ "Top C 11 to 50",
                     highestrank <= 100 & highestrank > 50 ~ "Top D 51 to 100",
                     highestrank <= 297 & highestrank > 100 ~ "Top E 101 to 297",
                     highestrank > 297 ~ "Top F > 297"))

# Graph
dfj_highest_long %>% ggplot(aes(x=highestrank, fill=employvswwage)) +
  geom_histogram(color="#e9ecef", alpha=0.6, position = 'identity') +
  scale_fill_manual(values=c("#69b3a2", "#404080")) +
  labs(fill="")

# Table
dfj_highest_long %>% group_by(employvswwage, highestrank_grp) %>%
  summarize(count = n())

Histogram Plots and Table Min and Max Change

From the wage and employment analysis, each generates min and max rank

  1. generate rank min max gap for wage and employment: min minus max because min number is actually the larger number (lower rank)
  2. reshape wide to long, gap one variable, wage vs employment categorical
  3. show table summary statistics differences
  4. show graph differences
# Generate min and max gaps
dfj_gap <- dfj %>% mutate(rank_gap_wage = rank_min_wage - rank_max_wage,
                          rank_gap_employ = rank_min_employ - rank_max_employ) %>%
  select(id, rank_gap_wage, rank_gap_employ)

# Wide to long
st_gap_prefix <- 'rank_gap'
dfj_gap_long <- dfj_gap %>%
  pivot_longer(
    cols = starts_with(st_gap_prefix),
    names_to = c('employvswwage'),
    names_pattern = paste0(st_gap_prefix, "_(.*)"),
    values_to = 'rank_gap'
  )

# Rank change to categories
dfj_gap_long <- dfj_gap_long %>%
  mutate(rank_gap_grp =
           case_when(rank_gap == 0 ~ "change A no change",
                     rank_gap <= 50  & rank_gap > 0 ~ "change B 50 positions",
                     rank_gap <= 100 & rank_gap > 50 ~ "change C 51 to 100",
                     rank_gap <= 200 & rank_gap > 100 ~ "change D 101 to 200",
                     rank_gap <= 400 & rank_gap > 200 ~ "change E 201 to 400",
                     rank_gap > 400 ~ "change F more than 401"))

# Graph
dfj_gap_long %>% ggplot(aes(x=rank_gap, fill=employvswwage)) +
  geom_histogram(color="#e9ecef", alpha=0.6, position = 'identity') +
  scale_fill_manual(values=c("#69b3a2", "#404080")) +
  labs(fill="")

# Table
dfj_gap_long %>% group_by(employvswwage, rank_gap_grp) %>%
  summarize(count = n())

Table of Top 10 Individuals

#
# # Calculat the maximum rank reached by each from all the rhos we have
# # The difference between this and the other max, this computes across all rhos, employment and wage
#
# dfj_top10 <- dfj %>%
#     select(contains("rho_")) %>%
#     reduce(pmin) %>%
#     mutate(dfj, max_rank_rhos = .)
#
# dfj_top10 <- dfj_top10 %>% filter(max_rank_rhos <= 10) %>%
#   select(id, age, educ, black, race,
#          rho_c1_rk_employ, rho_c16_rk_employ, rho_c30_rk_employ,
#          rho_c1_rk_wage, rho_c16_rk_wage, rho_c30_rk_wage) %>%
#   rename(emp_util = rho_c1_rk_employ,
#          emp_CD   = rho_c16_rk_employ,
#          emp_rawl = rho_c30_rk_employ,
#          wag_util = rho_c1_rk_wage,
#          wag_CD   = rho_c16_rk_wage,
#          wag_rawl = rho_c30_rk_wage,
#          ) %>%
#   arrange(age, educ, race)
#
# dim(dfj_top10)
#
# # Graphing Library
# library(kableExtra)
# # Load Data
# dt <- mtcars[1:5, 1:6]
# # Generate latex string variable
# st_out_tex <- kable(dfj_top10, "latex")
# print(st_out_tex)
# # File out
# # fileConn <- file("./../../_file/tex/tex_sample_a_tab.tex")
# fileConn <- file("C:/users/fan/HgtOptiAlloDraft/_tab/Lalonde_wage_employ_top10.tex")
# writeLines(st_out_tex, fileConn)
# close(fileConn)
#
# # Display results here
# dfj_top10 %>%
#     kable() %>%
#     kable_styling(bootstrap_options = c("striped", "hover", "responsive"))


FanWangEcon/PrjOptiAlloc documentation built on Jan. 25, 2022, 6:55 a.m.