Linear wage regression estimation of A and alpha from the Lalonde Training Dataset (722 Observations). Solve for optimal binary allocation queues. Test binary allocation queue with Lalonde training dataset. There are 722 observations, 297 in the treatment group, 425 in the control group. Following Lalonde, regressions are in terms of wage levels (not log wage)

Load Dependencies

rm(list = ls(all.names = TRUE))
options(knitr.duplicate.label = 'allow')
library(dplyr)
library(tidyr)
library(tibble)
library(forcats)
library(stringr)
library(broom)
library(ggplot2)
library(REconTools)

library(PrjOptiAlloc)

library(knitr)
library(kableExtra)

bl_save_rda = FALSE
bl_save_img = FALSE

Load Data and Generate New Variables

Generate four categories by initial height and mother's education levels combinations.

# Dataset
data(df_opt_lalonde_training)

# Add a binary variable for if there are wage in year 1975
dft <- df_opt_lalonde_training %>% 
  mutate(re75_zero = case_when(re75 <= 1 ~ 1, 
                               TRUE ~ 0)) %>%
  select(trt, re75_zero, everything())

# summary
REconTools::ff_summ_percentiles(dft)
# summary treated only
REconTools::ff_summ_percentiles(dft %>% filter(trt == 1))
REconTools::ff_summ_percentiles(dft %>% filter(trt == 0))
# within 297 treated, 111 (37%) had zero wage in 1975
# within 425 untreated, 178 (42%) had zero wage in 1975
# Load Data
dft <- dft %>% mutate(id = X) %>%
           select(-X) %>%
           select(id, everything())
# Summarize
str(dft)
summary(dft)

# Generate Employment 0 or 1 status Variable, and non-zero wages variable
dft <- dft %>% mutate(emp78 =
                      case_when(re78 <= 0 ~ 0,
                                TRUE ~ 1)) %>%
               mutate(emp75 =
                      case_when(re75 <= 0 ~ 0,
                                TRUE ~ 1)) %>%
               mutate(emp74 =
                      case_when(re74 <= 0 ~ 0,
                                TRUE ~ 1))

# Generate combine black + hispanic status
# 0 = white, 1 = black, 2 = hispanics
dft <- dft %>%
    mutate(race =
             case_when(black == 1 ~ 1,
                       hisp == 1 ~ 2,
                       TRUE ~ 0))

dft <- dft %>%
    mutate(age_m2 =
             case_when(age <= 23 ~ 1,
                       age >  23~ 2)) %>%
    mutate(age_m3 =
             case_when(age <= 20 ~ 1,
                       age > 20 & age <= 26 ~ 2,
                       age > 26 ~ 3))



# filter(re78 != 0) %>%
# mutate(re78_log = log(re78))

# Exclude zeros
# when this is on, both linear and log linear results exclude wage = 0
# dft <- dft %>%
#     filter(re78 > 0)

# Generate Discrete Version of continuous variables
# dft <- dft %>%
#     mutate(momwgtLowHigh = cut(lwt,
#                                breaks=c(-Inf, 129, Inf),
#                                labels=c("LW","HW"))) %>%
#     mutate(mombirthage = cut(age,
#                                breaks=c(-Inf, 24, Inf),
#                                labels=c("young","older")))

Regression with Data and Construct Input Arrays

Tabulate and Summarize Averages

What is the average difference in wage between treatment and control, do they match what is reported in Lalonde (1986)?

# Summarize average for all variables grouping by treatment status
# re78 is significantly different
round(t(dft %>% group_by(trt) %>%
        summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)
round(t(dft %>% group_by(marr, age_m2) %>%
        summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)

# Summarize by finer sub groups: RACE
# big increase for black, but not for other group
round(t(dft %>% group_by(trt, race) %>%
        summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)

# Summarize by finer sub groups: MARRIAGE
# big increase for black, but not for other group
round(t(dft %>% group_by(trt, marr) %>%
        summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)

# Summarize by finer sub groups: AGE GROUPS
round(t(dft %>% group_by(trt, age_m2) %>%
        summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)

round(t(dft %>% group_by(trt, age_m3) %>%
        summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)

round(t(dft %>% group_by(trt, marr, age_m2) %>%
        summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)

Tabulate

# Tabulate groups, how many in each group, enough for group heterogeneity in effects?
dft %>%
  group_by(trt, marr) %>%
  summarize(freq = n()) %>%
  pivot_wider(names_from = trt, values_from = freq)

# Tabulate groups, how many in each group, enough for group heterogeneity in effects?
dft %>%
  group_by(trt, age_m2, marr) %>%
  summarize(freq = n()) %>%
  pivot_wider(names_from = trt, values_from = freq)

Regression Testing

# As noted in Lalonde (1986), functional form assumptions do not matter much
# Dummies, treatment effect average about 801 dollars
summary(lm(re78 ~ factor(age)
                  + factor(educ)
                  + factor(race)
                  + factor(marr)
                  + factor(nodeg)
                  + factor(trt) - 1,
                  data = dft))

# cts Controls and treatment effects about 761 dollars
summary(lm(re78 ~  age + I(age^2) +
                   educ + I(educ^2) +
                  + factor(race)
                  + factor(marr)
                  + factor(nodeg)
                  + factor(trt) - 1,
                  data = dft))

# Treatment interactions by marriage status, 476 unmarried, vs 2216 for married
summary(lm(re78 ~  age + I(age^2) +
                   educ + I(educ^2) +
                  + factor(race)
                  + factor(marr)
                  + factor(nodeg)
                  + factor(marr):factor(trt) - 1,
                  data = dft))

# Treatment interactions by marriage status, 453 vs 1070 each age group
summary(lm(re78 ~  age + I(age^2) + factor(age_m2) +
                   educ + I(educ^2) +
                  + factor(race)
                  + factor(marr)
                  + factor(nodeg)
                  + factor(age_m2):factor(trt) - 1,
                  data = dft))

# Treatment interactions by marriage status, greater effect for older married + younger married
summary(lm(re78 ~  age + I(age^2) + factor(age_m2) +
                   educ + I(educ^2) +
                  + factor(race)
                  + factor(marr)
                  + factor(nodeg)
                  + factor(marr):factor(age_m2):factor(trt) - 1,
                  data = dft))

Regress Wage on Training Status

Linear Binary Problem

# Store Regression Results
mt_model <- model.matrix( ~ age + I(age^2) + factor(age_m2) +
                            educ + I(educ^2) +
                          + factor(race)
                          + factor(marr)
                          + factor(nodeg)
                          + factor(re75_zero)
                          + factor(age_m2):factor(trt),
                          data = dft)
rs_wage_on_trt = lm(re78 ~ mt_model - 1, data = dft)
print(summary(rs_wage_on_trt))
rs_wage_on_trt_tidy = tidy(rs_wage_on_trt)
rs_wage_on_trt_tidy

Construct Input Arrays $A_i$ and $\alpha_i$

Linear Binary Regression

Multiply coefficient vector by covariate matrix to generate A vector that is child/individual specific.

# Estimates Table
head(rs_wage_on_trt_tidy, 6)
# Covariates
head(mt_model, 5)

# Covariates coefficients from regression (including constant)
ar_fl_cova_esti <- as.matrix(rs_wage_on_trt_tidy %>% filter(!str_detect(term, 'trt')) %>% select(estimate))
ar_fl_main_esti <- as.matrix(rs_wage_on_trt_tidy %>% filter(str_detect(term, 'trt')) %>% select(estimate))
head(ar_fl_cova_esti, 5)
head(ar_fl_main_esti, 5)

# Select Matrix subcomponents
mt_cova <- as.matrix(as_tibble(mt_model) %>% select(-contains("trt")))
mt_intr <- model.matrix(~ factor(marr) - 1, data = dft)

# Generate A_i, use mt_cova_wth_const
ar_A_m <- mt_cova %*% ar_fl_cova_esti
head(ar_A_m, 5)

# Generate alpha_i
ar_alpha_m <- mt_intr %*% ar_fl_main_esti
head(ar_alpha_m, 5)

Individual Weight

# Child Weight
ar_beta_m <- rep(1/length(ar_A_m), times=length(ar_A_m))

Matrix with Required Inputs for Allocation

Linear Binary Matrix

# Initate Dataframe that will store all estimates and optimal allocation relevant information
# combine identifying key information along with estimation A, alpha results
# note that we only need indi.id as key
mt_opti <- cbind(ar_alpha_m, ar_A_m, ar_beta_m)
ar_st_varnames <- c('alpha', 'A', 'beta')
df_esti_alpha_A_beta <- as_tibble(mt_opti) %>% rename_all(~c(ar_st_varnames))
tb_key_alpha_A_beta <- bind_cols(dft, df_esti_alpha_A_beta) %>%
              select(one_of(c('id', 'trt', 'age', 'educ', 'race', 'marr', 'nodeg', 're78',
                              ar_st_varnames)))

# Rescale A and alpha to deal more easily with large powers
tb_key_alpha_A_beta <- tb_key_alpha_A_beta %>%
                          mutate(alpha = alpha/1000, A = A/1000)

# Need to only include the smokers here
# tb_key_alpha_A_beta <- tb_key_alpha_A_beta %>% filter(trt == 0)

# Unique beta, A, and alpha check
tb_opti_unique <- tb_key_alpha_A_beta %>% group_by(!!!syms(ar_st_varnames)) %>%
                    arrange(!!!syms(ar_st_varnames)) %>%
                    summarise(n_obs_group=n())

# Show cars
head(tb_key_alpha_A_beta, 32)

Optimal Linear Allocations

svr_inpalc <- 'opti_alloc_queue'

# Child Count
it_obs = dim(tb_opti_unique)[1]

# Vector of Planner Preference
ar_rho <- c(-100, 0.8)
ar_rho <- c(-50, -25, -10)
ar_rho <- c(-100, -5, -1, 0.1, 0.6, 0.8)
# ar_rho <- c(seq(-200, -100, length.out=5), seq(-100, -25, length.out=5), seq(-25, -5, length.out=5), seq(-5, -1, length.out=5), seq(-1, -0.01, length.out=5), seq(0.01, 0.25, length.out=5), seq(0.25, 0.90, length.out=5))
# ar_rho <- c(-100, -5, -1, 0.1, 0.6, 0.99)
# ar_rho <- c(-20, -1, 0.05, 0.9)
# ar_rho <- c(-50, -40, -30, -20, -15, -10, -7.5, -5,-3,-2,-1)
# ar_rho = c(-100, -0.001,  0.95)
ar_rho = c(-100, -0.001,  0.95)
ar_rho <- c(seq(-200, -100, length.out=5), seq(-100, -25, length.out=5), seq(-25, -5, length.out=5), seq(-5, -1, length.out=5), seq(-1, -0.01, length.out=5), seq(0.01, 0.25, length.out=5), seq(0.25, 0.90, length.out=5))
ar_rho <- 1 - (10^(c(seq(-2,2, length.out=30))))
ar_rho <- unique(ar_rho)

svr_rho_val <- 'rho_val'
ls_bin_solu_all_rhos <-
  ffp_opt_anlyz_rhgin_bin(tb_key_alpha_A_beta,
                          svr_id_i = 'id',
                          svr_A_i = 'A', svr_alpha_i = 'alpha', svr_beta_i = 'beta',
                          ar_rho = ar_rho, 
                          svr_rho = 'rho', svr_rho_val = svr_rho_val,
                          svr_inpalc = svr_inpalc,
                          svr_expout = 'opti_exp_outcome')

df_all_rho <- ls_bin_solu_all_rhos$df_all_rho
df_all_rho_long <- ls_bin_solu_all_rhos$df_all_rho_long

# How many people have different ranks across rhos
it_how_many_vary_rank <- sum(df_all_rho$rank_max - df_all_rho$rank_min)
it_how_many_vary_rank

Save Results

# Change Variable names so that this can becombined with logit file later
df_all_rho <- df_all_rho %>% rename_at(vars(starts_with("rho_")), funs(str_replace(., "rk", "rk_wage")))
df_all_rho <- df_all_rho %>% rename(A_i_wage = A, alpha_i_wage = alpha, beta_i_wage = beta,
                                    rank_min_wage = rank_min, rank_max_wage = rank_max, avg_rank_wage = avg_rank)

# Save File
if (bl_save_rda) {
  df_opt_lalonde_training_wage <- df_all_rho
  usethis::use_data(df_opt_lalonde_training_wage, df_opt_lalonde_training_wage, overwrite = TRUE)
}

Change in Rank along rho

# get rank when wage rho = 1
df_all_rho_rho_c1 <- df_all_rho %>% select(id, rho_c1_rk_wage)

# Merge
df_all_rho_long_more <- df_all_rho_long %>% mutate(rho = as.numeric(rho)) %>%
                      left_join(df_all_rho_rho_c1, by='id')

# Select subset to graph
df_rank_graph <- df_all_rho_long_more %>%
                    mutate(id = factor(id)) %>%
                    filter((id == 167) |  # utilitarian rank = 1
                           (id == 175) |  # utilitarian rank = 101
                           (id == 3710)  |  # utilitarian rank = 202
                           (id == 2151) |  # utilitarian rank = 306
                           (id == 27)|  # utilitarian rank = 405
                           (id == 58) |  # utilitarian rank = 503
                           (id == 184) |  # utilitarian rank = 603
                           (id == 1481)   # utilitarian rank = 701
                             ) %>%
                    mutate(one_minus_rho = 1 - !!sym(svr_rho_val)) %>%
                    mutate(rho_c1_rk_wage = factor(rho_c1_rk_wage))

df_rank_graph$rho_c1_rk_wage

df_rank_graph$id <- factor(df_rank_graph$rho_c1_rk_wage,
                           labels = c('Rank= 1 when λ=0.99', #200
                                      'Rank= 101 when λ=0.99', #110
                                      'Rank= 201 when λ=0.99', #95
                                      'Rank= 301 when λ=0.99', #217
                                      'Rank= 401 when λ=0.99', #274
                                      'Rank= 501 when λ=0.99', #101
                                      'Rank= 601 when λ=0.99', #131
                                      'Rank= 701 when λ=0.99' #3610
                                       ))

# x-labels
x.labels <- c('λ=0.99', 'λ=0.90', 'λ=0', 'λ=-10', 'λ=-100')
x.breaks <- c(0.01, 0.10, 1, 10, 100)

# title line 2
title_line1 <- sprintf("LINEAR WAGE: Optimal Tageting Queue, NSW Lalonde (AER, 1986)")
title_line2 <- sprintf("How Do Allocation Rankings (Targeting Queue) Shift with λ?")

# Graph Results--Draw
line.plot <- df_rank_graph %>%
  ggplot(aes(x=one_minus_rho, y=!!sym(svr_inpalc),
             group=fct_rev(id),
             colour=fct_rev(id),
             size=2)) +
  # geom_line(aes(linetype =fct_rev(id)), size=0.75) +
  geom_line(size=0.75) +
  geom_vline(xintercept=c(1), linetype="dotted") +
  labs(title = paste0(title_line1, '\n', title_line2),
       x = 'log10 Rescale of λ, Log10(λ)\nλ=1 Utilitarian, λ=-infty Rawlsian',
       y = 'Optimal Targeting Queue Rank (1=highest)',
       caption = 'Training RCT with 297 treated, 425 untreated. Optimal Binary Targeting Queue.') +
  scale_x_continuous(trans='log10', labels = x.labels, breaks = x.breaks) +
  theme_bw()

print(line.plot)

Bump Plot for Optimal Binary Allocations

# tb_opti_alloc_all_rho_long %>%
#   ggplot(aes(x = rho, y = rank, group = id)) +
#     geom_line(aes(color = race, alpha = 1), size = 2) +
#     geom_point(aes(color = race, alpha = 1), size = 4) +
#     scale_x_discrete(expand = c(0.85,0))+
#     scale_y_reverse(breaks = 1:nrow(tb_opti_alloc_all_rho_long))+
#     theme(legend.position = "none") +
#     labs(x = "Equality vs Efficiency",
#          y = "Rank",
#          title = "Binary Allocation Rank, which untrained to receive training first") +
#     ffy_opt_ghthm_dk() +
#     geom_text(data =tb_opti_alloc_all_rho,aes(y=rho_c1_rk,x=0.6,label=id),hjust="right")


FanWangEcon/PrjOptiAlloc documentation built on Jan. 25, 2022, 6:55 a.m.