Linear wage regression estimation of A and alpha from the Lalonde Training Dataset (722 Observations). Solve for optimal binary allocation queues. Test binary allocation queue with Lalonde training dataset. There are 722 observations, 297 in the treatment group, 425 in the control group. Following Lalonde, regressions are in terms of wage levels (not log wage)
rm(list = ls(all.names = TRUE)) options(knitr.duplicate.label = 'allow')
library(dplyr) library(tidyr) library(tibble) library(forcats) library(stringr) library(broom) library(ggplot2) library(REconTools) library(PrjOptiAlloc) library(knitr) library(kableExtra) bl_save_rda = FALSE bl_save_img = FALSE
Generate four categories by initial height and mother's education levels combinations.
# Dataset data(df_opt_lalonde_training) # Add a binary variable for if there are wage in year 1975 dft <- df_opt_lalonde_training %>% mutate(re75_zero = case_when(re75 <= 1 ~ 1, TRUE ~ 0)) %>% select(trt, re75_zero, everything()) # summary REconTools::ff_summ_percentiles(dft) # summary treated only REconTools::ff_summ_percentiles(dft %>% filter(trt == 1)) REconTools::ff_summ_percentiles(dft %>% filter(trt == 0)) # within 297 treated, 111 (37%) had zero wage in 1975 # within 425 untreated, 178 (42%) had zero wage in 1975
# Load Data dft <- dft %>% mutate(id = X) %>% select(-X) %>% select(id, everything()) # Summarize str(dft) summary(dft) # Generate Employment 0 or 1 status Variable, and non-zero wages variable dft <- dft %>% mutate(emp78 = case_when(re78 <= 0 ~ 0, TRUE ~ 1)) %>% mutate(emp75 = case_when(re75 <= 0 ~ 0, TRUE ~ 1)) %>% mutate(emp74 = case_when(re74 <= 0 ~ 0, TRUE ~ 1)) # Generate combine black + hispanic status # 0 = white, 1 = black, 2 = hispanics dft <- dft %>% mutate(race = case_when(black == 1 ~ 1, hisp == 1 ~ 2, TRUE ~ 0)) dft <- dft %>% mutate(age_m2 = case_when(age <= 23 ~ 1, age > 23~ 2)) %>% mutate(age_m3 = case_when(age <= 20 ~ 1, age > 20 & age <= 26 ~ 2, age > 26 ~ 3)) # filter(re78 != 0) %>% # mutate(re78_log = log(re78)) # Exclude zeros # when this is on, both linear and log linear results exclude wage = 0 # dft <- dft %>% # filter(re78 > 0) # Generate Discrete Version of continuous variables # dft <- dft %>% # mutate(momwgtLowHigh = cut(lwt, # breaks=c(-Inf, 129, Inf), # labels=c("LW","HW"))) %>% # mutate(mombirthage = cut(age, # breaks=c(-Inf, 24, Inf), # labels=c("young","older")))
What is the average difference in wage between treatment and control, do they match what is reported in Lalonde (1986)?
# Summarize average for all variables grouping by treatment status # re78 is significantly different round(t(dft %>% group_by(trt) %>% summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3) round(t(dft %>% group_by(marr, age_m2) %>% summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3) # Summarize by finer sub groups: RACE # big increase for black, but not for other group round(t(dft %>% group_by(trt, race) %>% summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3) # Summarize by finer sub groups: MARRIAGE # big increase for black, but not for other group round(t(dft %>% group_by(trt, marr) %>% summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3) # Summarize by finer sub groups: AGE GROUPS round(t(dft %>% group_by(trt, age_m2) %>% summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3) round(t(dft %>% group_by(trt, age_m3) %>% summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3) round(t(dft %>% group_by(trt, marr, age_m2) %>% summarise_if(is.numeric, funs(mean = mean), na.rm = TRUE)), digits=3)
# Tabulate groups, how many in each group, enough for group heterogeneity in effects? dft %>% group_by(trt, marr) %>% summarize(freq = n()) %>% pivot_wider(names_from = trt, values_from = freq) # Tabulate groups, how many in each group, enough for group heterogeneity in effects? dft %>% group_by(trt, age_m2, marr) %>% summarize(freq = n()) %>% pivot_wider(names_from = trt, values_from = freq)
# As noted in Lalonde (1986), functional form assumptions do not matter much # Dummies, treatment effect average about 801 dollars summary(lm(re78 ~ factor(age) + factor(educ) + factor(race) + factor(marr) + factor(nodeg) + factor(trt) - 1, data = dft)) # cts Controls and treatment effects about 761 dollars summary(lm(re78 ~ age + I(age^2) + educ + I(educ^2) + + factor(race) + factor(marr) + factor(nodeg) + factor(trt) - 1, data = dft)) # Treatment interactions by marriage status, 476 unmarried, vs 2216 for married summary(lm(re78 ~ age + I(age^2) + educ + I(educ^2) + + factor(race) + factor(marr) + factor(nodeg) + factor(marr):factor(trt) - 1, data = dft)) # Treatment interactions by marriage status, 453 vs 1070 each age group summary(lm(re78 ~ age + I(age^2) + factor(age_m2) + educ + I(educ^2) + + factor(race) + factor(marr) + factor(nodeg) + factor(age_m2):factor(trt) - 1, data = dft)) # Treatment interactions by marriage status, greater effect for older married + younger married summary(lm(re78 ~ age + I(age^2) + factor(age_m2) + educ + I(educ^2) + + factor(race) + factor(marr) + factor(nodeg) + factor(marr):factor(age_m2):factor(trt) - 1, data = dft))
# Store Regression Results mt_model <- model.matrix( ~ age + I(age^2) + factor(age_m2) + educ + I(educ^2) + + factor(race) + factor(marr) + factor(nodeg) + factor(re75_zero) + factor(age_m2):factor(trt), data = dft) rs_wage_on_trt = lm(re78 ~ mt_model - 1, data = dft) print(summary(rs_wage_on_trt)) rs_wage_on_trt_tidy = tidy(rs_wage_on_trt) rs_wage_on_trt_tidy
Multiply coefficient vector by covariate matrix to generate A vector that is child/individual specific.
# Estimates Table head(rs_wage_on_trt_tidy, 6) # Covariates head(mt_model, 5) # Covariates coefficients from regression (including constant) ar_fl_cova_esti <- as.matrix(rs_wage_on_trt_tidy %>% filter(!str_detect(term, 'trt')) %>% select(estimate)) ar_fl_main_esti <- as.matrix(rs_wage_on_trt_tidy %>% filter(str_detect(term, 'trt')) %>% select(estimate)) head(ar_fl_cova_esti, 5) head(ar_fl_main_esti, 5) # Select Matrix subcomponents mt_cova <- as.matrix(as_tibble(mt_model) %>% select(-contains("trt"))) mt_intr <- model.matrix(~ factor(marr) - 1, data = dft) # Generate A_i, use mt_cova_wth_const ar_A_m <- mt_cova %*% ar_fl_cova_esti head(ar_A_m, 5) # Generate alpha_i ar_alpha_m <- mt_intr %*% ar_fl_main_esti head(ar_alpha_m, 5)
# Child Weight ar_beta_m <- rep(1/length(ar_A_m), times=length(ar_A_m))
# Initate Dataframe that will store all estimates and optimal allocation relevant information # combine identifying key information along with estimation A, alpha results # note that we only need indi.id as key mt_opti <- cbind(ar_alpha_m, ar_A_m, ar_beta_m) ar_st_varnames <- c('alpha', 'A', 'beta') df_esti_alpha_A_beta <- as_tibble(mt_opti) %>% rename_all(~c(ar_st_varnames)) tb_key_alpha_A_beta <- bind_cols(dft, df_esti_alpha_A_beta) %>% select(one_of(c('id', 'trt', 'age', 'educ', 'race', 'marr', 'nodeg', 're78', ar_st_varnames))) # Rescale A and alpha to deal more easily with large powers tb_key_alpha_A_beta <- tb_key_alpha_A_beta %>% mutate(alpha = alpha/1000, A = A/1000) # Need to only include the smokers here # tb_key_alpha_A_beta <- tb_key_alpha_A_beta %>% filter(trt == 0) # Unique beta, A, and alpha check tb_opti_unique <- tb_key_alpha_A_beta %>% group_by(!!!syms(ar_st_varnames)) %>% arrange(!!!syms(ar_st_varnames)) %>% summarise(n_obs_group=n()) # Show cars head(tb_key_alpha_A_beta, 32)
svr_inpalc <- 'opti_alloc_queue' # Child Count it_obs = dim(tb_opti_unique)[1] # Vector of Planner Preference ar_rho <- c(-100, 0.8) ar_rho <- c(-50, -25, -10) ar_rho <- c(-100, -5, -1, 0.1, 0.6, 0.8) # ar_rho <- c(seq(-200, -100, length.out=5), seq(-100, -25, length.out=5), seq(-25, -5, length.out=5), seq(-5, -1, length.out=5), seq(-1, -0.01, length.out=5), seq(0.01, 0.25, length.out=5), seq(0.25, 0.90, length.out=5)) # ar_rho <- c(-100, -5, -1, 0.1, 0.6, 0.99) # ar_rho <- c(-20, -1, 0.05, 0.9) # ar_rho <- c(-50, -40, -30, -20, -15, -10, -7.5, -5,-3,-2,-1) # ar_rho = c(-100, -0.001, 0.95) ar_rho = c(-100, -0.001, 0.95) ar_rho <- c(seq(-200, -100, length.out=5), seq(-100, -25, length.out=5), seq(-25, -5, length.out=5), seq(-5, -1, length.out=5), seq(-1, -0.01, length.out=5), seq(0.01, 0.25, length.out=5), seq(0.25, 0.90, length.out=5)) ar_rho <- 1 - (10^(c(seq(-2,2, length.out=30)))) ar_rho <- unique(ar_rho) svr_rho_val <- 'rho_val' ls_bin_solu_all_rhos <- ffp_opt_anlyz_rhgin_bin(tb_key_alpha_A_beta, svr_id_i = 'id', svr_A_i = 'A', svr_alpha_i = 'alpha', svr_beta_i = 'beta', ar_rho = ar_rho, svr_rho = 'rho', svr_rho_val = svr_rho_val, svr_inpalc = svr_inpalc, svr_expout = 'opti_exp_outcome') df_all_rho <- ls_bin_solu_all_rhos$df_all_rho df_all_rho_long <- ls_bin_solu_all_rhos$df_all_rho_long # How many people have different ranks across rhos it_how_many_vary_rank <- sum(df_all_rho$rank_max - df_all_rho$rank_min) it_how_many_vary_rank
# Change Variable names so that this can becombined with logit file later df_all_rho <- df_all_rho %>% rename_at(vars(starts_with("rho_")), funs(str_replace(., "rk", "rk_wage"))) df_all_rho <- df_all_rho %>% rename(A_i_wage = A, alpha_i_wage = alpha, beta_i_wage = beta, rank_min_wage = rank_min, rank_max_wage = rank_max, avg_rank_wage = avg_rank) # Save File if (bl_save_rda) { df_opt_lalonde_training_wage <- df_all_rho usethis::use_data(df_opt_lalonde_training_wage, df_opt_lalonde_training_wage, overwrite = TRUE) }
# get rank when wage rho = 1 df_all_rho_rho_c1 <- df_all_rho %>% select(id, rho_c1_rk_wage) # Merge df_all_rho_long_more <- df_all_rho_long %>% mutate(rho = as.numeric(rho)) %>% left_join(df_all_rho_rho_c1, by='id') # Select subset to graph df_rank_graph <- df_all_rho_long_more %>% mutate(id = factor(id)) %>% filter((id == 167) | # utilitarian rank = 1 (id == 175) | # utilitarian rank = 101 (id == 3710) | # utilitarian rank = 202 (id == 2151) | # utilitarian rank = 306 (id == 27)| # utilitarian rank = 405 (id == 58) | # utilitarian rank = 503 (id == 184) | # utilitarian rank = 603 (id == 1481) # utilitarian rank = 701 ) %>% mutate(one_minus_rho = 1 - !!sym(svr_rho_val)) %>% mutate(rho_c1_rk_wage = factor(rho_c1_rk_wage)) df_rank_graph$rho_c1_rk_wage df_rank_graph$id <- factor(df_rank_graph$rho_c1_rk_wage, labels = c('Rank= 1 when λ=0.99', #200 'Rank= 101 when λ=0.99', #110 'Rank= 201 when λ=0.99', #95 'Rank= 301 when λ=0.99', #217 'Rank= 401 when λ=0.99', #274 'Rank= 501 when λ=0.99', #101 'Rank= 601 when λ=0.99', #131 'Rank= 701 when λ=0.99' #3610 )) # x-labels x.labels <- c('λ=0.99', 'λ=0.90', 'λ=0', 'λ=-10', 'λ=-100') x.breaks <- c(0.01, 0.10, 1, 10, 100) # title line 2 title_line1 <- sprintf("LINEAR WAGE: Optimal Tageting Queue, NSW Lalonde (AER, 1986)") title_line2 <- sprintf("How Do Allocation Rankings (Targeting Queue) Shift with λ?") # Graph Results--Draw line.plot <- df_rank_graph %>% ggplot(aes(x=one_minus_rho, y=!!sym(svr_inpalc), group=fct_rev(id), colour=fct_rev(id), size=2)) + # geom_line(aes(linetype =fct_rev(id)), size=0.75) + geom_line(size=0.75) + geom_vline(xintercept=c(1), linetype="dotted") + labs(title = paste0(title_line1, '\n', title_line2), x = 'log10 Rescale of λ, Log10(λ)\nλ=1 Utilitarian, λ=-infty Rawlsian', y = 'Optimal Targeting Queue Rank (1=highest)', caption = 'Training RCT with 297 treated, 425 untreated. Optimal Binary Targeting Queue.') + scale_x_continuous(trans='log10', labels = x.labels, breaks = x.breaks) + theme_bw() print(line.plot)
# tb_opti_alloc_all_rho_long %>% # ggplot(aes(x = rho, y = rank, group = id)) + # geom_line(aes(color = race, alpha = 1), size = 2) + # geom_point(aes(color = race, alpha = 1), size = 4) + # scale_x_discrete(expand = c(0.85,0))+ # scale_y_reverse(breaks = 1:nrow(tb_opti_alloc_all_rho_long))+ # theme(legend.position = "none") + # labs(x = "Equality vs Efficiency", # y = "Rank", # title = "Binary Allocation Rank, which untrained to receive training first") + # ffy_opt_ghthm_dk() + # geom_text(data =tb_opti_alloc_all_rho,aes(y=rho_c1_rk,x=0.6,label=id),hjust="right")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.