library(haven) library(tidyr) library(dplyr) library(ggplot2) library(pforeach) library(stRatification)
The data set is downloadable from this page https://ideas.repec.org/p/boc/bocins/webstar.html. if you are interested in the definition of each column, see the following link http://wise.xmu.edu.cn/course/gecon/star_Description.pdf.
data <- read_dta("http://fmwww.bc.edu/ec-p/data/stockwatson/star_sw.dta")
as a result we get n = 3784, which is slightly larger than the sample size in the paper.
data <- data %>% filter(rak != 1) %>% mutate(math_score = tmathssk, small_class = sck, female = ifelse(boy == 1, 0, 1), black = ifelse(black == 1, 1, 0), free_lunch = freelunk, school_attended = schidkn %>% as.factor() ) %>% select(math_score, small_class, female, black, free_lunch, school_attended) %>% na.omit()
regsd <- data %>% filter(small_class == 0) %>% summarise(regular_sd = sd(math_score)) data <- data %>% mutate(math_score_stand = math_score/regsd$regular_sd) hist(data$math_score_stand)
reg_nonadj <- lm(data = data, formula = math_score_stand ~ small_class) summary(reg_nonadj)
reg_adj <- lm(data = data, formula = math_score_stand ~ small_class + female + black + free_lunch + school_attended) summary(reg_adj)
ate_nonadj <- data.frame(coef = summary(reg_nonadj)$coef["small_class","Estimate"], stder = summary(reg_nonadj)$coef["small_class","Std. Error"], adjusted = "non_adjusted", orig = "reproduce") ate_adj <- data.frame(coef = summary(reg_adj)$coef["small_class","Estimate"], stder = summary(reg_adj)$coef["small_class","Std. Error"], adjusted = "adjusted", orig = "reproduce") ate_ori <- data.frame(coef = c(0.1659, 0.1892), stder = c(0.0329, 0.0294), adjusted = c("non_adjusted", "adjusted"), orig = "original") panel_a <- rbind(ate_nonadj, ate_adj, ate_ori) panel_a %>% ggplot(aes(y = coef, x = adjusted, fill = orig)) + geom_bar(position = position_dodge(), stat = "identity", color = "black") + geom_errorbar(data = panel_a, aes(ymin = coef - 1.96*stder, ymax = coef + 1.96*stder, width = 0.2), position = position_dodge(.9)) + xlab("adjusted/unadjusted estimation of ATE") + ylab("Estimated ATE in dummy regression") + ggtitle("Reproduction of Table 2, PanelA in Abadie et al. (2013)") + theme(plot.title = element_text(hjust = 0.5))
fs_result_unadjusted <- fullsample_stratification(data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), adjusted = F, ntilen = 3)
fs_result_adjusted <- fullsample_stratification(data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), adjusted = T, ntilen = 3)
fsbst_result_unadjusted <- fullsample_stratification_bst(R = 1000, data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), adjusted = F, ntilen = 3, parallel = T) fsbst_se_unadjusted <- fsbst_result_unadjusted %>% group_by(nt, r) %>% summarise(ATE = mean(ATE)) %>% group_by(nt) %>% summarise(stder = sd(ATE)) fsbst_result_adjusted <- fullsample_stratification_bst(R = 1000, data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), adjusted = T, ntilen = 3, parallel = T) fsbst_se_adjusted <- fsbst_result_adjusted %>% group_by(nt, r) %>% summarise(ATE = mean(ATE)) %>% group_by(nt) %>% summarise(stder = sd(ATE))
ss_result_unadjusted <- sample_splitting_estimation(data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), M = 100, ntilen = 3, adjusted = F)
ss_result_adjusted <- sample_splitting_estimation(data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), M = 100, ntilen = 3, adjusted = T)
This process may take about 1hour, so enjoy your coffee. I'm looking for some help to improve the speed of this process.
ssbst_result_adjusted <- sample_splitting_stratification_bst(R = 1000, data = data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), M = 100, ntilen = 3, adjusted = T, parallel = T) ssbst_se_adjusted <- ssbst_result_adjusted %>% group_by(nt,r) %>% summarise(ATE = mean(ATE)) %>% group_by(nt) %>% summarise(stder = sd(ATE)) ssbst_result_unadjusted <- sample_splitting_stratification_bst(R = 1000, data = data, Y = "math_score_stand", treatment = "small_class", Xvar = c("female","black","free_lunch","school_attended"), M = 100, ntilen = 3, adjusted = F, parallel = T) ssbst_se_unadjusted <- ssbst_result_unadjusted %>% group_by(nt,r) %>% summarise(ATE = mean(ATE)) %>% group_by(nt) %>% summarise(stder = sd(ATE))
fs_res <- fs_result_adjusted %>% select(-stder) %>% inner_join(fsbst_se_adjusted, by = "nt") %>% mutate(method = "fs_adjusted") %>% rbind(fs_result_unadjusted %>% select(-stder) %>% inner_join(fsbst_se_unadjusted, by = "nt") %>% mutate(method = "fs_unadjusted")) %>% select(-M) ss_res <- ss_result_unadjusted %>% group_by(nt) %>% summarise(ATE = mean(ATE)) %>% inner_join(ssbst_se_unadjusted, by = "nt") %>% mutate(method = "ss_unadjusted") %>% rbind(ss_result_adjusted %>% group_by(nt) %>% summarise(ATE = mean(ATE)) %>% inner_join(ssbst_se_unadjusted, by = "nt") %>% mutate(method = "ss_adjusted")) plot_dataset <- rbind(fs_res, ss_res) %>% mutate(origin = "reproduction") plot_dataset %>% ggplot(aes(y = ATE, x = nt, fill = method)) + geom_bar(position = position_dodge(), stat="identity", color = "black") + geom_errorbar(data = plot_dataset, aes(ymin = ATE - 1.96*stder, ymax = ATE + 1.96*stder, width = 0.2), position = position_dodge(.9)) + xlab("ntile of predicted math score(1 = low, 3 = high)") + ylab("Estimated ATE in dummy regression") + ggtitle("Reproduction of Table 2 in Abadie et al. (2013)") + theme(plot.title = element_text(hjust = 0.5))
comp_dataset <- data.frame( nt = rep(paste("ntile", 1:3, sep = "_"), 4), ATE = c(0.3705, 0.2688, -0.133, 0.3908, 0.3023, -0.1242, 0.3152, 0.2617, -0.052, 0.3130, 0.3005, -0.0374), stder = c(0.0521, 0.0655, 0.0636, 0.0509, 0.0678, 0.0614, 0.0467, 0.0505, 0.0567, 0.0459, 0.0526, 0.0552), method = c(rep("fs_unadjusted", 3), rep("fs_adjusted", 3), rep("ss_unadjusted", 3), rep("ss_adjusted", 3)), origin = "original" ) %>% rbind(plot_dataset) comp_dataset %>% ggplot(aes(y = ATE, x = nt, fill = method)) + geom_bar(position = position_dodge(), stat="identity", color = "black") + geom_errorbar(data = comp_dataset, aes(ymin = ATE - 1.96*stder, ymax = ATE + 1.96*stder, width = 0.2), position = position_dodge(.9)) + xlab("ntile of predicted math score(1 = low, 3 = high)") + ylab("Estimated ATE in dummy regression") + ggtitle("Reproduction of Table 2 in Abadie et al. (2013)") + theme(plot.title = element_text(hjust = 0.5)) + facet_grid(~ origin)
dataset is not exactly the same with the dataset used in the paper.(we have 20 more sample some how)
sample splitting estimation is fluctuate a little bit, since its based on sampling.
a. low and middle group have positive and significant effect in any case, and estimation method may not change the result largely.
b. high group have negative and significant effect in the case where full sample stratification is employed, and no significant effect was confirmed in the sample splitting estimation.
c. By confirming the gap between the result of high and low group, high group have negative bias and low group have positive bias in full sample estimation.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.