library(haven)
library(tidyr)
library(dplyr)
library(ggplot2)
library(pforeach)
library(stRatification)

Setup

Download & Read STAR dataset

The data set is downloadable from this page https://ideas.repec.org/p/boc/bocins/webstar.html. if you are interested in the definition of each column, see the following link http://wise.xmu.edu.cn/course/gecon/star_Description.pdf.

data <- read_dta("http://fmwww.bc.edu/ec-p/data/stockwatson/star_sw.dta")

mutate column names into more close to the expression in the paper

as a result we get n = 3784, which is slightly larger than the sample size in the paper.

data <- data %>%
  filter(rak != 1) %>%
  mutate(math_score = tmathssk,
         small_class = sck,
         female = ifelse(boy == 1, 0, 1),
         black = ifelse(black == 1, 1, 0),
         free_lunch = freelunk,
         school_attended = schidkn %>% as.factor()
  ) %>%
  select(math_score, small_class, female, black, free_lunch, school_attended)  %>% na.omit()

get the standard deviation inthe regular class, then calculate stardardized math score

regsd <- data %>% filter(small_class == 0) %>% summarise(regular_sd = sd(math_score))
data <- data %>% mutate(math_score_stand = math_score/regsd$regular_sd)
hist(data$math_score_stand)

Reproduction of Table.2 in Alberto(2013)

Panel A: Average treatment effect

unadjusted

reg_nonadj <- lm(data = data, formula = math_score_stand ~ small_class)
summary(reg_nonadj)

adjusted

reg_adj <- lm(data = data, formula = math_score_stand ~ small_class + female + black + free_lunch + school_attended)
summary(reg_adj)

ate_nonadj <- data.frame(coef = summary(reg_nonadj)$coef["small_class","Estimate"], 
                         stder = summary(reg_nonadj)$coef["small_class","Std. Error"], 
                         adjusted = "non_adjusted",
                         orig = "reproduce")

ate_adj <- data.frame(coef = summary(reg_adj)$coef["small_class","Estimate"], 
                         stder = summary(reg_adj)$coef["small_class","Std. Error"], 
                         adjusted = "adjusted",
                         orig = "reproduce")

ate_ori <- data.frame(coef = c(0.1659, 0.1892),
                      stder = c(0.0329, 0.0294),
                      adjusted = c("non_adjusted", "adjusted"),
                         orig = "original")

panel_a <- rbind(ate_nonadj, ate_adj, ate_ori)

panel_a %>%
  ggplot(aes(y = coef, x = adjusted, fill = orig)) + 
  geom_bar(position = position_dodge(),
           stat = "identity", color = "black") +
  geom_errorbar(data = panel_a,
                aes(ymin = coef - 1.96*stder,
                    ymax = coef + 1.96*stder,
                width = 0.2),
                position = position_dodge(.9)) +
  xlab("adjusted/unadjusted estimation of ATE") +
  ylab("Estimated ATE in dummy regression") +
  ggtitle("Reproduction of Table 2, PanelA in Abadie et al. (2013)") +
  theme(plot.title = element_text(hjust = 0.5))

Panel B: Average treatment effect by predicted outcome group

fullsample and unadjusted

fs_result_unadjusted <- fullsample_stratification(data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = F,
                                                  ntilen = 3)

fullsample and adjusted

fs_result_adjusted <- fullsample_stratification(data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = T,
                                                  ntilen = 3)

calculate standard error(fullsample)

fsbst_result_unadjusted <- fullsample_stratification_bst(R = 1000,
                                                data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = F,
                                                  ntilen = 3,
                                                parallel = T)

fsbst_se_unadjusted <- fsbst_result_unadjusted %>%
  group_by(nt, r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>% 
  summarise(stder = sd(ATE))


fsbst_result_adjusted <- fullsample_stratification_bst(R = 1000,
                                                data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = T,
                                                  ntilen = 3,
                                                parallel = T)

fsbst_se_adjusted <- fsbst_result_adjusted %>%
  group_by(nt, r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>% 
  summarise(stder = sd(ATE))

samplesplit and unadjusted

ss_result_unadjusted <- sample_splitting_estimation(data,
                                                    Y = "math_score_stand",
                                                    treatment = "small_class",
                                                    Xvar = c("female","black","free_lunch","school_attended"),
                                                    M = 100,
                                                    ntilen = 3,
                                                    adjusted = F)

samplesplit and adjusted

ss_result_adjusted <- sample_splitting_estimation(data,
                                                    Y = "math_score_stand",
                                                    treatment = "small_class",
                                                    Xvar = c("female","black","free_lunch","school_attended"),
                                                    M = 100,
                                                    ntilen = 3,
                                                    adjusted = T)

calculate standard error(samplesplit)

This process may take about 1hour, so enjoy your coffee. I'm looking for some help to improve the speed of this process.

ssbst_result_adjusted <- sample_splitting_stratification_bst(R = 1000,
                                                             data = data,
                                                             Y = "math_score_stand",
                                                             treatment = "small_class",
                                                             Xvar = c("female","black","free_lunch","school_attended"),
                                                             M = 100,
                                                             ntilen = 3,
                                                             adjusted = T,
                                                             parallel = T)

ssbst_se_adjusted <- ssbst_result_adjusted  %>% 
  group_by(nt,r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>%
  summarise(stder = sd(ATE))

ssbst_result_unadjusted <- sample_splitting_stratification_bst(R = 1000,
                                                               data = data,
                                                               Y = "math_score_stand",
                                                               treatment = "small_class",
                                                               Xvar = c("female","black","free_lunch","school_attended"),
                                                               M = 100,
                                                               ntilen = 3,
                                                               adjusted = F,
                                                               parallel = T)

ssbst_se_unadjusted <- ssbst_result_unadjusted  %>% 
  group_by(nt,r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>%
  summarise(stder = sd(ATE))

plot table_2 reproduction

fs_res <- fs_result_adjusted %>%
  select(-stder) %>%
  inner_join(fsbst_se_adjusted, by = "nt") %>%
  mutate(method = "fs_adjusted") %>%
  rbind(fs_result_unadjusted %>%
          select(-stder) %>%
  inner_join(fsbst_se_unadjusted, by = "nt") %>%
          mutate(method = "fs_unadjusted")) %>%
  select(-M)

ss_res <- ss_result_unadjusted %>% 
  group_by(nt) %>%
  summarise(ATE = mean(ATE)) %>%
  inner_join(ssbst_se_unadjusted, by = "nt") %>%
  mutate(method = "ss_unadjusted") %>%
  rbind(ss_result_adjusted %>% 
  group_by(nt) %>%
  summarise(ATE = mean(ATE)) %>%
  inner_join(ssbst_se_unadjusted, by = "nt") %>%
    mutate(method = "ss_adjusted"))

plot_dataset <- rbind(fs_res, ss_res) %>% mutate(origin = "reproduction")

plot_dataset %>%
  ggplot(aes(y = ATE, x = nt, fill = method)) + 
  geom_bar(position = position_dodge(), 
           stat="identity", color = "black") +
  geom_errorbar(data = plot_dataset,
                aes(ymin = ATE - 1.96*stder,
                    ymax = ATE + 1.96*stder,
                    width = 0.2),
                    position = position_dodge(.9)) +
  xlab("ntile of predicted math score(1 = low, 3 = high)") +
  ylab("Estimated ATE in dummy regression") +
  ggtitle("Reproduction of Table 2 in Abadie et al. (2013)") +
  theme(plot.title = element_text(hjust = 0.5))

plot the comparison

comp_dataset <- data.frame(
  nt = rep(paste("ntile", 1:3, sep = "_"), 4),
  ATE = c(0.3705, 0.2688, -0.133, 0.3908, 0.3023, -0.1242,
          0.3152, 0.2617, -0.052, 0.3130, 0.3005, -0.0374),
  stder = c(0.0521, 0.0655, 0.0636, 0.0509, 0.0678, 0.0614,
            0.0467, 0.0505, 0.0567, 0.0459, 0.0526, 0.0552),
  method = c(rep("fs_unadjusted", 3), rep("fs_adjusted", 3),
             rep("ss_unadjusted", 3), rep("ss_adjusted", 3)),
  origin = "original"
) %>%
  rbind(plot_dataset)


comp_dataset %>%
  ggplot(aes(y = ATE, x = nt, fill = method)) + 
  geom_bar(position = position_dodge(), 
           stat="identity", color = "black") +
  geom_errorbar(data = comp_dataset,
                aes(ymin = ATE - 1.96*stder,
                    ymax = ATE + 1.96*stder,
                    width = 0.2),
                    position = position_dodge(.9)) +
  xlab("ntile of predicted math score(1 = low, 3 = high)") +
  ylab("Estimated ATE in dummy regression") +
  ggtitle("Reproduction of Table 2 in Abadie et al. (2013)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  facet_grid(~ origin)

The above result is not exactly the same with the original table2.

This may be caused by following two reasons.

  1. dataset is not exactly the same with the dataset used in the paper.(we have 20 more sample some how)

  2. sample splitting estimation is fluctuate a little bit, since its based on sampling.

However, we are able to confirm the conclusion does not change from this difference.

From this table we are able to conclude three things.

a. low and middle group have positive and significant effect in any case, and estimation method may not change the result largely.

b. high group have negative and significant effect in the case where full sample stratification is employed, and no significant effect was confirmed in the sample splitting estimation.

c. By confirming the gap between the result of high and low group, high group have negative bias and low group have positive bias in full sample estimation.



yasui-salmon/stRatification documentation built on May 4, 2019, 2:31 p.m.