In yasui-salmon/stRatification: estimate the treatment effect in subgroup of dataset

library(haven)
library(tidyr)
library(dplyr)
library(ggplot2)
library(pforeach)
library(stRatification)

Setup

Download & Read STAR dataset

The data set is downloadable from this page https://ideas.repec.org/p/boc/bocins/webstar.html. if you are interested in the definition of each column, see the following link http://wise.xmu.edu.cn/course/gecon/star_Description.pdf.

data <- read_dta("http://fmwww.bc.edu/ec-p/data/stockwatson/star_sw.dta")

mutate column names into more close to the expression in the paper

as a result we get n = 3784, which is slightly larger than the sample size in the paper.

data <- data %>%
  filter(rak != 1) %>%
  mutate(math_score = tmathssk,
         small_class = sck,
         female = ifelse(boy == 1, 0, 1),
         black = ifelse(black == 1, 1, 0),
         free_lunch = freelunk,
         school_attended = schidkn %>% as.factor()
  ) %>%
  select(math_score, small_class, female, black, free_lunch, school_attended)  %>% na.omit()

get the standard deviation inthe regular class, then calculate stardardized math score

regsd <- data %>% filter(small_class == 0) %>% summarise(regular_sd = sd(math_score))
data <- data %>% mutate(math_score_stand = math_score/regsd$regular_sd)
hist(data$math_score_stand)

Reproduction of Table.2 in Alberto(2013)

Panel A: Average treatment effect

unadjusted

reg_nonadj <- lm(data = data, formula = math_score_stand ~ small_class)
summary(reg_nonadj)

adjusted

reg_adj <- lm(data = data, formula = math_score_stand ~ small_class + female + black + free_lunch + school_attended)
summary(reg_adj)

ate_nonadj <- data.frame(coef = summary(reg_nonadj)$coef["small_class","Estimate"], 
                         stder = summary(reg_nonadj)$coef["small_class","Std. Error"], 
                         adjusted = "non_adjusted",
                         orig = "reproduce")

ate_adj <- data.frame(coef = summary(reg_adj)$coef["small_class","Estimate"], 
                         stder = summary(reg_adj)$coef["small_class","Std. Error"], 
                         adjusted = "adjusted",
                         orig = "reproduce")

ate_ori <- data.frame(coef = c(0.1659, 0.1892),
                      stder = c(0.0329, 0.0294),
                      adjusted = c("non_adjusted", "adjusted"),
                         orig = "original")

panel_a <- rbind(ate_nonadj, ate_adj, ate_ori)

panel_a %>%
  ggplot(aes(y = coef, x = adjusted, fill = orig)) + 
  geom_bar(position = position_dodge(),
           stat = "identity", color = "black") +
  geom_errorbar(data = panel_a,
                aes(ymin = coef - 1.96*stder,
                    ymax = coef + 1.96*stder,
                width = 0.2),
                position = position_dodge(.9)) +
  xlab("adjusted/unadjusted estimation of ATE") +
  ylab("Estimated ATE in dummy regression") +
  ggtitle("Reproduction of Table 2, PanelA in Abadie et al. (2013)") +
  theme(plot.title = element_text(hjust = 0.5))

Panel B: Average treatment effect by predicted outcome group

fullsample and unadjusted

fs_result_unadjusted <- fullsample_stratification(data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = F,
                                                  ntilen = 3)

fullsample and adjusted

fs_result_adjusted <- fullsample_stratification(data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = T,
                                                  ntilen = 3)

calculate standard error(fullsample)

fsbst_result_unadjusted <- fullsample_stratification_bst(R = 1000,
                                                data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = F,
                                                  ntilen = 3,
                                                parallel = T)

fsbst_se_unadjusted <- fsbst_result_unadjusted %>%
  group_by(nt, r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>% 
  summarise(stder = sd(ATE))


fsbst_result_adjusted <- fullsample_stratification_bst(R = 1000,
                                                data,
                                                  Y = "math_score_stand",
                                                  treatment = "small_class",
                                                  Xvar = c("female","black","free_lunch","school_attended"),
                                                  adjusted = T,
                                                  ntilen = 3,
                                                parallel = T)

fsbst_se_adjusted <- fsbst_result_adjusted %>%
  group_by(nt, r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>% 
  summarise(stder = sd(ATE))

samplesplit and unadjusted

ss_result_unadjusted <- sample_splitting_estimation(data,
                                                    Y = "math_score_stand",
                                                    treatment = "small_class",
                                                    Xvar = c("female","black","free_lunch","school_attended"),
                                                    M = 100,
                                                    ntilen = 3,
                                                    adjusted = F)

samplesplit and adjusted

ss_result_adjusted <- sample_splitting_estimation(data,
                                                    Y = "math_score_stand",
                                                    treatment = "small_class",
                                                    Xvar = c("female","black","free_lunch","school_attended"),
                                                    M = 100,
                                                    ntilen = 3,
                                                    adjusted = T)

calculate standard error(samplesplit)

This process may take about 1hour, so enjoy your coffee. I'm looking for some help to improve the speed of this process.

ssbst_result_adjusted <- sample_splitting_stratification_bst(R = 1000,
                                                             data = data,
                                                             Y = "math_score_stand",
                                                             treatment = "small_class",
                                                             Xvar = c("female","black","free_lunch","school_attended"),
                                                             M = 100,
                                                             ntilen = 3,
                                                             adjusted = T,
                                                             parallel = T)

ssbst_se_adjusted <- ssbst_result_adjusted  %>% 
  group_by(nt,r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>%
  summarise(stder = sd(ATE))

ssbst_result_unadjusted <- sample_splitting_stratification_bst(R = 1000,
                                                               data = data,
                                                               Y = "math_score_stand",
                                                               treatment = "small_class",
                                                               Xvar = c("female","black","free_lunch","school_attended"),
                                                               M = 100,
                                                               ntilen = 3,
                                                               adjusted = F,
                                                               parallel = T)

ssbst_se_unadjusted <- ssbst_result_unadjusted  %>% 
  group_by(nt,r) %>%
  summarise(ATE = mean(ATE)) %>%
  group_by(nt) %>%
  summarise(stder = sd(ATE))

plot table_2 reproduction

fs_res <- fs_result_adjusted %>%
  select(-stder) %>%
  inner_join(fsbst_se_adjusted, by = "nt") %>%
  mutate(method = "fs_adjusted") %>%
  rbind(fs_result_unadjusted %>%
          select(-stder) %>%
  inner_join(fsbst_se_unadjusted, by = "nt") %>%
          mutate(method = "fs_unadjusted")) %>%
  select(-M)

ss_res <- ss_result_unadjusted %>% 
  group_by(nt) %>%
  summarise(ATE = mean(ATE)) %>%
  inner_join(ssbst_se_unadjusted, by = "nt") %>%
  mutate(method = "ss_unadjusted") %>%
  rbind(ss_result_adjusted %>% 
  group_by(nt) %>%
  summarise(ATE = mean(ATE)) %>%
  inner_join(ssbst_se_unadjusted, by = "nt") %>%
    mutate(method = "ss_adjusted"))

plot_dataset <- rbind(fs_res, ss_res) %>% mutate(origin = "reproduction")

plot_dataset %>%
  ggplot(aes(y = ATE, x = nt, fill = method)) + 
  geom_bar(position = position_dodge(), 
           stat="identity", color = "black") +
  geom_errorbar(data = plot_dataset,
                aes(ymin = ATE - 1.96*stder,
                    ymax = ATE + 1.96*stder,
                    width = 0.2),
                    position = position_dodge(.9)) +
  xlab("ntile of predicted math score(1 = low, 3 = high)") +
  ylab("Estimated ATE in dummy regression") +
  ggtitle("Reproduction of Table 2 in Abadie et al. (2013)") +
  theme(plot.title = element_text(hjust = 0.5))

plot the comparison

comp_dataset <- data.frame(
  nt = rep(paste("ntile", 1:3, sep = "_"), 4),
  ATE = c(0.3705, 0.2688, -0.133, 0.3908, 0.3023, -0.1242,
          0.3152, 0.2617, -0.052, 0.3130, 0.3005, -0.0374),
  stder = c(0.0521, 0.0655, 0.0636, 0.0509, 0.0678, 0.0614,
            0.0467, 0.0505, 0.0567, 0.0459, 0.0526, 0.0552),
  method = c(rep("fs_unadjusted", 3), rep("fs_adjusted", 3),
             rep("ss_unadjusted", 3), rep("ss_adjusted", 3)),
  origin = "original"
) %>%
  rbind(plot_dataset)


comp_dataset %>%
  ggplot(aes(y = ATE, x = nt, fill = method)) + 
  geom_bar(position = position_dodge(), 
           stat="identity", color = "black") +
  geom_errorbar(data = comp_dataset,
                aes(ymin = ATE - 1.96*stder,
                    ymax = ATE + 1.96*stder,
                    width = 0.2),
                    position = position_dodge(.9)) +
  xlab("ntile of predicted math score(1 = low, 3 = high)") +
  ylab("Estimated ATE in dummy regression") +
  ggtitle("Reproduction of Table 2 in Abadie et al. (2013)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  facet_grid(~ origin)

The above result is not exactly the same with the original table2.

This may be caused by following two reasons.

dataset is not exactly the same with the dataset used in the paper.(we have 20 more sample some how)
sample splitting estimation is fluctuate a little bit, since its based on sampling.

However, we are able to confirm the conclusion does not change from this difference.

From this table we are able to conclude three things.

a. low and middle group have positive and significant effect in any case, and estimation method may not change the result largely.

b. high group have negative and significant effect in the case where full sample stratification is employed, and no significant effect was confirmed in the sample splitting estimation.

c. By confirming the gap between the result of high and low group, high group have negative bias and low group have positive bias in full sample estimation.

yasui-salmon/stRatification documentation built on May 4, 2019, 2:31 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

yasui-salmon/stRatification
estimate the treatment effect in subgroup of dataset

In yasui-salmon/stRatification: estimate the treatment effect in subgroup of dataset

Setup

Download & Read STAR dataset

mutate column names into more close to the expression in the paper

get the standard deviation inthe regular class, then calculate stardardized math score

Reproduction of Table.2 in Alberto(2013)

Panel A: Average treatment effect

unadjusted

adjusted

Panel B: Average treatment effect by predicted outcome group

fullsample and unadjusted

fullsample and adjusted

calculate standard error(fullsample)

samplesplit and unadjusted

samplesplit and adjusted

calculate standard error(samplesplit)

plot table_2 reproduction

plot the comparison

The above result is not exactly the same with the original table2.

This may be caused by following two reasons.

However, we are able to confirm the conclusion does not change from this difference.

From this table we are able to conclude three things.

R Package Documentation

Browse R Packages

We want your feedback!

yasui-salmon/stRatification estimate the treatment effect in subgroup of dataset

In yasui-salmon/stRatification: estimate the treatment effect in subgroup of dataset

Setup

Download & Read STAR dataset

mutate column names into more close to the expression in the paper

get the standard deviation inthe regular class, then calculate stardardized math score

Reproduction of Table.2 in Alberto(2013)

Panel A: Average treatment effect

unadjusted

adjusted

Panel B: Average treatment effect by predicted outcome group

fullsample and unadjusted

fullsample and adjusted

calculate standard error(fullsample)

samplesplit and unadjusted

samplesplit and adjusted

calculate standard error(samplesplit)

plot table_2 reproduction

plot the comparison

The above result is not exactly the same with the original table2.

This may be caused by following two reasons.

However, we are able to confirm the conclusion does not change from this difference.

From this table we are able to conclude three things.

R Package Documentation

Browse R Packages

We want your feedback!

yasui-salmon/stRatification
estimate the treatment effect in subgroup of dataset