# ======================================================================
# Dinkelman (2011, AER) - Effects of Rural Electrification on Employment
# Based on the data used to construct Table 4 of the paper
# ======================================================================
download.file("https://www.aeaweb.org/aer/data/dec2011/20080791_data.zip",
"./data-raw/dinkelman.zip")
unzip("./data-raw/dinkelman.zip",
files = "20080791_dataset/data/matched_censusdata.dta",
exdir = "./data-raw", junkpaths = TRUE)
dinkelman <- haven::read_dta("./data-raw/matched_censusdata.dta")
system("rm ./data-raw/dinkelman.zip")
# Restrict to areas with at least 100 adults in both years
dinkelman <- subset(dinkelman, largeareas == 1)
keep <- c(# Treatment
"T", # ESKOM (electrification) Project? (Dummy)
# Outcomes
"d_prop_emp_f", # Change in female employment rate
"d_prop_emp_m", # Change in male employment rate
# Instrument
"mean_grad_new", # Land gradient
# Fixed Effects
"dccode0", # District index (factor)
# Controls
"kms_to_subs0",
"baseline_hhdens0",
"base_hhpovrate0",
"prop_head_f_a0",
"sexratio0_a",
"prop_indianwhite0",
"kms_to_road0",
"kms_to_town0",
"prop_matric_m0",
"prop_matric_f0",
"d_prop_waterclose",
"d_prop_flush")
dinkelman <- dinkelman[, keep]
rm(keep)
# Rename "T" to escom so it doesn't clash with "TRUE"
names(dinkelman)[which(names((dinkelman)) == "T")] <- "escom"
# Rescale certain variables to match magnitudes in paper
rescale <- c("mean_grad_new", "kms_to_subs0", "baseline_hhdens0",
"kms_to_road0", "kms_to_town0")
dinkelman[,rescale] <- dinkelman[,rescale] / 10
rm(rescale)
# Create binary version of instrument
dinkelman$steep <- (dinkelman$mean_grad_new > 1) # use 1 as cutoff ~= median and FAO's def of strongly sloping
# Replace dccode0 (a factor for district) with a group of dummy variables that
# represent the same information, treating F21 as the "baseline"
district <- as.factor(dinkelman$dccode0)
dinkelman$dccode0 <- NULL
dummies <- model.matrix(~ district)
dummies <- dummies[,-1]
dinkelman <- cbind(dinkelman, dummies)
rm(district, dummies)
devtools::use_data(dinkelman, overwrite = TRUE)
rm(dinkelman)
# ======================================================================
# Angrist et al (2002, AER) - Vouchers for Private Schooling
# Data needed to replicate selected results from Table 7
# ======================================================================
download.file("http://economics.mit.edu/files/1395", "./data-raw/tab7.sas7bdat")
angrist <- haven::read_sas("./data-raw/tab7.sas7bdat")
# Convert variable names to lowercase
names(angrist) <- tolower(names(angrist))
# Construct highest grade completed outcome
grades <- angrist[,c("finish6", "finish7", "finish8")]
g <- function(x){
out <- 5
if(all(x == c(1, 0, 0)))
out <- 6
if(all(x == c(1, 1, 0)))
out <- 7
if(all(x == c(1, 1, 1)))
out <- 8
return(out)
}
highestGrade <- apply(grades, 1, g)
angrist$highestGrade <- highestGrade
rm(grades, highestGrade, g)
# Construct total repetitions outcome
rept <- angrist[,c("rept6", "rept7", "rept8")]
f <- function(x){
x[2] <- x[2] - 1
x[3] <- x[3] - 1
return(sum(x, na.rm = TRUE))
}
totalRepeats <- apply(rept, 1, f)
angrist$totalRepeats <- totalRepeats
rm(totalRepeats, f, rept)
# In their SAS file, Angrist et al restrict their sample using the following
# condition, on which they do not comment in either the paper or the code.
# It is not innocuous, since it rules out individuals for whom we are not in
# fact missing the variables needed to carry out some of the regressions in
# Table 7. Moreover, some of these variables on which they exclude do not even
# appear to be used in the paper. Nevertheless, we will restrict our sample as
# they do so that our results are comparable.
keepObs <- with(angrist, !is.na(scyfnsh) & !is.na(finish6) & !is.na(rept6) &
!is.na(totalRepeats) & !is.na(svy) &
!is.na(inschl) & !is.na(finish7) &
!is.na(vouch0) & !is.na(prsch_c) &
!is.na(finish8) & !is.na(prscha_2) &
!is.na(totscyrs) & !is.na(rept) &
((bog95smp == 1) | (bog97smp == 1) | (jam93smp == 1)))
angrist <- angrist[keepObs,]
rm(keepObs)
# Note that we do not include some of the Dummies mentioned by Angrist et al.
# This is because they are collinear and are automatically dropped when running
# OLS or IV. The ones that we excluded are:
# strata6
# stratams
# dbogota
# d1993
# d1997
# dmonth12
# We chose these since they are the ones that are automatically exluded by R's
# lm function when it runs the OLS, reduced form, and first stage regressions.
keepVars <- c(
# Covariates
"age", # The only covariate that isn't a dummy
"svy",
"hsvisit",
"djamundi",
"phone",
"sex2",
"d1995",
"strata1",
"strata2",
"strata3",
"strata4",
"strata5",
"dmonth1",
"dmonth2",
"dmonth3",
"dmonth4",
"dmonth5",
"dmonth6",
"dmonth7",
"dmonth8",
"dmonth9",
"dmonth10",
"dmonth11",
# Regressor
"usesch",
# Instrument
"vouch0",
# Outcomes
"highestGrade",
"totalRepeats",
# Indicators of different sample-city years
"bog95smp", # Bogota 1995
"bog97smp", # Bogota 1997
"jam93smp", # Jamundi 1993
"prscha_1", # Started 6th grade in private school
"prscha_2", # Started 7th grade in private school
"prsch_c" # Currently in private school
)
angrist <- angrist[, keepVars]
rm(keepVars)
# Note: there is one individual in the dataset whose sex is unknown. Although
# Angrist et al do not make this clear, the rule they use to exclude
# observations in their SAS file already drops this individual. Thus, although
# they include a dummy for sex being missing (sex_miss) in their regressions,
# this is just being dropped by SAS since it's the same for everyone.
# Accordingly, we don't bother to store it.
devtools::use_data(angrist, overwrite = TRUE)
rm(angrist)
# ======================================================================
# Gelbach (2002, AER) - Public Schooling for Young Children
# Based on data used in Tables 6 and 7
# ======================================================================
download.file("http://gelbach.law.upenn.edu/phil/interact.dta",
"./data-raw/interact.dta")
gelbach <- haven::read_dta("./data-raw/interact.dta")
# Restrict sample to mothers whose youngest child is 5 years old
gelbach <- subset(gelbach, youngest == 5)
# Keep only the variables used in Tables 6 and 7
keep <- c(
# Controls
"num612", # Number of own children in household aged 6-12
"num1317", # Number of own children in household aged 13-17
"numge18", # Number of own children in household aged >= 18
"othrlt18", # Number of other household members aged < 18
"othrge18", # Number of other household members aged >= 18
"grade", # Mother's years of education
"white", # White? (dummy variable)
"centcity", # Live in central city? (dummy variable)
"age", # Age of mother
"age2", # Squared age of mother
# Factors for Fixed Effects
"stater", # State of residence
"stateb", # State of birth
# Instruments
"qtr1", # Born in Quarter I? (dummy variable)
"quarter", # Quarter of birth
# Regressor
"public", # Attend public school? (dummy)
# outcomes
"hours", # hours worked last week
"weeksw79", # weeks worked in 1979
"salary") # wage and salary income in 1979
gelbach <- gelbach[keep]
rm(keep)
devtools::use_data(gelbach, overwrite = TRUE)
rm(gelbach)
# ===========================================================================
# Burde and Linden (2013, AEJ Applied) - Bringing Education to Afghan Girls
# The paper does not report IV. We use the controls from Table 2.
# ===========================================================================
download.file("https://www.aeaweb.org/aej/app/data/2012-0252_data.zip",
"./data-raw/afghan.zip")
unzip("./data-raw/afghan.zip",
files = "Data_20120252 2015-06-15/afghanistan_anonymized_data.dta",
exdir = "./data-raw", junkpaths = TRUE)
afghan <- haven::read_dta("./data-raw/afghanistan_anonymized_data.dta")
system("rm ./data-raw/afghan.zip")
# Remove outliers following the authors' STATA code
outlier <- with(afghan, (f07_num_ppl_hh_cnt > 20 & f07_observed == 1) |
(f07_jeribs_cnt > 10 & f07_observed == 1) |
(f07_num_sheep_cnt > 50 & f07_observed == 1) |
(s08_num_ppl_hh_cnt > 20 & s08_observed == 1) |
(s08_jeribs_cnt > 10 & s08_observed == 1) |
(s08_num_sheep_cnt > 50 & s08_observed == 1))
afghan <- afghan[!outlier,]
# The variable "treatment" is actually our instrument
# --- living in a village that gets a school
# The variable "f08_formal_school" is our treatment
# --- enrolled in formal school, Spring 2008
# The variable "s08_formal_school" is our outcome
# --- total normalized test score, Spring 2008
afghan <- with(afghan, data.frame(
"enrolled" = s08_formal_school,
"testscore" = s08_both_norma_total,
"buildschool" = treatment,
#"c" = as.factor(clustercode), #only for cluster SE
"headchild" = s08_heads_child_cnt,
"female" = s08_girls_cnt,
"age" = s08_age_cnt,
"yrsvill" = s08_duration_village_cnt,
"farsi" = s08_farsi_cnt,
"tajik" = s08_tajik_cnt,
"farmers" = s08_farmer_cnt,
"agehead" = s08_age_head_cnt,
"educhead" = s08_yrs_ed_head_cnt,
"nhh" = s08_num_ppl_hh_cnt,
"land" = s08_jeribs_cnt,
"sheep" = s08_num_sheep_cnt,
"distschool" = s08_nearest_scl,
"chagcharan" = chagcharan)) # District dummy
# Not mentioned in paper!
# remove missing observations
afghan <- na.omit(afghan)
# only look at girls
afghan <- subset(afghan, female == 1)
devtools::use_data(afghan, overwrite = TRUE)
rm(afghan)
# ===========================================================================
# Courtemanche, Tchernis & Ukert (2015) - The Effect of Smoking on Obesity
# The paper uses confidential data from the Lung Health Data. Our example is
# based on summary statistics from this dataset. The outcome is body mass
# index (BMI), the instrument is encouragement to quit smoking, and the
# observed treatment is self-reported smoking status: 1 = quit. We examine
# the five-year horizon.
# ===========================================================================
n00 <- 1357
n01 <- 2018
n10 <- 451
n11 <- 1620
yb00 <- 25.89
yb01 <- 26.09
yb10 <- 28.48
yb11 <- 28.28
s2_00 <- 4.42^2
s2_01 <- 4.40^2
s2_10 <- 4.61^2
s2_11 <- 4.44^2
# The easiest way to work with summary statistics is to generate random normal
# data that matches the statistics exactly. Since our method of incorporating
# sampling uncertainty only uses these same summary statistics, the precise
# distribution of the data is irrelevant. This is merely a convenient way to
# feed the summary statistics directly into our existing code.
set.seed(1234)
y00 <- MASS::mvrnorm(n00, yb00, s2_00, empirical = TRUE)
y01 <- MASS::mvrnorm(n01, yb01, s2_01, empirical = TRUE)
y10 <- MASS::mvrnorm(n10, yb10, s2_10, empirical = TRUE)
y11 <- MASS::mvrnorm(n11, yb11, s2_11, empirical = TRUE)
dat00 <- cbind(y00, rep(0, n00), rep(0, n00))
dat01 <- cbind(y01, rep(0, n01), rep(1, n01))
dat10 <- cbind(y10, rep(1, n10), rep(0, n10))
dat11 <- cbind(y11, rep(1, n11), rep(1, n11))
smoking <- as.data.frame(rbind(dat00, dat01, dat10, dat11))
names(smoking) <- c("BMI", "quit", "program")
rm(s2_00, s2_01, s2_10, s2_11)
rm(n00, n01, n10, n11)
rm(yb00, yb10, yb11, yb01)
rm(y00, y10, y01, y11)
rm(dat00, dat01, dat10, dat11)
devtools::use_data(smoking, overwrite = TRUE)
rm(smoking)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.