Nothing
#' Borusyak, Jaravel, and Spiess (2021) Estimator
#'
#' @description
#' Treatment effect estimation and pre-trend testing in staggered adoption
#' diff-in-diff designs with an imputation approach of Borusyak, Jaravel, and
#' Spiess (2021)
#'
#' @details
#' The imputation-based estimator is a method of calculating treatment effects
#' in a difference-in-differences framework. The method estimates a model for
#' Y(0) using untreated/not-yet-treated observations and predicts Y(0) for the
#' treated observations hat(Y_it(0)). The difference between treated and
#' predicted untreated outcomes Y_it(1) - hat(Y_it(0)) serves as an estimate
#' for the treatment effect for unit i in period t. These are then averaged to
#' form average treatment effects for groups of {it}.
#'
#' @import fixest
#' @import data.table
#'
#' @param data A `data.frame`
#' @param yname String. Variable name for outcome. Use `fixest` c() syntax
#' for multiple lhs.
#' @param idname String. Variable name for unique unit id.
#' @param gname String. Variable name for unit-specific date of treatment
#' (never-treated should be zero or `NA`).
#' @param tname String. Variable name for calendar period.
#' @param first_stage Formula for Y(0).
#' Formula following \code{\link[fixest:feols]{fixest::feols}}.
#' Fixed effects specified after "`|`".
#' If not specified, then just unit and time fixed effects will be used.
#' @param wname String. Variable name for estimation weights of observations.
#' This is used in estimating Y(0) and also augments treatment effect weights.
#' @param wtr Character vector of treatment weight names
#' (see horizon for standard static and event-study weights)
#' @param horizon Integer vector of event_time or `TRUE`. This only applies if `wtr` is left
#' as `NULL`. if specified, weighted averages/sums of treatment effects will be
#' reported for each of these horizons separately (i.e. tau0 for the treatment
#' period, tau1 for one period after treatment, etc.).
#' If `TRUE`, all horizons are used.
#' If `wtr` and `horizon` are null, then the static treatment effect is calculated.
#' @param pretrends Integer vector or `TRUE`. Which pretrends to estimate.
#' If `TRUE`, all `pretrends` are used.
#' @param cluster_var String. Varaible name for clustering groups. If not
#' supplied, then `idname` is used as default.
#'
#' @return A `data.frame` containing treatment effect term, estimate, standard
#' error and confidence interval. This is in `tidy` format.
#'
#' @export
#'
#' @section Examples:
#'
#'
#' Load example dataset which has two treatment groups and homogeneous treatment effects
#'
#' ```{r, comment = "#>", collapse = TRUE}
#' # Load Example Dataset
#' data("df_hom", package="did2s")
#' ```
#' ### Static TWFE
#'
#' You can run a static TWFE fixed effect model for a simple treatment indicator
#' ```{r, comment = "#>", collapse = TRUE}
#' did_imputation(data = df_hom, yname = "dep_var", gname = "g",
#' tname = "year", idname = "unit")
#' ```
#'
#' ### Event Study
#'
#' Or you can use relative-treatment indicators to estimate an event study estimate
#' ```{r, comment = "#>", collapse = TRUE}
#' did_imputation(data = df_hom, yname = "dep_var", gname = "g",
#' tname = "year", idname = "unit", horizon=TRUE)
#' ```
#'
#' ### Example from Cheng and Hoekstra (2013)
#'
#' Here's an example using data from Cheng and Hoekstra (2013)
#'
#' ```{r, comment = "#>", collapse = TRUE}
#' # Castle Data
#' castle <- haven::read_dta("https://github.com/scunning1975/mixtape/raw/master/castle.dta")
#'
#' did_imputation(data = castle, yname = "c(l_homicide, l_assault)", gname = "effyear",
#' first_stage = ~ 0 | sid + year,
#' tname = "year", idname = "sid")
#' ```
#'
did_imputation <- function(data, yname, gname, tname, idname, first_stage = NULL,
wname = NULL, wtr = NULL, horizon = NULL,
pretrends = NULL, cluster_var = NULL) {
# Global variables
zz000treat <- zz000event_time <- zz000weight <- zz000adj <- NULL
lhs <- term <- NULL
# Set-up Parameters ------------------------------------------------------------
# Extract first stage vars from formula
if (is.null(first_stage)) {
first_stage <- glue::glue("0 | {idname} + {tname}")
} else if (inherits(first_stage, "formula")) {
first_stage <- as.character(first_stage)[[2]]
}
# Formula for fitting the first stage
formula <- stats::as.formula(glue::glue("{yname} ~ {first_stage}"))
# dummy estimation to extract needed variables
fixest_env <- feols(formula, data, lean = T, only.env = T, warn = F, notes = F)
# extract lhs vars (allows fixest style multiple lhs specification)
yvars <- fixest_env$lhs_names
# extract fe vars
fevars <- fixest_env$fixef_vars %>%
stringr::str_extract_all("\\w+") %>%
unlist()
# extract rhs vars
rhsvars <- fixest_env$linear.params %>%
stringr::str_split("(?<!:):(?!:)") %>%
unlist() %>%
stringr::str_replace("::.*", "") %>%
unique()
# make local copy of data, convert to data.table
needed_vars <- unique(c(yvars, gname, tname, idname, wname, wtr, rhsvars, fevars, cluster_var))
data <- copy(as.data.frame(data)[, needed_vars]) %>% setDT()
rm(fixest_env)
setDT(data)
# Treatment indicator
data[, zz000treat := 1 * (.SD[[tname]] >= .SD[[gname]]) * (.SD[[gname]] > 0)]
# if g is NA
data[is.na(zz000treat), zz000treat := 0]
# Create event time
data[, zz000event_time := ifelse(
is.na(.SD[[gname]]) | .SD[[gname]] == 0 | .SD[[gname]] == Inf,
-Inf,
as.numeric(.SD[[tname]] - .SD[[gname]])
)]
# Get list of event_time
event_time <- unique(data[["zz000event_time"]][is.finite(data$zz000event_time)])
# horizon/allhorizon options
if (is.null(wtr)) {
# event-study
if (!is.null(horizon)) {
# create event time weights
# allhorizons
if (all(horizon == TRUE)) horizon <- event_time
# Create wtr of horizons
wtr <- paste0("zz000wtr", event_time[event_time >= 0])
purrr::walk2(
event_time[event_time >= 0], wtr,
function(e, v) data[, (v) := dplyr::if_else(is.na(zz000event_time), 0, 1 * (zz000event_time == e))]
)
} else {
wtr <- "zz000wtrtreat"
data[, (wtr) := 1 * (zz000treat == 1)]
}
}
# Weights specified or not
if (is.null(wname)) {
data[, zz000weight := 1]
} else {
data[, zz000weight := .SD[[wname]]]
}
# First Stage estimate ---------------------------------------------------------
# Estimate Y(0) using untreated observations
first_stage_est <- fixest::feols(formula,
se = "standard",
data[zz000treat == 0, ],
weights = ~zz000weight,
warn = FALSE, notes = FALSE
)
# Residualize outcome variable(s)
if (length(yvars) == 1) {
data[, (paste("zz000adj", yvars, sep = "_")) := .SD[[yname]] - stats::predict(first_stage_est, newdata = data)]
} else {
data[,
(paste("zz000adj", yvars, sep = "_")) := purrr::imap(.SD, function(x, y) {
x - stats::predict(first_stage_est[lhs = y], newdata = data)
}),
.SDcols = yvars
]
}
# drop anything with missing values of the residualized outcome
todrop <- apply(
is.na(data[, paste("zz000adj", yvars, sep = "_"), with = F]),
MARGIN = 1,
FUN = any
)
data <- data[!todrop, ]
data[, (wtr) := purrr::map(.SD, ~ . * zz000weight), .SDcols = wtr] # Multiply treatment weights * weights vector
data[is.na(zz000weight), (wtr) := 0]
data[, (wtr) := purrr::map(.SD, ~ . / sum(.)), .SDcols = wtr] # Normalize
# Point estimate for wtr
ests <- yvars %>%
purrr::set_names(yvars) %>%
purrr::map(function(y) {
data[
,
zz000adj := .SD[[paste("zz000adj", y, sep = "_")]]
][
zz000treat == 1,
purrr::map(.SD, ~ sum(. * zz000adj)),
.SDcols = wtr
]
}) %>%
rbindlist(idcol = "lhs")
# Standard Errors --------------------------------------------------------------
if (length(yvars) == 1) {
Z <- sparse_model_matrix(data, first_stage_est)
} else {
Z <- sparse_model_matrix(data, first_stage_est[[1]])
}
# Equation (6) of Borusyak et. al. 2021
# - Z (Z_0' Z_0)^{-1} Z_1' wtr_1
v_star <- make_V_star(
(Z * data[, zz000weight]),
(Z * data[, zz000weight])[data$zz000treat == 0, ],
(Z * data[, zz000weight])[data$zz000treat == 1, ],
Matrix::Matrix(as.matrix(data[zz000treat == 1, wtr, with = F]), sparse = TRUE)
)
# fix v_it^* = w for treated observations
v_star[data$zz000treat == 1, ] <- as.matrix(data[zz000treat == 1, wtr, with = F])
# If no cluster_var, then use idname
if (is.null(cluster_var)) cluster_var <- idname
ses <- yvars %>%
purrr::set_names(yvars) %>%
purrr::map(function(y) {
se_inner(
data[, zz000adj := .SD[[paste("zz000adj", y, sep = "_")]]],
v_star, wtr, cluster_var, gname
)
}) %>%
rbindlist(idcol = "lhs")
# Pre-event Estimates ----------------------------------------------------------
if (!is.null(pretrends)) {
if (all(pretrends == TRUE)) {
pre_formula <- stats::as.formula(glue::glue("{yname} ~ i(zz000event_time) + {first_stage}"))
} else {
if (all(pretrends %in% event_time)) {
pre_formula <- stats::as.formula(
glue::glue("{yname} ~ i(zz000event_time, keep = c({paste(pretrends, collapse = ', ')})) + {first_stage}")
)
} else {
stop(glue::glue("Pretrends not found in event_time. Event_time has values {event_time}"))
}
}
pre_est <- fixest::feols(pre_formula, data[zz000treat == 0, ], weights = ~zz000weight, warn = FALSE, notes = FALSE)
}
# Create dataframe of results in tidy format -----------------------------------
ests <- data.table::melt(
ests,
id.vars = "lhs", variable.name = "term", value.name = "estimate"
)
ses <- data.table::melt(
ses,
id.vars = "lhs", variable.name = "term", value.name = "std.error"
)
out <- ests[ses, on = list(lhs, term)]
out$term <- as.character(stringr::str_replace(out$term, "zz000wtr", ""))
out$conf.low <- out$estimate - 1.96 * out$std.error
out$conf.high <- out$estimate + 1.96 * out$std.error
if (!is.null(pretrends)) {
if (length(yvars) == 1) {
pre_out <- broom::tidy(pre_est)
pre_out$lhs <- yvars
} else {
pre_out <- pre_est %>%
purrr::map(broom::tidy) %>%
dplyr::bind_rows(.id = "lhs")
}
pre_out$term <- stringr::str_remove(pre_out$term, "zz000event_time::")
pre_out$term <- as.character(pre_out$term)
pre_out$conf.low <- pre_out$estimate - 1.96 * pre_out$std.error
pre_out$conf.high <- pre_out$estimate + 1.96 * pre_out$std.error
pre_out <- pre_out[, c("lhs", "term", "estimate", "std.error", "conf.low", "conf.high")]
out <- dplyr::bind_rows(pre_out, out)
}
return(dplyr::as_tibble(out))
}
se_inner <- function(data, v_star, wtr, cluster, gname) {
zz000adj <- zz000treat <- NULL
# Calculate v_it^* = - Z (Z_0' Z_0)^{-1} Z_1' * w_1
vcols <- paste0("zz000v", seq_along(wtr))
tcols <- paste0("zz000tau_et", seq_along(wtr))
data[, (vcols) := split(v_star, col(v_star))]
# Equation (10) of Borusyak et. al. 2021
# Calculate tau_it - \bar{\tau}_{et}
data[,
(tcols) := purrr::map(.SD, function(x) sum(x^2 * zz000adj) / sum(x^2) * zz000treat),
by = c(gname, "zz000event_time"),
.SDcols = vcols
]
# Recenter tau by \bar{\tau}_{et}
data[, (tcols) := purrr::map(.SD, function(x) zz000adj - tidyr::replace_na(x, 0)), .SDcols = tcols]
# Equation (8)
# Calculate variance of estimate
result <- data[, purrr::map2(vcols, tcols, function(x,y) sum(.SD[[x]] * .SD[[y]])^2),
by = cluster
]
result = result[,
purrr::map(.SD, function(x) sqrt(sum(x))), .SDcols = paste0("V", seq_along(wtr))
]
setnames(result, wtr)
data[, (c(vcols, tcols)) := NULL]
return(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.