Nothing
##' @title Estimate a changes-in-changes model with multiple periods and cohorts
##'
##' @description Calculates a changes-in-changes model as in Athey and Imbens (2006)
##' for multiple periods and cohorts.
##'
##' @param yvar Dependent variable.
##' @param gvar Group variable. Can be either a string (e.g., "first_treated")
##' or an expression (e.g., first_treated). In a staggered treatment setting,
##' the group variable typically denotes treatment cohort.
##' @param tvar Time variable. Can be a string (e.g., "year") or an expression
##' (e.g., year).
##' @param ivar Individual Index variable. Can be a string (e.g., "country") or an
##' expression (e.g., country). Only needed to check cohort sizes.
##' @param dat The data set.
##' @param myProbs Quantiles that the quantile treatment effects should be calculated for.
##' @param nMin Minimum observations per groups. Small groups are deleted.
##' @param boot Bootstrap. Resampling is done over the entire data set ("normal"),
##' but might be weighted by period-cohort size ("weighted"). If you do not want
##' to calculate standard error, set boot = "no".
##' @param nReps Number of bootstrap replications.
##' @param weight_n0 Weight for the aggregation of the CDFs in the control group.
##' `n1` uses cohort sizes (Alternative: `n0`).
##' @param weight_n1 Weight for the aggregation of the CDFs in the treatment group.
##' `n1` uses cohort sizes (Alternative: `n0`).
##' @param quant_algo Quantile algorithm (see Wikipedia for definitions).
##' @param es Event Study (Logical). If TRUE, a quantile treatment effect is estimated
##' for each event-period.
##' @param n_digits Rounding the dependent variable before aggregating the empirical CDFs
##' reduces the size of the imputation grid. This can significantly reduce the amount
##' of RAM used in large data sets and improve running time, while reducing
##' precision (Use with caution).
##' @param periods_es Periods of the event study.
##' @param save_to_temp Logical. If TRUE, results are temporarily saved. This reduces the
##' RAM needed, but increases running time.
##' @param progress_bar Whether progress bar should be printed (select "void" for no
##' progress bar or "cli" for another type of bar).
##' @param nCores Number of cores used. If set > 1, bootstrapping will run in parallel.
##' @return An `ecic` object.
##' @references
##' Athey, Susan and Guido W. Imbens (2006). \cite{Identification and Inference in
##' Nonlinear Difference-in-Differences Models}.
##' \doi{10.1111/j.1468-0262.2006.00668.x}
##' @examples
##' # Example 1. Using the small mpdta data in the did package
##' data(dat, package = "ecic")
##' dat = dat[dat$first.treat <= 1983 & dat$countyreal <= 1000,] # small data for fast run
##'
##' mod_res =
##' summary(
##' ecic(
##' yvar = lemp, # dependent variable
##' gvar = first.treat, # group indicator
##' tvar = year, # time indicator
##' ivar = countyreal, # unit ID
##' dat = dat, # dataset
##' boot = "normal", # bootstrap proceduce ("no", "normal", or "weighted")
##' nReps = 3 # number of bootstrap runs
##' )
##' )
##'
##' # Basic Plot
##' ecic_plot(mod_res)
##'
##' \donttest{
##' # Example 2. Load some larger sample data
##' data(dat, package = "ecic")
##'
##' # Estimate a basic model with the package's sample data
##' mod_res =
##' summary(
##' ecic(
##' yvar = lemp, # dependent variable
##' gvar = first.treat, # group indicator
##' tvar = year, # time indicator
##' ivar = countyreal, # unit ID
##' dat = dat, # dataset
##' boot = "weighted", # bootstrap proceduce ("no", "normal", or "weighted")
##' nReps = 20 # number of bootstrap runs
##' )
##' )
##'
##' # Basic Plot
##' ecic_plot(mod_res)
##'
##' # Example 3. An Event-Study Example
##' mod_res =
##' summary(
##' ecic(
##' es = TRUE, # aggregate for every event period
##' yvar = lemp, # dependent variable
##' gvar = first.treat, # group indicator
##' tvar = year, # time indicator
##' ivar = countyreal, # unit ID
##' dat = dat, # dataset
##' boot = "weighted", # bootstrap proceduce ("no", "normal", or "weighted")
##' nReps = 20 # number of bootstrap runs
##' )
##' )
##'
##' # Plots
##' ecic_plot(mod_res) # aggregated in one plot
##' ecic_plot(mod_res, es_type = "for_quantiles") # individually for every quantile
##' ecic_plot(mod_res, es_type = "for_periods") # individually for every period
##' }
##' @importFrom stats aggregate quantile sd
##' @import future
##' @import furrr
##' @import progress
##' @export
ecic = function(
yvar = NULL,
gvar = NULL,
tvar = NULL,
ivar = NULL,
dat = NULL,
myProbs = seq(.1, .9, .1),
nMin = 40,
boot = c("weighted", "normal", "no"),
nReps = 10,
weight_n0 = c("n1", "n0"),
weight_n1 = c("n1", "n0"),
quant_algo = 1,
es = FALSE,
n_digits = NULL,
periods_es = NULL,
save_to_temp = FALSE,
progress_bar = c("progress", "void", "cli"),
nCores = 1
)
{
#-----------------------------------------------------------------------------
# Check Input Arguments
boot = match.arg(boot)
weight_n0 = match.arg(weight_n0)
weight_n1 = match.arg(weight_n1)
treat = NULL
progress_bar = match.arg(progress_bar)
if (is.null(dat)) stop("A non-NULL `dat` argument is required.")
# Prepare String Inputs for Variables
nl = as.list(seq_along(dat))
names(nl) = names(dat)
yvar = eval(substitute(yvar), nl, parent.frame())
if (is.numeric(yvar)) yvar = names(dat)[yvar]
tvar = eval(substitute(tvar), nl, parent.frame())
if (is.numeric(tvar)) tvar = names(dat)[tvar]
gvar = eval(substitute(gvar), nl, parent.frame())
if (is.numeric(gvar)) gvar = names(dat)[gvar]
ivar = eval(substitute(ivar), nl, parent.frame())
if (is.numeric(ivar)) ivar = names(dat)[ivar]
# Check Validity of inputs
if (is.null(gvar)) stop("A non-NULL `gvar` argument is required.")
if (is.null(tvar)) stop("A non-NULL `tvar` argument is required.")
if (is.null(ivar)) stop("A non-NULL `ivar` argument is required.")
if (is.null(yvar)) stop("A non-NULL `yvar` argument is required.")
if (!tvar %in% names(dat)) stop(paste0("`", tvar, "` is not a variable in the data set."))
if (!gvar %in% names(dat)) stop(paste0("`", gvar, "` is not a variable in the data set."))
if (!ivar %in% names(dat)) stop(paste0("`", ivar, "` is not a variable in the data set."))
if (!yvar %in% names(dat)) stop(paste0("`", yvar, "` is not a variable in the data set."))
if (!is.logical(es)) stop("`es` must be logical.")
if (!is.logical(save_to_temp)) stop("`save_to_temp` must be logical.")
if (!quant_algo %in% 1:9) stop("Invalid quantile algorithm.")
if (!is.numeric(nMin)) stop("`nMin` must be numerical.")
if (!is.numeric(nReps)) stop("`nReps` must be numerical.")
if (!is.numeric(nCores)) stop("`nCores` must be numerical.")
if (!is.numeric(myProbs)) stop("`myProbs` must be numerical.")
if (!is.null(n_digits)) if (!is.numeric(n_digits)) stop("`n_digits` must be numerical.")
# Check bootstrap
if (boot == "no") boot = NULL
nReps = as.integer(nReps)
if (! nReps > 0) stop("nReps must be a positive integer.")
if (is.null(boot) & nReps != 1){
warning("nReps > 1 but bootstrap is deactivated. nReps is set to 1.")
nReps = 1
}
# for saving to disk
if (save_to_temp == TRUE) {
temp_dir = tempdir() # set same tempdir() across cores in parallel
random_num = sample(1e5:5e6, 1) # add a big random number to file names
temp_files = list.files( # list all files in temp_dir that have the number
temp_dir, pattern = as.character(random_num)
)
while (length(temp_files)!= 0) { # check that no temp-files contain the number
random_num = sample(1e5:5e6, 1)
temp_files = list.files( # re-check
temp_dir, pattern = as.character(random_num)
)
}
}
#-----------------------------------------------------------------------------
# Prepare the data
dat = subset(dat, get(gvar) %in% unique(dat[[tvar]])) # exclude never-treated units
dat = dat[,c(yvar, gvar, tvar, ivar)] # keep relevant vars (lower RAM in parallel)
# star tvar and gvar at zero
first_period = min(dat[[tvar]], na.rm = TRUE)
last_cohort = max(dat[[gvar]], na.rm = TRUE) - first_period
dat[[tvar]] = dat[[tvar]]-(first_period-1)
dat[[gvar]] = dat[[gvar]]-(first_period-1)
# calculate group / cohort sizes
group_sizes = stats::aggregate(stats::as.formula(paste(yvar, " ~ ", gvar)),
data = dat[!duplicated(dat[, ivar]), ], FUN = length)
names(group_sizes)[names(group_sizes) == yvar] = "N"
# check number of too small groups / cohorts and exclude them
diffGroup = sum(group_sizes$N < nMin)
if (diffGroup != 0) {
skip_cohorts = group_sizes[group_sizes$N < nMin, gvar] # find the small cohorts
dat = dat[!dat[[gvar]] %in% skip_cohorts, ] # delete too small cohorts
warning(paste0("You have ", diffGroup, " (", round(100 * diffGroup / nrow(group_sizes)),
"%) too small groups (less than ", nMin, " observations). They were dropped."))
}
if (diffGroup == nrow(group_sizes)) stop("All treated cohorts are too small (you can adjust `nMin` with caution).")
# identify the treatment cohorts
list_periods = sort(unique(dat[[tvar]]))
list_cohorts = sort(unique(dat[[gvar]]))
qte_cohort = list_cohorts[-length(list_cohorts)] # omit last g (no comparison group)
qte_cohort = qte_cohort[qte_cohort != 1] # omit first g (no pre-period)
if(length(qte_cohort) == 0) stop("Not enough cohorts / groups in the data set!")
# Print settings
if (is.null(boot)) {
message(paste0("Estimating a changes-in-changes model for ", length(unique(dat[[gvar]])) - 1,
" groups and ", nrow(dat), " observations. No standard errors computed."))
} else {
message(paste0("Estimating a changes-in-changes model for ", length(unique(dat[[gvar]])) - 1,
" groups and ", nrow(dat), " observations with ", nReps, " (", boot, ") bootstrap replications."))
}
# max event time QTEs can be calculated for
periods_es = length(
list_periods[which(list_periods == qte_cohort[1]):(length(list_periods)-1)]
) - 1
################################################################################
# Calculate all 2-by-2 CIC combinations
future::plan(future::multisession, workers = nCores, gc = TRUE)
progressr::handlers(progress_bar) # choose whether to print output
# Calculate bootstrap for all possible 2x2 combinations
my_fcn <- function(xs) {
p <- progressr::progressor(along = xs)
return(
furrr::future_map(xs, function(j) {
n1 = n0 = vector()
y1 = y0 = name_runs = vector("list")
# resampling for bootstrapping
if ( !is.null(boot) ) {
if (boot == "weighted") {
cell_sizes = stats::aggregate(stats::as.formula(paste(yvar, " ~ ", gvar, "+", tvar)),
data = dat, FUN = length) # count cohort-period cells
names(cell_sizes)[names(cell_sizes) == yvar] = "N"
dat = merge(dat, cell_sizes, all.x = TRUE)
dat = dat[!is.na(dat$N) & dat$N > 0,] # additional check (weights have to be positive)
data_boot = dat[sample(1:nrow(dat), size = nrow(dat), replace = TRUE, prob = dat$N), ]
} else if (boot == "normal") {
data_boot = dat[sample(1:nrow(dat), size = nrow(dat), replace = TRUE), ]
}
} else {
data_boot = dat
}
# 1) treated cohorts ----
i = 1 # start the counter for the inner loop
for (qteCohort in qte_cohort) {
# 2) comparison groups ----
pre_cohort = list_cohorts[(which(list_cohorts == qteCohort)+1):length(list_cohorts)]
#pre_cohort = pre_cohort[pre_cohort - qteCohort >= 4]
for (preCohort in pre_cohort) {
# 3) post-treatment periods ----
qte_year = list_periods[which(list_periods == qteCohort):(which(list_periods == last_cohort))]
qte_year = qte_year[qte_year < preCohort] # control has to be untreated
# for event study: only calculate periods you're interested in
if (es == TRUE) qte_year = qte_year[qte_year - qteCohort <= periods_es]
for (qteYear in qte_year) {
# 4) pre-treatment comparison periods ----
pre_year = list_periods[list_periods < qteCohort] # both have to be untreated in this period
for (preYear in pre_year) {
# prepare the data for this loop
data_loop = subset(data_boot, get(gvar) %in% c(qteCohort, preCohort) & get(tvar) %in% c(qteYear, preYear))
data_loop$treat = ifelse(data_loop[[gvar]] == qteCohort, 1, 0) # add a treatment dummy
# catch empty groups
nrow_treat = nrow(subset(data_loop, treat == 1))
nrow_control = nrow(subset(data_loop, treat == 0))
# save the combinations (cohort / year) of this run
name_runs[[i]] = data.frame(i, qteCohort, preCohort, qteYear, preYear)
# save the group sizes for the weighting
n1[i] = nrow_treat
n0[i] = nrow_control
#-------------------------------------------------------------------
# catch empty groups
if (nrow_treat < nMin){
warning(paste0("Skipped a period-cohort group in bootstrap run ", j, " (too small treatment group)"))
next
}
if (nrow_control < nMin){
warning(paste0("Skipped a period-cohort group in bootstrap run ", j, " (too small treatment group)"))
next
}
#-------------------------------------------------------------------
# Y(1)
y1[[i]] = stats::ecdf(subset(data_loop, treat == 1 & get(tvar) == qteYear)[[yvar]])
# Y(0): Construct the counterfactual
y0[[i]] = stats::ecdf(
stats::quantile(subset(data_loop, treat == 0 & get(tvar) == qteYear)[[yvar]],
probs = stats::ecdf(subset(data_loop, treat == 0 & get(tvar) == preYear)[[yvar]]) (
subset(data_loop, treat == 1 & get(tvar) == preYear)[[yvar]]
), type = quant_algo
)
)
#-------------------------------------------------------------------
i = i + 1 # update counter
}
}
}
}
############################################################################
# Aggregate Results for 1 bootstrap run
# collapse
name_runs = cbind(do.call(rbind, name_runs), n1, n0) # specifications of the runs
# check that not all runs were empty
if (nrow(name_runs[name_runs$n1 >= nMin & name_runs$n0 >= nMin,]) == 0 ) {
stop("There was no non-empty cohort-group combination (all combinations were too small, check \"nMin\").", call. = F)
}
# prepare imputation values
if (!is.null(n_digits)) {
values_to_impute = sort(unique( round( dat[[yvar]], digits = n_digits)) )
} else {
values_to_impute = sort(unique( dat[[yvar]] ))
}
#-----------------------------------------------------------------------------
# impute Y(0)
y0_imp = lapply(y0, function(ecdf_temp) {
ecdf_temp(values_to_impute)
})
# bind rows into a matrix
y0_imp = as.matrix(do.call(rbind, y0_imp))
# aggregate all 2x2-Y(0) (weighted by cohort size)
test0 = data.frame(values_to_impute, value = colSums(y0_imp * (n1/sum(n1))) )
rm(y0_imp)
# get the quantiles of interest
y0_quant = do.call(rbind, lapply(myProbs, function(r) {
test0$diff = test0$value - r
test0 = subset(test0, diff >= 0)
test0[which.min(test0$diff),]
}))
y0_quant = y0_quant[, !(colnames(y0_quant) == "diff")]
rm(test0)
gc()
#---------------------------------------------------------------------------
if (es == FALSE) { # average QTE
# impute Y(1)
y1_imp = lapply(y1, function(ecdf_temp) {
ecdf_temp(values_to_impute)
})
# bind rows into a matrix
y1_imp = as.matrix(do.call(rbind, y1_imp))
# aggregate all 2x2-Y(0) (weighted by cohort size)
test1 = data.frame(values_to_impute, value = colSums(y1_imp * (n1/sum(n1))) )
rm(y1_imp)
# get the quantiles of interest
y1_quant = do.call(rbind, lapply(myProbs, function(r) {
test1$diff = test1$value - r
test1 = subset(test1, diff >= 0)
test1[which.min(test1$diff),]
}))
y1_quant = y1_quant[, !(colnames(y1_quant) == "diff")]
rm(test1)
gc()
# compute the QTE
myQuant = data.frame(
perc = myProbs,
values = y1_quant$values_to_impute - y0_quant$values_to_impute # from CDFs
)
#-------------------------------------------------------------------------
} else { # "event study"
# impute Y(1)
y1_imp = lapply(y1, function(ecdf_temp) {
ecdf_temp(values_to_impute)
})
# check event study settings
name_runs$diff = name_runs$qteYear - name_runs$qteCohort
max_es = max(name_runs[["diff"]])
if (periods_es > max_es) {
periods_es = max_es
warning(paste0("Bootstrap run ", j, ": Only ", periods_es,
" post-treatment periods can be calculated (plus contemporaneous)."))
}
myQuant = lapply(0:periods_es, function(e) { # time-after-treat
weights_temp = n1[subset(name_runs, diff == e)[["i"]]] # weights for this event time
y1_agg = colSums(
as.matrix(do.call(rbind, y1_imp[subset(name_runs, diff == e)[["i"]]]
)) * (weights_temp / sum(weights_temp)) )
test1 = data.frame(values_to_impute, value = y1_agg)
rm(y1_agg)
y1_quant = do.call(rbind, lapply(myProbs, function(r) {
test1$diff = test1$value - r
test1 = subset(test1, diff >= 0)
test1 = test1[which.min(test1$diff), ]
}))
y1_quant = y1_quant[, !(colnames(y1_quant) == "diff")]
# compute the QTE
quant_temp = data.frame(
perc = myProbs,
values = y1_quant$values_to_impute - y0_quant$values_to_impute # from CDFs
)
return(quant_temp)
})
rm(y1_imp)
gc()
}
#---------------------------------------------------------------------------
# update progress bar
p()
# save to disk (saver, but slower)
if (save_to_temp == TRUE) {
tmp_quant = tempfile(paste0(pattern = "myQuant_", random_num, "_", j, "_"), fileext = ".rds", tmpdir = temp_dir)
tmp_name = tempfile(paste0(pattern = "name_runs_", random_num, "_", j, "_"), fileext = ".rds", tmpdir = temp_dir)
saveRDS(myQuant, file = tmp_quant)
saveRDS(name_runs, file = tmp_name)
return(j)
} else { # just work in the RAM (output lost if crash and RAM may be too small)
return(list(coefs = myQuant, name_runs = name_runs))
}
},
.options = furrr::furrr_options(seed = 123)
)
)
}
reg = progressr::with_progress(my_fcn(1:nReps))
future::plan(future::sequential)
##############################################################################
# post-loop: combine the outputs files
if(save_to_temp == TRUE) {
list_files = list.files(path = temp_dir, pattern = paste0("myQuant_", random_num))
reg = lapply(1:nReps, function(j){
list(
coefs = readRDS(paste0(temp_dir, "\\", list.files(
path = temp_dir, pattern = paste0("myQuant_", random_num, "_", j, "_")))),
name_runs = readRDS(paste0(temp_dir, "\\", list.files(
path = temp_dir, pattern = paste0("name_runs_", random_num, "_", j, "_"))))
)
})
}
# post-loop: Overload class and new attributes (for post-estimation) ----
if(es == TRUE) {
periods_es = max(lengths(lapply(reg, "[[", 1))-1) # substact contemporary
} else {
periods_es = NA
}
class(reg) = c("ecic")
attr(reg, "ecic") = list(
myProbs = myProbs,
es = es,
periods_es = periods_es
)
return(reg)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.