test-jel_replication.R
In did: Treatment Effects with Multiple Periods and Groups

# Replication tests for the Callaway & Sant'Anna JEL article
# Data source: https://github.com/pedrohcgs/JEL-DiD
# These tests verify that `did` package results match the published article.
# Tests are skipped if the JEL data file is not available.


# Helper: resolve JEL data path, downloading from GitHub if needed (only called after skip_on_cran)
get_jel_data_path <- function() {
  path <- Sys.getenv("JEL_DID_DATA_PATH",
                     unset = file.path(path.expand("~"), "JEL-DiD", "data", "county_mortality_data.csv"))
  if (file.exists(path)) return(path)

  path <- file.path(tempdir(), "county_mortality_data.csv")
  if (file.exists(path) && file.info(path)$size > 0) return(path)

  download_ok <- FALSE
  tryCatch({
    res <- download.file(
      "https://raw.githubusercontent.com/pedrohcgs/JEL-DiD/main/data/county_mortality_data.csv",
      destfile = path,
      quiet = TRUE
    )
    download_ok <- identical(res, 0L)
  }, error = function(e) {
    # download failed; skip_if_not will handle missing file
  })

  # Remove empty/corrupt files so file.exists() correctly triggers skip
  if (!download_ok || !file.exists(path) || file.info(path)$size <= 0) {
    if (file.exists(path)) unlink(path)
  }
  path
}

# Helper to load and clean JEL data
load_jel_data <- function(path, filter_2xt = TRUE) {
  mydata <- read.csv(path, stringsAsFactors = FALSE)
  mydata$state <- substr(mydata$county, nchar(mydata$county) - 1, nchar(mydata$county))

  # Drop DC and pre-2014 adoption states
  mydata <- mydata[!(mydata$state %in% c("DC", "DE", "MA", "NY", "VT")), ]

  if (filter_2xt) {
    # For 2x2 and 2xT: keep only 2014-adopters and never-adopters
    mydata <- mydata[mydata$yaca == 2014 | is.na(mydata$yaca) | mydata$yaca > 2019, ]
  }

  # Compute covariates
  mydata$perc_white <- mydata$population_20_64_white / mydata$population_20_64 * 100
  mydata$perc_hispanic <- mydata$population_20_64_hispanic / mydata$population_20_64 * 100
  mydata$perc_female <- mydata$population_20_64_female / mydata$population_20_64 * 100
  mydata$unemp_rate <- mydata$unemp_rate * 100
  mydata$median_income <- mydata$median_income / 1000

  # Keep only needed columns
  keep_cols <- c("county_code", "year", "population_20_64", "yaca", "crude_rate_20_64",
                 "perc_female", "perc_white", "perc_hispanic", "unemp_rate", "poverty_rate", "median_income")
  mydata <- mydata[, keep_cols]

  # Drop rows with NA in non-yaca columns
  non_yaca <- setdiff(keep_cols, "yaca")
  mydata <- mydata[complete.cases(mydata[, non_yaca]), ]

  # Keep counties with data in both 2013 and 2014
  county_year_counts <- tapply(mydata$year %in% c(2013, 2014), mydata$county_code, sum)
  valid_counties <- names(county_year_counts[county_year_counts == 2])
  mydata <- mydata[mydata$county_code %in% valid_counties, ]

  # Keep counties with all 11 years of data
  county_counts <- table(mydata$county_code)
  valid_counties <- names(county_counts[county_counts == 11])
  mydata <- mydata[mydata$county_code %in% valid_counties, ]

  mydata
}


# ===========================================================================
# Table 7: 2x2 CS-DiD with Covariates (Sant'Anna-Zhao)
# ===========================================================================
test_that("JEL Table 7: 2x2 CS-DiD point estimates match", {
  skip_on_cran()
  jel_data_path <- get_jel_data_path()
  skip_if_not(file.exists(jel_data_path), "JEL-DiD data not available")

  mydata <- load_jel_data(jel_data_path, filter_2xt = TRUE)

  # Make 2x2 dataset (2013-2014 only)
  short_data <- mydata[mydata$year %in% c(2013, 2014), ]
  short_data$treat_year <- ifelse(!is.na(short_data$yaca) & short_data$yaca == 2014, 2014, 0)
  short_data$county_code <- as.numeric(short_data$county_code)

  # Population weights from 2013
  wt_2013 <- short_data[short_data$year == 2013, c("county_code", "population_20_64")]
  names(wt_2013)[2] <- "set_wt"
  short_data <- merge(short_data, wt_2013, by = "county_code")

  covs_formula <- ~perc_female + perc_white + perc_hispanic + unemp_rate + poverty_rate + median_income

  # Expected point estimates from JEL article (Table 7)
  expected <- list(
    reg_unweighted = -1.6154372119,
    ipw_unweighted = -0.8585625501,
    dr_unweighted  = -1.2256473242,
    reg_weighted   = -3.4592200594,
    ipw_weighted   = -3.8416966846,
    dr_weighted    = -3.7561045985
  )

  for (method in c("reg", "ipw", "dr")) {
    for (wt_info in list(list(name = NULL, label = "unweighted"),
                         list(name = "set_wt", label = "weighted"))) {
      res <- suppressWarnings(att_gt(
        yname = "crude_rate_20_64", tname = "year", idname = "county_code",
        gname = "treat_year", xformla = covs_formula,
        data = short_data, panel = TRUE, control_group = "nevertreated",
        bstrap = FALSE, est_method = method, weightsname = wt_info$name,
        base_period = "universal"
      ))
      agg <- suppressWarnings(aggte(res, na.rm = TRUE, bstrap = FALSE))

      key <- paste0(method, "_", wt_info$label)
      expect_equal(agg$overall.att, expected[[key]], tolerance = 1e-6,
                   label = paste("Table 7:", key))
    }
  }
})


# ===========================================================================
# 2xT Event Study: No covariates, weighted, reg, nevertreated
# ===========================================================================
test_that("JEL 2xT: event study ATT(g,t) point estimates match", {
  skip_on_cran()
  jel_data_path <- get_jel_data_path()
  skip_if_not(file.exists(jel_data_path), "JEL-DiD data not available")

  mydata <- load_jel_data(jel_data_path, filter_2xt = TRUE)
  mydata$treat_year <- ifelse(!is.na(mydata$yaca) & mydata$yaca == 2014, 2014, 0)
  mydata$county_code <- as.numeric(mydata$county_code)

  wt_2013 <- mydata[mydata$year == 2013, c("county_code", "population_20_64")]
  names(wt_2013)[2] <- "set_wt"
  mydata <- merge(mydata, wt_2013, by = "county_code")

  mod <- att_gt(
    yname = "crude_rate_20_64", tname = "year", idname = "county_code",
    gname = "treat_year", xformla = NULL, data = mydata, panel = TRUE,
    control_group = "nevertreated", bstrap = FALSE,
    est_method = "reg", weightsname = "set_wt", base_period = "universal"
  )

  # Expected ATT(g,t) for g=2014
  expected_att <- c(
    4.1292044043,   # t=2009
    -0.5016807242,  # t=2010
    2.7531791360,   # t=2011
    2.7804626426,   # t=2012
    0.0,            # t=2013 (base period)
    -2.5628745138,  # t=2014
    -1.6973291127,  # t=2015
    0.2189009815,   # t=2016
    -0.8133358354,  # t=2017
    -1.1532954495,  # t=2018
    1.7866564429    # t=2019
  )

  expect_equal(mod$att, expected_att, tolerance = 1e-6)

  # Dynamic aggregation
  es <- suppressWarnings(aggte(mod, type = "dynamic", bstrap = FALSE))
  expect_equal(es$att.egt, expected_att, tolerance = 1e-6)

  # Overall ATT for e in {0,5}
  agg <- suppressWarnings(aggte(mod, type = "dynamic", min_e = 0, max_e = 5, bstrap = FALSE))
  expect_equal(agg$overall.att, -0.7035462478, tolerance = 1e-6)
})


# ===========================================================================
# 2xT with covariates: 3 estimation methods
# ===========================================================================
test_that("JEL 2xT: event study with covariates matches across methods", {
  skip_on_cran()
  jel_data_path <- get_jel_data_path()
  skip_if_not(file.exists(jel_data_path), "JEL-DiD data not available")

  mydata <- load_jel_data(jel_data_path, filter_2xt = TRUE)
  mydata$treat_year <- ifelse(!is.na(mydata$yaca) & mydata$yaca == 2014, 2014, 0)
  mydata$county_code <- as.numeric(mydata$county_code)

  wt_2013 <- mydata[mydata$year == 2013, c("county_code", "population_20_64")]
  names(wt_2013)[2] <- "set_wt"
  mydata <- merge(mydata, wt_2013, by = "county_code")

  covs_formula <- ~perc_female + perc_white + perc_hispanic + unemp_rate + poverty_rate + median_income

  for (method in c("reg", "ipw", "dr")) {
    res <- att_gt(
      yname = "crude_rate_20_64", tname = "year", idname = "county_code",
      gname = "treat_year", xformla = covs_formula,
      data = mydata, panel = TRUE, control_group = "nevertreated",
      bstrap = FALSE, est_method = method, weightsname = "set_wt",
      base_period = "universal"
    )

    es <- suppressWarnings(aggte(res, type = "dynamic", na.rm = TRUE, bstrap = FALSE))

    # Base period (e = -1) should be exactly 0
    base_idx <- which(es$egt == -1)
    expect_equal(es$att.egt[base_idx], 0, label = paste("2xT covs", method, "e=-1"))

    # All estimates should be finite
    expect_true(all(is.finite(es$att.egt[!is.na(es$att.egt)])),
                label = paste("2xT covs", method, "finite"))
  }
})


# ===========================================================================
# GxT: No covariates, weighted, notyettreated
# ===========================================================================
test_that("JEL GxT: staggered event study without covariates matches", {
  skip_on_cran()
  jel_data_path <- get_jel_data_path()
  skip_if_not(file.exists(jel_data_path), "JEL-DiD data not available")

  mydata <- load_jel_data(jel_data_path, filter_2xt = FALSE)  # GxT keeps all groups
  mydata$treat_year <- ifelse(!is.na(mydata$yaca) & mydata$yaca <= 2019, mydata$yaca, 0)
  mydata$county_code <- as.numeric(mydata$county_code)

  wt_2013 <- mydata[mydata$year == 2013, c("county_code", "population_20_64")]
  names(wt_2013)[2] <- "set_wt"
  mydata <- merge(mydata, wt_2013, by = "county_code")

  mod <- att_gt(
    yname = "crude_rate_20_64", tname = "year", idname = "county_code",
    gname = "treat_year", xformla = NULL, data = mydata, panel = TRUE,
    control_group = "notyettreated", bstrap = FALSE,
    weightsname = "set_wt", base_period = "universal"
  )

  # Dynamic aggregation
  es <- suppressWarnings(aggte(mod, type = "dynamic", bstrap = FALSE))

  # Expected dynamic ATT at key event times
  expected_dynamic <- c(
    `e=-5`  =  2.2186783578,
    `e=-4`  =  0.8579225109,
    `e=-3`  =  1.9161499399,
    `e=-2`  =  2.5644742860,
    `e=-1`  =  0.0,
    `e=0`   = -1.6545648988,
    `e=1`   = -0.2616435324,
    `e=2`   =  1.7055625922,
    `e=3`   = -0.5405232028,
    `e=4`   = -0.5148819184,
    `e=5`   =  1.7866564429
  )

  # Match e = -5 to e = 5
  for (e_val in -5:5) {
    idx <- which(es$egt == e_val)
    key <- paste0("e=", e_val)
    expect_equal(es$att.egt[idx], expected_dynamic[[key]], tolerance = 1e-6,
                 label = paste("GxT no covs", key))
  }

  # Overall ATT for e in {0,5}
  agg <- suppressWarnings(aggte(mod, type = "dynamic", min_e = 0, max_e = 5, bstrap = FALSE))
  expect_equal(agg$overall.att, 0.0867675805, tolerance = 1e-6,
               label = "GxT no covs overall ATT (e=0:5)")
})


# ===========================================================================
# GxT: With covariates, DR, weighted, notyettreated
# ===========================================================================
test_that("JEL GxT: staggered event study with DR covariates matches", {
  skip_on_cran()
  jel_data_path <- get_jel_data_path()
  skip_if_not(file.exists(jel_data_path), "JEL-DiD data not available")

  mydata <- load_jel_data(jel_data_path, filter_2xt = FALSE)
  mydata$treat_year <- ifelse(!is.na(mydata$yaca) & mydata$yaca <= 2019, mydata$yaca, 0)
  mydata$county_code <- as.numeric(mydata$county_code)

  wt_2013 <- mydata[mydata$year == 2013, c("county_code", "population_20_64")]
  names(wt_2013)[2] <- "set_wt"
  mydata <- merge(mydata, wt_2013, by = "county_code")

  covs_formula <- ~perc_female + perc_white + perc_hispanic + unemp_rate + poverty_rate + median_income

  mod <- suppressWarnings(att_gt(
    yname = "crude_rate_20_64", tname = "year", idname = "county_code",
    gname = "treat_year", xformla = covs_formula,
    data = mydata, panel = TRUE, control_group = "notyettreated",
    bstrap = FALSE, est_method = "dr", weightsname = "set_wt",
    base_period = "universal"
  ))

  # Overall ATT for e in {0,5}
  agg <- suppressWarnings(aggte(mod, type = "dynamic", min_e = 0, max_e = 5, bstrap = FALSE))
  expect_equal(agg$overall.att, -2.2469982988, tolerance = 1e-6,
               label = "GxT DR covs overall ATT (e=0:5)")

  # Dynamic aggregation at key event times
  es <- suppressWarnings(aggte(mod, type = "dynamic", bstrap = FALSE))

  expected_dynamic <- c(
    `e=-5`  =  2.6684811691,
    `e=-4`  =  2.1333836537,
    `e=-3`  =  2.8574389179,
    `e=-2`  =  2.9129673584,
    `e=-1`  =  0.0,
    `e=0`   = -1.4929553032,
    `e=1`   = -1.9693922262,
    `e=2`   = -2.7250659699,
    `e=3`   = -5.0556625460,
    `e=4`   = -4.7161630370,
    `e=5`   =  2.4772492894
  )

  for (e_val in -5:5) {
    idx <- which(es$egt == e_val)
    key <- paste0("e=", e_val)
    expect_equal(es$att.egt[idx], expected_dynamic[[key]], tolerance = 1e-6,
                 label = paste("GxT DR covs", key))
  }
})


# ===========================================================================
# Consistency: faster_mode should match regular mode for all JEL analyses
# ===========================================================================
test_that("JEL: faster_mode matches regular mode", {
  skip_on_cran()
  jel_data_path <- get_jel_data_path()
  skip_if_not(file.exists(jel_data_path), "JEL-DiD data not available")

  mydata <- load_jel_data(jel_data_path, filter_2xt = TRUE)

  # 2x2 data
  short_data <- mydata[mydata$year %in% c(2013, 2014), ]
  short_data$treat_year <- ifelse(!is.na(short_data$yaca) & short_data$yaca == 2014, 2014, 0)
  short_data$county_code <- as.numeric(short_data$county_code)

  wt_2013 <- short_data[short_data$year == 2013, c("county_code", "population_20_64")]
  names(wt_2013)[2] <- "set_wt"
  short_data <- merge(short_data, wt_2013, by = "county_code")

  covs_formula <- ~perc_female + perc_white + perc_hispanic + unemp_rate + poverty_rate + median_income

  # Test DR weighted
  res_slow <- suppressWarnings(att_gt(
    yname = "crude_rate_20_64", tname = "year", idname = "county_code",
    gname = "treat_year", xformla = covs_formula,
    data = short_data, panel = TRUE, control_group = "nevertreated",
    bstrap = FALSE, est_method = "dr", weightsname = "set_wt",
    base_period = "universal", faster_mode = FALSE
  ))
  res_fast <- suppressWarnings(att_gt(
    yname = "crude_rate_20_64", tname = "year", idname = "county_code",
    gname = "treat_year", xformla = covs_formula,
    data = short_data, panel = TRUE, control_group = "nevertreated",
    bstrap = FALSE, est_method = "dr", weightsname = "set_wt",
    base_period = "universal", faster_mode = TRUE
  ))

  expect_equal(res_slow$att, res_fast$att, tolerance = 1e-10,
               label = "2x2 DR weighted: ATTs match")

  agg_slow <- suppressWarnings(aggte(res_slow, na.rm = TRUE, bstrap = FALSE))
  agg_fast <- suppressWarnings(aggte(res_fast, na.rm = TRUE, bstrap = FALSE))
  expect_equal(agg_slow$overall.att, agg_fast$overall.att, tolerance = 1e-10,
               label = "2x2 DR weighted: aggregate ATT matches")
})
Any scripts or data that you put into this service are public.
did documentation built on June 13, 2026, 5:07 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
did
Treatment Effects with Multiple Periods and Groups

tests/testthat/test-jel_replication.R
In did: Treatment Effects with Multiple Periods and Groups

Try the did package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

did Treatment Effects with Multiple Periods and Groups

tests/testthat/test-jel_replication.R In did: Treatment Effects with Multiple Periods and Groups

Try the did package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

did
Treatment Effects with Multiple Periods and Groups

tests/testthat/test-jel_replication.R
In did: Treatment Effects with Multiple Periods and Groups