R/event_study.R
In fixes: Tools for Creating and Visualizing Fixed-Effects Event Study Models

Documented in plot_es run_es

#' Run Event Study with Fixed Effects
#'
#' This function performs an event study using fixed effects regression based on a panel dataset.
#' It generates dummy variables for each lead and lag period relative to the treatment timing,
#' applies optional covariates and fixed effects, and estimates the model using `fixest::feols`.
#'
#' @param data A data frame containing the panel dataset.
#' @param outcome The outcome variable, specified unquoted. You may use a raw variable name
#' (e.g., `y`) or a transformation (e.g., `log(y)`).
#' @param treatment The treatment indicator (unquoted). Can be binary numeric (`0/1`) or logical (`TRUE/FALSE`). Typically equals 1 (or `TRUE`) in and after the treated period, 0 otherwise.
#' @param time The time variable (unquoted). Used to calculate the relative timing.
#' @param staggered Logical. If `TRUE`, allows treatment timing to vary across units. Requires `timing` to be a column name. Default is `FALSE`.
#' @param timing The time period when the treatment occurs. If `staggered = FALSE`, must be a single numeric value (e.g., `2005`). If `staggered = TRUE`, must be an unquoted variable name representing the treatment timing for each unit.
#' If `time_transform = TRUE`, specify `timing` as an integer corresponding to the transformed time index within each unit (e.g., 5 for the fifth time point).
#' @param lead_range Number of pre-treatment periods to include as leads (e.g., 5 = `lead5`, `lead4`, ..., `lead1`). If `NULL`, the function will automatically determine the maximum possible lead across all units.
#' @param lag_range Number of post-treatment periods to include as lags (e.g., 3 = `lag0`, `lag1`, `lag2`, `lag3`). If `NULL`, the function will automatically determine the maximum possible lag across all units.
#' @param covariates Optional covariates to include in the regression. Must be supplied as a one-sided formula (e.g., `~ x1 + x2`).
#' @param fe Fixed effects to control for unobserved heterogeneity. Must be a one-sided formula (e.g., `~ id + year`).
#' @param cluster Clustering specification for robust standard errors. Accepts either:
#' - a character vector of column names (e.g., `c("id", "year")`), or
#' - a one-sided formula (e.g., `~ id + year` or `~ id^year`).
#' Cluster variables are internally re-evaluated after filtering for the estimation window.
#' @param weights Optional observation weights. Must be supplied as a one-sided formula (e.g., `~ popwt`). If `NULL`, unweighted regression is performed.
#' @param baseline The relative time (e.g., `-1`) to use as the reference period.
#' The corresponding dummy variable will be excluded from the regression and added manually to the results with estimate 0.
#' Must lie within the specified `lead_range` and `lag_range`. If not, an error will be thrown.
#' @param interval The interval between time periods. Use `1` for annual data (default), `5` for 5-year intervals, etc.
#' @param time_transform Logical. If TRUE, the time variable will be converted to a unit-level sequence (1, 2, 3, ...) based on its order within each unit. Useful for panel data with non-continuous time variables. Default is FALSE.
#' @param unit The unit (individual) identifier for panel data. Required when `time_transform = TRUE`. Must be an unquoted variable name (e.g., `id`).
#'
#' @return A tibble with the event study regression results, including:
#' - `term`: Name of the lead or lag dummy variable
#' - `estimate`: Coefficient estimate
#' - `std.error`: Standard error
#' - `statistic`: t-statistic
#' - `p.value`: p-value
#' - `conf_high`: Upper bound of 95% confidence interval
#' - `conf_low`: Lower bound of 95% confidence interval
#' - `relative_time`: Time scaled relative to the treatment
#' - `is_baseline`: Logical indicator for the baseline term (equals `TRUE` only for the excluded dummy)
#'
#' @details
#' This function is intended for difference-in-differences or event study designs with panel data.
#' It automatically:
#' - Computes relative time: \code{(time - timing) / interval}
#' - Generates dummy variables for specified leads and lags
#' - Removes the baseline term from estimation and appends it back post-estimation
#' - Uses \code{fixest::feols()} for fast and flexible estimation
#'
#' Both fixed effects and clustering are fully supported. Observation weights can be specified using the `weights` argument.
#'
#' If `time_transform = TRUE`, the time variable is internally replaced with a unit-level sequence
#' (e.g., 1, 2, 3, ...) based on its order within each unit (as specified by the `unit` argument).
#' This is useful when the time variable is irregular (e.g., Date-type data or monthly data with gaps).
#' Note that in this case, the `timing` argument must be specified based on the transformed index
#' (e.g., 5 corresponds to the fifth time point in the sorted order within each unit).
#'
#' @examples
#' \dontrun{
#' # Assume df is a panel dataset with variables: id, year, y, treat, x1, x2, var1, var2, popwt
#'
#' # Minimal example without covariates
#' run_es(
#'   data       = df,
#'   outcome    = y,
#'   treatment  = treat,
#'   time       = year,
#'   timing     = 2005,
#'   lead_range = 2,
#'   lag_range  = 2,
#'   fe         = ~ id + year,
#'   cluster    = ~ id,
#'   baseline   = -1
#' )
#'
#' # Weighted regression
#' run_es(
#'   data       = df,
#'   outcome    = y,
#'   treatment  = treat,
#'   time       = year,
#'   timing     = 2005,
#'   lead_range = 2,
#'   lag_range  = 2,
#'   covariates = ~ x1 + x2,
#'   fe         = ~ id + year,
#'   cluster    = ~ id,
#'   weights    = ~ wt,
#'   baseline   = -1
#' )
#'
#' # Example with staggered treatment timing
#' # Suppose `treat_time` indicates the year each unit was treated
#' run_es(
#'   data       = df,
#'   outcome    = y,
#'   treatment  = is_treated,
#'   time       = year,
#'   staggered  = TRUE,
#'   timing     = treat_time,  # a variable with treatment years per unit
#'   lead_range = 3,
#'   lag_range  = 4,
#'   fe         = ~ id + year,
#'   cluster    = ~ id,
#'   baseline   = -1
#' )
#' }
#' @importFrom stats as.formula
#' @export
run_es <- function(data,
                   outcome,
                   treatment,
                   time,
                   staggered = FALSE,
                   timing,
                   lead_range = NULL,
                   lag_range = NULL,
                   covariates = NULL,
                   fe,
                   cluster = NULL,
                   weights = NULL,
                   baseline = -1,
                   interval = 1,
                   time_transform = FALSE,
                   unit = NULL) {

  # ---- 0. Helper function: generate lead/lag term name ----
  get_term_name <- function(i) {
    if (i < 0) paste0("lead", abs(i)) else paste0("lag", i)
  }

  # ---- 1. Helper function: process cluster argument ----
  process_cluster <- function(cluster, data) {
    if (is.null(cluster)) return(NULL)
    if (inherits(cluster, "formula")) {
      cl_vars <- all.vars(cluster)
      missing_vars <- cl_vars[!cl_vars %in% colnames(data)]
      if (length(missing_vars) > 0) {
        stop("The following cluster variables from the formula are not in data: ",
             paste(missing_vars, collapse = ", "))
      }
      return(cluster)
    } else if (is.character(cluster)) {
      missing_vars <- cluster[!cluster %in% colnames(data)]
      if (length(missing_vars) > 0) {
        stop("The following cluster variables are not in data: ",
             paste(missing_vars, collapse = ", "))
      }
      return(cluster)
    } else {
      stop("Invalid type for cluster argument. Please supply either a formula or character vector.")
    }
  }

  # ---- 2. Process outcome, treatment, time ----
  outcome_expr <- rlang::enexpr(outcome)
  treatment_sym <- rlang::ensym(treatment)
  time_sym <- rlang::ensym(time)

  if (rlang::is_symbol(outcome_expr)) {
    outcome_chr <- rlang::as_string(outcome_expr)
    if (!outcome_chr %in% colnames(data)) {
      stop("The specified outcome ('", outcome_chr, "') does not exist in the dataframe.")
    }
  } else if (!rlang::is_call(outcome_expr)) {
    stop("The specified outcome must be either a column name or a function call (e.g., log(variable)).")
  }

  treatment_chr <- rlang::as_string(treatment_sym)
  if (!treatment_chr %in% colnames(data)) {
    stop("The specified treatment ('", treatment_chr, "') does not exist in the dataframe.")
  }

  time_chr <- rlang::as_string(time_sym)
  if (!time_chr %in% colnames(data)) {
    stop("The specified time ('", time_chr, "') does not exist in the dataframe.")
  }

  if (inherits(data[[time_chr]], "Date")) {
    data[[time_chr]] <- as.numeric(data[[time_chr]])
  }

  if (!is.numeric(data[[time_chr]])) {
    stop("The `time` variable must be either numeric or Date. Please convert it before passing to `run_es()`.")
  }

  # ---- 3. Process timing ----
  if (staggered && time_transform) {
    stop("The combination of staggered = TRUE and time_transform = TRUE is not currently supported.")
  }

  timing_sym <- NULL

  if (staggered) {
    timing_sym <- rlang::ensym(timing)
    timing_chr <- rlang::as_string(timing_sym)
    if (!timing_chr %in% colnames(data)) {
      stop("The specified timing variable ('", timing_chr, "') does not exist in the dataframe.")
    }
  } else {
    timing_sym <- rlang::enexpr(timing)
    if (!is.numeric(eval(timing_sym))) {
      stop("When staggered = FALSE, timing must be a single numeric value.")
    }
  }

  # ---- 4. Process time_transform ----
  if (time_transform) {
    if (missing(unit)) {
      stop("When time_transform = TRUE, you must specify the `unit` argument (e.g., unit = id).")
    }

    id_sym <- rlang::ensym(unit)
    id_chr <- rlang::as_string(id_sym)
    if (!id_chr %in% colnames(data)) {
      stop("The specified unit variable ('", id_chr, "') does not exist in the dataframe.")
    }

    data <- data |>
      dplyr::group_by(!!id_sym) |>
      dplyr::arrange(!!time_sym, .by_group = TRUE) |>
      dplyr::mutate(time_index = dplyr::row_number()) |>
      dplyr::ungroup()

    time_sym <- rlang::sym("time_index")
  }

  if (!time_transform && !is.null(unit)) {
    warning("The `unit` argument is ignored because `time_transform = FALSE`.")
  }

  # ---- 5. Process covariates ----
  cov_text <- ""
  if (!rlang::quo_is_null(rlang::enquo(covariates))) {
    if (!inherits(covariates, "formula")) {
      stop("Invalid input for covariates. Please supply a one-sided formula (e.g., ~ x1 + log(x2)).")
    }
    cov_rhs <- rlang::f_rhs(covariates)
    cov_text <- rlang::expr_text(cov_rhs)
    cov_vars <- all.vars(cov_rhs)
    missing_cov <- cov_vars[!cov_vars %in% colnames(data)]
    if (length(missing_cov) > 0) {
      stop("Missing covariates: ", paste(missing_cov, collapse = ", "))
    }
  }

  # ---- 6. Process fixed effects (fe) ----
  if (!inherits(fe, "formula")) {
    stop("Invalid input for fe. Please supply a one-sided formula (e.g., ~ id + year).")
  }

  fe_rhs <- rlang::f_rhs(fe)
  fe_vars <- all.vars(fe_rhs)
  missing_fes <- fe_vars[!fe_vars %in% colnames(data)]
  if (length(missing_fes) > 0) {
    stop("The specified fixed effects variable(s) (", paste(missing_fes, collapse = ", "), ") do not exist in the dataframe.")
  }

  fe_text <- paste(fe_vars, collapse = "+")

  # ---- 7. Process cluster if provided ----
  if (!is.null(cluster)) {
    cluster <- process_cluster(cluster, data)
  }

  # ---- 8. Create relative time ----
  data <- data |>
    dplyr::mutate(
      relative_time = if (staggered)
        (!!time_sym - !!timing_sym) / interval
      else
        (!!time_sym - timing) / interval
    )

  # ---- 9. Auto set lead/lag range if NULL ----
  if (is.null(lead_range) || is.null(lag_range)) {
    min_rt <- floor(min(data$relative_time, na.rm = TRUE))
    max_rt <- ceiling(max(data$relative_time, na.rm = TRUE))
    if (is.null(lead_range)) lead_range <- abs(min_rt)
    if (is.null(lag_range))  lag_range  <- max_rt
  }

  # ---- 10. Check that baseline is within range ----
  if (baseline < -lead_range || baseline > lag_range) {
    stop("The specified baseline (", baseline,
         ") is outside the range defined by lead_range (", -lead_range,
         ") and lag_range (", lag_range, ").")
  }

  # ---- 11. Check for existing columns to be overwritten ----
  existing_terms <- vapply(seq(-lead_range, lag_range), get_term_name, character(1))
  overwritten <- existing_terms[existing_terms %in% names(data)]
  if (length(overwritten) > 0) {
    warning("The following columns already exist in the data and will be overwritten: ",
            paste(overwritten, collapse = ", "))
  }

  # ---- 12. Create lead and lag dummy variables ----
  for (i in seq(-lead_range, lag_range, by = 1)) {
    col_name <- get_term_name(i)
    data <- data |>
      dplyr::mutate(
        !!col_name := dplyr::if_else(
          !is.na(relative_time) &
            !!treatment_sym &
            round(relative_time, 5) == i, 1L, 0L
        )
      )
  }

  # ---- 13. Construct and estimate the regression model ----
  outcome_expr_text <- rlang::expr_deparse(outcome_expr)

  all_terms <- c(
    paste0("lead", seq(lead_range, 1)),
    paste0("lag", seq(0, lag_range))
  )
  baseline_term <- get_term_name(baseline)
  included_terms <- setdiff(all_terms, baseline_term)

  RHS_formula <- paste(included_terms, collapse = "+")
  if (cov_text != "") {
    RHS_formula <- paste0(RHS_formula, "+", cov_text)
  }

  formula_string <- paste0(outcome_expr_text, " ~ ", RHS_formula, " | ", fe_text)
  model_formula <- as.formula(formula_string)

  # ---- 14. Estimate model (with optional weights) ----
  if (!is.null(weights)) {
    weights_formula <- rlang::enexpr(weights)
    model <- fixest::feols(model_formula, data = data, cluster = cluster, weights = weights_formula)
  } else {
    model <- fixest::feols(model_formula, data = data, cluster = cluster)
  }

  # ---- 15. Format the results ----
  result <- broom::tidy(model)

  baseline_row <- tibble::tibble(
    term = baseline_term,
    estimate = 0,
    std.error = 0,
    statistic = NA_real_,
    p.value = NA_real_
  )

  full_order <- c(
    paste0("lead", seq(lead_range, 1)),
    paste0("lag", seq(0, lag_range))
  )
  result <- result |>
    dplyr::bind_rows(baseline_row) |>
    dplyr::mutate(term = factor(term, levels = full_order)) |>
    dplyr::arrange(term) |>
    dplyr::mutate(
      conf_high = estimate + 1.96 * std.error,
      conf_low  = estimate - 1.96 * std.error
    ) |>
    dplyr::filter(!is.na(term))

  # ---- 16. Attach relative_time mapping ----
  rel_times <- c(seq(-lead_range, -1), seq(0, lag_range)) * interval
  rel_map <- tibble::tibble(
    term = factor(full_order, levels = full_order),
    relative_time = rel_times
  )
  result <- result |>
    dplyr::left_join(rel_map, by = "term") |>
    dplyr::mutate(is_baseline = (term == baseline_term))

  return(result)
}


#' Plot Event Study Results
#'
#' This function creates a plot for event study results using `ggplot2`.
#' Users can choose between ribbon-style confidence intervals or error bars
#' to visualize the estimates and their uncertainty.
#'
#' @param data A dataframe containing the results from the `run_es` function.
#' The dataframe must include the following columns:
#' - `relative_time`: The scaled time relative to the treatment.
#' - `estimate`: The estimated coefficients.
#' - `conf_low`: The lower bound of the 95% confidence interval.
#' - `conf_high`: The upper bound of the 95% confidence interval.
#' - `std.error`: The standard errors (required if `type = "errorbar"`).
#' @param type The type of confidence interval visualization: "ribbon" (default) or "errorbar".
#' - "ribbon": Shaded area representing the confidence intervals.
#' - "errorbar": Vertical error bars for each estimate.
#' @param vline_val The x-intercept for the vertical reference line (default: 0).
#' Typically represents the time of treatment.
#' @param vline_color The color of the vertical reference line (default: "#000").
#' @param hline_val The y-intercept for the horizontal reference line (default: 0).
#' Usually represents the null effect line.
#' @param hline_color The color of the horizontal reference line (default: "#000").
#' @param linewidth The width of the lines in the plot (default: 1).
#' @param pointsize The size of the points for the estimates (default: 2).
#' @param alpha The transparency level for the ribbon (default: 0.2).
#' @param barwidth The width of the error bars (default: 0.2).
#' @param color The color of the lines and points (default: "#B25D91FF").
#' @param fill The fill color for the ribbon (default: "#B25D91FF").
#' @return A ggplot object displaying the event study results. The plot includes:
#' - A line connecting the estimates over relative time.
#' - Points for the estimated coefficients.
#' - Either ribbon-style confidence intervals or error bars, depending on `type`.
#' - Vertical and horizontal reference lines for better interpretability.
#' @details
#' This function provides a flexible visualization tool for event study results.
#' Users can customize the appearance of the plot by adjusting the parameters
#' for line styles, point sizes, colors, and confidence interval types.
#'
#' **Column Requirements**:
#' The input dataframe (`data`) must include:
#' - `relative_time`: A numeric column for the time relative to the treatment.
#' - `estimate`: The estimated coefficients for each relative time.
#' - `conf_low` and `conf_high`: The bounds of the confidence intervals.
#' - `std.error`: The standard errors (only required if `type = "errorbar"`).
#'
#' **Type Options**:
#' - `"ribbon"`: A shaded area to represent the confidence intervals.
#' - `"errorbar"`: Error bars for each point. Standard errors (`std.error`) are required.
#'
#' @note If `type = "errorbar"`, ensure that the `std.error` column is present in the input dataframe.
#' Missing values in the `std.error` column for any term will result in incomplete confidence intervals.
#'
#' @examples
#' \dontrun{
#' # Run event study
#' event_study <- run_es(
#'   data       = df,
#'   outcome    = y,
#'   treatment  = is_treated,
#'   time       = year,
#'   timing     = 2005,
#'   lead_range = 5,              # Corresponds to years 2000-2004 (relative time: -5 to -1)
#'   lag_range  = 4,              # Corresponds to years 2006-2009 (relative time: 1 to 4)
#'   fe         = firm_id + year,
#'   cluster    = "state_id",
#'   baseline   = -1,
#'   interval   = 1
#' )
#'
#' # Basic plot
#' plot_es(event_study)
#'
#' # Use error bars instead of ribbon confidence intervals
#' plot_es(event_study, type = "errorbar")
#'
#' # Adjust vertical reference line
#' plot_es(event_study, type = "errorbar", vline_val = -0.5)
#'
#' # Customize axis breaks and title
#' library(ggplot2)
#' plot_es(event_study, type = "errorbar") +
#'   ggplot2::scale_x_continuous(breaks = seq(-5, 4, by = 1)) +
#'   ggplot2::ggtitle("Result of Event Study")
#' }
#'
#' @export
plot_es <- function(data,
                    type = "ribbon",
                    vline_val = 0,
                    vline_color = "#000",
                    hline_val = 0,
                    hline_color = "#000",
                    linewidth = 1,
                    pointsize = 2,
                    alpha = .2,
                    barwidth = .2,
                    color = "#B25D91FF",
                    fill = "#B25D91FF") {
  # Validate the type of confidence interval visualization
  if (!type %in% c("ribbon", "errorbar")) {
    stop("Invalid type. Please choose 'ribbon' or 'errorbar'.")
  }

  # Create the base plot with vertical and horizontal reference lines
  base_plot <-
    ggplot2::ggplot(
      data = data,
      ggplot2::aes(x = relative_time, y = estimate, group = 1)
    ) +
    ggplot2::geom_vline(
      xintercept = vline_val,
      linetype = "dashed",
      color = vline_color
    ) +
    ggplot2::geom_hline(
      yintercept = hline_val,
      linetype = "dashed",
      color = hline_color
    ) +
    ggplot2::geom_point(
      size = pointsize,
      color = color
    ) +
    ggplot2::scale_x_continuous(
      breaks = seq(floor(min(data$relative_time, na.rm = TRUE)),
                   ceiling(max(data$relative_time, na.rm = TRUE)),
                   by = 1)
    ) +
    ggplot2::labs(
      x = "Relative Time to Treatment",
      y = "Estimate and 95% Confidence Interval"
    ) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      panel.grid.minor = ggplot2::element_blank()
    )

  # Add ribbon-style confidence intervals
  if (type == "ribbon") {
    base_plot <- base_plot +
      ggplot2::geom_ribbon(
        ggplot2::aes(ymin = conf_low, ymax = conf_high),
        fill = fill,
        alpha = alpha
      ) +
      ggplot2::geom_line(
        linewidth = linewidth,
        color = color
      )
  }
  # Add error bars for confidence intervals
  else if (type == "errorbar") {
    # If we have the "is_baseline" column, set the baseline row's std.error to NA
    # so that the error bars disappear for the baseline row.
    if ("is_baseline" %in% colnames(data)) {
      data <- data |>
        dplyr::mutate(
          std.error = dplyr::if_else(is_baseline, NA_real_, std.error),
          conf_high = estimate + 1.96 * std.error,
          conf_low  = estimate - 1.96 * std.error
        )
    } else {
      # Fallback if is_baseline doesn't exist (no changes)
      data <- data |>
        dplyr::mutate(
          conf_high = estimate + 1.96 * std.error,
          conf_low  = estimate - 1.96 * std.error
        )
    }

    base_plot <- base_plot +
      ggplot2::geom_errorbar(
        data = data,
        ggplot2::aes(ymin = conf_low, ymax = conf_high),
        color = color,
        width = barwidth,
        linewidth = linewidth
      )
  }

  return(base_plot)
}