R/sc_cic.R

Defines functions print.sc_cic .fit_sc sc_cic

Documented in .fit_sc sc_cic

#' Synthetic Control Changes-in-Changes Estimator
#'
#' Combines synthetic control methods with the Changes-in-Changes estimator.
#' First constructs a synthetic control unit from donor units using elastic
#' net regularization, then applies the CIC estimator using the synthetic
#' control as the comparison group.
#'
#' @param y_treated Numeric vector. Outcome for the treated unit across all
#'   time periods (pre and post).
#' @param y_donors Numeric matrix. Outcomes for donor units, with rows as
#'   time periods (matching \code{y_treated}) and columns as donor units.
#' @param treatment_period Integer. The index (row number) of the first
#'   treatment period. Periods 1 to \code{treatment_period - 1} are
#'   pre-treatment.
#' @param alpha Elastic net mixing parameter. \code{alpha = 1} (default) is
#'   lasso; \code{alpha = 0} is ridge.
#' @param boot Logical. Compute bootstrap standard errors. Default \code{TRUE}.
#'   The bootstrap re-estimates the elastic net weights in every iteration
#'   to account for first-stage estimation uncertainty.
#' @param boot_iters Integer. Number of bootstrap iterations. Default 500.
#' @param seed Integer or \code{NULL}. Random seed for reproducibility.
#'
#' @return An object of class \code{"sc_cic"} inheriting from \code{"cic"},
#'   with components:
#'   \item{tau}{The SC-CIC average treatment effect estimate.}
#'   \item{se}{Bootstrap standard error (if \code{boot = TRUE}). Note:
#'     analytic standard errors from Athey and Imbens (2006) Theorem 5.1
#'     are \emph{not} provided for \code{sc_cic}, because they do not
#'     account for first-stage synthetic control estimation uncertainty.
#'     Use \code{\link{cic}} directly if analytic SEs are needed for a
#'     pre-specified control group.}
#'   \item{z}{z-statistic (bootstrap-based).}
#'   \item{pval}{Two-sided p-value (bootstrap-based).}
#'   \item{boot_se}{Same as \code{se} (for compatibility with \code{cic}).}
#'   \item{tau_did}{The SC-DID estimate for comparison.}
#'   \item{sc_weights}{Named vector of synthetic control weights (including
#'     intercept).}
#'   \item{sc_fitted}{Synthetic control outcome across all time periods.}
#'   \item{donors_selected}{Names of donor units with nonzero weights.}
#'   \item{pre_fit_rmse}{Root mean squared error of pre-treatment fit.}
#'
#' @details
#' The procedure works in two steps:
#'
#' \strong{Step 1: Synthetic Control Construction.}
#' In the pre-treatment period, the treated unit's outcome is regressed on
#' the donor units' outcomes using elastic net (via \code{\link[glmnet]{cv.glmnet}}).
#' This yields a sparse set of weights that construct a synthetic control unit
#' as a weighted combination of donors.
#'
#' \strong{Step 2: CIC Estimation.}
#' The CIC estimator is applied with the synthetic control as the "control
#' group" and the treated unit as the "treatment group."
#'
#' \strong{Inference.}
#' Because the synthetic control is an estimated object, the analytic
#' asymptotic variance of Athey and Imbens (2006) does not directly apply.
#' Instead, \code{sc_cic} provides bootstrap standard errors that
#' re-estimate the elastic net weights in each bootstrap iteration,
#' thereby accounting for first-stage estimation uncertainty. The bootstrap
#' resamples time periods (with replacement) within the pre-treatment and
#' post-treatment windows separately, preserving the panel structure.
#'
#' @references
#' Athey, S. and Imbens, G. W. (2006). Identification and Inference in
#' Nonlinear Difference-in-Differences Models. \emph{Econometrica},
#' 74(2), 431--497.
#'
#' Abadie, A., Diamond, A., and Hainmueller, J. (2010). Synthetic Control
#' Methods for Comparative Case Studies. \emph{Journal of the American
#' Statistical Association}, 105(490), 493--505.
#'
#' @examples
#' \donttest{
#' # Basque Country example
#' if (requireNamespace("Synth", quietly = TRUE)) {
#'   data("basque", package = "Synth")
#'   gdp <- reshape(basque[, c("regionno", "year", "gdpcap")],
#'                  idvar = "year", timevar = "regionno", direction = "wide")
#'   y_treated <- gdp[, "gdpcap.17"]
#'   donors <- as.matrix(gdp[, grep("gdpcap\\.", names(gdp))])
#'   donors <- donors[, !colnames(donors) %in% c("gdpcap.17", "gdpcap.1")]
#'   valid <- complete.cases(y_treated, donors)
#'   result <- sc_cic(y_treated[valid], donors[valid, ],
#'                    treatment_period = 16, seed = 42)
#'   print(result)
#' }
#' }
#'
#' @export
sc_cic <- function(y_treated, y_donors, treatment_period,
                   alpha = 1, boot = TRUE,
                   boot_iters = 500L, seed = NULL) {

  # Input validation
  stopifnot(is.numeric(y_treated))
  if (!is.matrix(y_donors)) y_donors <- as.matrix(y_donors)
  stopifnot(nrow(y_donors) == length(y_treated))
  stopifnot(treatment_period > 1, treatment_period <= length(y_treated))

  T_total <- length(y_treated)
  pre_idx <- seq_len(treatment_period - 1)
  post_idx <- seq(treatment_period, T_total)
  T_pre <- length(pre_idx)
  T_post <- length(post_idx)

  # Remove donors with NAs in pre-period
  good_cols <- colSums(is.na(y_donors[pre_idx, , drop = FALSE])) == 0
  y_donors_clean <- y_donors[, good_cols, drop = FALSE]

  # Step 1: Construct synthetic control via elastic net
  sc_result <- .fit_sc(y_treated, y_donors_clean, pre_idx, post_idx, alpha, seed)

  # Step 2: Apply CIC (no analytic SE — not valid for two-stage estimator)
  cic_result <- cic(sc_result$y_00, sc_result$y_01,
                    sc_result$y_10, sc_result$y_11,
                    se = FALSE, boot = FALSE)

  # Pre-treatment fit diagnostics
  pre_fit_rmse <- sqrt(mean((sc_result$y_10 - sc_result$y_00)^2))

  # Distributional alignment check (KS test on pre-treatment SC vs treated)
  ks_result <- tryCatch(
    stats::ks.test(sc_result$y_00, sc_result$y_10),
    error = function(e) NULL
  )
  ks_pval <- if (!is.null(ks_result)) ks_result$p.value else NA_real_
  if (!is.na(ks_pval) && ks_pval < 0.10) {
    warning("Pre-treatment distributional mismatch detected (KS test p = ",
            round(ks_pval, 3), "). ",
            "The synthetic control matches the treated unit's mean but not ",
            "its distribution. CIC estimates may be unreliable. ",
            "Inspect plot_qq() for details.",
            call. = FALSE)
  }

  # Effective number of donors (non-zero weights excluding intercept)
  n_donors_eff <- sum(sc_result$weights[-1] != 0)

  # Bootstrap: re-estimate SC weights in each iteration
  boot_se_val <- NA_real_
  boot_taus <- NULL
  if (boot) {
    if (!is.null(seed)) set.seed(seed)
    boot_taus <- replicate(boot_iters, {
      # Resample pre-treatment periods (with replacement)
      b_pre <- sample(pre_idx, T_pre, replace = TRUE)
      # Resample post-treatment periods (with replacement)
      b_post <- sample(post_idx, T_post, replace = TRUE)

      # Re-estimate SC weights on resampled pre-treatment data
      b_sc <- tryCatch({
        .fit_sc(y_treated, y_donors_clean, b_pre, b_post, alpha, seed = NULL)
      }, error = function(e) NULL)

      if (is.null(b_sc)) return(NA_real_)

      # Compute CIC tau on resampled data
      b_cic <- tryCatch({
        cic(b_sc$y_00, b_sc$y_01, b_sc$y_10, b_sc$y_11,
            se = FALSE, boot = FALSE)$tau
      }, error = function(e) NA_real_)

      b_cic
    })

    boot_taus <- boot_taus[!is.na(boot_taus)]
    if (length(boot_taus) > 10) {
      boot_se_val <- stats::sd(boot_taus)
    }
  }

  # Assemble result
  se_val <- boot_se_val
  z_val <- if (!is.na(se_val) && se_val > 0) cic_result$tau / se_val else NA_real_
  pval <- if (!is.na(z_val)) 2 * stats::pnorm(-abs(z_val)) else NA_real_

  nonzero_idx <- which(sc_result$weights != 0)

  structure(list(
    tau = cic_result$tau,
    se = se_val,
    z = z_val,
    pval = pval,
    counterfactual_mean = cic_result$counterfactual_mean,
    tau_did = cic_result$tau_did,
    N = cic_result$N,
    n = cic_result$n,
    boot_se = boot_se_val,
    boot_taus = boot_taus,
    V_components = NULL,
    ecdfs = cic_result$ecdfs,
    cf_values = cic_result$cf_values,
    sc_weights = sc_result$weights,
    sc_fitted = sc_result$fitted,
    donors_selected = names(sc_result$weights)[nonzero_idx],
    pre_fit_rmse = pre_fit_rmse,
    ks_pval = ks_pval,
    n_donors_eff = n_donors_eff,
    cv_fit = sc_result$cv_fit,
    y_treated = y_treated,
    treatment_period = treatment_period
  ), class = c("sc_cic", "cic"))
}

#' Fit synthetic control via elastic net (internal)
#' @keywords internal
.fit_sc <- function(y_treated, y_donors, pre_idx, post_idx, alpha, seed) {
  X_pre <- y_donors[pre_idx, , drop = FALSE]
  y_pre <- y_treated[pre_idx]

  if (!is.null(seed)) set.seed(seed)
  cv_fit <- glmnet::cv.glmnet(X_pre, y_pre, alpha = alpha)
  coefs <- stats::coef(cv_fit, s = "lambda.min")
  w <- as.numeric(coefs)
  names(w) <- c("intercept", colnames(y_donors))

  # Construct SC for all periods
  X_all <- cbind(1, y_donors)
  X_all[is.na(X_all)] <- 0
  fitted <- as.numeric(X_all %*% w)

  list(
    weights = w,
    fitted = fitted,
    cv_fit = cv_fit,
    y_00 = fitted[pre_idx],
    y_01 = fitted[post_idx],
    y_10 = y_treated[pre_idx],
    y_11 = y_treated[post_idx]
  )
}

#' @export
print.sc_cic <- function(x, digits = 4, ...) {
  cat("\nSynthetic Control + Changes-in-Changes Estimator\n")
  cat(strrep("-", 50), "\n")
  cat("SC donors selected:", x$n_donors_eff, "\n")
  cat("Pre-treatment RMSE:", formatC(x$pre_fit_rmse, digits = digits, format = "f"), "\n")
  if (!is.na(x$ks_pval)) {
    cat("Pre-treatment KS p: ", formatC(x$ks_pval, digits = 3, format = "f"),
        if (x$ks_pval < 0.10) " [WARNING: distributional mismatch]" else " [OK]",
        "\n", sep = "")
  }
  cat(strrep("-", 50), "\n")
  cat("  tau^CIC  =", formatC(x$tau, digits = digits, format = "f"), "\n")
  cat("  tau^DID  =", formatC(x$tau_did, digits = digits, format = "f"), "\n")
  if (!is.na(x$se)) {
    cat("  boot SE  =", formatC(x$se, digits = digits, format = "f"), "\n")
    cat("  z-stat   =", formatC(x$z, digits = digits, format = "f"), "\n")
    cat("  p-value  =", format.pval(x$pval, digits = digits), "\n")
  }
  cat("  N        =", x$N,
      sprintf("(%d/%d/%d/%d)", x$n[1], x$n[2], x$n[3], x$n[4]), "\n")
  invisible(x)
}

Try the sccic package in your browser

Any scripts or data that you put into this service are public.

sccic documentation built on April 10, 2026, 5:07 p.m.