R/cic.R
In sccic: Synthetic Control Changes-in-Changes Estimator

Documented in cic

#' Changes-in-Changes Estimator
#'
#' Implements the Changes-in-Changes (CIC) estimator of Athey and Imbens (2006)
#' for the average treatment effect on the treated in a two-group, two-period
#' difference-in-differences setting.
#'
#' @param y_00 Numeric vector. Outcomes for the control group in the
#'   pre-treatment period.
#' @param y_01 Numeric vector. Outcomes for the control group in the
#'   post-treatment period.
#' @param y_10 Numeric vector. Outcomes for the treated group in the
#'   pre-treatment period.
#' @param y_11 Numeric vector. Outcomes for the treated group in the
#'   post-treatment period.
#' @param se Logical. If \code{TRUE} (default), compute analytic standard
#'   errors using the asymptotic variance from Theorem 5.1 of Athey and
#'   Imbens (2006). Ignored (with a message) when \code{discrete = TRUE},
#'   because Theorem 5.1 is derived under the continuous-distribution
#'   assumption.
#' @param boot Logical. If \code{TRUE}, also compute bootstrap standard
#'   errors. Default is \code{FALSE}.
#' @param boot_iters Integer. Number of bootstrap iterations. Default 500.
#' @param seed Integer or \code{NULL}. Random seed for bootstrap.
#' @param discrete Logical. If \code{FALSE} (default), use the continuous
#'   CIC estimator (Theorem~3.1 of Athey and Imbens 2006), which applies
#'   \eqn{F^{-1}_{01}(F_{00}(y_{10,i}))} to each pre-treatment treated
#'   observation. If \code{TRUE}, use the discrete CIC estimator
#'   (Theorem~4.1), which integrates the counterfactual over the quantile
#'   band \eqn{[F_{00}(y^-), F_{00}(y)]} to handle mass points in the
#'   outcome distribution. Use \code{discrete = TRUE} when the outcome
#'   takes a small number of distinct values (e.g., integer counts).
#'
#' @return An object of class \code{"cic"} containing:
#'   \item{tau}{The CIC average treatment effect estimate.}
#'   \item{se}{Analytic standard error (if \code{se = TRUE}).}
#'   \item{z}{z-statistic.}
#'   \item{pval}{Two-sided p-value.}
#'   \item{counterfactual_mean}{Mean of the counterfactual distribution.}
#'   \item{tau_did}{The standard DID estimate for comparison.}
#'   \item{N}{Total sample size.}
#'   \item{n}{Named vector of group sample sizes.}
#'   \item{boot_se}{Bootstrap standard error (if \code{boot = TRUE}).}
#'   \item{ecdfs}{List of empirical CDF objects for each group.}
#'
#' @details
#' The CIC estimator constructs a counterfactual distribution for the treated
#' group in the post-treatment period by applying the transformation:
#' \deqn{Y^{N,CIC}_{11} = F^{-1}_{Y,01}(F_{Y,00}(Y_{10}))}
#'
#' The average treatment effect is then:
#' \deqn{\hat{\tau}^{CIC} = \frac{1}{N_{11}} \sum Y_{11,i}
#'   - \frac{1}{N_{10}} \sum F^{-1}_{Y,01}(F_{Y,00}(Y_{10,i}))}
#'
#' The analytic variance follows Theorem 5.1 of Athey and Imbens (2006):
#' \deqn{Var(\sqrt{N} \hat{\tau}^{CIC}) = V^p/\alpha_{00} + V^q/\alpha_{01}
#'   + V^r/\alpha_{10} + V^s/\alpha_{11}}
#'
#' @references
#' Athey, S. and Imbens, G. W. (2006). Identification and Inference in
#' Nonlinear Difference-in-Differences Models. \emph{Econometrica},
#' 74(2), 431--497. \doi{10.1111/j.1468-0262.2006.00668.x}
#'
#' @examples
#' # Workers' compensation example (Meyer, Viscusi, and Durbin 1995)
#' if (requireNamespace("wooldridge", quietly = TRUE)) {
#'   data("injury", package = "wooldridge")
#'   result <- cic(
#'     y_00 = injury$ldurat[injury$highearn == 0 & injury$afchnge == 0],
#'     y_01 = injury$ldurat[injury$highearn == 0 & injury$afchnge == 1],
#'     y_10 = injury$ldurat[injury$highearn == 1 & injury$afchnge == 0],
#'     y_11 = injury$ldurat[injury$highearn == 1 & injury$afchnge == 1]
#'   )
#'   print(result)
#' }
#'
#' @export
cic <- function(y_00, y_01, y_10, y_11,
                se = TRUE, boot = FALSE, boot_iters = 500L, seed = NULL,
                discrete = FALSE) {

  # Input validation
  stopifnot(is.numeric(y_00), is.numeric(y_01),
            is.numeric(y_10), is.numeric(y_11))
  stopifnot(length(y_00) > 0, length(y_01) > 0,
            length(y_10) > 0, length(y_11) > 0)

  # Build empirical CDFs
  ec_00 <- make_ecdf(y_00)
  ec_01 <- make_ecdf(y_01)
  ec_10 <- make_ecdf(y_10)
  ec_11 <- make_ecdf(y_11)

  N_00 <- ec_00$n; N_01 <- ec_01$n
  N_10 <- ec_10$n; N_11 <- ec_11$n
  N <- N_00 + N_01 + N_10 + N_11

  # Counterfactual values via continuous or discrete CIC formula
  cf_values <- compute_cic_cf(y_10, ec_00, ec_01, discrete = discrete)

  counterfactual_mean <- mean(cf_values)
  tau <- mean(y_11) - counterfactual_mean

  # Standard DID for comparison
  tau_did <- (mean(y_11) - mean(y_10)) - (mean(y_01) - mean(y_00))

  # Analytic standard errors (Theorem 5.1, continuous case only)
  se_val <- NA_real_
  z_val <- NA_real_
  pval <- NA_real_
  V_components <- NULL

  if (discrete && se) {
    message("Analytic standard errors (Theorem 5.1) assume a continuous ",
            "outcome distribution and are not valid when discrete = TRUE. ",
            "Use boot = TRUE for inference.")
    se <- FALSE
  }

  if (se) {
    ecdfs <- list(ec_00 = ec_00, ec_01 = ec_01,
                  ec_10 = ec_10, ec_11 = ec_11)
    V_components <- compute_variance_components(
      y_00, y_01, y_10, y_11, ecdfs)
    V_p <- V_components$V_p
    V_q <- V_components$V_q
    V_r <- V_components$V_r
    V_s <- V_components$V_s

    a_00 <- N_00 / N; a_01 <- N_01 / N
    a_10 <- N_10 / N; a_11 <- N_11 / N
    avar <- V_p / a_00 + V_q / a_01 + V_r / a_10 + V_s / a_11
    se_val <- sqrt(avar) / sqrt(N)
    z_val <- tau / se_val
    pval <- 2 * stats::pnorm(-abs(z_val))
  }

  # Bootstrap
  boot_se_val <- NA_real_
  if (boot) {
    if (!is.null(seed)) set.seed(seed)
    boot_taus <- replicate(boot_iters, {
      b00 <- sample(y_00, N_00, replace = TRUE)
      b01 <- sample(y_01, N_01, replace = TRUE)
      b10 <- sample(y_10, N_10, replace = TRUE)
      b11 <- sample(y_11, N_11, replace = TRUE)
      bec_00 <- make_ecdf(b00)
      bec_01 <- make_ecdf(b01)
      bcf <- compute_cic_cf(b10, bec_00, bec_01, discrete = discrete)
      mean(b11) - mean(bcf)
    })
    boot_se_val <- stats::sd(boot_taus)
  }

  structure(list(
    tau = tau,
    se = se_val,
    z = z_val,
    pval = pval,
    counterfactual_mean = counterfactual_mean,
    tau_did = tau_did,
    N = N,
    n = c(N_00 = N_00, N_01 = N_01, N_10 = N_10, N_11 = N_11),
    boot_se = boot_se_val,
    V_components = V_components,
    ecdfs = list(ec_00 = ec_00, ec_01 = ec_01,
                 ec_10 = ec_10, ec_11 = ec_11),
    cf_values = cf_values,
    discrete = discrete
  ), class = "cic")
}

#' @export
print.cic <- function(x, digits = 4, ...) {
  estimator_label <- if (isTRUE(x$discrete)) {
    "discrete (Theorem 4.1)"
  } else {
    "continuous (Theorem 3.1)"
  }
  cat("\nChanges-in-Changes Estimator\n")
  cat(strrep("-", 40), "\n")
  cat("  Estimator:", estimator_label, "\n")
  cat("  tau^CIC  =", formatC(x$tau, digits = digits, format = "f"), "\n")
  cat("  tau^DID  =", formatC(x$tau_did, digits = digits, format = "f"), "\n")
  if (!is.na(x$se)) {
    cat("  SE       =", formatC(x$se, digits = digits, format = "f"), "\n")
    cat("  z-stat   =", formatC(x$z, digits = digits, format = "f"), "\n")
    cat("  p-value  =", format.pval(x$pval, digits = digits), "\n")
  }
  if (!is.na(x$boot_se)) {
    cat("  boot SE  =", formatC(x$boot_se, digits = digits, format = "f"), "\n")
  }
  cat("  N        =", x$N,
      sprintf("(%d/%d/%d/%d)", x$n[1], x$n[2], x$n[3], x$n[4]), "\n")
  invisible(x)
}

#' @export
summary.cic <- function(object, ...) {
  print.cic(object, ...)
}