R/simulate.R
In sccic: Synthetic Control Changes-in-Changes Estimator

Documented in .run_cs_sim .run_sc_sim simulate_sccic summarize_simulation

#' Simulation Study for SC-CIC
#'
#' Generates data under controlled DGPs and evaluates SC-CIC performance.
#'
#' @param n_sims Integer. Number of simulation replications.
#' @param T_pre Integer. Number of pre-treatment periods.
#' @param T_post Integer. Number of post-treatment periods.
#' @param J Integer. Number of donor units.
#' @param tau_true Numeric. True average treatment effect.
#' @param dgp Character. Data generating process. See Details.
#' @param alpha Elastic net mixing parameter for SC construction.
#' @param boot_iters Integer. Bootstrap iterations per simulation.
#' @param seed Integer. Random seed.
#' @param verbose Logical. Print progress.
#'
#' @return A data frame with simulation results.
#'
#' @details
#' Four DGPs are available, designed to test different aspects of SC-CIC:
#'
#' \strong{DGP 1: \code{"linear"} — Baseline.}
#' Outcomes are linear in a common factor and unit-specific loadings.
#' DID is correctly specified. CIC matches DID. SC fits well.
#' Purpose: verify the method works in the easy case.
#'
#' \strong{DGP 2: \code{"nonlinear"} — CIC advantage.}
#' Cross-sectional DGP (not SC). N observations per cell.
#' Control and treated have different distributions of unobservables.
#' The production function is nonlinear and changes over time.
#' DID is biased due to the nonlinear distributional shift; CIC is correct.
#' Purpose: demonstrate the advantage of CIC over DID.
#' Note: this tests \code{cic()}, not \code{sc_cic()}.
#'
#' \strong{DGP 3: \code{"sc_good"} — SC with good distributional fit.}
#' The treated unit is a true sparse combination of donors plus noise.
#' SC recovers the weights well; the distributional dynamics are similar.
#' Purpose: show SC-CIC works when SC fit is good.
#'
#' \strong{DGP 4: \code{"sc_bad"} — SC with mean-only fit.}
#' The SC matches the treated mean, but donors have much lower variance
#' than the treated unit. The distributional transport is wrong.
#' Purpose: show SC-CIC fails when distributional assumptions are violated.
#'
#' @examples
#' # Quick example (runs in seconds)
#' r <- simulate_sccic(n_sims = 2, dgp = "nonlinear", tau_true = 1, boot_iters = 5, verbose = FALSE)
#' summarize_simulation(r, tau_true = 1)
#'
#' \donttest{
#' # Full simulation
#' r <- simulate_sccic(n_sims = 200, dgp = "nonlinear", tau_true = 1)
#' summarize_simulation(r, tau_true = 1)
#' }
#'
#' @export
simulate_sccic <- function(n_sims = 500, T_pre = 25, T_post = 15, J = 15,
                           tau_true = 1,
                           dgp = c("linear", "nonlinear", "sc_good", "sc_bad"),
                           alpha = 1, boot_iters = 200, seed = 42,
                           verbose = TRUE) {

  dgp <- match.arg(dgp)
  if (!is.null(seed)) set.seed(seed)

  # For cross-sectional DGP, use N per cell instead of T
  is_cs <- (dgp == "nonlinear")

  results <- data.frame(
    sim = integer(n_sims),
    tau_cic = numeric(n_sims),
    tau_did = numeric(n_sims),
    se_cic = numeric(n_sims),
    ci_lower = numeric(n_sims),
    ci_upper = numeric(n_sims),
    covers = logical(n_sims),
    pre_rmse = numeric(n_sims)
  )

  for (s in seq_len(n_sims)) {
    if (verbose && s %% 50 == 0) cat("Simulation", s, "/", n_sims, "\n")

    res <- tryCatch({
      if (is_cs) {
        .run_cs_sim(T_pre, T_post, tau_true, dgp)
      } else {
        .run_sc_sim(T_pre, T_post, J, tau_true, dgp, alpha, boot_iters)
      }
    }, error = function(e) NULL)

    if (is.null(res)) {
      results[s, ] <- c(s, rep(NA, 7))
      next
    }

    results$sim[s] <- s
    results$tau_cic[s] <- res$tau_cic
    results$tau_did[s] <- res$tau_did
    results$se_cic[s] <- res$se
    results$ci_lower[s] <- res$tau_cic - 1.96 * res$se
    results$ci_upper[s] <- res$tau_cic + 1.96 * res$se
    results$covers[s] <- !is.na(res$se) &&
      (res$tau_cic - 1.96 * res$se) <= tau_true &&
      tau_true <= (res$tau_cic + 1.96 * res$se)
    results$pre_rmse[s] <- res$pre_rmse
  }

  attr(results, "dgp") <- dgp
  attr(results, "tau_true") <- tau_true
  results
}

#' Run one cross-sectional CIC simulation (DGP: nonlinear)
#' @keywords internal
.run_cs_sim <- function(N_cell, N_cell_post, tau_true, dgp) {
  # Cross-sectional: N_cell obs per group-period cell
  # Control group: U ~ N(0, 1)
  # Treated group: U ~ N(0.5, 1) (shifted distribution)
  # Production function:
  #   h(u, t=0) = u
  #   h(u, t=1) = u + 0.4 * u^2  (monotone for u > -1.25, covers ~90% of N(0,1))
  # Treatment: adds tau_true to h(U, 1) for treated in post-period

  U_00 <- stats::rnorm(N_cell, 0, 1)
  U_01 <- stats::rnorm(N_cell_post, 0, 1)
  U_10 <- stats::rnorm(N_cell, 0.5, 1)
  U_11 <- stats::rnorm(N_cell_post, 0.5, 1)

  # Apply production function
  gamma <- 0.4
  y_00 <- U_00                          # h(u, 0) = u
  y_01 <- U_01 + gamma * U_01^2         # h(u, 1) = u + gamma*u^2
  y_10 <- U_10                          # h(u, 0) = u
  y_11 <- U_11 + gamma * U_11^2 + tau_true  # h(u, 1) + tau

  # True counterfactual for treated in post: h(U_11, 1) without treatment
  # = U_11 + gamma * U_11^2
  # True tau = mean(y_11) - mean(U_11 + gamma * U_11^2) = tau_true (by construction)

  result <- cic(y_00, y_01, y_10, y_11, se = TRUE, boot = FALSE)

  list(tau_cic = result$tau, tau_did = result$tau_did,
       se = result$se, pre_rmse = 0)
}

#' Run one SC-CIC simulation
#' @keywords internal
.run_sc_sim <- function(T_pre, T_post, J, tau_true, dgp, alpha, boot_iters) {
  T_total <- T_pre + T_post
  pre_idx <- seq_len(T_pre)
  post_idx <- seq(T_pre + 1, T_total)

  # Common time factor (smooth trend + noise)
  f_t <- 0.1 * seq_len(T_total) + cumsum(stats::rnorm(T_total, 0, 0.2))

  # Donor fixed effects and factor loadings
  mu_d <- stats::rnorm(J, 0, 1)
  lambda_d <- stats::runif(J, 0.5, 1.5)
  sigma <- 0.5

  # Treated unit: sparse convex combination of donors
  n_active <- min(4, J)
  active <- sample(J, n_active)
  w_true <- numeric(J)
  w_true[active] <- stats::runif(n_active, 0.2, 0.6)
  w_true <- w_true / sum(w_true)
  mu_tr <- sum(w_true * mu_d) + 0.3
  lambda_tr <- sum(w_true * lambda_d)

  # Generate latent values for all units
  latent_d <- matrix(NA, T_total, J)
  for (j in seq_len(J)) {
    latent_d[, j] <- mu_d[j] + lambda_d[j] * f_t + stats::rnorm(T_total, 0, sigma)
  }
  latent_tr <- mu_tr + lambda_tr * f_t + stats::rnorm(T_total, 0, sigma)

  # Apply production functions based on DGP
  if (dgp == "linear") {
    # h(u, t) = u for all t. DID is correct. CIC matches DID.
    y_donors <- latent_d
    y_treated <- latent_tr

  } else if (dgp == "sc_good") {
    # Linear dynamics with GOOD SC fit.
    # Same as linear DGP but with more pre-treatment periods and lower
    # noise so the SC fits well and CIC/DID both work.
    # The SC is a linear combination of linear outcomes, so it preserves
    # the distributional dynamics. CIC assumptions are approximately met.
    y_donors <- latent_d
    y_treated <- latent_tr

  } else if (dgp == "sc_bad") {
    # Nonlinear production function applied to ALL units.
    # h(u, pre) = u, h(u, post) = u + gamma * u^2
    # The SC is sum(w_j * h(L_j, post)) which is NOT h(sum(w_j * L_j), post).
    # This means the SC's distributional transport is WRONG even though
    # all units share the same production function.
    # PURPOSE: demonstrate that SC (a linear operation) breaks CIC when
    # the outcome is nonlinear in the latent variable.
    gamma <- 0.3
    y_donors <- latent_d
    y_donors[post_idx, ] <- latent_d[post_idx, ] +
      gamma * latent_d[post_idx, ]^2
    y_treated <- latent_tr
    y_treated[post_idx] <- latent_tr[post_idx] +
      gamma * latent_tr[post_idx]^2
  }

  colnames(y_donors) <- paste0("d", seq_len(J))

  # Add treatment effect to treated in post-period
  y_treated[post_idx] <- y_treated[post_idx] + tau_true

  # Run SC-CIC
  result <- sc_cic(y_treated, y_donors,
                   treatment_period = T_pre + 1,
                   alpha = alpha, boot = TRUE,
                   boot_iters = boot_iters, seed = NULL)

  list(tau_cic = result$tau, tau_did = result$tau_did,
       se = result$boot_se, pre_rmse = result$pre_fit_rmse)
}


#' Summarize simulation results
#'
#' @param results Data frame from \code{simulate_sccic}.
#' @param tau_true True treatment effect.
#' @return Prints summary statistics and returns them invisibly.
#' @export
summarize_simulation <- function(results, tau_true) {
  valid <- !is.na(results$tau_cic)
  r <- results[valid, ]

  dgp <- attr(results, "dgp")
  if (is.null(dgp)) dgp <- "unknown"

  cat("\n=== Simulation Summary (DGP:", dgp, ") ===\n")
  cat("Replications:", sum(valid), "/", nrow(results), "\n")
  cat("True tau:", tau_true, "\n\n")

  # CIC
  bias_cic <- mean(r$tau_cic) - tau_true
  rmse_cic <- sqrt(mean((r$tau_cic - tau_true)^2))
  se_mean <- mean(r$se_cic, na.rm = TRUE)
  coverage <- mean(r$covers, na.rm = TRUE)

  cat("CIC:\n")
  cat("  Mean estimate:", round(mean(r$tau_cic), 4), "\n")
  cat("  Bias:         ", round(bias_cic, 4), "\n")
  cat("  RMSE:         ", round(rmse_cic, 4), "\n")
  cat("  Mean SE:      ", round(se_mean, 4), "\n")
  cat("  95% CI cover: ", round(coverage * 100, 1), "%\n\n")

  # DID
  bias_did <- mean(r$tau_did) - tau_true
  rmse_did <- sqrt(mean((r$tau_did - tau_true)^2))

  cat("DID:\n")
  cat("  Mean estimate:", round(mean(r$tau_did), 4), "\n")
  cat("  Bias:         ", round(bias_did, 4), "\n")
  cat("  RMSE:         ", round(rmse_did, 4), "\n\n")

  if (any(!is.na(r$pre_rmse) & r$pre_rmse > 0)) {
    cat("Pre-treatment fit:\n")
    cat("  Mean RMSE:    ", round(mean(r$pre_rmse[r$pre_rmse > 0], na.rm = TRUE), 4), "\n")
  }

  invisible(list(bias_cic = bias_cic, rmse_cic = rmse_cic,
                 coverage = coverage, bias_did = bias_did, rmse_did = rmse_did))
}