Monte Carlo Analysis for Verifying Expressions

Core functions and packages

library(sreg)
library(dplyr)
library(tidyr)
library(extraDistr)
library(tidyverse)
rm(list = ls())
source("/Users/trifonovjuri/Desktop/sreg/R/2.0(dev)/fun_new.r")

Individual level treatment assignment

Here we provide the results for the case with multiple treatments but without clusters (one covariate so far).

nsim <- 2000
tau_vec <- c(0.5, 1.5)
n.treat <- 2
k <- 3
n <- 1200
treat_sizes <- c(1, 1, 1)
theta_hat_mtrx <- matrix(ncol = n.treat, nrow = nsim)
v_hat_mtrx <- matrix(ncol = n.treat, nrow = nsim)
hit <- matrix(ncol = n.treat, nrow = nsim)

for (s in 1:nsim)
{
  data_raw <- dgp_po(n = n, theta.vec = tau_vec, n.treat = n.treat, gamma.vec = c(0.4, 0.2, 1)) # Checked that now it returns several X's
  data_df <- run_experiment_multi(data_raw, n.treat = n.treat, k = k, treat_sizes = treat_sizes) # 1 gets treatment 0, 1 gets treatment 1, 1 gets treatment 2 in each triplet
  data_full <- data.frame("Y" = data_df$y, "D" = data_df$A, "X" = data_df$X, "S" = data_df$block)
  #data_full <- data.frame("Y" = data_df$y, "D" = data_df$A, "X" = data.frame(data_df$X, "S" = data_df$block)
  N <- nrow(data_full)
  theta_hat_vec <- theta_hat_mult(data_full$Y, data_full$D, data_full$X, data_full$S)$tau.hat
  v_hat_vec <- var_hat_mult(data_full) / N
  theta_hat_mtrx[s, ] <- theta_hat_vec
  v_hat_mtrx[s, ] <- v_hat_vec
  upper <- theta_hat_vec + qnorm(0.975) * sqrt(v_hat_vec)
  lower <- theta_hat_vec - qnorm(0.975) * sqrt(v_hat_vec)
  hit[s, ] <- (tau_vec < upper) & (tau_vec > lower)
  print(s)
}
result_table <- data.frame(
  `True tau` = tau_vec,
  `E[tau_n]` = round(colMeans(theta_hat_mtrx), 4),
  `True Variance` = round(colMeans(v_hat_mtrx), 4),
  `E[V_n]` = round(apply(theta_hat_mtrx, 2, var), 4),
  `Coverage` = round(colMeans(hit), 4),
  check.names = FALSE
)
print(result_table)

Cluster level treatment assignment

Here we provide the results for the case of cluster level treatment assignment with multiple treatments (one covariate so far).

n_treat <- 3
nsim <- 10000
tau_vec <- c(0.2, 1.2, 0.7)
gamma_vec <- c(0.4, 0.2, 1)
tau_hat_mtrx <- matrix(ncol = n_treat, nrow = nsim)
var_hat_mtrx <- matrix(ncol = n_treat, nrow = nsim)
hit_mtrx <- matrix(ncol = n_treat, nrow = nsim)
G <- 500
Nmax <- 50
k <- 5
treat_sizes <- c(2, 1, 1, 1) # treatment 0 - p_1 individuals, treatment 1 - p_2 individuals, ...
max.support <- Nmax / 10 - 1
# Ng <- rep(Nmax, G)
Ng <- gen.cluster.sizes(G, max.support)

for (s in 1:nsim)
{
  data_pot <- dgp.po.creg(Ng = Ng, tau.vec = tau_vec, G = G, gamma.vec = gamma_vec, n.treat = n_treat)
  proba <- dgp.obs.creg(baseline = data_pot, n.treat = n_treat, k = k, treat_sizes = treat_sizes)
  data_cl <- data.frame("Y" = proba$Y, "D" = proba$D, proba$X, "S" = proba$S, "Ng" = proba$Ng, "G.id" = proba$G.id)
  tau_hat <- theta_hat_mult_cl_old(Y = data_cl$Y, D = data_cl$D, X = data_cl$x_1, S = data_cl$S, G.id = data_cl$G.id, Ng = data_cl$Ng)
  tau_hat_mtrx[s, ] <- tau_hat$tau.hat
  var_hat <- var_hat_mult_cl(Y = data_cl$Y, D = data_cl$D, X = data_cl$x_1, S = data_cl$S, G.id = data_cl$G.id, Ng = data_cl$Ng, tau.hat = tau_hat) / G
  var_hat_mtrx[s, ] <- var_hat
  upper <- tau_hat$tau.hat + qnorm(0.975) * sqrt(var_hat)
  lower <- tau_hat$tau.hat - qnorm(0.975) * sqrt(var_hat)
  hit_mtrx[s, ] <- (tau_vec < upper) & (tau_vec > lower)
  print(s)
}
result_table <- data.frame(
  `True tau` = tau_vec,
  `E[tau_n]` = round(colMeans(tau_hat_mtrx), 4),
  `E[V_n]` = round(colMeans(var_hat_mtrx), 4),
  `True Variance` = round(apply(tau_hat_mtrx, 2, var), 4),
  `Coverage` = round(colMeans(hit_mtrx), 4),
  check.names = FALSE
)
print(result_table)
tau_vec <- c(0.5, 1.5) 
n.treat <- 2
k <- 3
n <- 900
treat_sizes <- c(1, 1, 1)


datim <- sreg.rgen(n = 1200, cluster = TRUE, small.strata = TRUE, tau.vec = c(1.2, 0.8))
X <- data.frame(datim$x_1, datim$x_2, datim$Ng)
sreg(Y = datim$Y, S = datim$S, D = datim$D, G.id = datim$G.id, Ng = datim$Ng, X = X, HC1 = TRUE, small.strata = TRUE)

data_raw <- dgp_po(n = n, theta.vec = tau_vec, n.treat = n.treat, gamma.vec = c(0.4, 0.2, 1)) # Checked that now it returns several X's
data_df <- run_experiment_multi(data_raw, n.treat = n.treat, k = k, treat_sizes = treat_sizes) # 1 gets treatment 0, 1 gets treatment 1, 1 gets treatment 2 in each triplet
Y <- data_df$y
D <- data_df$A
X <- data.frame("x_1" = data_df$x_1, "x_2" = data_df$x_2)#, 'bor' = rnorm(24, 2, 1), 'god' = rnorm(24, 3, 1), 'bro'= rnorm(24, 3, 1), 'bro1'= rnorm(24, 3, 1), 'bro2'= rnorm(24, 3, 1))
S <- data_df$block
data_full <- data.frame(Y, D, X, S)
N <- nrow(data_full)

theta_hat_vec <- theta_hat_mult(Y, D, X, S)#$tau.hat
var_hat_vec   <- var_hat_mult(Y, D, X, S, theta_hat_vec) / N

# Using the aggregated master functions to test:
tau_hat <- tau.hat.sreg.ss(Y, D, X, S)
var_hat <- as.var.sreg.ss(Y, D, X, S, fit = tau_hat) / N
var_hat <- as.var.sreg.ss(Y, D, X, S, fit = tau_hat, HC1 = TRUE) / N
sqrt(var_hat)
tau_hat <- tau.hat.sreg.ss(Y, D, X = NULL, S)
var_hat <- as.var.sreg.ss(Y, D, X = NULL, S, fit = NULL, HC1 = TRUE) / N
X <- data.frame("age" = data_df$x_1)#, "x_2" = data_df$x_2)#, 'bor' = rnorm(24, 2, 1), 'god' = rnorm(24, 3, 1), 'bro'= rnorm(24, 3, 1), 'bro1'= rnorm(24, 3, 1), 'bro2'= rnorm(24, 3, 1), 'bro3' = rnorm(24, 5, 2))
agg_result <- res.sreg.ss(Y, S, D, X, HC1 = TRUE)
agg_result$data
any(sapply(agg_result$beta.hat, function(x) any(is.na(x))))
any(sapply(agg_result$se.rob, function(x) any(is.infinite(x))))
abju <- sreg(Y, S = S, D, G.id = NULL, Ng = NULL, X = NULL, HC1 = TRUE, small.strata = TRUE)
abju$lin.adj

nsim <- 1000
n <- 1200
k <- 3
theta_vec <- c(0.5, 1.5)
n.treat <- 2
treat_sizes <- c(1, 1, 1)
theta_hat_mtrx <- matrix(nrow = nsim, ncol = n.treat)
var_hat_mtrx <- matrix(nrow = nsim, ncol = n.treat)
hit_mtrx <- matrix(nrow = nsim, ncol = n.treat)
for (s in 1:nsim)
{
  data_raw <- dgp_po(n = n, theta.vec = theta_vec, n.treat = n.treat, gamma.vec = c(0.4, 0.2, 1)) # Checked that now it returns several X's
  data_df <- run_experiment_multi(data_raw, n.treat = n.treat, k = k, treat_sizes = treat_sizes) # 1 gets treatment 0, 1 gets treatment 1, 1 gets treatment 2 in each triplet
  Y <- data_df$y
  D <- data_df$A
  X <- data.frame("x_1" = data_df$x_1, "x_2" = data_df$x_2)
  S <- data_df$block
  data_full <- data.frame(Y, D, X, S)
  N <- nrow(data_full)

  fit <- theta_hat_mult(Y, D, X, S)
  theta_hat_vec <- fit$tau.hat
  var_hat_vec <- var_hat_mult(Y, D, X, S, fit) / N

  theta_hat_mtrx[s, ] <- theta_hat_vec
  var_hat_mtrx[s, ] <- var_hat_vec

  upper <- theta_hat_vec + qnorm(0.975) * sqrt(var_hat_vec)
  lower <- theta_hat_vec - qnorm(0.975) * sqrt(var_hat_vec)
  hit_mtrx[s, ] <- (theta_vec < upper) & (theta_vec > lower)
  print(s)
}

colMeans(theta_hat_mtrx)
colMeans(var_hat_mtrx)
apply(theta_hat_mtrx, 2, var)

colMeans(hit_mtrx)
data_full %>%
  group_by(S) %>%
  summarize(
    Y_treated = mean(Y[D == 1], na.rm = TRUE),
    Y_control = mean(Y[D == 0], na.rm = TRUE),
    X_treated = list(colMeans(X[D == 1, , drop = FALSE], na.rm = TRUE)),
    X_control = list(colMeans(X[D == 0, , drop = FALSE], na.rm = TRUE)),
    k = n(),
    l = sum(D == 1),
    q = sum(D == 0),
    .groups = "drop"
  )


agg_data_old <- data_full %>%
  group_by(S) %>%
  summarize(
    Y_treated = mean(Y[D == 1], na.rm = TRUE),
    Y_control = mean(Y[D == 0], na.rm = TRUE),
    X_treated = list(colMeans(X[D == 1, , drop = FALSE], na.rm = TRUE)),
    X_control = list(colMeans(X[D == 0, , drop = FALSE], na.rm = TRUE)),
    k = n(),
    l = sum(D == 1),
    q = sum(D == 0),
    .groups = "drop"
  )

# Turn list columns into two separate columns per covariate
agg_data_old <- agg_data_old %>%
  mutate(
    X_treated_1 = map_dbl(X_treated, 1),
    X_treated_2 = map_dbl(X_treated, 2),
    X_control_1 = map_dbl(X_control, 1),
    X_control_2 = map_dbl(X_control, 2)
  )

# Now view the first few rows
agg_data_old


agg_data <- data_full %>%
  group_split(S) %>%
  map_dfr(~ {
    df <- .
    list(
      S = df$S[1],
      Y_treated = mean(df$Y[df$D == 1], na.rm = TRUE),
      Y_control = mean(df$Y[df$D == 0], na.rm = TRUE),
      X_treated = list(colMeans(df[df$D == 1, grep("^x_", names(df)), drop = FALSE])),
      X_control = list(colMeans(df[df$D == 0, grep("^x_", names(df)), drop = FALSE])),
      k = nrow(df),
      l = sum(df$D == 1),
      q = sum(df$D == 0)
    )
  })

# Turn list columns into two separate columns per covariate
agg_data <- agg_data %>%
  mutate(
    X_treated_1 = map_dbl(X_treated, 1),
    X_treated_2 = map_dbl(X_treated, 2),
    X_control_1 = map_dbl(X_control, 1),
    X_control_2 = map_dbl(X_control, 2)
  )
n_treat <- 3 
nsim <- 10
tau_vec <- c(0.2, 1.2, 0.7)
gamma_vec <- c(0.4, 0.2, 1)
# n.treat = 2
k <- 5
treat_sizes <- c(2, 1, 1, 1)
tau_hat_mtrx <- matrix(ncol = n_treat, nrow = nsim)
tau_hat_mtrx_old <- matrix(ncol = n_treat, nrow = nsim)
var_hat_mtrx <- matrix(ncol = n_treat, nrow = nsim)
var_hat_old_mtrx <- matrix(ncol = n_treat, nrow = nsim)
hit_mtrx <- matrix(ncol = n_treat, nrow = nsim)
G <- 500
Nmax <- 50

# treat_sizes = c(2, 1, 1, 1) # treatment 0 - p_1 individuals, treatment 1 - p_2 individuals, ...
max.support <- Nmax / 10 - 1
# Ng <- rep(Nmax, G)
Ng <- gen.cluster.sizes(G, max.support)

for (s in 1:nsim)
{
  Ng <- gen.cluster.sizes(G, max.support)
  data_pot <- dgp.po.creg(Ng = Ng, tau.vec = tau_vec, G = G, gamma.vec = gamma_vec, n.treat = n_treat)
  proba <- dgp.obs.creg(baseline = data_pot, n.treat = n_treat, k = k, treat_sizes = treat_sizes)
  data_cl <- data.frame("Y" = proba$Y, "D" = proba$D, proba$X, "S" = proba$S, "Ng" = proba$Ng, "G.id" = proba$G.id)
  Y <- data_cl$Y
  D <- data_cl$D
  X <- data.frame("x_1" = data_cl$x_1, 'x_2' = data_cl$x_2)

  S <- data_cl$S
  G.id <- data_cl$G.id
  Ng <- data_cl$Ng
  data_full <- data.frame(Y, D, X, S, G.id, Ng)
  #X <- data.frame("x_1" = data_full$x_1, 'x_2' = data_full$x_2)

  tau_hat <- theta_hat_mult_cl(Y, D, X, S, G.id, Ng)
  var_hat <- var_hat_mult_cl(Y, D, X, S, G.id, Ng, fit = tau_hat) / G

  tau_hat_1 <- tau.hat.creg.ss(Y, D, X, S, G.id, Ng)
  var_hat_1 <- as.var.creg.ss(Y, D, X, S, G.id, Ng, tau_hat_1, HC1 = FALSE) / G
  tau_hat_1 <- tau.hat.creg.ss(Y, D, X = NULL, S, G.id, Ng)
  var_hat_1 <- as.var.creg.ss(Y, D, X = NULL, S, G.id, Ng) / G

  agg_result <- res.creg.ss(Y, S, D, G.id, Ng = NULL, X = X, HC1 = TRUE)
  tau_hat_1  = tau.hat.creg.ss(Y, D, X = X, S, G.id, Ng = NULL)
  as.var.creg.ss(Y, D, X, S, G.id, Ng = NULL, tau_hat_1, HC1 = FALSE) / G
  head(agg_result$lin.adj)
  !check.cluster(data.frame("G.id" = agg_result$data$G.id, agg_result$lin.adj))
  X <- data.frame('Ng' = data_cl$Ng, 'x_2' = data_cl$x_2)
  result <- sreg(Y, S = S, D, G.id, Ng = NULL, X = NULL, HC1 = TRUE, small.strata = TRUE)
  result
  result$small.strata
  # tau_hat_old           <- theta_hat_mult_cl_old(Y, D, X, S, G.id, Ng)
  # var_hat_old           <- var_hat_mult_cl_old(Y, D, X, S, G.id, Ng, tau.hat = tau_hat_old) / G
  tau_hat_mtrx[s, ] <- tau_hat$tau.hat
  # tau_hat_mtrx_old[s, ] <- tau_hat_old$tau.hat
  var_hat_mtrx[s, ] <- var_hat
  # var_hat_old_mtrx[s, ] <- var_hat_old
  upper <- tau_hat$tau.hat + qnorm(0.975) * sqrt(var_hat)
  lower <- tau_hat$tau.hat - qnorm(0.975) * sqrt(var_hat)
  hit_mtrx[s, ] <- (tau_vec < upper) & (tau_vec > lower)
  # var_hat             <- var_hat_mult_cl(Y = data_cl$Y, D = data_cl$D, X = data_cl$x_1, S = data_cl$S, G.id = data_cl$G.id, Ng = data_cl$Ng, tau.hat = tau_hat) / G
  # var_hat_mtrx[s, ]   <- var_hat
  # upper               <- tau_hat$tau.hat + qnorm(0.975) * sqrt(var_hat)
  # lower               <- tau_hat$tau.hat - qnorm(0.975) * sqrt(var_hat)
  # hit_mtrx[s, ]       <- (tau_vec < upper) & (tau_vec > lower)
  print(s)
}

colMeans(tau_hat_mtrx)
colMeans(tau_hat_mtrx_old)
apply(tau_hat_mtrx, 2, var)
colMeans(var_hat_mtrx)
apply(tau_hat_mtrx_old, 2, var)
colMeans(hit_mtrx)

theta_hat_mult <- function(Y, D, X, S) { tau.hat <- numeric(max(D)) beta.hat <- matrix(ncol = ncol(X), nrow = max(D)) data_full <- data.frame(Y, D, X, S) for (d in 1:max(D)) { # Compute averages for treated and control within each stratum

agg_data <- data_full %>%

group_by(S) %>%

summarize(

Y_treated = mean(Y[D == d], na.rm = TRUE),

Y_control = mean(Y[D == 0], na.rm = TRUE),

X_treated = list(colMeans(X[D == d, , drop = FALSE], na.rm = TRUE)),

X_control = list(colMeans(X[D == 0, , drop = FALSE], na.rm = TRUE)),

k = n(),

l = sum(D == d),

q = sum(D == 0),

.groups = "drop"

)

agg_data <- data_full %>% group_split(S) %>% map_dfr(~{ df <- . list( S = df$S[1], Y_treated = mean(df$Y[df$D == d], na.rm = TRUE), Y_control = mean(df$Y[df$D == 0], na.rm = TRUE), X_treated = list(colMeans(df[df$D == d, grep("^x_", names(df)), drop = FALSE])), # mean value among treated in each stratum X_control = list(colMeans(df[df$D == 0, grep("^x_", names(df)), drop = FALSE])), # mean value among control in each stratum k = nrow(df), l = sum(df$D == d), q = sum(df$D == 0) ) })

Create Y_diff vector

Y_diff <- with(agg_data, Y_treated - Y_control)

Compute differences correctly: row-wise for each pair of treated/control vectors

X_diff_mat <- map2(agg_data$X_treated, agg_data$X_control, ~ .x - .y) %>% do.call(rbind, .)

print(X_diff_mat)

print(Y_diff)

data_decomp <- as.data.frame(agg_data)
X_treated_mat <- do.call(rbind, data_decomp$X_treated) X_control_mat <- do.call(rbind, data_decomp$X_control)

lm_model <- lm(Y_diff ~ ., data = as.data.frame(cbind(Y_diff, X_diff_mat)))

print(summary(lm_model))

beta_hat <- unname(lm_model$coefficients[-1])

X_mat <- as.matrix(data_full[, grepl("^x_", names(data_full))]) print(X_mat) X_bar <- colMeans(X_mat) X_dem <- sweep(X_mat, 2, X_bar)

adjusted estimator:

theta_hat_adj <- sum((data_full$Y * (data_full$D == d))) / sum((data_full$D == d)) - sum((data_full$Y) * (data_full$D == 0)) / sum(data_full$D == 0) - as.numeric(t(colMeans(X_dem[data_full$D == d, , drop = FALSE]) - colMeans(X_dem[data_full$D == 0, , drop = FALSE])) %*% beta_hat) tau.hat[d] <- theta_hat_adj beta.hat[d, ] <- beta_hat }

ret_list <- list( tau.hat = tau.hat, beta.hat = beta.hat ) return(ret_list) }

theta_hat_mult_cl_old <- function(Y, D, X, S, G.id, Ng)

{
tau.hat <- numeric(max(D)) beta.hat <- numeric(max(D))

working.df <- data.frame(Y, S, D, G.id, Ng, X)
Y.bar.g <- aggregate(Y ~ G.id, working.df, mean)
#print(Y.bar.g)
cl.lvl.data <- unique(working.df[, c("G.id", "D", "S", "Ng", names(working.df)[6:ncol(working.df)])])
cl.lvl.data <- data.frame("Y.bar" = Y.bar.g$Y, cl.lvl.data)
data <- cl.lvl.data
N.bar.G <- mean(data$Ng) #??? Why is this so weird?
#N.bar.G <- Ng
#print(N.bar.g)

#print(data)
#print(head(working.df, 200))
#print(data)
for (d in 1:max(D))
{
# Compute averages for treated and control within each stratum
agg_data <- data %>%
    group_by(S) %>%
summarise(
    Y_treated = mean(Y.bar[D == d] * N.bar.G, na.rm = TRUE),
    Y_control = mean(Y.bar[D == 0] * N.bar.G, na.rm = TRUE),
    X_treated = mean(x_1[D == d], na.rm = TRUE),
    X_control = mean(x_1[D == 0], na.rm = TRUE),
    k = n(),           # Total units in stratum (should be 2)
    l = sum(D == d),   # Number of treated units (should be 1)
    q = sum(D == 0)
) %>%

mutate( Y_diff = (1/l) * Y_treated - (1/q) * Y_control, X_diff = (1/l) * X_treated - (1/q) * X_control, k = k, l = l, q = q )

data_decomp <- as.data.frame(agg_data)

print(data_decomp)

run the linear model for covariate adjustments

lm_model <- lm(Y_diff ~ X_diff, data = data_decomp) print(summary(lm_model)) beta_hat <- coef(lm_model)[2] X_bar <- mean(data$x_1) X_dem <- data$x_1 - X_bar

adjusted estimator:

theta_hat_adj <- sum((data$Y.bar * data$Ng * (data$D == d))) / sum((data$D == d) * data$Ng) - sum(data$Y.bar * data$Ng * (data$D == 0)) / sum((data$D == 0) * data$Ng) - (sum(X_dem * (data$D == d)) / sum((data$D == d) * data$Ng) - sum(X_dem * (data$D == 0)) / sum((data$D == 0) * data$Ng)) * beta_hat tau.hat[d] <- theta_hat_adj beta.hat[d] <- beta_hat }

return(c(tau.hat, beta.hat)) #NB! WORK ON THE WAY HOW TO EXTRACT BOTH SEPARATELY! -- DONE!

return(tau.hat)

ret_list <- list( tau.hat = tau.hat, beta.hat = beta.hat ) return(ret_list) }

############ try to parallelize

library(devtools) # install devtools library(sreg) library(sandwich) library(lubridate) library(compiler) library(extraDistr) library(VGAM) library(Matrix) library(parallel) library(progress) library(purrr) library(SimDesign) library(pbapply)

rm(list = ls())

options(scipen = 999) # disable scientific notation

num_cores <- detectCores() cl <- makeCluster(num_cores, outfile = "")

Upload the libraries to the cluster

clusterEvalQ(cl, { library(lubridate) library(compiler) library(extraDistr) library(Matrix) library(progress) library(parallel) library(sreg) library(pbapply) library(dplyr) library(purrr) library(tidyverse) source("/Users/trifonovjuri/Desktop/sreg/R/2.0(dev)/fun_new.r") })

sim.func <- function(sim.id) { seed <- 1000 + sim.id set.seed(seed)

# --- DGP settings --- n.treat <- 2 tau.vec <- c(0.2, 1.2) # true ATEs gamma.vec <- c(0.4, 0.2, 1) k <- 4 treat_sizes <- c(1, 1, 2) G <- 12000 Nmax <- 50 max.support <- Nmax / 10 - 1

# --- Generate data --- Ng <- gen.cluster.sizes(G, max.support) data_pot <- dgp.po.creg(Ng = Ng, tau.vec = tau.vec, G = G, gamma.vec = gamma.vec, n.treat = n.treat) proba <- dgp.obs.creg(baseline = data_pot, n.treat = n.treat, k = k, treat_sizes = treat_sizes)

data_cl <- data.frame( Y = proba$Y, D = proba$D, proba$X, S = proba$S, Ng = proba$Ng, G.id = proba$G.id )

Y <- data_cl$Y D <- data_cl$D X <- data.frame("x_1" = data_cl$x_1) # or include other covariates as needed S <- data_cl$S G.id <- data_cl$G.id Ng <- data_cl$Ng

result <- tryCatch({ # Estimate ATEs and variances theta_res <- theta_hat_mult_cl(Y, D, X, S, G.id, Ng) tau_hat <- theta_res$tau.hat var_hat <- var_hat_mult_cl(Y, D, X, S, G.id, Ng, fit = theta_res) / G se_hat <- sqrt(var_hat)

# Confidence intervals and coverage
CI.left <- tau_hat - qnorm(0.975) * se_hat
CI.right <- tau_hat + qnorm(0.975) * se_hat
tstat <- tau_hat / se_hat
ci.hit <- as.numeric(tau.vec >= CI.left & tau.vec <= CI.right)

list(
  tau = tau_hat,
  se = se_hat,
  tstat = tstat,
  CI.left = CI.left,
  CI.right = CI.right,
  ci.hit = ci.hit
)

}, error = function(e) { cat("Simulation", sim.id, "encountered an error:", conditionMessage(e), "\n") list( tau = NA, se = NA, tstat = NA, CI.left = NA, CI.right = NA, ci.hit = NA ) })

return(result) }

simres <- pblapply(1:50000, sim.func, cl=cl) stopCluster(cl) tau <- na.omit(as.matrix(sapply(simres, function(simres) simres$tau))) se <- na.omit(as.matrix(sapply(simres, function(simres) simres$se))) ci.hit <- na.omit(as.matrix(sapply(simres, function(simres) simres$ci.hit))) rowMeans(tau) apply(tau, 1, sd) rowMeans(se) rowMeans(ci.hit)

library("sreg") library("dplyr") library("haven")

' ### Example 1. Simulated Data.

data <- sreg.rgen(n = 1200, tau.vec = c(0.2, 0.5, 0.8), n.strata = 4, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2) Y <- data$Y head(data)

S <- data$S D <- data$D X <- data.frame("age" = data$x_1, "sex" = data$x_2) G.id <- data$G.id Ng <- data$Ng result <- sreg(Y, S, D, G.id = G.id, Ng = Ng, X) sreg(Y, S = S, D, G.id = NULL, Ng = NULL, X = X) print(result) plot(result)

dtapm <- sreg.rgen(n = 100, tau.vec = c(1.2), cluster = TRUE, small.strata = TRUE, k = 2, treat.sizes = c(1, 1)) result <- sreg(Y = dtapm$Y, S = dtapm$S, D = dtapm$D, X = dtapm$X, small.strata = TRUE, G.id = dtapm$G.id, Ng = dtapm$Ng) result head(dtapm) data <- sreg.rgen(n = 300, tau.vec = c(1.2, 0.8), cluster = FALSE, small.strata = TRUE, k = 3, treat.sizes = c(1, 1, 1)) head(data)

result <- sreg(Y = data$Y, S = data$S, D = data$D, X = data.frame('x_1' = data$x_1, 'x_2' = data$x_2), small.strata = TRUE) print(result)

max(result$data$S)

length(result$data$Y) / max(result$data$S)

data("AEJapp") data <- AEJapp Y <- data$gradesq34 D <- data$treatment S <- data$class_level data.clean <- data.frame(Y, D, S) data.clean <- data.clean %>% mutate(D = ifelse(D == 3, 0, D)) Y <- data.clean$Y D <- data.clean$D S <- data.clean$S head(data.clean)

result <- sreg::sreg(Y = Y, S = S, D = D) print(result)

pills <- data$pills_taken age <- data$age_months data.clean <- data.frame(Y, D, S, pills, age) data.clean <- data.clean %>% mutate(D = ifelse(D == 3, 0, D)) Y <- data.clean$Y D <- data.clean$D S <- data.clean$S X <- data.frame("pills" = data.clean$pills, "age" = data.clean$age) result <- sreg::sreg(Y, S, D, G.id = NULL, X = X) print(result)

result

plot_sreg_results <- function(result) { # Load ggplot2 library(ggplot2)

# Construct a data frame from result df <- data.frame( treatment = factor(seq_along(result$tau.hat)), # factor for x-axis tau = result$tau.hat, CI.lower = result$CI.left, CI.upper = result$CI.right )

# Create the plot ggplot(df, aes(x = treatment, y = tau)) + geom_point(size = 3) + geom_errorbar(aes(ymin = CI.lower, ymax = CI.upper), width = 0.1) + labs( x = "Treatment Group", y = "Estimated ATE", title = "Point Estimates and Confidence Intervals for ATE" ) + theme_minimal() }

plot_sreg_results_fancy <- function(result) { library(ggplot2)

df <- data.frame( treatment = factor(seq_along(result$tau.hat)), tau = result$tau.hat, CI.lower = result$CI.left, CI.upper = result$CI.right )

# Add numeric version of treatment for proper ribbon plotting df$treatment_num <- as.numeric(df$treatment)

ggplot(df, aes(x = treatment_num, y = tau)) + # Confidence interval ribbon with transparency geom_ribbon(aes(ymin = CI.lower, ymax = CI.upper), fill = "#1f78b4", alpha = 0.2) + # ATE point estimates geom_point(size = 4, color = "#1f78b4") + # Optional dashed line connecting points geom_line(color = "#1f78b4", linetype = "dashed") + # Horizontal line at 0 geom_hline(yintercept = 0, linetype = "dotted", color = "gray40") + # Set treatment labels on x-axis scale_x_continuous( breaks = df$treatment_num, labels = df$treatment ) + labs( x = "Treatment Group", y = "Estimated ATE", title = "Estimated ATEs with Confidence Intervals" ) + theme_minimal(base_size = 14) + theme( axis.title = element_text(face = "bold"), panel.grid.major.y = element_line(color = "gray90") ) } plot_sreg_results(result) plot_sreg_results_fancy(result) plot_sreg_horizontal(result) plot_sreg_horizontal <- function(result, treatment_labels = NULL) { library(ggplot2)

df <- data.frame( treatment = seq_along(result$tau.hat), tau = result$tau.hat, CI.lower = result$CI.left, CI.upper = result$CI.right, SE = round((result$CI.right - result$tau.hat) / 1.96, 2) )

if (!is.null(treatment_labels)) { df$label <- factor(treatment_labels, levels = rev(treatment_labels)) } else { df$label <- factor(paste("Treatment", df$treatment), levels = rev(paste("Treatment", df$treatment))) }

ggplot(df, aes(y = label)) + # Horizontal CI bar geom_rect(aes(xmin = CI.lower, xmax = CI.upper, ymin = as.numeric(label) - 0.01, ymax = as.numeric(label) + 0.01), fill = "#2c3e50", alpha = 0.8) + # Diamond-shaped point geom_point(aes(x = tau), shape = 23, size = 3, fill = "white", stroke = 1.2, color = "black") + # Text label with tau (SE) geom_text(aes(x = tau, label = sprintf("%.2f (%.2f)", tau, SE)), vjust = -1.5, color = "black", size = 4) + # Vertical line at 0 geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") + labs( x = "Treatment Effect", y = NULL, title = "Estimated ATEs with Confidence Intervals" ) + theme_minimal(base_size = 14) + theme( axis.text.y = element_text(hjust = 1), axis.title.x = element_text(face = "bold") ) }

result

plot_sreg_horizontal <- function(result, treatment_labels = NULL) { library(ggplot2)

df <- data.frame( treatment = seq_along(result$tau.hat), tau = result$tau.hat, CI.lower = result$CI.left, CI.upper = result$CI.right, SE = result$se.rob )

if (!is.null(treatment_labels)) { df$label <- factor(treatment_labels, levels = rev(treatment_labels)) } else { df$label <- factor(paste("Treatment", df$treatment), levels = rev(paste("Treatment", df$treatment))) }

ggplot(df, aes(y = label)) + geom_rect(aes( xmin = CI.lower, xmax = CI.upper, ymin = as.numeric(label) - 0.01, ymax = as.numeric(label) + 0.01, fill = tau ), alpha = 0.8) + geom_point(aes(x = tau), shape = 23, size = 3, fill = "white", stroke = 1.2, color = "black") + geom_text(aes(x = tau, label = sprintf("%.2f (%.2f)", tau, SE)), vjust = -1.5, color = "black", size = 4) + geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") + scale_fill_viridis_c(option = "viridis", name = "Effect Size") + #scale_fill_gradient(low = "#bdd7e7", high = "#08519c", name = "Effect Size") + labs( x = "Treatment Effect", y = NULL, title = "Estimated ATEs with Confidence Intervals" ) + theme_minimal(base_size = 14) + theme( axis.text.y = element_text(hjust = 1), axis.title.x = element_text(face = "bold"), legend.position = "right" ) }

library(viridis) plot_sreg_horizontal(result)

result plot(result)

class(result)

# ------------------------------------------------------------------
#  Classify strata as "small" or "big" in a mixed design
# ------------------------------------------------------------------
# design.classifier <- function(data, S, G.id = NULL, keep.size = FALSE, warn = TRUE) {
#   ## 1. Capture column names (works with bare names or strings)
#   S_name <- if (is.character(substitute(S))) substitute(S) else deparse(substitute(S))
#   G_name <- if (!missing(G.id)) {
#     if (is.character(substitute(G.id))) substitute(G.id) else deparse(substitute(G.id))
#   } else {
#     NULL
#   }

#   ## 2. Compute stratum sizes
#   if (!is.null(G_name)) {
#     # -- clustered: count *clusters* per stratum
#     cluster_strata <- dplyr::distinct(data, .data[[S_name]], .data[[G_name]])
#     strata_sizes   <- dplyr::count(cluster_strata, .data[[S_name]], name = "size")
#   } else {
#     # -- individual: count *units* per stratum
#     strata_sizes   <- dplyr::count(data, .data[[S_name]], name = "size")
#   }

#   ## 3. If every stratum has the same size, nothing to classify
#   if (length(unique(strata_sizes$size)) == 1) {
#     if (!keep.size) strata_sizes$size <- NULL  # cosmetic
#     return(dplyr::left_join(data, strata_sizes, by = S_name))
#   }

#   ## 4. Find the modal size and classify
#   modal_size <- strata_sizes %>%
#     dplyr::count(size) %>%
#     dplyr::arrange(dplyr::desc(n)) %>%
#     dplyr::pull(size) %>%
#     .[[1]]

#   strata_sizes <- strata_sizes %>%
#     dplyr::mutate(stratum_type = ifelse(size == modal_size, "small", "big"))

#   if (!keep.size) strata_sizes$size <- NULL

#   ## 5. Merge back and optionally warn
#   out <- dplyr::left_join(data, strata_sizes, by = S_name)

#   if (warn)
#     warning("Mixed design detected: strata of varying cluster/unit counts. ",
#             "Weighted estimators will be used.", call. = FALSE)

#   return(out)
# }

data_s <- sreg.rgen(n = 3000, tau.vec = c(0.2, 0.8), n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3) Y <- data$Y S <- data$S

sreg(Y = data_s$Y, S = data_s$S, D = data_s$D, G.id = data_s$G.id, Ng = data_s$Ng, X = NULL, small.strata = TRUE)

pum <- res.creg.ss(Y = data_s$Y, D = data_s$D, S = data_s$S, G.id = data_s$G.id, Ng = data_s$Ng, X = data.frame(data_s$x_1, data_s$x_2), HC1 = FALSE)

S <- data_s$S

pum$tau.hat

pum$se.rob

data_b <- sreg.rgen(n = 50, tau.vec = c(0.2, 0.8), n.strata = 2, cluster = TRUE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)

Step 1: Get the max stratum ID in data_s

max_id <- max(data_s$S)

Step 2: Get unique strata in data_b and assign new IDs

unique_b_strata <- sort(unique(data_b$S)) num_b_strata <- length(unique_b_strata)

Create a named mapping from old to new stratum IDs

new_ids <- seq(max_id + 1, max_id + num_b_strata) stratum_map <- setNames(new_ids, unique_b_strata)

Step 3: Relabel data_b$S

data_b$S <- stratum_map[as.character(data_b$S)]

🧠 Step 4: Make cluster IDs unique across the two datasets!!!!!

max_gid <- max(data_s$G.id) data_b$G.id <- data_b$G.id + max_gid length(intersect(data_s$G.id, data_b$G.id)) == 0

data <- rbind(data_s[, -ncol(data_s)], data_b) S <- data$S G.id <- data$G.id

design.classifier(data, S, G.id)

----- ## ----- ## ----- ## ---- ## -----

Step 1: Classify strata

data_all <- design.classifier(data, S, G.id)

Step 2: Split the data

data_small <- dplyr::filter(data_all, stratum_type == "small") data_big <- dplyr::filter(data_all, stratum_type == "big")

sreg(Y = data_small$Y, S = data_small$S, D = data_small$D, G.id = data_small$G.id, Ng = data_small$Ng, X = NULL, small.strata = TRUE)

Step 3: Run the estimators

res_small <- res.sreg.ss(Y = data_small$Y, D = data_small$D, S = data_small$S, X = data_small[, grep("^x_", names(data_small)), drop = FALSE], HC1 = TRUE)

res_small <- res.sreg.ss(Y = data_small$Y, D = data_small$D, S = data_small$S, X = data.frame(data_small$x_1, data_small$x_2), HC1 = TRUE) res_small <- res.creg.ss(Y = data_small$Y, D = data_small$D, S = data_small$S, G.id = data_small$G.id, Ng = data_small$Ng, X = data.frame(data_small$x_1, data_small$x_2), HC1 = FALSE) res_small$tau.hat res_small$se.rob

res_big <- res.sreg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = data_big[, grep("^x_", names(data_big)), drop = FALSE], HC1 = TRUE)

res_big <- res.sreg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = NULL, HC1 = TRUE) res_big <- res.creg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = NULL, G.id = data_big$G.id, Ng = data_big$Ng, HC1 = FALSE) res_big$tau.hat res_big$se.rob

Step 4: Combine estimates

N_small <- nrow(data_small) N_big <- nrow(data_big) N_total <- N_small + N_big

tau_hat <- (N_small / N_total) * res_small$tau.hat + (N_big / N_total) * res_big$tau.hat

se_combined <- sqrt((N_small / N_total)^2 * res_small$se.rob^2 + (N_big / N_total)^2 * res_big$se.rob^2)

## Simulations for the combined estimator (sreg)
```r
nsim = 1000
tau.vec = c(0.2, 0.8)
n.treat = length(tau.vec)
n_1 = 3000
n_2 = 50

theta_hat_mtrx <- matrix(ncol = n.treat, nrow = nsim)

se_hat_mtrx <- matrix(ncol = n.treat, nrow = nsim)
hit <- matrix(ncol = n.treat, nrow = nsim)

for (s in 1:nsim)
{
data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]


data <- rbind(data_s, data_b)

## ----- ## ----- ## ----- ## ---- ## ----- #

# Step 1: Classify strata
data_all <- design.classifier(data, S)

# Step 2: Split the data
data_small <- dplyr::filter(data_all, stratum_type == "small")
data_big   <- dplyr::filter(data_all, stratum_type == "big")

# Step 3: Run the estimators
#res_small <- res.sreg.ss(Y = data_small$Y, D = data_small$D, S = data_small$S, X = data_small[, grep("^x_", names(data_small)), drop = FALSE], HC1 = TRUE)
res_small <- res.sreg.ss(Y = data_small$Y, D = data_small$D, S = data_small$S, X = NULL, HC1 = FALSE)

#res_big <- res.sreg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = data_big[, grep("^x_", names(data_big)), drop = FALSE], HC1 = TRUE)
res_big <- res.sreg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = NULL, HC1 = FALSE)


# Step 4: Combine estimates
N_small <- nrow(data_small)
N_big   <- nrow(data_big)
N_total <- N_small + N_big

tau_hat <- (N_small / N_total) * res_small$tau.hat + (N_big / N_total) * res_big$tau.hat

se_combined <- sqrt((N_small / N_total)^2 * res_small$se.rob^2 +
                    (N_big   / N_total)^2 * res_big$se.rob^2)

theta_hat_mtrx[s, ] <- tau_hat
se_hat_mtrx[s, ] <- se_combined

upper <- tau_hat + qnorm(0.975) * se_combined
lower <- tau_hat - qnorm(0.975) * se_combined
hit[s, ] <- (tau.vec < upper) & (tau.vec > lower)
print(s)
}

result_table <- data.frame(
  `True tau` = tau.vec,
  `E[tau_n]` = round(colMeans(theta_hat_mtrx), 4),
  `True Variance` = round(colMeans(se_hat_mtrx), 4),
  `E[V_n]` = round(apply(theta_hat_mtrx, 2, sd), 4),
  `Coverage` = round(colMeans(hit), 4),
  check.names = FALSE
)
print(result_table)

Simulations for the combined estimator (creg)

nsim = 1000
tau.vec = c(0.2, 0.8)
n.treat = length(tau.vec)
n_1 = 3000
n_2 = 50

theta_hat_mtrx <- matrix(ncol = n.treat, nrow = nsim)

se_hat_mtrx <- matrix(ncol = n.treat, nrow = nsim)
hit <- matrix(ncol = n.treat, nrow = nsim)

for (s in 1:nsim)
{
data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = TRUE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]

# 🧠 Step 4: Make cluster IDs unique across the two datasets!!!!!
max_gid <- max(data_s$G.id)
data_b$G.id <- data_b$G.id + max_gid
length(intersect(data_s$G.id, data_b$G.id)) == 0

data <- rbind(data_s[, -ncol(data_s)], data_b)

## ----- ## ----- ## ----- ## ---- ## ----- #

# Step 1: Classify strata
data_all <- design.classifier(data, S, G.id)

# Step 2: Split the data
data_small <- dplyr::filter(data_all, stratum_type == "small")
data_big   <- dplyr::filter(data_all, stratum_type == "big")

# Step 3: Run the estimators
#res_small <- res.sreg.ss(Y = data_small$Y, D = data_small$D, S = data_small$S, X = data_small[, grep("^x_", names(data_small)), drop = FALSE], HC1 = TRUE)
res_small <- res.creg.ss(Y = data_small$Y, D = data_small$D, S = data_small$S, X = data.frame(data_small$x_1, data_small$x_2), G.id = data_small$G.id, Ng = data_small$Ng, HC1 = TRUE)

#res_big <- res.sreg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = data_big[, grep("^x_", names(data_big)), drop = FALSE], HC1 = TRUE)
res_big <- res.creg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = NULL, G.id = data_big$G.id, Ng = data_big$Ng, HC1 = FALSE)
#res_big <- res.creg(Y = data_big$Y, D = data_big$D, S = data_big$S, X = data.frame(data_big$x_1, data_big$x_2), G.id = data_big$G.id, Ng = data_big$Ng, HC1 = TRUE)
res_big$tau.hat
res_big$se.rob

# Step 4: Combine estimates
N_small <- nrow(data_small)
N_big   <- nrow(data_big)
N_total <- N_small + N_big

tau_hat <- (N_small / N_total) * res_small$tau.hat + (N_big / N_total) * res_big$tau.hat

se_combined <- sqrt((N_small / N_total)^2 * res_small$se.rob^2 +
                    (N_big   / N_total)^2 * res_big$se.rob^2)

theta_hat_mtrx[s, ] <- tau_hat
se_hat_mtrx[s, ] <- se_combined

upper <- tau_hat + qnorm(0.975) * se_combined
lower <- tau_hat - qnorm(0.975) * se_combined
hit[s, ] <- (tau.vec < upper) & (tau.vec > lower)
print(s)
}

result_table <- data.frame(
  `True tau` = tau.vec,
  `E[tau_n]` = round(colMeans(theta_hat_mtrx), 4),
  `True Variance` = round(colMeans(se_hat_mtrx), 4),
  `E[V_n]` = round(apply(theta_hat_mtrx, 2, sd), 4),
  `Coverage` = round(colMeans(hit), 4),
  check.names = FALSE
)
print(result_table)
data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = TRUE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]

# 🧠 Step 4: Make cluster IDs unique across the two datasets!!!!!
max_gid <- max(data_s$G.id)
data_b$G.id <- data_b$G.id + max_gid
length(intersect(data_s$G.id, data_b$G.id)) == 0

data <- rbind(data_s[, -ncol(data_s)], data_b)
resultim <- res.sreg.mixed(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), HC1 = TRUE)
resultim$tau.hat
resultim$se.rob

##### no clusters #####
data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]

# 🧠 Step 4: Make cluster IDs unique across the two datasets!!!!!
max_gid <- max(data_s$G.id)
data_b$G.id <- data_b$G.id + max_gid
length(intersect(data_s$G.id, data_b$G.id)) == 0

data <- rbind(data_s, data_b)
resultim <- res.sreg.mixed(Y = data$Y, S = data$S, D = data$D,  X = NULL, HC1 = TRUE)
resultim$tau.hat
resultim$se.rob

Testing the native sreg with the mixed design function

##### no clusters #####
data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]

data <- rbind(data_s, data_b)
resultim <- res.sreg.mixed(Y = data$Y, S = data$S, D = data$D,  X = NULL, HC1 = TRUE)
resultim$tau.hat
resultim$se.rob


res_sreg <- sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), HC1 = FALSE, small.strata = TRUE)
print(res_sreg)
res_sreg$tau.hat
res_sreg$se.rob





data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = TRUE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]

# 🧠 Step 4: Make cluster IDs unique across the two datasets!!!!!
max_gid <- max(data_s$G.id)
data_b$G.id <- data_b$G.id + max_gid
length(intersect(data_s$G.id, data_b$G.id)) == 0

data <- rbind(data_s[, -ncol(data_s)], data_b)
resultim <- res.creg.mixed(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE)
resultim$tau.hat
resultim$se.rob

result_creg <- sreg(Y = data$Y, S = data$S, D = data$D, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = TRUE)
promo <- res.creg.mixed(Y = data$Y, S = data$S, D = data$D, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE)
promo$tau.hat
promo$se.rob
length(promo$lin.adj[,1])
length(promo$data$G.id)
print(result_creg)

Let's run some tests to ensure that everything is working (no clusters)

# Test 1. data: small strata, option: small strata
set.seed(123)
tau.vec = c(0.2, 0.8)
n.treat = length(tau.vec)
n_1 = 300

data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = FALSE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = NULL, X = data.frame(data$x_1), HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = NULL, D = NULL, S = data$S, X = data.frame(data$x_1), HC1 = TRUE, small.strata = TRUE)
print(result)


# Test 2. data: big strata, option: big strata
tau.vec = c(0.2, 0.9, 1.5)
n.treat = length(tau.vec)
n_1 = 1000

data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1, 1), k = 3)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = TRUE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = TRUE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = TRUE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = FALSE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), HC1 = TRUE, small.strata = FALSE)
print(result)


# Test 3. data: small strata, option: big strata
set.seed(123)
tau.vec = c(0.2, 0.8)
n.treat = length(tau.vec)
n_1 = 900

data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = TRUE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = TRUE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = FALSE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = FALSE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), HC1 = TRUE, small.strata = FALSE)
print(result)

nsim = 1000
tau_small <- matrix(nrow = nsim, ncol = length(tau.vec)) 
tau_big   <- matrix(nrow = nsim, ncol = length(tau.vec))
se_small  <- matrix(nrow = nsim, ncol = length(tau.vec))
se_big    <- matrix(nrow = nsim, ncol = length(tau.vec))
for (s in 1:nsim){
  data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = TRUE, treat.sizes = c(1, 1), k = 2)
  result_big <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = TRUE, small.strata = FALSE)
  result_small <- sreg(Y = data$Y, D = data$D, S = data$S, X = NULL, HC1 = FALSE, small.strata = TRUE)
  tau_small[s, ] <- result_small$tau.hat
  tau_big[s, ]   <- result_big$tau.hat
  se_small[s, ]  <- result_small$se.rob
  se_big[s, ]    <- result_big$se.rob
  print(s)
}
apply(tau_small, 2, sd)
colMeans(se_big)
colMeans(se_small)


# Test 4. data: big strata, option: small strata 
tau.vec = c(0.2, 0.9)
n.treat = length(tau.vec)
n_1 = 1000
data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 20, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
head(data)
sreg(Y = data$Y, D = data$D, S = data$S, small.strata = TRUE)

# Test 5. data: mixed design, option: small strata (so we should have just a mixed estimator)
tau.vec = c(0.2, 0.9) 
n.treat = length(tau.vec)
n_1 = 3000
n_2 = 50
data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = FALSE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]

data <- rbind(data_s, data_b)

result <-  sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), HC1 = TRUE, small.strata = TRUE)
result
result$res.big$tau.hat[1]
result$res.small$tau.hat[1]
result$se.rob[1] ^ 2
N_s <- n_1
N_b <- n_2
N_total <- N_s  + N_b

(N_s * N_b) / N_total^3 * (result$res.small$tau.hat[1] - result$res.big$tau.hat[1])^2

sqrt(result$se.rob[1] ^ 2 + ((N_s * N_b) / N_total^3 * (result$res.small$tau.hat[1] - result$res.big$tau.hat[1])^2))

tau_hat <- result$tau.hat[1]
# Test 6. data: mixed design, option: big strata (just a big strata estimator, but should through a warning that estimators are invalid!)
proba <- sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), HC1 = FALSE, small.strata = FALSE)
proba
proba <- sreg(Y = data$Y, S = data$S, D = data$D,  X = NULL, HC1 = FALSE, small.strata = FALSE)
proba

Let's run some tests to ensure that everything is working (with clusters)

# Test 1. data: small strata, option: small strata
tau.vec = c(0.2, 0.8)
n.treat = length(tau.vec)
n_1 = 900

data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
result <- sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = NULL, HC1 = FALSE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), G.id = NULL, Ng = data$G.id, HC1 = FALSE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = NULL, G.id = data$G.id, Ng = data$G.id, HC1 = TRUE, small.strata = TRUE)
print(result)


# Test 2. data: big strata, option: big strata
tau.vec = c(0.2, 0.9, 1.5)
n.treat = length(tau.vec)
n_1 = 1000

data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = FALSE, treat.sizes = c(1, 1, 1), k = 3)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = TRUE, G.id = data$G.id, Ng = data$Ng, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, HC1 = FALSE, G.id = data$G.id, Ng = data$Ng, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = TRUE, G.id = data$G.id, Ng = data$Ng, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), HC1 = FALSE, G.id = data$G.id, Ng = data$Ng, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = FALSE)
print(result)

result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), G.id = data$G.id, Ng = NULL, HC1 = FALSE, small.strata = FALSE)
print(result)

# Test 3. data: small strata, option: big strata
tau.vec = c(0.2, 0.8)
n.treat = length(tau.vec)
n_1 = 900

data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
result <- sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = TRUE) # should not match fith small.strata = FALSE unless we change the estimator
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = FALSE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = TRUE)
print(result)
result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = FALSE)
print(result)

result <- sreg(Y = data$Y, D = data$D, S = data$S, X = data.frame(data$x_1), G.id = data$G.id, Ng = NULL, HC1 = TRUE, small.strata = FALSE)
print(result)


# Test 4. data: big strata, option: small strata (should not work and throw a warning)
tau.vec = c(0.2, 0.9)
n.treat = length(tau.vec)
n_1 = 1000
data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 20, cluster = TRUE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
head(data)
sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = data$Ng, small.strata = TRUE, HC1 = TRUE)
sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = data$Ng, small.strata = TRUE, HC1 = FALSE)
sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = data$Ng, X = data.frame(data$x_1), small.strata = TRUE, HC1 = TRUE)
sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = NULL, X = data.frame(data$x_1), small.strata = TRUE, HC1 = TRUE)

# Test 4. data: big strata, option: small strata 
tau.vec = c(0.2, 0.9)
n.treat = length(tau.vec)
n_1 = 1000
data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 20, cluster = FALSE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
head(data)
sreg(Y = data$Y, D = data$D, S = data$S, small.strata = TRUE)

# Test 5. data: mixed design, option: small strata (so we should have just a mixed estimator)
tau.vec = c(0.2, 0.9)
n.treat = length(tau.vec)
n_1 = 3000
n_2 = 50
data_s <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
Y <- data$Y
S <- data$S

data_b <- sreg.rgen(n = n_2, tau.vec = tau.vec, n.strata = 2, cluster = TRUE, small.strata = FALSE, treat.sizes = c(1, 1), k = 2)
# Step 1: Get the max stratum ID in data_s
max_id <- max(data_s$S)

# Step 2: Get unique strata in data_b and assign new IDs
unique_b_strata <- sort(unique(data_b$S))
num_b_strata <- length(unique_b_strata)

# Create a named mapping from old to new stratum IDs
new_ids <- seq(max_id + 1, max_id + num_b_strata)
stratum_map <- setNames(new_ids, unique_b_strata)

# Step 3: Relabel data_b$S
data_b$S <- stratum_map[as.character(data_b$S)]

# 🧠 Step 4: Make cluster IDs unique across the two datasets!!!!!
max_gid <- max(data_s$G.id)
data_b$G.id <- data_b$G.id + max_gid
length(intersect(data_s$G.id, data_b$G.id)) == 0

data <- rbind(data_s[, -ncol(data_s)], data_b)
Y <- data$Y
S <- data$S
G.id <- data$G.id
Ng <- data$Ng
D <- data$D
data_tst <- data.frame(Y = Y, D = D, S = S, G.id = G.id, Ng = Ng)
proba <-sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2, data$Ng), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = TRUE)
proba
proba <-sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = TRUE)
proba
proba <-sreg(Y = data$Y, S = data$S, D = data$D,  X = NULL, G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = TRUE)
proba
proba <-sreg(Y = data$Y, S = data$S, D = data$D,  X = NULL, G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = TRUE)
proba
proba <-sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = NULL, HC1 = TRUE, small.strata = TRUE)
proba
proba <-sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = NULL, HC1 = FALSE, small.strata = TRUE)
proba
proba <-sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), G.id = NULL, Ng = NULL, HC1 = FALSE, small.strata = TRUE)
proba

# Test 6. data: mixed design, option: big strata (just a big strata estimator!)
proba <- sreg(Y = data$Y, S = data$S, D = data$D,  X = data.frame(data$x_1, data$x_2), G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = FALSE)
proba
proba <- sreg(Y = data$Y, S = data$S, D = data$D,  X = NULL, G.id = data$G.id, Ng = data$Ng, HC1 = TRUE, small.strata = FALSE)
proba
proba <- sreg(Y = data$Y, S = data$S, D = data$D,  X = NULL, G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = FALSE)
proba
proba <- sreg(Y = data$Y, S = data$S, D = data$D,  X = NULL, G.id = data$G.id, Ng = NULL, HC1 = FALSE, small.strata = FALSE)
proba




#### Let's play around with out plot! ###
tau.vec = c(0.2, 0.5)
n.treat = length(tau.vec)
n_1 = 900

data <- sreg.rgen(n = n_1, tau.vec = tau.vec, n.strata = 4, cluster = TRUE, small.strata = TRUE, treat.sizes = c(1, 1, 1), k = 3)
result <- sreg(Y = data$Y, D = data$D, S = data$S, G.id = data$G.id, Ng = data$Ng, HC1 = FALSE, small.strata = TRUE)
print(result)

plot(result, treatment_labels = c('FIRST', 'SECOND'), bar_fill = c('blue', 'orange'), title = 'New Plot', point_shape = 23, point_size = 4, point_color = "green", point_fill = 'black', point_stroke = 1.2, label_color = 'purple', label_size = 10,
      zero_line = FALSE, grid = TRUE, bg_color = 'grey')
plot(result, grid = FALSE, bg_color = 'white', y_axis_title = 'Effects', x_axis_title = 'effect size')


Try the sreg package in your browser

Any scripts or data that you put into this service are public.

sreg documentation built on Aug. 25, 2025, 5:14 p.m.