knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
The MBalance
implements the Mahalanobis balancing method for estimating ATE and ATT or ATC on observational data. Our method has good performance (low bias, small balancing diagnostics) in "Bad Overlap" situations (e.g., $E{Pr(T = 1 \mid X)}$ closes to 0 or 1, $E{ X \mid T = 1}$ and $E{ X \mid T = 0}$ are significantly different, high-dimensional covariate).
# Install the "MBalance" package from github. # if (!require(c("devtools","WeightIt","MASS","cobalt"))){ # install.packages(c("devtools","WeightIt","MASS","cobalt")) # } # devtools::install_github("yimindai0521/MBalance") library(MBalance) library(WeightIt) library(MASS) library(sbw) library(cobalt)
We provide function covbal
(using R package Weightit
@Gerifer2022Weightit),sbwbal
(using R package sbw
@zubizarreta2021package) to implement Mahalanobis balancing method and other covariate balancing methods, including inverse probability weighting (ps) in @rosenbaum1987model, entropy balancing (ebal) in @hainmueller2012entropy, covariate balancing propensity score (cbps) in @imai2014covariate, energy balancing (energy) in @huling2020energy, minimal dispersion approximate balancing weights (sbw) in @wang2020minimal.
covbal <- function(X, Tr, Y){ data.matrix <- data.frame(X , factor(Tr)) sample.size <- dim(X)[1] dimension <- dim(X)[2] character <- names(data.matrix) for(j in 1:(dimension+1)){character[j] <- paste(character[j])} myformula <- as.formula(paste(character[1 + dimension],paste(" ~ ", paste(character[1:dimension], collapse= "+")))) ps.weight <- weightit(myformula , data = data.matrix, estimand = "ATE", method = "ps")$weights ps.ate <- t(ps.weight[Tr == 1] / sum(ps.weight[Tr == 1])) %*% Y[Tr == 1] - t(ps.weight[Tr == 0] / sum(ps.weight[Tr == 0])) %*% Y[Tr == 0] ebal.weight <- weightit(myformula , data = data.matrix, estimand = "ATE", method = "ebal")$weights ebal.ate <- t(ebal.weight[Tr == 1] / sum(ebal.weight[Tr == 1])) %*% Y[Tr == 1] - t(ebal.weight[Tr == 0] / sum(ebal.weight[Tr == 0])) %*% Y[Tr == 0] ebal.weight[ebal.weight == 0] <- 1 cbps.weight <- weightit(myformula , data = data.matrix, estimand = "ATE", method = "cbps")$weights cbps.ate <- t(cbps.weight[Tr == 1] / sum(cbps.weight[Tr == 1])) %*% Y[Tr == 1] - t(cbps.weight[Tr == 0] / sum(cbps.weight[Tr == 0])) %*% Y[Tr == 0] energy.weight <- weightit(myformula , data = data.matrix, estimand = "ATE", method = "energy")$weights energy.ate <- t(energy.weight[Tr == 1] / sum(energy.weight[Tr == 1])) %*% Y[Tr == 1] - t(energy.weight[Tr == 0] / sum(energy.weight[Tr == 0])) %*% Y[Tr == 0] MB.weight <- rep(NA,sample.size) MB1.result <- MB(x = X, treat = Tr, group1 = 0, outcome = rep(0,sample.size), method = "MB") MB2.result <- MB(x = X, treat = Tr, group1 = 1, outcome = rep(0,sample.size), method = "MB") MB.weight[Tr == 0] <- sum(Tr == 0) * MB1.result$weight MB.weight[Tr == 1] <- sum(Tr == 1) * MB2.result$weight MB.ate <- t(MB.weight[Tr == 1] / sum(MB.weight[Tr == 1])) %*% Y[Tr == 1] - t(MB.weight[Tr == 0] / sum(MB.weight[Tr == 0])) %*% Y[Tr == 0] return(list(weight = list(ps = ps.weight, ebal = ebal.weight, cbps = cbps.weight, energy = energy.weight, MB = MB.weight), ate = c(ps = ps.ate, ebal = ebal.ate, cbps = cbps.ate, energy = energy.ate, MB = MB.ate))) } sbwbal <- function(X, Tr, Y){ data.matrix <- data.frame(X , factor(Tr), Y) dimension <- dim(X)[2] bal = list() bal$bal_gri = c(1e-04, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1) character <- names(data.matrix) bal$bal_cov <- character[1:dimension] sbw.result <- FALSE while(sum(dim(as.matrix(sbw.result))) != 15){ sbw.result <- tryCatch(sbw.result <- sbw(dat = data.matrix, ind = character[1 + dimension], bal = bal, out = character[2 + dimension], par = list(par_est = "ate")), error = function(e) { skip_to_next <<- FALSE}) bal$bal_gri <- bal$bal_gri[-1] } sbw.weight <- sbw.result$dat_weights$sbw_weights sbw.ate <- t(sbw.weight[Tr == 1] / sum(sbw.weight[Tr == 1])) %*% Y[Tr == 1] - t(sbw.weight[Tr == 0] / sum(sbw.weight[Tr == 0])) %*% Y[Tr == 0] return(list(weight = sbw.weight, ate = sbw.ate)) }
We consider three different simulation settings. The first simulation will be Scenario C in our paper. We first generate treatment indicator $T$ from $Bernoulli(0.5)$ for each observation. The observed covariates depend on treatment assignment. If $T = 1$, then $X \sim N(1, \Sigma)$ where the jth row and kth column of $\Sigma$ is $\rho^{|j−k|}$ and we set $\rho = 1/2$, otherwise, $X \sim N(0,I)$. The observed outcome is generated from: $Y(T) = (1 - T)(X_1 + ... + X_6) + T(X_1 + ... + X_6) / 2$. In this setting, $E{ X \mid T = 1}$ and $E{ X \mid T = 0}$ are significantly different and thus it's a "Bad Overlap" situation.
si.data.bad <- function(dimension = 10, sample.size = 200){ covmatrix <- matrix(0 , dimension , dimension) for(i in 1:dimension){for(j in 1:dimension){covmatrix[i,j] <-2^{-abs(i!=j)}}} treat <- rbinom(sample.size,1,0.5) z1 <- mvrnorm(sample.size, mu = rep(1,dimension), Sigma = covmatrix) z0 <- mvrnorm(sample.size, mu = rep(0,dimension), Sigma = diag(dimension)) X <- treat * z1 + (1-treat) * z0 noise <- rnorm(sample.size) Y <- (1 + treat) * apply(X[,1:10],1,sum) + noise return(list(X = X, treat = treat, Y = Y)) }
The second simulation setting corresponds Scenario D to our paper. The observed covariates are simulated by $X \sim N(1,I)$. The treatment indicator is simulated from $T ∼ Bernoulli(\pi(X))$ with $π(X) = 1/(1+19 \exp(X_1+···+X_{10}−10))$. The outcome is simulated from $Y = (1+T)(X_1+···+X_{10}) + \epsilon$, where $\epsilon \sim N(0,1)$. In this setting, $E{Pr(T = 1 \mid X)}$ closes to 0 or 1 and thus it's a "Bad Overlap" situation.
si.data.badover <- function(dimension = 10, sample.size = 1000){ X <- matrix(rnorm(sample.size * dimension) + 1, nrow = sample.size, ncol = dimension) ps <- 1 / (1 + 19 * exp( apply(X[,1:10],1,sum) - 10) ) treat <- rep(NA,sample.size) for(i in 1:sample.size){treat[i] <- rbinom(1,1,ps[i])} noise <- rnorm(sample.size) Y <- (1 + treat) * apply(X[,1:10],1,sum) + noise return(list(X = X, treat = treat, Y = Y)) }
We finally study the performance of our method in high-dimensional data. The final simulation corresponds to Scenario E in our paper. We first generate the observed data from $X ∼ N(0,\Sigma)$ where the jth row and kth column of $\Sigma$ is $\rho^{|j−k|}$ and we set $\rho = 1/2$. The treatment index is generated by $T ∼ Bernoulli(π(X))$ with $\pi(X) = 1/(1 + \exp(X_1 + (X_2 + ... + X_6) / 2))$. The outcome is generated from: $Y(T) = (1 - T)(X_1 + ... + X_6) + T(X_1 + ... + X_6) / 2$.
si.data.hd <- function(dimension = 100, sample.size = 200){ covmatrix <- matrix(0 , dimension , dimension) for(i in 1:dimension){for(j in 1:dimension){covmatrix[i,j] <-2^{-abs(i-j)}}} X <- mvrnorm(sample.size, mu = rep(1,dimension), Sigma = covmatrix) ps <- 1 / (1 + exp( apply(X[,1:6],1,sum) / 2) ) treat <- rep(NA,sample.size) for(i in 1:sample.size){treat[i] <- rbinom(1,1,ps[i])} noise <- rnorm(sample.size) Y <- (1 + treat) / 2 * apply(X[,1:6],1,sum) + noise return(list(X = X, treat = treat, Y = Y)) }
MB
: estimating average treatment effectWe will use function \texttt{love.plot} in R package @Greifer2022cobalt to plot ASMD to compare the performance for different covariate balancing methods.
##generating data from Scenario C set.seed(1999) data <- si.data.bad() result <- covbal(data$X,data$treat,data$Y) sbw.re <- sbwbal(data$X,data$treat,data$Y) ##plot ASMD dimension <- dim(data$X)[2] data.matrix <- data.frame(data$X , factor(data$treat)) character <- names(data.matrix) for(j in 1:(dimension+1)){character[j] <- paste(character[j])} myformula <- as.formula(paste(character[1 + dimension],paste(" ~ ", paste(character[1:dimension], collapse= "+")))) love.plot(myformula, data = data.matrix, estimand = "ATE", stats = c("mean.diffs"), weights = list(w1 = result$weight$MB, w2 = result$weight$ps, w3 = result$weight$ebal, w4 = result$weight$cbps, w5 = result$weight$energy, w6 = sbw.re$weight), var.order = "unadjusted", abs = TRUE, line = TRUE, thresholds = c(m = .1), sample.names = c("Unweighted", "MB", "PS", "ebal", "cbps", "energy","sbw"), limits = list(m = c(0,1.5)), wrap = 20, position = "top") ##generating data from Scenario D data <- si.data.badover() result <- covbal(data$X,data$treat,data$Y) sbw.re <- sbwbal(data$X,data$treat,data$Y) ##plot ASMD dimension <- dim(data$X)[2] data.matrix <- data.frame(data$X , factor(data$treat)) character <- names(data.matrix) for(j in 1:(dimension+1)){character[j] <- paste(character[j])} myformula <- as.formula(paste(character[1 + dimension],paste(" ~ ", paste(character[1:dimension], collapse= "+")))) love.plot(myformula, data = data.matrix, estimand = "ATE", stats = c("mean.diffs"), weights = list(w1 = result$weight$MB, w2 = result$weight$ps, w3 = result$weight$ebal, w4 = result$weight$cbps, w5 = result$weight$energy, w6 = sbw.re$weight), var.order = "unadjusted", abs = TRUE, line = TRUE, thresholds = c(m = .1), sample.names = c("Unweighted", "MB", "PS", "ebal", "cbps", "energy","sbw"), limits = list(m = c(0,1)), wrap = 20, position = "top")
We can find that ASMD of MB is significantly lower than other covariate balancing methods.
We repeat 200 times.
mainf1 <- function(iteration = 1000){ result1 <- matrix(0,iteration,5) result2 <- matrix(0,iteration,5) for(i in 1:iteration){ data1 <- si.data.bad() data2 <- si.data.badover() result1[i,] <- covbal(data1$X,data1$treat,data1$Y)$ate result2[i,] <- covbal(data2$X,data2$treat,data2$Y)$ate } return(list(bad = list(ps_bias = mean(result1[,1]) - 5, ebal_bias = mean(result1[,2]) - 5, cbps_bias = mean(result1[,3]) - 5, energy_bias = mean(result1[,4]) - 5, MB_bias = mean(result1[,5]) - 5), badover = list(ps_bias = mean(result2[,1]) - 10, ebal_bias = mean(result2[,2]) - 10, cbps_bias = mean(result2[,3]) - 10, energy_bias = mean(result2[,4]) - 10, MB_bias = mean(result2[,5]) - 10) )) } set.seed(0521) options(warn = -1) mainf1(200)
We can find that our method has lower bias than other methods because our purposed method has significantly lower imbalance and thus greatly outperforms other methods.
hdMB
: estimating ate in high-dimensional data.set.seed(0521) data <- si.data.hd() dimension <- dim(data$X)[2] result1 <- hdMB(x = data$X, treat = data$treat, outcome = data$Y, GASMD.bound = 0.2, group1 = 1) plot(result1$GMIM) threshold <- 4 index <- rank(result1$GASMD) >= (dimension - threshold + 1) result1 <- MB(x = data$X[,index], treat = data$treat, outcome = data$Y, group1 = 1) result0 <- hdMB(x = data$X, treat = data$treat, outcome = data$Y, GASMD.bound = 0.05, group1 = 0) plot(result0$GMIM) threshold <- 16 index <- rank(result0$GASMD) >= (dimension - threshold + 1) result0 <- MB(x = data$X[,index], treat = data$treat, outcome = data$Y, group1 = 1) ##An estimate of ATE result1$AT - result0$AT
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.