#' @title Estimation of bounds on ATE as a function of the proportion of
#' unmeasured confounding.
#'
#' @description \code{get_bound} is the main function to estimate the lower and
#' upper bounds curves as a function of eps, the proportion of
#' unmeasured confounding.
#' @param y nx1 outcome vector in [0, 1].
#' @param a nx1 treatment received vector.
#' @param x nxp \code{data.frame} of covariates.
#' @param ymin infimum of the support of y.
#' @param ymax supremum of the support of y.
#' @param outfam family specifying the error distribution for outcome
#' regression, currently \code{gaussian()} or \code{binomial()} supported.
#' Link should not be specified.
#' @param treatfam family specifying the error distribution for treatment
#' regression, currently \code{binomial()} supported.
#' Link should not be specified.
#' @param sl.lib character vector specifying which libraries to use for the SL.
#' @param model a string specifying the assumption placed on S when
#' computing the bounds. Currently only "x" (\eqn{S \perp (Y, A) | X)} and "xa"
#' (\eqn{S \perp Y | A, X}). Default is model = "x".
#' @param eps vector of arbitrary length specifying the values for the
#' proportion of confounding where the lower and upper bounds curves are
#' evaluated. Default is 0 (no unmeasured confounding).
#' @param delta vector of arbitrary length specifyin the values of delta used
#' to bound maximal confounding among S = 0 units. Default is delta = 1, which
#' imposes no assumption if the outcome Y is bounded.
#' @param nsplits number of splits for the cross-fitting. Default is 5.
#' @param do_mult_boot boolean for whether uniform bands via the multiplier
#' bootstrap need to computed. Default is do_mult_boot=TRUE.
#' @param do_eps_zero boolean for whether estimate of espilon_zero shoul be
#' computed. Default is do_eps_zero=TRUE.
#' @param alpha confidence level. Default is 0.05.
#' @param B number of rademacher rvs sampled. Default is 10000.
#' @param nuis_fns Optional. A nx4 matrix specifying the estimated regression
#' functions evaluated at the observed x, columns should be named: pi0, pi1,
#' mu1, mu0. Default is NULL so that regressions are estimated using the
#' SuperLearner via cross-fitting.
#' @param plugin boolean for whether the estimator for the bounds is of plug-in
#' type: uses g(etahat) rather than its estimator based on influence functions
#' when computing term that multiplies the indicator. So g(etahat) instead of
#' tauhat.
#' @param do_rearrange bollean for whether the precedure by Chernozhukov et al
#' (2008) should be applied to the estimators of the bounds and the CIs.
#' @param do_parallel boolean for whether parallel computing should be used
#' @param ncluster number of clusters used if parallel computing is used.
#' @param show_progress boolean for whether progress bar in estimating
#' regression functions should be shown. Default is FALSE. Currently, only
#' available if do_parallel is FALSE.
#'
#' @return A list containing
#' \item{\code{bounds}}{a length(eps)x12xlength(delta) array, where, for each
#' eps and delta, it has the estimates of lower bound (\code{lb}), upper bound
#' (\code{ub}), lower uniform band for lower bound (\code{ci_lb_lo_unif}),
#' upper uniform band for upper bound (\code{ci_ub_hi_unif}),
#' lower pointwise band for lower bound (\code{ci_lb_lo_pt}),
#' upper pointwise band for lower bound (\code{ci_lb_hi_pt}),
#' lower pointwise band for upper bound (\code{ci_ub_lo_pt}),
#' upper pointwise band for upper bound (\code{ci_ub_hi_pt}),
#' lower confidence band using Imbens & Manski (2004) procedure
#' (\code{ci_im04_lo}), upper confidence band using Imbens & Manski (2004)
#' procedure(\code{ci_im04_hi}).}
#' \item{\code{var_ub}}{estimate of the variance of the upper bound curve as fn
#' of eps.}
#' \item{\code{var_lb}}{estimate of the variance of the lower bound curve as fn
#' of eps.}
#' \item{\code{eps_zero}}{a length(delta)x5 \code{data.frame} with values of
#' delta, estimate of eps0, max(0, ci_lo), min(1, ci_hi), variance of estimate
#' of eps0.}
#' \item{\code{q_lb}}{a list of size \code{nsplits}, where element j is a
#' (num eps) x (num delta) matrix containing estimates of eps-quantile of
#' g(etab) for lower bound, where g(etab) is computed without using obs from
#' fold j}.
#' \item{\code{q_ub}}{a list of size \code{nsplits}, where element j is a
#' (num eps) x (num delta) matrix containing estimates of eps-quantile of
#' g(etab) for upper bound, where g(etab) is computed without using obs from
#' fold j}.
#' \item{\code{lambda_lb}}{a list of size \code{nsplits}, where element j
#' contains a nxlength(eps)xlength(delta) array containing
#' the indicator ghatmat <= q, where q is eps-quantile of ghatmat and
#' ghatmat is g(eta) for lower bound, computed using regression functions
#' estimated using all folds but j and evaluated using obs at fold j.}
#' \item{\code{lambda_ub}}{a list of size \code{nsplits}, where element j
#' contains a nxlength(eps)xlength(delta) array containing
#' the indicator ghatmat > q, where q is (1-eps)-quantile of ghatmat and
#' ghatmat is g(eta) for upper bound, computed using regression functions
#' estimated using all folds but j and evaluated using obs at fold j.}
#' \item{\code{ifvals_lb}}{a n x length(eps) x length(delta) array containing
#' the influence functions for lower bound evaluated at the observed X as a
#' function of epsilon and delta.}
#' \item{\code{ifvals_ub}}{a n x length(eps) x length(delta) array containing
#' the influence functions for upper bound evaluated at the observed X as a
#' function of epsilon and delta.}
#' \item{\code{nuis_fns}}{a list containing estimates of regression functions
#' evaluated at test obs and train obs. See \code{\link{do_crossfit}}.}
#' \item{\code{nuhat}}{a nx1 matrix containing the influence function values
#' for E{E(Y|A = 1, X) - E(Y|A = 0, X)} computed using regressions fns estimated
#' using all obs except those in fold j and evaluated at obs in fold j.}
#' \item{\code{glhat}}{a list of size \code{nsplits}, where element j
#' is (num obs in split j) x length(delta) matrix containing estimates of g(eta)
#' for lower bound computed using regressions fns estimated using all obs except
#' those in fold j and evaluated at obs in fold j.}
#' \item{\code{guhat}}{a list of size \code{nsplits}, where element j
#' is (num obs in split j) x length(delta) matrix containing estimates of g(eta)
#' for upper bound computed using regressions fns estimated using all obs except
#' those in fold j and evaluated at obs in fold j.}
#' \item{\code{glhat_train}}{a list of size \code{nsplits}, where element j
#' is (n - num obs in split j) x length(delta) matrix containing estimates of
#' g(eta) for lower bound computed using regressions fns estimated using all obs
#' except those in fold j and evaluated at obs \emph{not} in fold j.}
#' \item{\code{guhat_train}}{a list of size \code{nsplits}, where element j
#' is (n - num obs in split j) x length(delta) matrix containing estimates of
#' g(eta) for upper bound computed using regressions fns estimated using all obs
#' except those in fold j and evaluated at obs \emph{not} in fold j.}
#' \item{\code{tauhat_lb}}{a list of size \code{nsplits}, where element j
#' is (num obs in split j) x 1 matrix containing the influence function
#' values of the parameter E(g(eta)) with g(eta) for the lower bound computed
#' using regression fns estimated from all obs except those in fold j and
#' evaluated at obs in fold j.}
#' \item{\code{tauhat_ub}}{a list of size \code{nsplits}, where element j
#' is (num obs in split j) x 1 matrix containing the influence function
#' values of the parameter E(g(eta)) with g(eta) for the upper bound computed
#' using regression fns estimated from all obs except those in fold j and
#' evaluated at obs in fold j.}
#' \item{\code{phibar_lb}}{a n x length(eps) x length(delta) array containing
#' values for \code{ifvals_lb} - \code{lambda_lb} * \code{q_lb}.}
#' \item{\code{phibar_ub}}{a n x length(eps) x length(delta) array containing
#' values for \code{ifvals_ub} - \code{lambda_ub} * \code{q_ub}.}
#' \item{\code{phibar_lb_fold}}{a list of size \code{nsplits}, where element j
#' is n x length(eps) x length(delta) array containing
#' values for \code{ifvals_lb} - \code{lambda_lb} * \code{q_lb} computed using
#' regression functions estimated from all obs except those in fold j and
#' evaluated at obs in fold j.}
#' \item{\code{phibar_ub_fold}}{a list of size \code{nsplits}, where element j
#' is n x length(eps) x length(delta) array containing
#' values for \code{ifvals_ub} - \code{lambda_ub} * \code{q_ub} computed using
#' regression functions estimated from all obs except those in fold j and
#' evaluated at obs in fold j.}
#' \item{\code{mult_calpha_lb}}{a ndelta-dimensional vector containing
#' \code{calpha} equal to the z-score used to construct uniform bands for the
#' lower bound of the form psi(eps) \eqn{\pm} \code{calpha} * sigma(eps).}
#' \item{\code{mult_calpha_ub}}{a ndelta-dimensional vector containing
#' \code{calpha} equal to the z-score used to construct uniform bands for the
#' upper bound of the form psi(eps) \eqn{\pm} \code{calpha} * sigma(eps).}
#' \item{\code{im04_calpha}}{a ndelta-dimensional vector containing to the
#' z-score used to construct the confidence interval for partially identified
#' ATE as in Imbens & Manski (2004).}
#'
#' @section Details:
#' As done in the paper, one can see that g(eta) for the lower bound is equal to
#' g(eta) for the upper bound minus delta * (ymax - ymin). Therefore the IFs for
#' E(g(eta)) follows the same relation. They are keep separated just for code
#' clarity.
#'
#' @examples
#' n <- 1000
#' eps <- seq(0, 1, 0.001)
#' delta <- c(0.25, 0.5, 1)
#' a <- rbinom(n, 1, 0.5)
#' x <- as.data.frame(matrix(rnorm(2*n), ncol = 2, nrow = n))
#' ymin <- 0
#' ymax <- 1
#' y <- runif(n, ymin, ymax)
#' res <- get_bound(y = y, a = a, x = x, ymin = ymin, ymax = ymax,
#' outfam = gaussian(), treatfam = binomial(),
#' model = "x", eps = eps, delta = delta,
#' do_mult_boot = TRUE, do_eps_zero = TRUE, nsplits = 5,
#' alpha = 0.05, B = 1000, sl.lib = "SL.glm")
#' print(res$eps_zero)
#' print(head(res$bounds[, , 1]))
#'
#' @references Imbens, G. W., & Manski, C. F. (2004). Confidence intervals for
#' partially identified parameters. \emph{Econometrica}, 72(6), 1845-1857.
#' @references Van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007).
#' Super learner.
#' \emph{Statistical applications in genetics and molecular biology}, 6(1).
#' @references Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo,
#' E., Hansen, C., & Newey, W. K. (2016). Double machine learning for treatment
#' and causal parameters (No. CWP49/16). \emph{cemmap working paper}.
#' @references Kennedy, E. H. (2019). Nonparametric causal effects based on
#' incremental propensity score interventions. \emph{Journal of the American
#' Statistical Association}, 114(526), 645-656.
#' @references Chernozhukov, V., Fernandez-Val, I., & Galichon, A. (2009).
#' Improving point and interval estimators of monotone functions by
#' rearrangement. \emph{Biometrika}, 96(3), 559-575.
#'
#' @export
get_bound <- function(y, a, x, ymin, ymax, outfam, treatfam, sl.lib,
model = "x", eps = 0, delta = 0, nsplits = 5,
do_mult_boot = TRUE, do_eps_zero = TRUE, alpha = 0.05,
B = 10000, nuis_fns = NULL, plugin = FALSE,
do_rearrange = FALSE, do_parallel = FALSE,
ncluster = NULL, show_progress = FALSE) {
if(is.null(nuis_fns)) {
nuis_fns <- do_crossfit(y = y, a = a, x = x, outfam = outfam,
treatfam = treatfam, nsplits = nsplits,
sl.lib = sl.lib, ymin = ymin, ymax = ymax,
do_parallel = do_parallel, ncluster = ncluster,
show_progress = show_progress)
}
pi0hat <- nuis_fns$test[, "pi0"]
pi1hat <- nuis_fns$test[, "pi1"]
mu0hat <- nuis_fns$test[, "mu0"]
mu1hat <- nuis_fns$test[, "mu1"]
pi0hat_train <- nuis_fns$train[, "pi0"]
pi1hat_train <- nuis_fns$train[, "pi1"]
mu0hat_train <- nuis_fns$train[, "mu0"]
mu1hat_train <- nuis_fns$train[, "mu1"]
folds_train <- as.factor(nuis_fns$train[, "fold"])
folds_test <- as.factor(nuis_fns$test[, "fold"])
nobs_fold <- table(folds_test)
nsplits <- length(unique(folds_train))
n <- length(y)
ndelta <- length(delta)
neps <- length(eps)
order_obs <- nuis_fns$order_obs
y <- y[order_obs]
a <- a[order_obs]
x <- x[order_obs, , drop = FALSE]
# Estimate term E(mu1(X) - mu0(X)) (ATE under no unmeasured confounding)
psi0 <- if_gamma(y = y, a = a, aval = 0, pia = pi0hat, mua = mu0hat)
psi1 <- if_gamma(y = y, a = a, aval = 1, pia = pi1hat, mua = mu1hat)
nuhat <- psi1 - psi0
nuhat_fold <- split(nuhat, folds_test)
if(model == "x") {
pi0g <- pi0hat
pi1g <- pi1hat
pi0g_train <- pi0hat_train
pi1g_train <- pi1hat_train
} else if(model == "xa") {
pi0g <- pi0g_train <- 1 - a
pi1g <- pi1g_train <- a
} else {
stop("model not supported!")
}
datg <- data.frame(ymin = ymin, ymax = ymax, pi0g = pi0g, pi1g = pi1g,
mu0 = mu0hat, mu1 = mu1hat)
gl <- get_g(data = datg, delta = delta, upper = FALSE)
gu <- get_g(data = datg, delta = delta, upper = TRUE)
min_gl <- apply(gl, 2, min)
min_gu <- apply(gu, 2, min)
max_gl <- apply(gl, 2, max)
max_gu <- apply(gu, 2, max)
glhat <- by(gl, folds_test, function(x) { as.matrix(x) }, simplify = FALSE)
guhat <- by(gu, folds_test, function(x) { as.matrix(x) }, simplify = FALSE)
datg_train <- data.frame(ymin = ymin, ymax = ymax, pi0g = pi0g_train,
pi1g = pi1g_train, mu0 = mu0hat_train,
mu1 = mu1hat_train)
glhat_train <- by(datg_train, folds_train, get_g, delta = delta, upper = FALSE,
simplify = FALSE)
guhat_train <- by(datg_train, folds_train, get_g, delta = delta, upper = TRUE,
simplify = FALSE)
qhats_lb <- lapply(glhat, get_quant_g, eps = eps, min_g = min_gl,
max_g = max_gl)
qhats_ub <- lapply(guhat, get_quant_g, eps = 1 - eps, min_g = min_gu,
max_g = max_gu)
if(!plugin) {
dat_tau <- data.frame(y = y, a = a, ymin = ymin, ymax = ymax, pi0 = pi0hat,
pi1 = pi1hat, mu0 = mu0hat, mu1 = mu1hat)
tauhat_lb <- by(dat_tau, folds_test, if_tau, delta = delta, upper = FALSE,
simplify = FALSE)
tauhat_ub <- by(dat_tau, folds_test, if_tau, delta = delta, upper = TRUE,
simplify = FALSE)
} else {
tauhat_lb <- glhat
tauhat_ub <- guhat
}
list_lb <- get_ifvals(n = n, neps = neps, ndelta = ndelta, upper = FALSE,
nu = nuhat_fold, tau = tauhat_lb, g = glhat,
quant = qhats_lb, nsplits = nsplits,
nobs_fold = nobs_fold)
list_ub <- get_ifvals(n = n, neps = neps, ndelta = ndelta, upper = TRUE,
nu = nuhat_fold, tau = tauhat_ub, g = guhat,
quant = qhats_ub, nsplits = nsplits,
nobs_fold = nobs_fold)
lambda_l <- list_lb$lambda
lambda_u <- list_ub$lambda
lambdaq_l <- list_lb$lambdaq
lambdaq_u <- list_ub$lambdaq
ifvals_l <- list_lb$ifvals
ifvals_u <- list_ub$ifvals
phibar_l_fold <- Map("-", ifvals_l, lambdaq_l)
phibar_u_fold <- Map("-", ifvals_u, lambdaq_u)
.get_est_fold <- function(x, fun) {
array( apply(x, c(2, 3), fun), dim = c(neps, ndelta) )
}
est_l_fold <- array(vapply(ifvals_l, .get_est_fold, fun = mean,
FUN.VALUE = array(0, dim = c(neps, ndelta))),
dim = c(neps, ndelta, nsplits))
est_u_fold <- array(vapply(ifvals_u, .get_est_fold, fun = mean,
FUN.VALUE = array(0, dim = c(neps, ndelta))),
dim = c(neps, ndelta, nsplits))
est_l <- array(apply(est_l_fold, c(1, 2), mean), dim = c(neps, ndelta))
est_u <- array(apply(est_u_fold, c(1, 2), mean), dim = c(neps, ndelta))
phibar_l <- abind::abind(phibar_l_fold, along = 1)
phibar_u <- abind::abind(phibar_u_fold, along = 1)
var_l <- array(apply(phibar_l, c(2, 3), var), dim = c(neps, ndelta))
var_u <- array(apply(phibar_u, c(2, 3), var), dim = c(neps, ndelta))
if(do_mult_boot) {
estbar_l_fold <- array(vapply(phibar_l_fold, .get_est_fold, fun = mean,
FUN.VALUE = array(0, dim = c(neps, ndelta))),
dim = c(neps, ndelta, nsplits))
estbar_u_fold <- array(vapply(phibar_u_fold, .get_est_fold, fun = mean,
FUN.VALUE = array(0, dim = c(neps, ndelta))),
dim = c(neps, ndelta, nsplits))
lb_estbar <- array(apply(estbar_l_fold, c(1, 2), mean),
dim = c(neps, ndelta))
ub_estbar <- array(apply(estbar_u_fold, c(1, 2), mean),
dim = c(neps, ndelta))
calpha_lb <- do_multboot(n = n, ndelta = ndelta, psihat = lb_estbar,
sigmahat = var_l, ifvals = phibar_l,
alpha = alpha / 2, B = B)
calpha_ub <- do_multboot(n = n, ndelta = ndelta, psihat = -ub_estbar,
sigmahat = var_u, ifvals = -phibar_u,
alpha = alpha / 2, B = B)
calpha_lb <- matrix(rep(calpha_lb, neps), ncol = ndelta, nrow = neps,
byrow = TRUE)
calpha_ub <- matrix(rep(calpha_ub, neps), ncol = ndelta, nrow = neps,
byrow = TRUE)
} else {
calpha_lb <- calpha_ub <- qnorm(1 - alpha / 2)
}
if(do_eps_zero) {
eps_zero <- get_eps_zero(n = n, pt_est = mean(nuhat), eps = eps, lb = est_l,
ub = est_u, ql = qhats_lb, qu = qhats_ub,
ifvals_lb = phibar_l_fold,
ifvals_ub = phibar_u_fold,
delta = delta, alpha = alpha)
} else {
eps_zero <- NULL
}
get_ci <- function(muhat, sigmahat, c) {
lo <- muhat - sigmahat * c
hi <- muhat + sigmahat * c
out <- aperm(array(cbind(lo, hi), dim = c(neps, ndelta, 2)), c(1, 3, 2))
return(out)
}
ci_lb <- get_ci(est_l, sqrt(var_l / n), calpha_lb)
ci_ub <- get_ci(est_u, sqrt(var_u / n), calpha_ub)
ci_lb_pt <- get_ci(est_l, sqrt(var_l / n), qnorm(1 - alpha / 2))
ci_ub_pt <- get_ci(est_u, sqrt(var_u / n), qnorm(1 - alpha / 2))
temp <- array(cbind(est_l, est_u, sqrt(var_l), sqrt(var_u)),
dim = c(neps, ndelta, 4))
cim04 <- apply(temp, c(1, 2), get_im04, n = n, alpha = alpha)
ci_im04_l <- get_ci(est_l, sqrt(var_l/n), cim04)[, 1, , drop = FALSE]
ci_im04_u <- get_ci(est_u, sqrt(var_u/n), cim04)[, 2, , drop = FALSE]
if(do_rearrange) {
ci_lb <- array(apply(ci_lb, c(2, 3), sort, decreasing = TRUE),
dim = c(neps, 2, ndelta))
ci_ub <- array(apply(ci_ub, c(2, 3), sort, decreasing = FALSE),
dim = c(neps, 2, ndelta))
est_l <- array(apply(est_l, 2, sort, decreasing = TRUE),
dim = c(neps, ndelta))
est_u <- array(apply(est_u, 2, sort, decreasing = FALSE),
dim = c(neps, ndelta))
ci_im04_l <- array(apply(ci_im04_l, c(2, 3), sort, decreasing = TRUE),
dim = c(neps, ndelta))
ci_im04_u <- array(apply(ci_im04_u, c(2, 3), sort, decreasing = FALSE),
dim = c(neps, ndelta))
}
ci_im04 <- aperm(array(cbind(ci_im04_l, ci_im04_u), dim = c(neps, ndelta, 2)),
c(1, 3, 2))
# Return results in a user-friendly format
temp_fn <- function(x) {
out1 <- array(c(est_l[, x], est_u[, x]), dim = c(neps, 2))
out2 <- array(c(ci_lb[, 1, x], ci_ub[, 2 , x], ci_lb_pt[, , x],
ci_ub_pt[, , x], ci_im04[, , x]), dim = c(neps, 8))
out <- array(c(out1, out2), dim = c(neps, 10))
return(out)
}
res <- array(vapply(1:ndelta, temp_fn,
FUN.VALUE = array(0, dim = c(neps, 10))),
dim = c(neps, 10, ndelta))
dim2names <- c("lb", "ub", "ci_lb_lo_unif", "ci_ub_hi_unif", "ci_lb_lo_pt",
"ci_lb_hi_pt", "ci_ub_lo_pt", "ci_ub_hi_pt", "ci_im04_lo",
"ci_im04_hi")
dimnames(res) <- list(eps, dim2names, delta)
out <- list(bounds = res, var_ub = var_u, var_lb = var_l,
eps_zero = eps_zero, q_lb = qhats_lb, q_ub = qhats_ub,
lambda_lb = lambda_l, lambda_ub = lambda_u,
ifvals_lb = ifvals_l, ifvals_ub = ifvals_u,
nuis_fns = nuis_fns, nuhat = nuhat_fold, glhat = glhat, guhat = guhat,
glhat_train = glhat_train, guhat_train = guhat_train,
tauhat_l = tauhat_lb, tauhat_u = tauhat_ub, phibar_lb = phibar_l,
phibar_ub = phibar_u, phibar_lb_fold = phibar_l_fold,
phibar_ub_fold = phibar_u_fold, mult_calpha_lb = calpha_lb,
mult_calpha_ub = calpha_ub, im04_calpha = cim04)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.