Nothing
#' Inverse Probability Weighted Estimation
#'
#' @description
#' `ipw()` is a bring-your-own-model (BYOM) inverse probability weighted
#' estimator for causal inference. You supply a fitted propensity score model
#' and a fitted weighted outcome model; `ipw()` computes causal effect estimates
#' with standard errors that correctly account for the two-step estimation
#' process.
#'
#' `ipw()` currently supports binary exposures with binary or continuous
#' outcomes. For binary outcomes, it returns the risk difference, log risk
#' ratio, and log odds ratio. For continuous outcomes, it returns the difference
#' in means.
#'
#' @param ps_mod A fitted propensity score model of class [stats::glm()],
#' typically a logistic regression with the exposure as the left-hand side of
#' the formula.
#' @param outcome_mod A fitted weighted outcome model of class [stats::glm()]
#' or [stats::lm()], with the outcome as the dependent variable and
#' propensity score weights supplied via the `weights` argument. The weights
#' should be created with a propensity weight function such as [wt_ate()].
#' @param .data A data frame containing the exposure, outcome, and covariates.
#' If `NULL` (the default), `ipw()` extracts data from the model objects.
#' Supply `.data` explicitly if the outcome model formula contains
#' transformations that prevent extraction of the exposure variable from
#' [stats::model.frame()].
#' @param estimand A character string specifying the causal estimand: `"ate"`,
#' `"att"`, `"ato"`, or `"atm"`. If `NULL`, the estimand is inferred from the
#' weights in `outcome_mod`. Auto-detection requires weights created with
#' [wt_ate()], [wt_att()], [wt_atm()], or [wt_ato()].
#' @param ps_link A character string specifying the link function used in the
#' propensity score model: `"logit"`, `"probit"`, or `"cloglog"`. Defaults to
#' the link used by `ps_mod`.
#' @param conf_level Confidence level for intervals. Default is `0.95`.
#'
#' @details
#' # Workflow
#'
#' `ipw()` is designed around a three-step workflow:
#'
#' 1. Fit a propensity score model (e.g., logistic regression of exposure on
#' confounders).
#' 2. Calculate propensity score weights for your estimand (e.g., [wt_ate()])
#' and fit a weighted outcome model.
#' 3. Pass both models to `ipw()` to obtain causal effect estimates with
#' correct standard errors.
#'
#' You are responsible for specifying and fitting both models. `ipw()` handles
#' the variance estimation.
#'
#' # Effect measures
#'
#' For binary outcomes ([stats::glm()] with `family = binomial()`), `ipw()`
#' returns three effect measures:
#' - `rd`: Risk difference (marginal risk in exposed minus unexposed)
#' - `log(rr)`: Log risk ratio
#' - `log(or)`: Log odds ratio
#'
#' For continuous outcomes ([stats::lm()] or [stats::glm()] with
#' `family = gaussian()`), only the difference in means (`diff`) is returned.
#'
#' Use [as.data.frame()] with `exponentiate = TRUE` to obtain risk ratios and
#' odds ratios on their natural scale.
#'
#' # Variance estimation
#'
#' Standard errors are computed via linearization, which correctly accounts for
#' the uncertainty introduced by estimating propensity scores. This avoids the
#' known problem of underestimated standard errors that arises from treating
#' estimated weights as fixed. See Kostouraki et al. (2024) for details.
#'
#' @references
#' Kostouraki A, Hajage D, Rachet B, et al. On variance estimation of the
#' inverse probability-of-treatment weighting estimator: A tutorial for
#' different types of propensity score weights. *Statistics in Medicine*.
#' 2024;43(13):2672--2694. \doi{10.1002/sim.10078}
#'
#' @return An S3 object of class `ipw` with the following components:
#' \describe{
#' \item{`estimand`}{The causal estimand: one of `"ate"`, `"att"`, `"ato"`,
#' or `"atm"`.}
#' \item{`ps_mod`}{The fitted propensity score model.}
#' \item{`outcome_mod`}{The fitted outcome model.}
#' \item{`estimates`}{A data frame with one row per effect measure and the
#' following columns: `effect` (the measure name), `estimate` (point
#' estimate), `std.err` (standard error), `z` (z-statistic),
#' `ci.lower` and `ci.upper` (confidence interval bounds),
#' `conf.level`, and `p.value`.}
#' }
#'
#' @examples
#' # Simulate data with a confounder, binary exposure, and binary outcome
#' set.seed(123)
#' n <- 200
#' x1 <- rnorm(n)
#' z <- rbinom(n, 1, plogis(0.5 * x1))
#' y <- rbinom(n, 1, plogis(-0.5 + 0.8 * z + 0.3 * x1))
#' dat <- data.frame(x1, z, y)
#'
#' # Step 1: Fit a propensity score model
#' ps_mod <- glm(z ~ x1, data = dat, family = binomial())
#'
#' # Step 2: Calculate ATE weights and fit a weighted outcome model
#' wts <- wt_ate(ps_mod)
#' outcome_mod <- glm(y ~ z, data = dat, family = binomial(), weights = wts)
#'
#' # Step 3: Estimate causal effects with correct standard errors
#' result <- ipw(ps_mod, outcome_mod)
#' result
#'
#' # Exponentiate log-RR and log-OR to get RR and OR
#' as.data.frame(result, exponentiate = TRUE)
#'
#' # Continuous outcome example
#' y_cont <- 2 + 0.8 * z + 0.3 * x1 + rnorm(n)
#' dat$y_cont <- y_cont
#' outcome_cont <- lm(y_cont ~ z, data = dat, weights = wts)
#' ipw(ps_mod, outcome_cont)
#'
#' @seealso
#' [wt_ate()], [wt_att()], [wt_atm()], [wt_ato()] for calculating propensity
#' score weights.
#'
#' [ps_trim()], [ps_trunc()] for handling extreme propensity scores before
#' weighting.
#'
#' @export
#' @importFrom stats dnorm family formula model.frame model.matrix model.weights
#' @importFrom stats pnorm predict printCoefmat qnorm var
ipw <- function(
ps_mod,
outcome_mod,
.data = NULL,
estimand = NULL,
ps_link = NULL,
conf_level = 0.95
) {
assert_class(ps_mod, "glm")
assert_class(outcome_mod, c("glm", "lm"))
weight_matrix <- model.matrix(ps_mod)
exposure_name <- fmla_extract_left_chr(ps_mod)
outcome_name <- fmla_extract_left_chr(outcome_mod)
if (is.null(.data)) {
exposure <- fmla_extract_left_vctr(ps_mod)
outcome <- fmla_extract_left_vctr(outcome_mod)
} else {
assert_class(exposure_name, "character", .length = 1)
assert_class(outcome_name, "character", .length = 1)
assert_columns_exist(.data, c(exposure_name, outcome_name))
exposure <- .data[[exposure_name]]
outcome <- .data[[outcome_name]]
}
ps <- predict(ps_mod, type = "response", newdata = .data)
if (is.null(ps_link)) {
ps_link <- ps_mod$family$link
}
if (!identical(length(exposure), length(outcome))) {
abort(c(
"{.arg exposure} and {.arg outcome} must be the same length.",
x = "{.arg exposure} is length {length(exposure)}",
x = "{.arg outcome} is length {length(outcome)}"
))
}
# todo: allow user to specify existing weights
# or automatically extract weights from outcome model if they exist
wts <- extract_weights(outcome_mod)
estimand <- check_estimand(wts, estimand)
marginal_means <- estimate_marginal_means(
outcome_mod = outcome_mod,
wts = wts,
exposure = exposure,
exposure_name = exposure_name,
.data = .data
)
uncorrected_lin_vars <- linearize_variables_for_wts(
exposure,
outcome,
wts,
marginal_means
)
lin_vars <- linearize_variables_for_ps(
exposure = exposure,
outcome = outcome,
wts = wts,
ps = ps,
estimand = estimand,
weight_matrix = weight_matrix,
marginal_means = marginal_means,
uncorrected_lin_vars = uncorrected_lin_vars,
ps_link = ps_link
)
estimates <- calculate_estimates(
lin_vars = lin_vars,
marginal_means = marginal_means,
n = length(outcome),
linear_regression = is_linear_regression(outcome_mod),
conf_level = conf_level
)
structure(
list(
estimand = estimand,
ps_mod = ps_mod,
outcome_mod = outcome_mod,
estimates = estimates
),
class = "ipw"
)
}
#' @export
print.ipw <- function(x, ...) {
cat("Inverse Probability Weight Estimator\n")
# todo: make this more adaptive, e.g. ATE without L2FU
cat("Estimand:", toupper(x$estimand), "\n\n")
cat("Propensity Score Model:\n")
cat(" Call:", paste(deparse(x$ps_mod$call), collapse = "\n"), "\n")
cat("\n")
cat("Outcome Model:\n")
cat(" Call:", paste(deparse(x$outcome_mod$call), collapse = "\n"), "\n")
cat("\n")
cat("Estimates:\n")
estimates <- x$estimates[-1]
rownames(estimates) <- x$estimates$effect
printCoefmat(estimates, has.Pvalue = TRUE, cs.ind = 2:3, tst.ind = 4)
invisible(x)
}
#' @param x An `ipw` object.
#' @param exponentiate If `TRUE`, exponentiate the log risk ratio and log odds
#' ratio to produce risk ratios and odds ratios on their natural scale. The
#' confidence interval bounds are also exponentiated. Standard errors, z
#' statistics, and p-values remain on the log scale. Default is `FALSE`.
#' @param row.names,optional,... Passed to [base::as.data.frame()].
#' @returns `as.data.frame()` returns the `estimates` component as a data
#' frame. When `exponentiate = TRUE`, the `log(rr)` and `log(or)` rows are
#' transformed: point estimates and confidence limits are exponentiated and
#' the effect labels become `"rr"` and `"or"`.
#' @export
#' @rdname ipw
as.data.frame.ipw <- function(
x,
row.names = NULL,
optional = NULL,
exponentiate = FALSE,
...
) {
df <- as.data.frame(
x$estimates,
row.names = row.names,
optional = optional,
...
)
if (!exponentiate) {
# Return as-is
return(df)
}
# If exponentiate=TRUE, we transform the "log_rr" and "log_or" rows to the raw scale
# by exponentiating estimate, ci.lower, and ci.upper.
is_log_rr <- df$effect == "log(rr)"
is_log_or <- df$effect == "log(or)"
rows_to_expo <- is_log_rr | is_log_or
# Exponentiate estimate, ci.lower, ci.upper
df$estimate[rows_to_expo] <- exp(df$estimate[rows_to_expo])
df$ci.lower[rows_to_expo] <- exp(df$ci.lower[rows_to_expo])
df$ci.upper[rows_to_expo] <- exp(df$ci.upper[rows_to_expo])
# Rename effect labels for clarity
# "log_rr" -> "rr"
# "log_or" -> "or"
df$effect[is_log_rr] <- "rr"
df$effect[is_log_or] <- "or"
# note: variance, std.err, z_stat, p_value remain on the log scale.
# significance testing is typically done on log-scale.
df
}
calculate_estimates <- function(
lin_vars,
marginal_means,
n,
linear_regression,
conf_level
) {
z_val <- qnorm(1 - ((1 - conf_level) / 2))
### RISK DIFFERENCE (raw scale)
# --------------------------------
# Influence = (l1 - l0)
rd_var <- var(lin_vars$l1 - lin_vars$l0) / n
rd_est <- marginal_means$mu1 - marginal_means$mu0
rd_se <- sqrt(rd_var)
rd_ci_lower <- rd_est - z_val * rd_se
rd_ci_upper <- rd_est + z_val * rd_se
rd_z <- rd_est / rd_se
rd_p <- 2 * (1 - pnorm(abs(rd_z)))
# for continuous outcomes, only return difference
if (linear_regression) {
return(
data.frame(
effect = "diff",
estimate = rd_est,
# variances are on the same scale as 'estimate':
# variance = c(rd_var, log_rr_var, log_or_var),
std.err = rd_se,
z = rd_z,
ci.lower = rd_ci_lower,
ci.upper = rd_ci_upper,
conf.level = conf_level,
p.value = rd_p
)
)
}
### RISK RATIO (log scale)
# ---------------------------
# Risk ratio: RR = mu1 / mu0
# We'll store 'log_rr_est = log(RR)' in the output.
rr_raw_est <- marginal_means$mu1 / marginal_means$mu0
log_rr_est <- log(rr_raw_est)
# Influence for RR on the raw scale: (l1 / mu1 - l0 / mu0)
rr_inf_raw <- lin_vars$l1 /
marginal_means$mu1 -
lin_vars$l0 / marginal_means$mu0
# Then for log(RR), the influence is (1 / RR) * rr_inf_raw
log_rr_inf <- (1 / rr_raw_est) * rr_inf_raw
log_rr_var <- var(log_rr_inf) / n
log_rr_se <- sqrt(log_rr_var)
log_rr_ci_lower <- log_rr_est - z_val * log_rr_se
log_rr_ci_upper <- log_rr_est + z_val * log_rr_se
rr_z <- log_rr_est / log_rr_se
rr_p <- 2 * (1 - pnorm(abs(rr_z)))
### ODDS RATIO (log scale)
# ---------------------------
# OR = [mu1/(1 - mu1)] / [mu0/(1 - mu0)]
or_raw_est <- (marginal_means$mu1 / (1 - marginal_means$mu1)) /
(marginal_means$mu0 / (1 - marginal_means$mu0))
log_or_est <- log(or_raw_est)
# Influence for OR on the raw scale:
# l_or_raw = l1 / [mu1*(1 - mu1)] - l0 / [mu0*(1 - mu0)]
or_inf_raw <- (lin_vars$l1 /
(marginal_means$mu1 * (1 - marginal_means$mu1))) -
(lin_vars$l0 / (marginal_means$mu0 * (1 - marginal_means$mu0)))
# Then log(OR) influence = (1 / OR) * or_inf_raw
log_or_inf <- (1 / or_raw_est) * or_inf_raw
log_or_var <- var(log_or_inf) / n
log_or_se <- sqrt(log_or_var)
log_or_ci_lower <- log_or_est - z_val * log_or_se
log_or_ci_upper <- log_or_est + z_val * log_or_se
or_z <- log_or_est / log_or_se
or_p <- 2 * (1 - pnorm(abs(or_z)))
data.frame(
effect = c("rd", "log(rr)", "log(or)"),
# For RD, the estimate is raw. For RR and OR, the estimate is log-scale:
estimate = c(rd_est, log_rr_est, log_or_est),
# Variances are on the same scale as 'estimate'
# but we won't return these since they can be calculated from std.err
# variance = c(rd_var, log_rr_var, log_or_var)
std.err = c(rd_se, log_rr_se, log_or_se),
z = c(rd_z, rr_z, or_z),
ci.lower = c(rd_ci_lower, log_rr_ci_lower, log_or_ci_lower),
ci.upper = c(rd_ci_upper, log_rr_ci_upper, log_or_ci_upper),
conf.level = conf_level,
p.value = c(rd_p, rr_p, or_p)
)
}
# accounts for dependence introduced by weights
# treats them as fixed, not estimated
# equivalent to usual robust sandwich estimator
linearize_variables_for_wts <- function(Z, Y, wts, marginal_means) {
n <- length(Z)
wts <- as.double(wts)
l1 <- n / marginal_means$n1 * (wts * Z * (Y - marginal_means$mu1))
l0 <- n / marginal_means$n0 * (wts * (1 - Z) * (Y - marginal_means$mu0))
list(l1 = l1, l0 = l0)
}
# additionally adds variability for the estimation of the PS
linearize_variables_for_ps <- function(
exposure,
outcome,
wts,
ps,
estimand,
weight_matrix,
marginal_means,
uncorrected_lin_vars,
ps_link
) {
n <- length(exposure)
weight_derivatives <- derive_weights(
exposure = exposure,
ps = ps,
weight_matrix = weight_matrix,
ps_link = ps_link,
estimand = estimand
)
correction_mat <- compute_ps_correction_matrix_inv(
weight_matrix = weight_matrix,
ps = ps,
ps_link = ps_link,
n = n
)
l1 <- uncorrected_lin_vars$l1 +
correct_for_ps(
exposure = exposure,
outcome = outcome,
ps = ps,
mu = marginal_means$mu1,
n_group = marginal_means$n1,
weight_matrix = weight_matrix,
weight_derivatives = weight_derivatives,
correction_mat = correction_mat,
n = n
)
l0 <- uncorrected_lin_vars$l0 +
correct_for_ps(
exposure = exposure,
exposure_actual = 1 - exposure,
outcome = outcome,
ps = ps,
mu = marginal_means$mu0,
n_group = marginal_means$n0,
weight_matrix = weight_matrix,
weight_derivatives = weight_derivatives,
correction_mat = correction_mat,
n = n
)
list(l1 = l1, l0 = l0)
}
compute_ps_correction_matrix_inv <- function(weight_matrix, ps, ps_link, n) {
deriv_link_f <- derive_link(ps_link)
# row-by-row x_i x_i^T
crossprods <- t_tcrossprod_over_rows(weight_matrix)
# build the correction matrix
correction_mat <- matrix(
colSums(crossprods / (ps * (1 - ps) * (deriv_link_f(ps)^2))) / n,
ncol(weight_matrix),
ncol(weight_matrix)
)
solve(correction_mat)
}
# faster than `t(apply(mat, 1, \(x) tcrossprod(x)))`
t_tcrossprod_over_rows <- function(mat) {
n <- nrow(mat)
p <- ncol(mat)
out <- matrix(0, nrow = n, ncol = p * p)
for (i in seq_len(n)) {
out[i, ] <- tcrossprod(mat[i, ])
}
out
}
correct_for_ps <- function(
exposure,
exposure_actual = exposure,
outcome,
ps,
mu,
n_group,
weight_matrix,
weight_derivatives,
correction_mat,
n
) {
# first, compute partial-derivative sums over subjects (averaged by n)
partial_derivative_sums <- colSums(
weight_derivatives * exposure_actual * (outcome - mu)
) /
n
# then build the transformation matrix for correction
transformation_mat <- correction_mat %*% t((exposure - ps) * weight_matrix)
# and then apply the partial-derivative sums to that transformation
correction_contrib <- rbind(partial_derivative_sums) %*% transformation_mat
# rescale by (n / n_group)
scaling_factor <- n / n_group
correction_contrib <- correction_contrib * scaling_factor
# and reduce to vector and unname
correction_contrib |>
drop() |>
unname()
}
estimate_marginal_means <- function(
outcome_mod,
wts,
exposure,
exposure_name,
.data = NULL,
call = rlang::caller_env()
) {
# todo: this could be generalized with split() and lapply()
if (is.null(.data)) {
.data <- model.frame(outcome_mod)
check_exposure(.data, exposure_name, call = call)
}
# todo: make this more flexible for different values and model specs
# maybe can optionally accept a function for g-comp
exposure_values <- sort(unique(exposure))
if (!isTRUE(length(exposure_values) == 2)) {
abort(c(
"{.code ipw()} currently only supports binary exposures.",
x = "There are {length(exposure_values)} unique value{?s} of the exposure.",
call = call
))
}
.data_1 <- .data
.data_0 <- .data
.data_1[[exposure_name]] <- exposure_values[[2]]
.data_0[[exposure_name]] <- exposure_values[[1]]
n1 <- sum(wts[exposure == exposure_values[[2]]])
mu1 <- mean(predict(outcome_mod, newdata = .data_1, type = "response"))
n0 <- sum(wts[exposure == exposure_values[[1]]])
mu0 <- mean(predict(outcome_mod, newdata = .data_0, type = "response"))
list(
# exposure = 1
n1 = n1,
mu1 = mu1,
# exposure = 0
n0 = n0,
mu0 = mu0
)
}
derive_weights <- function(
exposure,
ps,
weight_matrix,
ps_link = c("logit", "probit", "cloglog"),
estimand = c("ate", "att", "ato", "atm")
) {
estimand <- rlang::arg_match(estimand)
deriv_link_f <- derive_link(ps_link)
deriv_vec <- switch(
estimand,
"ate" = ate_derivative(exposure, ps, deriv_link_f),
"att" = att_derivative(exposure, ps, deriv_link_f),
"ato" = ato_derivative(exposure, ps, deriv_link_f),
"atm" = atm_derivative(exposure, ps, deriv_link_f)
)
weight_matrix * deriv_vec
}
ate_derivative <- function(exposure, ps, deriv_link_f) {
ifelse(
exposure == 1,
-1 / (ps^2 * deriv_link_f(ps)),
1 / ((1 - ps)^2 * deriv_link_f(ps))
)
}
att_derivative <- function(exposure, ps, deriv_link_f) {
ifelse(
exposure == 1,
0,
1 / ((1 - ps)^2 * deriv_link_f(ps))
)
}
ato_derivative <- function(exposure, ps, deriv_link_f) {
ifelse(
exposure == 1,
-1 / deriv_link_f(ps),
1 / deriv_link_f(ps)
)
}
atm_derivative <- function(exposure, ps, deriv_link_f) {
wv <- ifelse(
exposure == 1,
-1 / (ps^2 * deriv_link_f(ps)),
1 / ((1 - ps)^2 * deriv_link_f(ps))
)
# Then set parts to zero
wv[exposure == 1 & ps < 0.5] <- 0
wv[exposure == 0 & ps > 0.5] <- 0
wv
}
derive_link <- function(ps_link = c("logit", "probit", "cloglog")) {
ps_link <- rlang::arg_match(ps_link)
switch(
ps_link,
logit = function(x) 1 / (x * (1 - x)),
probit = function(x) 1 / (dnorm(qnorm(x))),
cloglog = function(x) -1 / ((1 - x) * log(1 - x))
)
}
extract_weights <- function(.mod) {
.mod |>
model.frame() |>
model.weights()
}
check_estimand <- function(wts, estimand, call = rlang::caller_env()) {
if (is_causal_wt(wts)) {
estimand_from_weights <- estimand(wts)
} else {
estimand_from_weights <- NULL
}
if (!is.null(estimand_from_weights) && !is.null(estimand)) {
same_estimand <- identical(estimand_from_weights, estimand)
if (!same_estimand) {
.estimand <- estimand
.estimand_from_weights <- estimand_from_weights
abort(
"Estimand in weights different from {.arg estimand}: \\
{.val { .estimand_from_weights}} vs. {.val { .estimand}}",
call = call
)
} else {
return(estimand)
}
}
if (is.null(estimand_from_weights) && is.null(estimand)) {
abort(
c(
"Can't determine the estimand from weights.",
i = "Please specify {.arg estimand}."
)
)
}
if (!is.null(estimand_from_weights)) {
return(estimand_from_weights)
} else {
return(estimand)
}
}
check_exposure <- function(.data, .exposure_name, call = rlang::caller_env()) {
assert_class(.exposure_name, "character", .length = 1, call = call)
if (!(.exposure_name %in% names(.data))) {
abort(
c(
"{.val { .exposure_name}} not found in {.code model.frame(outcome_mod)}.",
i = "The outcome model may have transformations in the formula.",
i = "Please specify {.arg .data}"
),
call = call,
error_class = "propensity_columns_exist_error"
)
}
}
is_linear_regression <- function(outcome_mod) {
# Check if 'outcome_mod' inherits from 'glm' first
if (inherits(outcome_mod, "glm")) {
# 'outcome_mod' is a glm, check if family is 'gaussian'
if (family(outcome_mod)$family == "gaussian") {
# It's a glm with Gaussian family => linear regression
return(TRUE)
} else {
# It's a glm but not Gaussian
return(FALSE)
}
}
# Otherwise, if it inherits from 'lm' (and NOT from 'glm'),
# this should be a plain linear model
# since we already confirmed the class of the model
TRUE
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.