R/ind.decomp.R
In causal.decomp: Causal Decomposition Analysis

Documented in ind.decomp

#' Causal Decomposition with Individualized Interventions
#' 
#' ‘ind.decomp’ estimates how much initial disparities would persist under two scenarios:
#' (i) Everyone follows the optimal treatment regime (OTR), yielding the Individualized
#' Controlled Direct Effect (ICDE); (ii) The proportion of individuals following the OTR is
#' equalized across groups, yielding the Individualized Interventional Effect (IIE).
#' This function implements the method proposed by Park, Kang, and Lee (2025+).
#'
#' @usage
#' ind.decomp(outcome, group, group.ref, risk.factor, intermediates, moderators,
#'   covariates, data, B, cluster = NULL)
#'
#' @details This function first estimates the initial disparity, defined
#'   as the average difference in an outcome between groups within the same levels
#'   of outcome-allowable covariates. Formally,
#'   \deqn{\tau_{c} \equiv E[Y \mid R = 1, c] - E[Y \mid R = 0, c], \quad \text{for } c \in \mathcal{C}}
#'   where \eqn{R = 1} is the comparison group and \eqn{R = 0} is the reference group.
#'   The term \eqn{c \in \mathcal{C}} represents baseline covariates.
#'
#'   For individualized conditional effects, disparity remaining is defined as
#'   \deqn{
#'   \zeta_{c}^{\mathrm{ICDE}}(d^{opt}) \equiv E[Y(d^{opt}) \mid R = 1, c] - E[Y(d^{opt}) \mid R = 0, c], \quad \text{for } c \in \mathcal{C}
#'   }
#'   where \eqn{d^{opt}} is an optimal value for risk factor \eqn{M}. This definition of
#'   disparity remaining states the difference in an outcome between the comparison
#'   and reference groups after setting their risk factor \eqn{M} to the optimal value
#'   obtained from the OTRs.
#'
#'   For individualized interventional effects, disparity reduction and disparity
#'   remaining due to equalizing compliance rates across groups can be expressed as
#'   follows:
#'   \deqn{
#'   \delta_{c}^{\mathrm{IIE}}(1) \equiv E[Y \mid R = 1, c] - E[Y(K) \mid R = 1, c]
#'   }
#'   \deqn{
#'   \zeta_{c}^{\mathrm{IIE}}(0) \equiv E[Y(K) \mid R = 1, c] - E[Y \mid R = 0, c]
#'   }
#'   where \eqn{Y(K)} represents a potential outcome under the value of \eqn{M} that is
#'   determined by a random draw from the compliance distribution of the reference
#'   group among individuals with the same levels of target-factor-allowable
#'   covariates. Disparity reduction (\eqn{\delta_{c}^{\mathrm{IIE}}(1)}) represents
#'   the disparity in outcomes among a comparison group
#'   after intervening to setting the compliance rate equal to that of a reference
#'   group within the same target-factor-allowable covariate levels. Disparity
#'   remaining (\eqn{\zeta_{c}^{\mathrm{IIE}}(0)}) quantifies the outcome difference
#'   that persists between a comparison and reference group even after the intervention.
#'
#' @param outcome a character string indicating the name of the outcome.
#' @param group a character string indicating the name of the social group indicator
#'   such as race or gender.
#' @param group.ref reference group of the social group indicator.
#' @param risk.factor a character string indicating the name of the risk factor.
#' @param intermediates vector containing the name of intermediate confounders.
#' @param moderators a character string indicating the name of variables that have
#'   heterogeneous effects on the outcome based on the risk factor.
#' @param covariates a vector containing the name of baseline covariate variable(s).
#' @param data The data set for analysis (data.frame).
#' @param B Number of bootstrapped samples in the causal decomposition analysis.
#' @param cluster a vector of cluster indicators for the bootstrap. If provided,
#'   the cluster bootstrap is used. Default is 'NULL'.
#'
#' @return a matrix containing the point estimates for:
#' \enumerate{
#'   \item the initial disparity and the proportion recommended for treatment,
#'   \item the disparity remaining and percent reduction based on individualized conditional
#'   effects,
#'   \item the disparity remaining, disparity reduction, and percent reduction based on
#'   individualized intervention effects.
#' }
#'   It also returns the nonparametric bootstrap confidence intervals for each estimate.
#'
#' @author
#'   Soojin Park, University of California, Riverside, \email{soojinp@@ucr.edu};
#'   Suyeon Kang, University of Central Florida, \email{suyeon.kang@@ucf.edu};
#'   Karen Xu, University of California, Riverside, \email{karenxu@@ucr.edu}.
#'
#' @references
#'   Park, S., Kang, S., & Lee, C. (2025). Simulation-Based Sensitivity Analysis in
#'   Optimal Treatment Regimes and Causal Decomposition with Individualized Interventions.
#'   arXiv preprint arXiv:2506.19010.
#'
#' @seealso \code{\link{ind.sens}}
#'
#' @export
#' @examples
#' data(idata)
#' 
#' # NOTE: We recommend using at least B = 500 boostrap replications.
#' results_unadj <- ind.decomp(outcome = "Y", group = "R", group.ref = "0", risk.factor = "M",
#'                            intermediates = c("X1", "X2", "X3"), moderators = c("X1", "X2"),
#'                            covariates = "C", data = idata, B = 50)
#' results_unadj                            
ind.decomp <- function(outcome, group, group.ref, risk.factor, intermediates, moderators,
                       covariates, data, B, cluster = NULL){
  # NOTE: include reference level for group

  data[[group]] <- as.factor(data[[group]]) # NOTE: Ensure group is a factor
  n_levels <- nlevels(data[[group]])       # NOTE: Check if the variable has exactly 2 levels
  if (n_levels != 2) {
    stop("The social group indicator R must be a factor with two levels")

  }
  # NOTE: Check if the specified ref_level exists
  if (!(group.ref %in% levels(data[[group]]))) {
    stop(sprintf("The specified reference level '%s' is not a level of '%s'", group.ref, group))
  }

  data[[risk.factor]] <- as.factor(data[[risk.factor]]) # NOTE: Ensure risk.factor is a factor
  n_levels <- nlevels(data[[risk.factor]])              # NOTE: Check if the variable has exactly 2 levels
  if (n_levels != 2) {
    stop("The mediator has to be a binary factor")
  }

  if (!is.numeric(data[[outcome]]) || is.factor(data[[outcome]]) || length(unique(data[[outcome]])) <= 10) {
    # NOTE: Assumes a variable is continuous if it’s numeric and has more than, say, 10 unique values.
    warning(sprintf("Variable '%s' must be a continuous numeric variable", outcome))
  }

    est1 <- function(data){

    data_R0 <- subset(data, data[[group]] == group.ref)
    data_R1 <- subset(data, data[[group]] != group.ref)

    # weighting
    moPropen1 <- buildModelObj(model = as.formula(paste("~", paste(c(covariates, intermediates), collapse = " + "))),
                               solver.method = "glm",
                               solver.args = list("family" = "binomial", control = glm.control(maxit = 50)),
                               # control = , change it as function input, give this value as default
                               predict.method = 'predict.glm',
                               predict.args = list(type = "response"))
    
    moClass1 <- buildModelObj(
      model = as.formula(paste0(" ~ ", paste(c(moderators), collapse = " + "))),
      solver.method = rpart,
      solver.args   = list(method = "class",
                           control = rpart.control(minsplit = 200, cp = 0.01)),
      predict.args  = list(type = "class")
    )

    fitFS1 <- optimalClass(moPropen = moPropen1,
                           moClass = moClass1,
                           data = data.frame(data_R1), response = data_R1[[outcome]],
                           txName = risk.factor, verbose = FALSE)

    fitFS0 <- optimalClass(moPropen = moPropen1,
                           moClass = moClass1,
                           data = data.frame(data_R0), response = data_R0[[outcome]],
                           txName = risk.factor, verbose = FALSE)


    data$opt.M <- ifelse(data[[group]] == group.ref, optTx(fitFS0)$optimalTx, optTx(fitFS1)$optimalTx)
    ta <- table(data$opt.M)
    p.opt <- ta[2]/(ta[2] + ta[1]) * 100
    # rpart.plot(classif(object=fitFS0))
    DATA1 <- data
    DATA1 <- DATA1 %>%
      mutate(Ind = (DATA1[[risk.factor]] == DATA1$opt.M))

    fit.m <- glm(as.formula(paste0(risk.factor, " ~ ", paste(c(group,covariates,intermediates), collapse = " + "))),
                 family = binomial(logit), data = DATA1)
    coef.m <- c(fit.m$coef)
    pre.m <- 1 / (1 + exp(- (cbind(model.matrix(fit.m)) %*% coef.m)))
    p.med <- ifelse(DATA1[[risk.factor]] == 0, 1 - pre.m, pre.m)

    DATA1$w <- 1 / p.med

    #zeta_ICDE
    fit <- lm(paste0(outcome, " ~ ", paste(c(covariates,group), collapse = " + ")), data=DATA1, weights = Ind * w)
    estimated_zeta_ICDE <- coef(fit)[length(covariates)+2]

    #zeta_IIE
    fit.lm1 <- lm(paste0(outcome, " ~ ", paste(c(covariates), collapse = " + ")), data=data_R1)
    W_Y1 <- coef(fit.lm1)[1]

    fit.lm0 <- lm(paste0(outcome, " ~ ", paste(c(covariates), collapse = " + ")), data=data_R0)
    W_Y0 <- coef(fit.lm0)[1]

    fit.lm <- lm(paste0(outcome, " ~ ", paste(c(covariates,group,"Ind", paste0(group, ":Ind")), collapse = " + ")), data = DATA1, weights = w)
    fit.Ind <- glm(paste0("Ind", " ~ ", paste(c(group), collapse = " + ")), data = DATA1, family = binomial(logit))

    alpha1 <- plogis(sum(coef(fit.Ind)[1:2])) - plogis(coef(fit.Ind)[1]) #race

    coef_names <- names(coef(fit.lm))
    ind_coef <- coef_names[grepl(paste0("^", "Ind"), coef_names)]
    interaction_coef <- coef_names[grepl(paste0(":", "Ind"), coef_names)]

    reg_delta_IIE <- alpha1 * sum(coef(fit.lm)[c(ind_coef, interaction_coef)])
    reg_zeta_IIE <- (W_Y1 - W_Y0) - reg_delta_IIE

    return(list(fitFS0 = fitFS0, fitFS1 = fitFS1, p.opt = p.opt,
                estimated_zeta_ICDE = estimated_zeta_ICDE,
                reg_zeta_IIE = reg_zeta_IIE,
                reg_delta_IIE = reg_delta_IIE))
  }

  est.ori <- est1(data)

  reg_delta_IIE <- est.ori$reg_delta_IIE
  reg_zeta_IIE <- est.ori$reg_zeta_IIE
  estimated_zeta_ICDE <- est.ori$estimated_zeta_ICDE
  p.opt <- est.ori$p.opt
  fitFS1 <- est.ori$fitFS1
  fitFS0 <- est.ori$fitFS0

  regzeta_IIE.boot <- regdelta_IIE.boot <- zeta_ICDE.boot <- rep(NA, B)

  for(i in 1:B){
    if (is.null(cluster)) {
      data.boot <- data[sample(nrow(data), size = nrow(data), replace = TRUE), ]
    } else {
      clusters <- unique(data[, cluster])
      units <- sample(clusters, size = length(clusters), replace = TRUE)
      df.bs <- sapply(units, function(x) which(data[, cluster] == x))
      sb <- unlist(df.bs)
      data.boot <- data[sb, ]
    }

    est.boot <- est1(data.boot)

    zeta_ICDE.boot[i] <- est.boot$estimated_zeta_ICDE
    regzeta_IIE.boot[i] <- est.boot$reg_zeta_IIE
    regdelta_IIE.boot[i] <- est.boot$reg_delta_IIE
  }

  SE_zeta_ICDE <- sd(zeta_ICDE.boot)
  SE_regdelta_IIE <- sd(regdelta_IIE.boot)
  SE_regzeta_IIE <- sd(regzeta_IIE.boot)

  # Initial disparity
  results_ini <- lm(as.formula(paste0(outcome, " ~ ", paste(c(group,covariates), collapse = " + "))), data = data)
  estimated_tau <- coef(results_ini)[2] # race
  SE_tau <- summary(results_ini)$coefficients[2, 2] # race

  results <- list(# result table
    p.opt = p.opt, fitFS1 = fitFS1, fitFS0 = fitFS0,
    estimated_tau = estimated_tau, SE_tau = SE_tau, # initial disparity
    estimated_zeta_ICDE = as.numeric(estimated_zeta_ICDE), SE_zeta_ICDE = SE_zeta_ICDE, # disparity remaining_ICDE
    reg_delta_IIE = as.numeric(reg_delta_IIE), SE_regdelta_IIE = SE_regdelta_IIE, # disparity reduction_IIE
    reg_zeta_IIE = as.numeric(reg_zeta_IIE), SE_regzeta_IIE = SE_regzeta_IIE # disparity remaining_IIE
  )

  class(results) <- "ind.decomp"

  return(results)
}