R/eifs.R
In txshift: Efficient Estimation of the Causal Effects of Stochastic Interventions

Documented in eif ipcw_eif_update

utils::globalVariables(c("..v_names"))

#' Compute the Shift Parameter Estimate and the Efficient Influence Function
#'
#' @details Estimate the value of the causal parameter alongside statistical
#'  inference for the parameter estimate based on the efficient influence
#'  function of the target parameter, which takes the following form:
#'  %D(P)(o) = H(a,w)(y - \bar{Q}(a,w)) + \bar{Q}(d(a,w)) - \psi(P)
#'
#' @param Y A \code{numeric} vector of the observed outcomes.
#' @param Qn An object providing the value of the outcome evaluated after
#'  imposing a shift in the treatment. This object is passed in after being
#'  constructed by a call to the internal function \code{est_Q}.
#' @param Hn An object providing values of the auxiliary ("clever") covariate,
#'  constructed from the treatment mechanism and required for targeted minimum
#'  loss-based estimation. This object object should be passed in after being
#'  constructed by a call to the internal function \code{est_Hn}.
#' @param estimator The type of estimator to be fit, either \code{"tmle"} for
#'  targeted maximum likelihood estimation or \code{"onestep"} for a one-step
#'  estimator.
#' @param C_samp Indicator for missingness due to exclusion from second-phase
#'  sample. Used for compatibility with the IPCW-TML estimation routine.
#' @param ipc_weights A \code{numeric} vector that gives inverse probability of
#'  censoring weights for each observation. These are generated by invoking the
#'  routines for estimating the censoring mechanism.
#' @param fluc_mod_out An object giving values of the logistic tilting model
#'  for targeted minimum loss estimation. This type of object should be the
#'  output of the internal routines to perform this step of the TML estimation
#'  procedure, as given by \code{\link{fit_fluctuation}}.
#'
#' @importFrom stats var
#'
#' @return A \code{list} containing the parameter estimate, estimated variance
#'  based on the efficient influence function (EIF), the estimate of the EIF
#'  incorporating inverse probability of censoring weights, and the estimate of
#'  the EIF without the application of such weights.
eif <- function(Y,
                Qn,
                Hn,
                estimator = c("tmle", "onestep"),
                fluc_mod_out = NULL,
                C_samp = rep(1, length(Y)),
                ipc_weights = rep(1, length(Y))) {

  # set TMLE as default estimator type
  estimator <- match.arg(estimator)

  # set Qn to use based on estimator type
  if (estimator == "tmle") {
    Qn_shift <- fluc_mod_out$Qn_shift_star
    Qn_noshift <- fluc_mod_out$Qn_noshift_star
  } else if (estimator == "onestep") {
    Qn_shift <- Qn$upshift
    Qn_noshift <- Qn$noshift
  }

  # normalize inverse probability of censoring weights
  ipc_weights_norm <- ipc_weights / sum(ipc_weights)

  # compute substitution estimator of the stochastic shift parameter
  param_obs_est <- rep(0, length(C_samp))
  param_obs_est[C_samp == 1] <- ipc_weights_norm * Qn_shift
  psi <- sum(param_obs_est)

  # compute the efficient influence function (EIF)
  eif <- rep(0, length(C_samp))
  eif[C_samp == 1] <-
    ipc_weights * (Hn$noshift * (Y - Qn_noshift) + (Qn_shift - psi))

  # this will be the outcome of the extra regression
  eif_unweighted <- rep(0, length(C_samp))
  eif_unweighted[C_samp == 1] <-
    (Hn$noshift * (Y - Qn_noshift) + (Qn_shift - psi))

  # add mean of EIF to parameter estimate if fitting one-step
  # NOTE: the estimate of psi is updated _after_ evaluating the EIF
  if (estimator == "onestep") {
    psi <- psi + mean(eif)
  }

  # compute the variance based on the EIF and scale by number of observations
  var_eif <- stats::var(eif) / length(Y)

  # return the variance and the EIF value at each observation
  out <- list(
    psi = psi, var = var_eif, eif = eif,
    eif_unweighted = eif_unweighted
  )
  return(out)
}

###############################################################################

#' Iterative IPCW Update Procedure of Augmented Efficient Influence Function
#'
#' @details An adaptation of the IPCW-TMLE for iteratively constructing an
#'  efficient inverse probability of censoring weighted TML or one-step
#'  estimator. The efficient influence function of the parameter and updating
#'  the IPC weights in an iterative process, until a convergence criteria is
#'  satisfied.
#'
#' @param data_internal A \code{data.table} containing of the observations
#'  selected into the second-phase sample.
#' @param C_samp A \code{numeric} indicator for missingness due to exclusion
#'  the from second-stage sample.
#' @param V A \code{data.table} giving the values across all observations of
#'  all variables that play a role in the censoring mechanism.
#' @param ipc_mech A \code{numeric} vector of the censoring mechanism estimates
#'  all of the observations, only for the two-phase sampling mechanism. Note
#'  well that these values do NOT account for censoring from loss to follow-up.
#' @param ipc_weights A \code{numeric} vector of inverse probability of
#'  censoring weights, including such weights for censoring due to loss to
#'  follow-up. Without loss to follow-up, these are equivalent to \code{C_samp
#'  / ipc_mech} in an initial run of this procedure.
#' @param Qn_estim A \code{data.table} corresponding to the outcome regression.
#'  This is produced by invoking the internal function \code{est_Q}.
#' @param Hn_estim A \code{data.table} corresponding to values produced in the
#'  computation of the auxiliary ("clever") covariate. This is produced easily
#'  by invoking the internal function \code{est_Hn}.
#' @param estimator The type of estimator to be fit, either \code{"tmle"} for
#'  targeted maximum likelihood estimation or \code{"onestep"} for a one-step
#'  estimator.
#' @param fluctuation A \code{character} giving the type of regression to be
#'  used in traversing the fluctuation submodel. The choices are "weighted" and
#'  "standard".
#' @param flucmod_tol A \code{numeric} indicating the largest value to be
#'  tolerated in the fluctuation model for the targeted minimum loss estimator.
#' @param eif_reg_type Whether a flexible nonparametric function ought to be
#'  used in the dimension-reduced nuisance regression of the targeting step for
#'  the censored data case. By default, the method used is a nonparametric
#'  regression based on the Highly Adaptive Lasso (from \pkg{hal9001}). Set
#'  this to \code{"glm"} to instead use a simple linear regression model. In
#'  this step, the efficient influence function (EIF) is regressed against
#'  covariates contributing to the censoring mechanism (i.e., EIF ~ V | C = 1).
#'
#' @importFrom stats var glm qlogis fitted predict as.formula coef
#' @importFrom data.table as.data.table set copy
#' @importFrom assertthat assert_that
#' @importFrom hal9001 fit_hal
#'
#' @return A \code{list} containing the estimated outcome mechanism, the fitted
#'  fluctuation model for TML updates, the updated inverse probability of
#'  censoring weights (IPCW), the updated estimate of the efficient influence
#'  function, and the estimated IPCW component of the EIF.
ipcw_eif_update <- function(data_internal,
                            C_samp,
                            V,
                            ipc_mech,
                            ipc_weights,
                            Qn_estim,
                            Hn_estim,
                            estimator = c("tmle", "onestep"),
                            fluctuation = NULL,
                            flucmod_tol = 50,
                            eif_reg_type = c("hal", "glm")) {
  # get names of columns in sampling mechanism
  v_names <- names(V)

  # perform submodel fluctuation if computing TMLE
  if (estimator == "tmle" & !is.null(fluctuation)) {
    # fit logistic regression for submodel fluctuation with updated weights
    fitted_fluc_mod <- fit_fluctuation(
      Y = data_internal$Y,
      Qn_scaled = Qn_estim,
      Hn = Hn_estim,
      ipc_weights = ipc_weights,
      method = fluctuation
    )
  } else if (estimator == "onestep" & is.null(fluctuation)) {
    fitted_fluc_mod <- NULL
  }

  # compute EIF using updated weights and updated fluctuation (if TMLE)
  # NOTE: for one-step, this adds the first half of the EIF as the correction
  #       _SO_ the second half (from the reduced regression) is still needed...
  eif_eval <- eif(
    Y = data_internal$Y,
    Qn = Qn_estim,
    Hn = Hn_estim,
    estimator = estimator,
    fluc_mod_out = fitted_fluc_mod,
    C_samp = C_samp,
    ipc_weights = ipc_weights
  )

  # NOTE: upon the first run of this procedure, the above two function calls
  #       have computed only the inefficient IPCW-TMLE, i.e., by fitting the
  #       initial fluctuation model and updating the EIF accordingly

  # estimate the EIF nuisance regression using HAL
  if (eif_reg_type == "hal") {
    # fit HAL regression
    eif_mod <- hal9001::fit_hal(
      X = as.matrix(data_internal[, ..v_names]),
      Y = as.numeric(eif_eval$eif_unweighted[C_samp == 1]),
      max_degree = length(v_names),
      family = "gaussian",
      fit_control = list(
        cv_select = TRUE,
        type.measure = "mse"
      ),
      return_lasso = FALSE,
      yolo = FALSE
    )

    # conditional expectation E[D*|V]
    eif_pred <- stats::predict(
      eif_mod,
      new_data = as.matrix(V)
    )
  } else if (eif_reg_type == "glm") {
    eif_mod <- stats::glm(
      stats::as.formula(paste("eif ~", paste(v_names, collapse = " + "))),
      data = data_internal[, eif := eif_eval$eif_unweighted[C_samp == 1]]
    )

    # conditional expectation E[D*|V]
    eif_pred <- unname(stats::predict(
      eif_mod,
      newdata = V
    ))
  }

  # IPCW-TMLE: fluctuation regression to update the IPC weights
  if (estimator == "tmle") {
    ipcw_fluc_reg_data <-
      data.table::as.data.table(
        list(
          C_samp = C_samp,
          logit_ipcw = stats::qlogis(bound_precision(ipc_mech)),
          eif_reg_cov = (eif_pred / bound_precision(ipc_mech))
        )
      )

    # fit fluctuation model for targeting updates
    # NOTE: set `start` to zero to ensure updates are not too large
    suppressWarnings(
      ipcw_fluc <- stats::glm(
        stats::as.formula("C_samp ~ -1 + offset(logit_ipcw) + eif_reg_cov"),
        data = ipcw_fluc_reg_data,
        family = "binomial",
        start = 0
      )
    )
    coefs_fluc <- stats::coef(ipcw_fluc)

    # safety checks for ensuring sanity of the fluctuation model fit
    if (!ipcw_fluc$converged || abs(max(coefs_fluc)) > flucmod_tol) {
      # if convergence fails or tolerance is violated, fit without `start`
      suppressWarnings(
        ipcw_fluc <- stats::glm(
          stats::as.formula("C_samp ~ -1 + offset(logit_ipcw) + eif_reg_cov"),
          data = ipcw_fluc_reg_data,
          family = "binomial"
        )
      )
      # if the fluctuation model hasn't converged or is unstable, simply set
      # the coefficients to disable updating, i.e., coef(eif_reg_cov) := 0
      if (!ipcw_fluc$converged || abs(max(coefs_fluc)) > flucmod_tol) {
        ipcw_fluc$coefficients <- 0
      }
    }

    # fitted values following submodel fluctuation
    ipc_pred <- unname(stats::fitted(ipcw_fluc))
  } else {
    # just use the initial estimates of censoring probability for one-step
    ipc_pred <- ipc_mech
  }

  # this is the second half of the IPCW-EIF (solved by pi_n fluctuation):
  # 0 = ((C_samp - pi_n) / pi_n) E[f(eif ~ V | C_samp = 1)]
  ipcw_eif_component <- ((C_samp - ipc_pred) / ipc_pred) * eif_pred

  # so, now we need weights to feed back into the previous steps
  ipc_weights <- C_samp / ipc_pred

  # as above, compute TMLE and EIF with NEW WEIGHTS and SUBMODEL FLUCTUATION
  # NOTE: since this is meant to update the EIF components based on the TMLE
  #       update steps, it accomplishes _literally_ nothing for the one-step
  if (estimator == "tmle") {
    # NOTE: update fluctuation with new weights prior to re-computing EIF
    fitted_fluc_mod <- fit_fluctuation(
      Y = data_internal$Y,
      Qn_scaled = Qn_estim,
      Hn = Hn_estim,
      ipc_weights = ipc_weights[C_samp == 1],
      method = fluctuation
    )

    # now, update EIF after re-fitting fluctuation with updated weights
    eif_eval <- eif(
      Y = data_internal$Y,
      Qn = Qn_estim,
      Hn = Hn_estim,
      estimator = estimator,
      fluc_mod_out = fitted_fluc_mod,
      C_samp = C_samp,
      ipc_weights = ipc_weights[C_samp == 1]
    )
  }

  # need to return output such that we can loop over this function
  out <- list(
    Qn_estim = Qn_estim,
    fluc_mod_out = fitted_fluc_mod,
    pi_mech_star = ipc_pred,
    ipc_weights = ipc_weights,
    eif_eval = eif_eval,
    ipcw_eif_component = ipcw_eif_component
  )
  return(out)
}