sherlock: Causal Machine Learning for Segment Discovery and Analysis

Documented in ci_wald

#' Confidence Intervals for Conditional Effects and Evaluation Measures
#'
#' Wald-style confidence intervals for \code{\link[data.table]{data.table}}s
#' with additional columns for estimates of the conditional average treatment
#' effect (CATE) and related segmentation evaluation measures.
#'
#' @param estimation_results A \code{\link[data.table]{data.table}} containing
#'  the input data, augmented to include cross-validated nuisance parameter
#'  estimates, an estimate of the CATE, and a treatment rule assigned based on
#'  the estimated CATE via \code{\link{assign_rule}}. This input object should
#'  be generated by successive calls to \code{\link{set_est_data}} and
#'  \code{\link{est_cate}}, or through a wrapper function that composes these
#'  function calls automatically. For cases in which \code{\link{assign_rule}}
#'  has been called prior, any additional information is preserved.
#' @param segment_by A \code{character} vector specifying the column names in
#'  \code{data_obs} that correspond to the covariates over which segmentation
#'  should be performed. This should be a strict subset of \code{baseline}.
#' @param param_type A \code{character} string (of length one) indicating the
#'  type of estimates for which a Wald-style confidence interval is to be
#'  computed. The choice of \code{"cate"} indicates that this routine is being
#'  invoked thru \code{\link{summarize_segments}} while \code{"effect_measure"}
#'  indicates that the input was generated by \code{\link{est_effect}}.
#' @param outcome_type A \code{character} string (of length one) indicating
#'  whether the outcome variable is binary or continuous-valued. For cases in
#'  which the outcome is binary, the confidence interval for the parameter
#'  estimate is constructed on the logit scale and then back-transformed, in
#'  order to respect the fact that the estimate must lie in the unit interval.
#' @param coverage A \code{numeric} indicating the nominal rate at which the
#'  Wald-style confidence interval is intended to cover the target parameter.
#'
#' @importFrom assertthat assert_that
#' @importFrom data.table setcolorder ":="
#' @importFrom stats qnorm qlogis plogis
#'
#' @return A \code{\link[data.table]{data.table}} matching exactly the input
#'  provided in \code{estimation_results}, augmented to include the lower and
#'  upper bounds from the Wald-style confidence intervals constructed.
#'
#' @keywords internal
ci_wald <- function(estimation_results,
                    segment_by = NULL,
                    param_type = c("cate", "effect_measure"),
                    outcome_type = c("continuous", "binary"),
                    coverage = getOption("sherlock.ci_covers")) {
  # first, let's get Z_(1 - alpha)
  wald_mult <- c(-1, 1) * abs(stats::qnorm(p = (1 - coverage) / 2))

  # assume continuous outcome if more than two levels in outcome node
  if (outcome_type == "continuous") {
    # NOTE: standard error already scaled (i.e., from Var(D_n)/n)

    if (param_type == "cate") {
      # check that segmentation covariates are included
      assertthat::assert_that(!is.null(segment_by))

      # compute the interval around the CATE estimate
      estimation_results[, lwr_ci := (wald_mult[1] * std_err + cate)]
      estimation_results[, upr_ci := (wald_mult[2] * std_err + cate)]
    } else if (param_type == "effect_measure") {
      # compute the interval around the effect measure estimate
      estimation_results[, lwr_ci := (wald_mult[1] * std_err + estimate)]
      estimation_results[, upr_ci := (wald_mult[2] * std_err + estimate)]
    }
  } else if (outcome_type == "binary") {
    # compute the gradient for ratios by application of the delta method
    ratio_gradient_delta <- function(theta) {
      (1 / theta) + (1 / (1 - theta))
    }

    # for binary outcomes, create CI on the logit scale and back-transform
    if (param_type == "cate") {
      theta_ratio <- estimation_results[, stats::qlogis(cate)]
      ratio_grad_theta <- estimation_results[, ratio_gradient_delta(cate)]
    } else if (param_type == "effect_measure") {
      theta_ratio <- estimation_results[, stats::qlogis(estimate)]
      ratio_grad_theta <- estimation_results[, ratio_gradient_delta(estimate)]
    }

    # compute SE on logit scale, back-transform, append lower/upper bounds
    se_eif_logit <- estimation_results[, sqrt(ratio_grad_theta^2 * std_err^2)]
    estimation_results[, lwr_ci := stats::plogis(wald_mult[1] * se_eif_logit +
      theta_ratio)]
    estimation_results[, upr_ci := stats::plogis(wald_mult[2] * se_eif_logit +
      theta_ratio)]
  }

  # re-arrange columns
  if (param_type == "cate") {
    data.table::setcolorder(
      estimation_results,
      c(segment_by, "count", "cate", "lwr_ci", "upr_ci", "std_err")
    )
  } else if (param_type == "effect_measure") {
    data.table::setcolorder(
      estimation_results,
      c("estimate", "lwr_ci", "upr_ci", "std_err")
    )
  }

  # return augmented data.table
  return(estimation_results[])
}