R/iv.R
In CausalFX: Methods for Estimating Causal Effects from Observational Data

Documented in iv print.summary.iv summary.iv

############################################################################################
# iv.R
#
# Standard instrumental variable analysis (for binary data).
#
# Code by
#
#  - Ricardo Silva (ricardo@stats.ucl.ac.uk)
#  - Robin Evans (robin.evans@stats.ox.ac.uk)
#
# Current version: 07/05/2015
# First version: 31/03/2014

#' @title Bayesian Analysis of Binary Instrumental Variables
#' 
#' @description 
#' Perform Bayesian instrumental variable analysis for binary data. This assumes the
#' number of covariates is small enough. Notice that a set of size greater than
#' 20 will raise a flag and require explicit permission from the user by setting
#' \code{force_use} to \code{TRUE}.
#' 
#' @references
#' \url{http://ftp.cs.ucla.edu/pub/stat_ser/r199-jasa.pdf}
#'
#' @param problem a \code{CausalFX} problem instance.
#' @param w index of instrumental variable within the data.
#' @param Z array of indices for covariate adjustment.
#' @param prior_table Dirichlet hyperparameter for contingency tables.
#' @param M number of Monte Carlo samples to compute posterior distributions.
#' @param verbose print relevant messages.
#' @param reject_level if first iteration of rejection sampling has a proportion of rejected
#'                     samples larger than this, reject the model and return no bounds.
#' @param force_use if TRUE, ignore any warnings about size of \code{Z}.
#' 
#' @return A list containing two fields,
#'  \item{\code{bound}}{the point estimate of lower and upper bounds.}
#'  \item{\code{bound_post}}{samples from the posterior distribution over bounds.}
#'         
#' @details
#' Given a joint distribution, this function finds a lower bound and an upper bound on the average causal effect of
#' treatment \eqn{X} on outcome \eqn{Y}, adjusting for covariate set \eqn{Z}. The joint distribution
#' is estimated by assigning a prior to joint contingency table of \code{w}, \code{Z} and the treatment/outcome pair indexed in \code{problem},
#' and ACE bounds are inferred by computing the posterior on the contingency table implied by the instrumental variable assumption.
#' The prior is proportional to a Dirichlet distribution with an effective sample size \code{prior_table}. It assigns probability 
#' zero to models which do not satisfy the constraints of the (conditional)
#' instrumental variable, as described by Balke and Pearl (1997, \emph{Journal of the American Statistical Association}, vol. 92,
#' 1171--1176). Hence, the prior is not an exact Dirichlet distribution, but only proportional to one.
#' Posterior samples for the lower and upper bound are generated by rejection sampling using the unconstrained model as 
#' a proposal. If the model is a bad fit to the data, this might take much computing time. Setting \code{reject_level}
#' to a level smaller than 1 may stop the rejection sampler earlier and reject the model, returning no bounds.
#' 
#' @examples
#' 
#' ## Generate synthetic problems until a (conditional) instrumental variable can be found
#' 
#' while (TRUE) {
#'   problem <- simulateWitnessModel(p = 4, q = 4, par_max = 3, M = 1000)
#'   s <- covsearch(problem, pop_solve = TRUE)
#'   if (length(s$witness) > 0) {
#'     w <- s$witness[1]
#'     Z <- s$Z[[1]]
#'     break
#'   }
#' } 
#' 
#' ## Calculate true effect for evaluation purposes
#' sol_pop <- covsearch(problem, pop_solve = TRUE)
#' effect_pop <- synthetizeCausalEffect(problem)
#' cat(sprintf("ACE (true) = %1.2f\n", effect_pop$effect_real))
#'
#' ## Binary IV bounds
#' sol_iv <- iv(problem, w, Z, prior_table = 10, M = 1000)
#' summary(sol_iv)
#' 
#' @export

iv <- function(problem, w, Z, prior_table, M, verbose = FALSE, reject_level = 0.9, force_use = FALSE) {

  if (class(problem) != "cfx") {
    stop("a cfx object is necessary")
  }
    
  if (!validateData(problem$data)) {
    stop("Data must be binary, encoded numerically as 0s and 1s")
  }
  
  num_Z <- length(Z)
  if (num_Z > 20 && !force_use) {
    stop("Size of covariate set very large. To force use, activate the force_use flag")
  }
  
  num_states <- 2^num_Z
  dat <- problem$data[, c(Z, w, problem$X_idx, problem$Y_idx)]
  counts <- dtoc(dat)  
  dim(counts) <- c(num_states, 2, 2, 2)
      
  if (num_Z > 0) {  
    Z_totals <- marginTable(counts, 1)
    c_Z <- rdirichlet(M, Z_totals + prior_table / num_states)
  } else {
    c_Z <- matrix(rep(1, M), ncol = 1)
  }
    
  bounds_post <- matrix(rep(0, 2 * M), ncol = 2, nrow = M)
  
  for (i in 1:num_states) {
    
    if (verbose) cat(sprintf("State %d out of %d \n", i, num_states))
    
    pre_bounds <- matrix(ncol = 2, nrow = M)
    
    num_remaining <- M
    c_wxy <- as.numeric(counts[i, , , ]) + prior_table / (8 * num_states)
    first_trial <- TRUE
        
    while (num_remaining > 0) {
      
      wxy_samples <- rdirichlet(num_remaining, c_wxy)
      b <- binAnalyticalIV(wxy_samples)
      
      num_valid <- sum(b$valid)
      if (num_valid == 0) next
      next_v <- (M - num_remaining + 1):(M - num_remaining + num_valid)
      
      pre_bounds[next_v, ] <- b$bounds[b$valid == 1, ]
      num_remaining <- num_remaining - num_valid
      
      if (first_trial && num_remaining / M > reject_level) {
        cat("Model falsified\n")
        out <- list(bounds = NULL, bounds_post = NULL, w = w, Z = Z, problem = problem)
        class(out) <- "iv"
        return(out)        
      }
      
      first_trial <- FALSE
      if (verbose) cat(sprintf(" Remaining: %d out of %d\n", num_remaining, M))
            
    }
    
    bounds_post[, 1] <- bounds_post[, 1] + pre_bounds[, 1] * c_Z[, i]
    bounds_post[, 2] <- bounds_post[, 2] + pre_bounds[, 2] * c_Z[, i]
      
  }
  
  out <- list(bounds = colMeans(bounds_post), bounds_post = bounds_post, w = w, Z = Z, problem = problem)
  class(out) <- "iv"
  return(out)
  
}

binAnalyticalIV <- function(P_ZETA) {
  # binAnalyticalIV
  #
  # For a treatment X, outcome Y and instrument W,  This uses the analytical solution for the basic IV model {W -> X, X -> Y, X <- U -> Y}, 
  # U latent, ACE given by P(Y = 1 | do(X = 1)) - P(Y = 1 | do(X = 0)).
  #
  # * Input:
  #
  # - P_ZETA: 8-column matrix of probabilitys, where P_ZETA[,1:4] is P(YX | W = 0), and
  #           entries 1, 2, 3, 4 correspond to YX = {00, 01, 10, 11}. Analogously,
  #           P_ZETA[,1:4] is P(YX | W = 1).
  #
  # * Output:
  #
  # - bottom, upper: the bounds on the ACE
  
  w_0_upper <- pmin(1 - P_ZETA[, 1], 
                    1 - P_ZETA[, 5], 
                    P_ZETA[, 2] + P_ZETA[, 3] + P_ZETA[, 7] + P_ZETA[, 8],
                    P_ZETA[, 3] + P_ZETA[, 4] + P_ZETA[, 6] + P_ZETA[, 7])
  w_0_bottom <- pmax(P_ZETA[, 7],
                    P_ZETA[, 3],
                    P_ZETA[, 3] + P_ZETA[, 4] - P_ZETA[, 5] - P_ZETA[, 8],
                    -P_ZETA[, 1] - P_ZETA[, 4] + P_ZETA[, 7] + P_ZETA[, 8])
  w_1_upper <- pmin(1 - P_ZETA[, 6],
                   1 - P_ZETA[, 2],
                   P_ZETA[, 3] + P_ZETA[, 4] + P_ZETA[, 5] + P_ZETA[, 8],
                   P_ZETA[, 1] + P_ZETA[, 4] + P_ZETA[, 7] + P_ZETA[, 8])
  w_1_bottom <- pmax(P_ZETA[, 8],
                    P_ZETA[, 4],
                    -P_ZETA[, 2] - P_ZETA[, 3] + P_ZETA[, 7] + P_ZETA[, 8],
                    P_ZETA[, 3] + P_ZETA[, 4] - P_ZETA[, 6] - P_ZETA[, 7])
  return(list(bounds = cbind(w_1_bottom - w_0_upper, w_1_upper - w_0_bottom),
              valid = (w_0_upper > w_0_bottom) * (w_1_upper > w_1_bottom)))
}

#' @title Summarize Binary Instrumental Variable Analyses
#' 
#' @description 
#' \code{summary} method for class "\code{iv}".
#'
#' @param object an object of class "\code{iv}", usually a result of a call to \code{\link{iv}}.
#' @param ... other parameters, ignored.
#' @return Besides fields from the \code{iv} object, this includes a list summary statistics,
#'  \item{\code{lq}}{an array of 5 entries corresponding to evenly space quantiles of the lower bound ("minimum", 25\%, median, 75%, "maximum") as
#'                   given by a Monte Carlo approximation.}
#'  \item{\code{lci}}{95\% marginal posterior credible interval for lower bound.}
#'  \item{\code{uq}}{an array of 5 entries corresponding to evenly space quantiles of the upper bound ("minimum", 25\%, median, 75%, "maximum") as
#'                   given by a Monte Carlo approximation.}
#'  \item{\code{uci}}{95\% marginal posterior credible interval for upper bound.}
#'         
#' @seealso
#' The model fitting function \code{\link{iv}}.
#' 
#' @export

summary.iv <- function(object, ...) {
  
  if (class(object) != "iv") {
    stop("an iv object is necessary")
  }
  
  if (!is.null(object$bounds)) {
    lq  <- quantile(object$bounds_post[, 1], probs = seq(0, 1, 0.25), na.rm = TRUE)
    lci <- quantile(object$bounds_post[, 1], probs = c(0.025, 0.975), na.rm = TRUE)
    uq  <- quantile(object$bounds_post[, 2], probs = seq(0, 1, 0.25), na.rm = TRUE)
    uci <- quantile(object$bounds_post[, 2], probs = c(0.025, 0.975), na.rm = TRUE)
  } else {
    lq  <- NULL
    lci <- NULL
    uq  <- NULL
    uci <- NULL
  }
  
  ans <- list(lq = lq, lcqi = lci, uq = uq, uci = uci,
              X_idx = object$problem$X_idx, Y_idx = object$problem$Y_idx, varnames = object$problem$varnames,
              w = object$w, Z = object$Z, bounds = object$bounds, bounds_post = object$bounds_post)
  class(ans) <- "summary.iv"
  return(ans)
  
}

#' @title Print Summaries of Binary Instrumental Variable Analyses
#' 
#' @description 
#' Print output of \code{summary} method for class "\code{iv}".
#'
#' @param x an object of class "\code{summary.iv}", usually a result of a call to \code{\link{summary.iv}}.
#' @param ... other parameters, ignored.
#' 
#' @details
#' Variable names with more than 20 characters are truncated.
#' 
#' @export

print.summary.iv <- function(x, ...) {
  
  if (class(x) != "summary.iv") {
    stop("an summary.iv object is necessary")
  }
  
  cat("\n")
  cat(sprintf("BINARY INSTRUMENTAL VARIABLE ANALYSIS (Treatment %s, Outcome %s)\n",
              x$varnames[[x$X_idx]], x$varnames[[x$Y_idx]]))
  
  cat("\n")
  cat(sprintf("             IV: %s\n", x$varnames[[x$w]]))
  if (length(x$Z) == 0) {
    cat(sprintf("  Covariate set: EMPTY\n"))
  } else {
    cat(sprintf("  Covariate set: %s\n", x$varnames[[x$Z[1]]]))
    for (i in (1 + seq_len(length(x$Z) - 1))) {
      cat(sprintf("                 %s\n", x$varnames[[x$Z[i]]]))
    }
  }    
  cat("\n")

  if (is.null(x$bounds)) {
    cat("Model rejected.\n")
    return()
  }
  
  lq  <- x$lq
  lci <- x$lci
  uq  <- x$uq
  uci <- x$uci
  
  cat(sprintf("             Estimated Interval: [% 1.2f, % 1.2f]\n\n", x$bounds[1], x$bounds[2]))
  cat(sprintf("          Lower bound quantiles: (% 1.2f, % 1.2f, % 1.2f, % 1.2f, % 1.2f)\n", lq[[1]], lq[[2]], lq[[3]], lq[[4]], lq[[5]]))
  cat(sprintf("  95 per cent credible interval: [% 1.2f, % 1.2f]\n", lci[[1]], lci[[2]]))    
  cat(sprintf("          Upper bound quantiles: (% 1.2f, % 1.2f, % 1.2f, % 1.2f, % 1.2f)\n", uq[[1]], uq[[2]], uq[[3]], uq[[4]], uq[[5]]))
  cat(sprintf("  95 per cent credible interval: [% 1.2f, % 1.2f]\n", uci[[1]], uci[[2]]))    
  
}