R/util.R
In CausalFX: Methods for Estimating Causal Effects from Observational Data

Documented in bindagCausalEffectBackdoor synthetizeCausalEffect

############################################################################################
# util.R
#
# Some graphical manipulation functions and causal effect computations.
#
# Code by
#
#  - Ricardo Silva (ricardo@stats.ucl.ac.uk)
#  - Robin Evans (robin.evans@stats.ox.ac.uk)
#
# Current version: 28/04/2015
# First version: 31/03/2014

validateData <- function(dat) {
  # Verifiy that dataset 'dat' is binary, encoded as 0s and 1s
  if (is.null(dat)) return(1)
  N <- ncol(dat) * nrow(dat)
  if (sum(dat == 0) + sum(dat == 1) != N) return(0)
  return(1)
}

dtoc <- function(d) {
  # Convert a dataset into a count table.
  #
  # Input:
  # 
  # - d: a dataset
  #
  # Output:
  #
  # - the count representation of the dataset  
  nc <- ncol(d)
  tmp <- tabulate(d %*% 2^(seq_len(nc) - 1) + 1, nbins = 2^nc)
  return(array(tmp, rep(2, nc)))
}

dsep <- function(x, y, S, dag, ancestrals) {
  # Returns true if x is d-separated from y given S in a dag (with redundant ancestral sets
  # pre-computed).
  #
  # * Input:
  #
  # - x, y: indices of X and Y
  # - S: indices of conditioning set S
  # - dag: the DAG, as a binary adjacency matrix
  # - ancestrals: a list indicating for each variable, which ancestrals they have (excluding themselves)  
  #
  # * Output:
  #
  # - TRUE or FALSE, depending on whether d-separation occurs
  
  ## Get ancestral graph first
  num_v <- dim(dag)[1]
  c_anc <- rep(0, num_v)
  c_anc[c(x, y, S)] <- 1
  c_anc[ancestrals[[x]]] <- 1; c_anc[ancestrals[[y]]] <- 1;
  c_anc[unlist(ancestrals[S])] <- 1
  dag[c_anc == 0, ] <- 0  
  
  ## Marry parents
  for (s in S) {
    parents <- (dag[s, ] == 1)
    dag[parents, parents] <- 1
  }
  
  ## Remove S and non-ancestors
  rmv <- c(which(c_anc == 0), S)
  if (length(rmv)) {
    G <- dag[-rmv, -rmv]
    x <- x - sum(rmv < x)
    y <- y - sum(rmv < y)
  } else {
    G <- dag
  }
  
  ## Convert to undirected
  G <- (G + t(G) > 0)
  
  ## G[S, ] <- 0; G[, S] <- 0
  if (sum(G[x, ]) == 0 || sum(G[y,  ]) == 0) return(TRUE)
  
  ## test for graph connectivity between x and y; turns out to be
  ## quicker to use matrix multiplication than igraph
  G2 <- G
  for (i in seq_len(dim(G)[1])) {
    if (G2[x, y] > 0) return(FALSE)
    G2 <- G %*% G2
  }
  return(TRUE)
}

bindagMonteCarloCausalEffect <- function(problem, M) {
  # Finds the true ACE for X --> Y using Monte Carlo simulation.
  #
  # * Input:
  #
  #
  # * Output:
  #
  # - a scalar, the true ACE (estimated by Monte Carlo)

  if (class(problem) != "cfx") {
    stop("a CausalFX object is necessary")
  }
  
  if (is.null(problem$graph) || is.null(problem$model)) {
    stop("a true model specification is necessary")    
  }
  
  num_v <- nrow(problem$graph)
  
  graph_edge <- c()
  for (v in 1:num_v) {
    parents <- which(problem$graph[v, ] == 1)
    for (p in parents) graph_edge <- c(graph_edge, c(p, v))
  }
  graph_temp <- graph(graph_edge)
  v_order <- topological.sort(graph_temp)
  
  sim_data <- matrix(nrow = M, ncol = num_v)
  
  for (v in v_order) {
    
    if (v != problem$X_idx) {
      parents <- which(problem$graph[v, ] == 1)
      cpt <- problem$model[[v]]
    } else {
      parents <- c()
      cpt <- 0.5
    }
    
    if (length(parents) == 0) {
      sim_data[, v] <- runif(M) < cpt
      next
    }
    p_idx <- sim_data[, parents, drop = FALSE] %*% 2^((length(parents) - 1):0) + 1
    sim_data[, v] <- as.numeric(runif(M) < cpt[p_idx])
    
  }
  
  do_0.rows <- which(sim_data[, problem$X_idx] == 0)
  do_1.rows <- which(sim_data[, problem$X_idx] == 1)
  effect <- mean(sim_data[do_1.rows, problem$Y_idx]) - mean(sim_data[do_0.rows, problem$Y_idx])
  
  ## Use problem$counts to estimate naive causal effect P(Y = 1 | X = 1) - P(Y = 1 | X = 0)
  tmp <- conditionTable(dtoc(problem$data[, c(problem$X_idx, problem$Y_idx)]), 2, 1)
  effect_naive2 <- tmp[2, 2] - tmp[2, 1]
  
  return(list(effect_real = effect, effect_naive2 = effect_naive2))
  
}

#' @title Computes Average Causal Effects by Covariate Adjustment in Binary Models using a Given Causal Model
#' 
#' @description
#' 
#' Computes the average causal effect (ACE) of a given treatment variable \eqn{X} on a given 
#' outcome \eqn{Y} for the models generated by \code{\link{simulateWitnessModel}}. This assumes 
#' the synthetic models are small enough, as adjustment is done by brute force calculation.
#' 
#' @param problem a \code{\link{cfx}} problem instance for the ACE of a given treatment \eqn{X} on a given outcome \eqn{Y}.
#'                This problem instance should have a fully specified causal model, including a graph and conditional
#'                probability tables. It must also be small enough so that the joint probability must have been
#'                pre-computed.
#'
#' @return A list containing three different types of estimand:
#'   \item{\code{effect_real}}{the true ACE.}
#'   \item{\code{effect_naive}}{the result of a naive adjustment using all of the observed covariates.}
#'   \item{\code{effect_naive2}}{the result of a naive adjustment using no covariates.}
#'
#' @details
#' The algorithm implemented is a naive one. When creating the \code{cfx} object, field \code{num_v_max} must be
#' large enough so that the joint distribution is computed in advance. Only for relatively small models (approximately 20 
#' variables in total) this will be feasible.
#' 
#' @examples
#' 
#' ## Generate a synthetic problem
#' problem <- simulateWitnessModel(p = 4, q = 4, par_max = 3, M = 1000)
#' 
#' ## Idealized case: suppose we know the true distribution, 
#' ## get "exact" ACE estimands for different adjustment sets
#' sol_pop <- covsearch(problem, pop_solve = TRUE)
#' effect_pop <- synthetizeCausalEffect(problem)
#' cat(sprintf(
#'   "ACE (true) = %1.2f\nACE (adjusting for all) = %1.2f\nACE (adjusting for nothing) = %1.2f\n", 
#'    effect_pop$effect_real, effect_pop$effect_naive, effect_pop$effect_naive2))
#'
#' @export

synthetizeCausalEffect <- function(problem) {
  
  if (class(problem) != "cfx") {
    stop("a CausalFX object is necessary")
  }
  if (is.null(problem$probs)) {
    stop("specification of model full joint probability is necessary")
  }
  
  num_v <- nrow(problem$graph)
  num_states <- 2^num_v
  
  ## store parents as matrix of Ts and Fs
  nparents <- rowSums(problem$graph)
  
  ## get joint distribution
  probs <- problem$probs
  
  ## Naive effect: condition on everybody (non-latent)
  S <- seq_len(num_v)[-c(problem$X_idx, problem$Y_idx, problem$latent_idx)]
  k <- length(S)
  P_SXY <- marginTable(probs, c(S, problem$X_idx, problem$Y_idx))
  P_SX <- marginTable(P_SXY, seq_len(k + 1))
  P_S <- marginTable(P_SX, seq_len(k))
  
  num_s <- 2^k
  
  sq <- seq(num_s)
  effect_naive <- sum((P_SXY[3 * num_s + sq] / P_SX[num_s + sq]  -
                       P_SXY[2 * num_s + sq] / P_SX[        sq]) * P_S[sq])
  
  ## Naive effect 2: condition on nobody
  P_Y.X <- conditionTable(P_SXY, k + 2, k + 1)
  effect_naive2 <- P_Y.X[2, 2] - P_Y.X[2, 1]
  
  ## Real effect: condition on everybody (including latents)
  S <- seq_len(num_v)[-c(problem$X_idx, problem$Y_idx)]
  k <- length(S)
  P_SXY <- marginTable(probs, c(S, problem$X_idx, problem$Y_idx))
  P_SX <- marginTable(P_SXY, seq_len(k + 1))
  P_S <- marginTable(P_SX, seq_len(k))
  
  num_s <- 2^k
  sq <- seq(num_s)
  
  effect <- sum((P_SXY[3 * num_s + sq] / P_SX[num_s + sq]  -
                 P_SXY[2 * num_s + sq] / P_SX[        sq]) * P_S[sq])
  
  return(list(effect_real = effect, effect_naive = effect_naive, effect_naive2 = effect_naive2))
  
}

#' @title Estimates Average Causal Effects by Covariate Adjustment in Binary Models
#' 
#' @description
#' 
#' Estimate the average causal effect (ACE) of a given treatment variable
#' \eqn{X} on a given outcome \eqn{Y} by adjusting for a set \eqn{S} of covariates (equivalenly, assuming
#' \eqn{S} blocks all backdoor paths in the causal graph that generates the observations). . Bounds are based on finding conditional instrumental variables
#' using the faithfulness assumption relaxed to allow for a moderate degree of unfaithfulness.
#' Candidate models are generated from the method described in \code{\link{covsearch}}.
#' 
#' @references 
#' Pearl, J. (2000) \emph{Causality: Models, Reasoning and Inference}. Cambridge University Press.
#' 
#' @param problem a \code{\link{cfx}} problem instance for the ACE of a given treatment \eqn{X} on a given outcome \eqn{Y}.
#' @param prior_table effective sample size hyperparameter of a Dirichlet prior for testing independence with contingency tables.
#' @param S an array indicating the indices of the covariates used in the adjustment.
#'
#' @return the estimated ACE.
#'
#' @details
#' The algorithm implemented is a naive one. If the dimensionality of \emph{S} is larger than around 20, this may crash the system.
#' This function is assumed to be used along other methods such as \code{\link{covsearch}} and \code{\link{wpp}}, which generate
#' relativelty small covariate sets.
#' 
#' @examples
#' 
#' ## Generate a synthetic problem
#' problem <- simulateWitnessModel(p = 4, q = 4, par_max = 3, M = 1000)
#' 
#' ## Idealized case: suppose we know the true distribution, 
#' ## get "exact" ACE estimands for different adjustment sets
#' sol_pop <- covsearch(problem, pop_solve = TRUE)
#' effect_pop <- synthetizeCausalEffect(problem)
#' cat(sprintf(
#'   "ACE (true) = %1.2f\nACE (adjusting for all) = %1.2f\nACE (adjusting for nothing) = %1.2f\n", 
#'    effect_pop$effect_real, effect_pop$effect_naive, effect_pop$effect_naive2))
#'
#' ## Perform inference and report results
#' covariate_hat <- covsearch(problem, cred_calc = TRUE, M = 1000)
#' if (length(covariate_hat$witness) > 1) {
#'   sol_ACE <- bindagCausalEffectBackdoor(problem, prior_table = 10, S = covariate_hat$Z[[1]])
#'   cat(sprintf("Estimated ACE = %1.2f\n", sol_ACE))
#' }
#' 
#' @export

bindagCausalEffectBackdoor <- function(problem, prior_table, S) {

  if (class(problem) != "cfx") {
    stop("a CausalFX object is necessary")
  }
  if (is.null(problem$probs) && is.null(problem$data)) {
    stop("data or specification of model full joint probability is necessary")
  }
  
  k <- length(S)
  if (k > 20) {
    warning("Warning, this may take all of your memory resources\n")
  }
  
  if (!is.null(problem$probs)) {
    probs <- problem$probs
    P_SXY <- marginTable(probs, c(S, problem$X_idx, problem$Y_idx))
  } else {
    counts <- dtoc(problem$data[, c(S, problem$X_idx, problem$Y_idx)]) + prior_table / 2^(ncol(problem$data) - (2 + length(S)))
    P_SXY  <- counts / sum(counts)
  }
  P_SX   <- marginTable(P_SXY, seq_len(k + 1))
  P_S    <- marginTable(P_SX, seq_len(k))
  
  num_s <- 2^k
  sq <- seq(num_s)
  ACE <- sum((P_SXY[3 * num_s + sq] / P_SX[num_s + sq]  -
              P_SXY[2 * num_s + sq] / P_SX[        sq]) * P_S[sq])
 
  return(ACE)
  
}

binParamPosteriorSampling <- function(counts, w, x, y, Z, M, prior_table = 10) {
  # Generates a sample of the posterior distribution of the contingency table parameters of a model
  # for p(W, X, Y | Z).
  #
  # * Input:
  #
  # - counts: array of binary counts of the same dimensionality of (w, x, y, Z)
  # - w, x, y, Z: the indices of the corresponding variables (Z being a vector here)
  # - M: sample size for Monte Carlo simulations
  # - prior_table: Dirichlet hyperparameter for contingency tables
  #
  # * Output:
  #
  # - theta_samples: a list with fields, 
  #
  #        sW, posterior samples of P(W = 1), 
  #      sXW0, posterior samples of P(X = 1 | W = 0, Z = z));
  #      sXW1, analogously;
  #      sYX0, posterior samples of P(Y = 1 | X = 0, Z = z);
  #      sYX1, analogously
  #
  # - Z_samples: samples from the marginal of Z, assuming its dimensionality is small enough
  #              to that there is no memory overflow
  
  num_states <- 2^length(Z)
  theta_samples <- list()
  
  # matrix: each row corresponds to state of Z
  totals <- marginTable(counts, c(Z, w, x, y))  
  dim(totals) <- c(num_states, 2, 2, 2)
  
  for (i in 1:num_states) { 
    
    # Marginal of W = 1
    
    N_w1 <- sum(totals[i, 2, , ])
    N_w0 <- sum(totals[i, 1, , ])
    alpha <- c(N_w0, N_w1) + prior_table / (num_states * 2)
    sW <- rbeta(M, alpha[2], alpha[1])
    
    # X = 1 given W
    
    N_x1w1 <- sum(totals[i, 2, 2, ])
    N_x0w1 <- sum(totals[i, 2, 1, ])
    alpha <- c(N_x0w1, N_x1w1) + prior_table / (num_states * 4)
    sXW1 <- rbeta(M, alpha[2], alpha[1])
    
    N_x1w0 <- sum(totals[i, 1, 2, ])
    N_x0w0 <- sum(totals[i, 1, 1, ])
    alpha <- c(N_x0w0, N_x1w0) + prior_table / (num_states * 4)
    sXW0 <- rbeta(M, alpha[2], alpha[1])
    
    # Y = 1 given X
    
    N_y1x1 <- sum(totals[i, , 2, 2])
    N_y0x1 <- sum(totals[i, , 2, 1])
    alpha <- c(N_y0x1, N_y1x1) + prior_table / (num_states * 4)
    sYX1 <- rbeta(M, alpha[2], alpha[1])
    
    N_y1x0 <- sum(totals[i, , 1, 2])
    N_y0x0 <- sum(totals[i, , 1, 1])
    alpha <- c(N_y0x0, N_y1x0) + prior_table / (num_states * 4)
    sYX0 <- rbeta(M, alpha[2], alpha[1])
    
    # Wrap it up
    
    theta_samples[[i]] <- list(sW = sW, sXW0 = sXW0, sXW1 = sXW1, sYX0 = sYX0, sYX1 = sYX1)
    
  }
  
  # Marginal of Z
  
  if (length(Z) > 0) {
    Z_totals <- marginTable(totals, 1)
    Z_samples <- rdirichlet(M, Z_totals + prior_table / num_states)
  } else {
    Z_samples <- matrix(rep(1, M), ncol = 1)
  }
  
  # Return
  
  return(list(theta_samples = theta_samples, Z_samples = Z_samples))
}

binParamPosteriorExpectation <- function(counts, w, x, y, Z, prior_table = 10) {
  # Generates the posterior expected distribution of the contingency table parameters of a model
  # for p(W, X, Y | Z).
  #
  # * Input:
  #
  # - counts: array of binary counts
  # - w, x, y, Z: the indices of the corresponding variables (Z being a vector here)
  # - prior_table: Dirichlet hyperparameter for contingency tables
  #
  # * Output:
  #
  # - theta_mean: a list with fields, 
  #
  #        sW, posterior expected value of P(W = 1), 
  #      sXW0, posterior expected value of P(X = 1 | W = 0, Z = z));
  #      sXW1, analogously;
  #      sYX0, posterior expected value of P(Y = 1 | X = 0, Z = z);
  #      sYX1, analogously
  #
  # - Z_mean: posterior expected values of the marginal of Z, 
  #
  # It is assumed the dimensionality of the problem is small so that there is no memory overflow.
  
  num_states <- 2^length(Z)
  theta_mean <- list()
  
  # matrix: each row corresponds to state of Z
  totals <- marginTable(counts, c(Z, w, x, y)) + prior_table / 2^(length(Z) + 3)
  dim(totals) <- c(num_states, 2, 2, 2)
  
  for (i in 1:num_states) { 
    
    # Marginal of W = 1
    
    N_w1 <- sum(totals[i, 2, , ])
    N_w0 <- sum(totals[i, 1, , ])
    sW   <- N_w1 / (N_w0 + N_w1)
    
    # X = 1 given W
    
    N_x1w1 <- sum(totals[i, 2, 2, ])
    N_x0w1 <- sum(totals[i, 2, 1, ])
    sXW1   <- N_x1w1 / (N_x1w1 + N_x0w1)
    
    N_x1w0 <- sum(totals[i, 1, 2, ])
    N_x0w0 <- sum(totals[i, 1, 1, ])
    sXW0   <- N_x1w0 / (N_x1w0 + N_x0w0)
    
    # Y = 1 given X
    
    N_y1x1 <- sum(totals[i, , 2, 2])
    N_y0x1 <- sum(totals[i, , 2, 1])
    sYX1   <- N_y1x1 / (N_y1x1 + N_y0x1)
    
    N_y1x0 <- sum(totals[i, , 1, 2])
    N_y0x0 <- sum(totals[i, , 1, 1])
    sYX0   <- N_y1x0 / (N_y1x0 + N_y0x0)
    
    # Wrap it up
    
    theta_mean[[i]] <- list(sW = sW, sXW0 = sXW0, sXW1 = sXW1, sYX0 = sYX0, sYX1 = sYX1)
    
  }
  
  # Marginal of Z
  
  if (length(Z) > 0) {
    Z_totals <- marginTable(totals, 1)
    Z_mean <- Z_totals / sum(Z_totals)
  } else {
    Z_mean <- 1
  }
  
  # Return
  
  return(list(theta_mean = theta_mean, Z_mean = Z_mean))
}

rowMins <- function(x) do.call(pmin, as.data.frame(x))

rowMaxs <- function(x) do.call(pmax, as.data.frame(x))
Any scripts or data that you put into this service are public.
CausalFX documentation built on May 2, 2019, 5:39 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
CausalFX
Methods for Estimating Causal Effects from Observational Data

R/util.R
In CausalFX: Methods for Estimating Causal Effects from Observational Data

Defines functions validateData dtoc dsep bindagMonteCarloCausalEffect synthetizeCausalEffect bindagCausalEffectBackdoor binParamPosteriorSampling binParamPosteriorExpectation rowMins rowMaxs

Documented in bindagCausalEffectBackdoor synthetizeCausalEffect

Try the CausalFX package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CausalFX Methods for Estimating Causal Effects from Observational Data

R/util.R In CausalFX: Methods for Estimating Causal Effects from Observational Data

Defines functions validateData dtoc dsep bindagMonteCarloCausalEffect synthetizeCausalEffect bindagCausalEffectBackdoor binParamPosteriorSampling binParamPosteriorExpectation rowMins rowMaxs

Documented in bindagCausalEffectBackdoor synthetizeCausalEffect

Try the CausalFX package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CausalFX
Methods for Estimating Causal Effects from Observational Data

R/util.R
In CausalFX: Methods for Estimating Causal Effects from Observational Data