R/synthetize.R

Defines functions simulateWitnessModel getBinDAGModelData logitDist patternRepeat

Documented in simulateWitnessModel

############################################################################################
# synthetize.R
#
# Generates synthetic examples of WPP problems. Also includes some functions to generate
# 
#
# Code by
#
#  - Ricardo Silva (ricardo@stats.ucl.ac.uk)
#  - Robin Evans (robin.evans@stats.ox.ac.uk)
#
# Current version: 28/04/2015
# First version: 31/03/2014

#' @title Generates Synthetic CausalFX Problems
#' 
#' @description
#' This function generates simple synthetic problems that can be used to test 
#' the methods in the \code{CausalFX} package. \code{CausalFX} problems are objects
#' of class \code{\link{cfx}}, and specify a causal inference task of estimating the effect 
#' of a given treatment \eqn{X} on a given outcome \eqn{Y}, with a corresponding dataset.
#' This function generates only binary data from a multinomial distribution.
#' 
#' @param p number of background variables (besides \eqn{X} and \eqn{Y}).
#' @param q number of sink variables.
#' @param par_max maximum number of parents in the background set.
#' @param M sample size.
#' @param no_sol if \code{TRUE}, then latent variables are parents of both \eqn{X} 
#'               and \eqn{Y}, meaning no adjustment set will theoretically be found
#'               (barring sampling variability) if a method such as \code{\link{covsearch}} 
#'               is applied.
#'
#' @return An object of class \code{\link{cfx}}.
#' 
#' @details
#' The function first generates a directed acyclic graph with a given number of variables which have no latent common 
#' parents with treatment \eqn{X} and outcome \eqn{Y}, which we call "background variables". Conditioning
#' on a subset of the background variables will block all measured confounding in this problem. 
#' The function then generates a set of "sink" variables \eqn{K} which have one common latent parent with 
#' either \eqn{X} or \eqn{Y}, but are otherwise not adjacent to any observed variable. Conditioning on the sink variables
#' will generate confounding paths between treatment and outcome. Latent variables are 
#' a  pool of independent variables with no parents. If \code{no_sol} is \code{FALSE}, they are parents of either \eqn{X} or \eqn{Y} but not both. 
#' If \code{no_sol} is \code{TRUE}, then all latent variables are parents of both \eqn{X} and \eqn{Y} and as
#' such no adjustment set with observed variables will remove unmeasured confounding between treatment and outcome.
#' Remaining parents for observed variables are sampled uniformly at random from the pool of background
#' variables obeying the constraint on the maximum number of parents given by \code{par_max}.
#' 
#' Given a graph structure, each variable \eqn{i} is given a binary conditional distribution, defining the probability
#' of \eqn{i} being equal to 1 given its parents in the graph. This conditional distribution is generated
#' randomly by a logistic regression model with pairwise interactions, where coefficients are generated by 
#' samples from independent Gaussians with zero mean and standard deviation 10 / number of parents. 
#'
#' @importFrom igraph graph 
#' @importFrom igraph graph.empty
#' @importFrom igraph topological.sort
#' @import rje
#' @export

simulateWitnessModel <- function(p, q, par_max, M, no_sol = FALSE) {

  ## Preliminaries: here, do rejection sampling on a smaller model to
  ## test whether there is a witness set or not
  if (par_max < 1) stop("Must allow at least one parent")
  
  if (no_sol == FALSE) {
    while (TRUE) {
      g_dummy <- matrix(0, nrow = p + 2, ncol = p + 2)
      
      num_par <- pmin(seq_len(p) - 1, sample.int(par_max, p, replace = TRUE))
      for (i in 2:p) {
        num_par_i <- min(i - 1, sample.int(par_max, 1))
        par_i <- sample.int(i - 1, num_par[i])
        g_dummy[i, par_i] <- 1
      }
      for (i in (p + 1):(p + 2)) {
        num_par_i <- min(p, sample(par_max, 1))
        par_i <- sample(p, num_par_i)
        g_dummy[i, par_i] <- 1
      }
      g_dummy[p + 2, p + 1] <- 1
      
      m <- getBinDAGModelData(g_dummy, 1)
      problem_dummy <- cfx(x = p + 1, y = p + 2, g = g_dummy)
      true_witness <- covsearch(problem_dummy, pop_solve = TRUE)
      if (length(true_witness$witness) > 0) break
    }
  }
  
  ## Generate graph
  q <- max(2, q) # Minimum number of sink variables is 2
  num_h <- q     # The pool of latent variables is also given by q
  num_var <- p + q + 2 + num_h
  
  X_idx <- p + q + 1
  Y_idx <- p + q + 2
  
  g <- matrix(rep(0, num_var^2), ncol = num_var) # variable order: X and Y are indexed as p + q + 1 and
  # p + q + 2. Latent variables are indexed
  # as p + q + 3 ... num_var
  # Observable variables
  for (i in 2:(p + q)) {
    num_par_i <- min(i - 1, sample(1:par_max, 1))
    par_i <- sample(1:(i - 1), num_par_i)
    g[i, par_i] <- 1
  }
  for (i in c(X_idx, Y_idx)) {
    num_par_i <- min(p, sample(1:par_max, 1))
    par_i <- sample(1:p, num_par_i)
    g[i, par_i] <- 1
  }
  g[Y_idx, X_idx] <- 1
  
  ## If g_dummy has been generated, override g with it
  if (no_sol == FALSE) {
    g[1:p, 1:p]   <- g_dummy[1:p, 1:p]
    g[X_idx, 1:p] <- g_dummy[p + 1, 1:p]
    g[Y_idx, 1:p] <- g_dummy[p + 2, 1:p]
  }
  
  ## for (i in (p + 1):(p + q)) g[i, (p + q + 3):num_var] <- 1 # SINK <- all H
  g[p + seq_len(q), (p + q + 3):num_var] <- 1 # SINK <- all H
  
  latent_idx <- (p + q + 3):num_var
  
  if (no_sol) { # Guarantee no solution: make all latent parents be parents of X and Y
    g[X_idx, p + q + 2 + 1:q] <- 1
    g[Y_idx, p + q + 2 + 1:q] <- 1
  }
  else {
    ## Latent variables: flip a coin, then decide whether latent variable will be linked to
    ## X or Y. But the first one is assigned to X and the second to Y
    g[X_idx, p + q + 3] <- 1
    g[Y_idx, p + q + 4] <- 1
    
    if (q > 2) {
      whX <- runif(q - 2, 0.5)
      g[X_idx, p + q + 2 + seq(3, q)[whX <  0.5]] <- 1
      g[Y_idx, p + q + 2 + seq(3, q)[whX >= 0.5]] <- 1      
    }
  }
  
  ## Generate model
  m <- getBinDAGModelData(g, M)
  problem <- cfx(x = X_idx, y = Y_idx, g = g, latent_idx = latent_idx, dat = m$data, model = m$model)
  return(problem)
}

getBinDAGModelData <- function(g, M) {
  # Given a DAG, generate a binary model and a simulate corresponding sample
  #
  # * Input
  #
  # - g: binary matrix representing a DAG, each row represent the corresponding parents
  # - M: sample size
  #
  # * Output
  #
  # - model: a list of CPTs, where model[[i]] is P(V_i = 1 | parents = p), p being a parent
  #          configuration. Configurations are sorted as follows. If [1 2 3 ... (k - 2) (k - 1) k]
  #          are the parents, the corresponding configuration indices are
  #          1 + bin2dec(000...000), 1 + bin2dec(000...001), 1 + bin2dec(000...010), 1 + bin2dec(111...11)
  # - data: binary data
  
  num_v <- nrow(g)
  
  ## get topological ordering
  graph_edge <- c(rbind(col(g)[g == 1], row(g)[g == 1]))
  graph_temp <- graph(graph_edge)
  v_order <- topological.sort(graph_temp)
  
  ## Create model and sample
  model <- list()
  dat <- matrix(nrow = M, ncol = num_v)
  
  for (v in v_order) {
    
    parents <- which(g[v, ] == 1)    
    num_parents <- length(parents)
    
    cpt <- logitDist(num_parents, nu = 10, TRUE) # Non-linearities, to make problems harder   
    prob_cpt <- which(cpt > 0.975)               # Allowing  extreme probabilities also make the problems much harder
    cpt[prob_cpt] <- 0.95 + runif(length(prob_cpt)) * 0.025
    prob_cpt <- which(cpt < 0.025)
    cpt[prob_cpt] <- 0.05 - runif(length(prob_cpt)) * 0.025
    
    model[[v]] <- cpt
    
    if (num_parents == 0) {
      dat[, v] <- as.numeric(runif(M) < cpt)
    } else {
      p_idx <- dat[, parents, drop = FALSE] %*% 2^((num_parents - 1):0) + 1
      dat[, v] <- as.numeric(runif(M) < cpt[p_idx])
    }
    
  }  
  
  return(list(model = model, data = dat))
}

logitDist <- function(np, nu = 6, interactions = FALSE) {
  # Generate distributions of the form P(Y = 1 | PA = x) using logistic
  # link function.
  #
  # Input:
  #
  #  - np           : integer giving number of parents
  #  - interactions : logical specifying whether to include two-way interactions
  #
  # Output:
  #
  #  - A simulated function PP(Y = 1 | PA = x)
  #  
  # If interactions = FALSE
  # logit P(Y=1 | PA1=x1, PA2=x2, ... ,PAk=xk) = sum_i (-1)^{xi} \alpha_i/2
  # where \alpha_i is an independent normal r.v., and k is the number of
  # parents.
  # If interactions = TRUE, then add on sum_{i<j} (-1)^{xi*xj} \beta_{ij}/2
  # for \beta_ij with same distribution.
  
  if (np == 0) return(runif(1))
  effs <- rnorm(np, sd = nu / np)
  out <- c(-1, 1) * effs[1] / 2
  for (i in seq_len(np)[-1]) out <- outer(out, c(-1, 1) * effs[i] / 2, "+")
  
  if (interactions && np > 1) {
    ints <- matrix(rnorm(np^2, sd = 2 * nu / np), np, np)
    for (j in seq_len(np)[-1]) for (i in seq_len(j - 1)) {
      out = out + patternRepeat(c(1, -1, -1, 1) * ints[i, j] / 2, c(i, j), rep(2, np), careful = FALSE)
    }
  }
  return(expit(c(out)))
}

patternRepeat <- function(x, which, n, careful = TRUE) {
  if (careful) {
    tmp <- unique.default(which)
    if (!all(tmp == which)) {
      warning("repeated indices ignored")
      which = tmp
    }
    if (length(which) == 0) {
      if(length(x) != 1) stop("x not of correct length")
      else return(rep.int(x, prod(n)))
    }
    if (max(which) > length(n))
      stop("n not long enough")
    if (prod(n[which]) != length(x))
      stop("x not of correct length")
  }
  else if (length(which) == 0) return(rep.int(x, prod(n)))
  
  tmp <- seq_along(n)[-which]
  if (length(tmp) == 0)
    return(x)
  
  for (add.in in tmp) {
    bl <- prod(n[seq_len(add.in - 1)])
    x <- x[rep.int(seq_len(bl), n[add.in]) + rep(seq.int(from = 0,
                                                         by = bl, length.out = length(x)/bl), each = bl * n[add.in])]
  }
  return(x)
}

Try the CausalFX package in your browser

Any scripts or data that you put into this service are public.

CausalFX documentation built on May 2, 2019, 5:39 a.m.