Nothing
############################################################################################
# synthetize.R
#
# Generates synthetic examples of WPP problems. Also includes some functions to generate
#
#
# Code by
#
# - Ricardo Silva (ricardo@stats.ucl.ac.uk)
# - Robin Evans (robin.evans@stats.ox.ac.uk)
#
# Current version: 28/04/2015
# First version: 31/03/2014
#' @title Generates Synthetic CausalFX Problems
#'
#' @description
#' This function generates simple synthetic problems that can be used to test
#' the methods in the \code{CausalFX} package. \code{CausalFX} problems are objects
#' of class \code{\link{cfx}}, and specify a causal inference task of estimating the effect
#' of a given treatment \eqn{X} on a given outcome \eqn{Y}, with a corresponding dataset.
#' This function generates only binary data from a multinomial distribution.
#'
#' @param p number of background variables (besides \eqn{X} and \eqn{Y}).
#' @param q number of sink variables.
#' @param par_max maximum number of parents in the background set.
#' @param M sample size.
#' @param no_sol if \code{TRUE}, then latent variables are parents of both \eqn{X}
#' and \eqn{Y}, meaning no adjustment set will theoretically be found
#' (barring sampling variability) if a method such as \code{\link{covsearch}}
#' is applied.
#'
#' @return An object of class \code{\link{cfx}}.
#'
#' @details
#' The function first generates a directed acyclic graph with a given number of variables which have no latent common
#' parents with treatment \eqn{X} and outcome \eqn{Y}, which we call "background variables". Conditioning
#' on a subset of the background variables will block all measured confounding in this problem.
#' The function then generates a set of "sink" variables \eqn{K} which have one common latent parent with
#' either \eqn{X} or \eqn{Y}, but are otherwise not adjacent to any observed variable. Conditioning on the sink variables
#' will generate confounding paths between treatment and outcome. Latent variables are
#' a pool of independent variables with no parents. If \code{no_sol} is \code{FALSE}, they are parents of either \eqn{X} or \eqn{Y} but not both.
#' If \code{no_sol} is \code{TRUE}, then all latent variables are parents of both \eqn{X} and \eqn{Y} and as
#' such no adjustment set with observed variables will remove unmeasured confounding between treatment and outcome.
#' Remaining parents for observed variables are sampled uniformly at random from the pool of background
#' variables obeying the constraint on the maximum number of parents given by \code{par_max}.
#'
#' Given a graph structure, each variable \eqn{i} is given a binary conditional distribution, defining the probability
#' of \eqn{i} being equal to 1 given its parents in the graph. This conditional distribution is generated
#' randomly by a logistic regression model with pairwise interactions, where coefficients are generated by
#' samples from independent Gaussians with zero mean and standard deviation 10 / number of parents.
#'
#' @importFrom igraph graph
#' @importFrom igraph graph.empty
#' @importFrom igraph topological.sort
#' @import rje
#' @export
simulateWitnessModel <- function(p, q, par_max, M, no_sol = FALSE) {
## Preliminaries: here, do rejection sampling on a smaller model to
## test whether there is a witness set or not
if (par_max < 1) stop("Must allow at least one parent")
if (no_sol == FALSE) {
while (TRUE) {
g_dummy <- matrix(0, nrow = p + 2, ncol = p + 2)
num_par <- pmin(seq_len(p) - 1, sample.int(par_max, p, replace = TRUE))
for (i in 2:p) {
num_par_i <- min(i - 1, sample.int(par_max, 1))
par_i <- sample.int(i - 1, num_par[i])
g_dummy[i, par_i] <- 1
}
for (i in (p + 1):(p + 2)) {
num_par_i <- min(p, sample(par_max, 1))
par_i <- sample(p, num_par_i)
g_dummy[i, par_i] <- 1
}
g_dummy[p + 2, p + 1] <- 1
m <- getBinDAGModelData(g_dummy, 1)
problem_dummy <- cfx(x = p + 1, y = p + 2, g = g_dummy)
true_witness <- covsearch(problem_dummy, pop_solve = TRUE)
if (length(true_witness$witness) > 0) break
}
}
## Generate graph
q <- max(2, q) # Minimum number of sink variables is 2
num_h <- q # The pool of latent variables is also given by q
num_var <- p + q + 2 + num_h
X_idx <- p + q + 1
Y_idx <- p + q + 2
g <- matrix(rep(0, num_var^2), ncol = num_var) # variable order: X and Y are indexed as p + q + 1 and
# p + q + 2. Latent variables are indexed
# as p + q + 3 ... num_var
# Observable variables
for (i in 2:(p + q)) {
num_par_i <- min(i - 1, sample(1:par_max, 1))
par_i <- sample(1:(i - 1), num_par_i)
g[i, par_i] <- 1
}
for (i in c(X_idx, Y_idx)) {
num_par_i <- min(p, sample(1:par_max, 1))
par_i <- sample(1:p, num_par_i)
g[i, par_i] <- 1
}
g[Y_idx, X_idx] <- 1
## If g_dummy has been generated, override g with it
if (no_sol == FALSE) {
g[1:p, 1:p] <- g_dummy[1:p, 1:p]
g[X_idx, 1:p] <- g_dummy[p + 1, 1:p]
g[Y_idx, 1:p] <- g_dummy[p + 2, 1:p]
}
## for (i in (p + 1):(p + q)) g[i, (p + q + 3):num_var] <- 1 # SINK <- all H
g[p + seq_len(q), (p + q + 3):num_var] <- 1 # SINK <- all H
latent_idx <- (p + q + 3):num_var
if (no_sol) { # Guarantee no solution: make all latent parents be parents of X and Y
g[X_idx, p + q + 2 + 1:q] <- 1
g[Y_idx, p + q + 2 + 1:q] <- 1
}
else {
## Latent variables: flip a coin, then decide whether latent variable will be linked to
## X or Y. But the first one is assigned to X and the second to Y
g[X_idx, p + q + 3] <- 1
g[Y_idx, p + q + 4] <- 1
if (q > 2) {
whX <- runif(q - 2, 0.5)
g[X_idx, p + q + 2 + seq(3, q)[whX < 0.5]] <- 1
g[Y_idx, p + q + 2 + seq(3, q)[whX >= 0.5]] <- 1
}
}
## Generate model
m <- getBinDAGModelData(g, M)
problem <- cfx(x = X_idx, y = Y_idx, g = g, latent_idx = latent_idx, dat = m$data, model = m$model)
return(problem)
}
getBinDAGModelData <- function(g, M) {
# Given a DAG, generate a binary model and a simulate corresponding sample
#
# * Input
#
# - g: binary matrix representing a DAG, each row represent the corresponding parents
# - M: sample size
#
# * Output
#
# - model: a list of CPTs, where model[[i]] is P(V_i = 1 | parents = p), p being a parent
# configuration. Configurations are sorted as follows. If [1 2 3 ... (k - 2) (k - 1) k]
# are the parents, the corresponding configuration indices are
# 1 + bin2dec(000...000), 1 + bin2dec(000...001), 1 + bin2dec(000...010), 1 + bin2dec(111...11)
# - data: binary data
num_v <- nrow(g)
## get topological ordering
graph_edge <- c(rbind(col(g)[g == 1], row(g)[g == 1]))
graph_temp <- graph(graph_edge)
v_order <- topological.sort(graph_temp)
## Create model and sample
model <- list()
dat <- matrix(nrow = M, ncol = num_v)
for (v in v_order) {
parents <- which(g[v, ] == 1)
num_parents <- length(parents)
cpt <- logitDist(num_parents, nu = 10, TRUE) # Non-linearities, to make problems harder
prob_cpt <- which(cpt > 0.975) # Allowing extreme probabilities also make the problems much harder
cpt[prob_cpt] <- 0.95 + runif(length(prob_cpt)) * 0.025
prob_cpt <- which(cpt < 0.025)
cpt[prob_cpt] <- 0.05 - runif(length(prob_cpt)) * 0.025
model[[v]] <- cpt
if (num_parents == 0) {
dat[, v] <- as.numeric(runif(M) < cpt)
} else {
p_idx <- dat[, parents, drop = FALSE] %*% 2^((num_parents - 1):0) + 1
dat[, v] <- as.numeric(runif(M) < cpt[p_idx])
}
}
return(list(model = model, data = dat))
}
logitDist <- function(np, nu = 6, interactions = FALSE) {
# Generate distributions of the form P(Y = 1 | PA = x) using logistic
# link function.
#
# Input:
#
# - np : integer giving number of parents
# - interactions : logical specifying whether to include two-way interactions
#
# Output:
#
# - A simulated function PP(Y = 1 | PA = x)
#
# If interactions = FALSE
# logit P(Y=1 | PA1=x1, PA2=x2, ... ,PAk=xk) = sum_i (-1)^{xi} \alpha_i/2
# where \alpha_i is an independent normal r.v., and k is the number of
# parents.
# If interactions = TRUE, then add on sum_{i<j} (-1)^{xi*xj} \beta_{ij}/2
# for \beta_ij with same distribution.
if (np == 0) return(runif(1))
effs <- rnorm(np, sd = nu / np)
out <- c(-1, 1) * effs[1] / 2
for (i in seq_len(np)[-1]) out <- outer(out, c(-1, 1) * effs[i] / 2, "+")
if (interactions && np > 1) {
ints <- matrix(rnorm(np^2, sd = 2 * nu / np), np, np)
for (j in seq_len(np)[-1]) for (i in seq_len(j - 1)) {
out = out + patternRepeat(c(1, -1, -1, 1) * ints[i, j] / 2, c(i, j), rep(2, np), careful = FALSE)
}
}
return(expit(c(out)))
}
patternRepeat <- function(x, which, n, careful = TRUE) {
if (careful) {
tmp <- unique.default(which)
if (!all(tmp == which)) {
warning("repeated indices ignored")
which = tmp
}
if (length(which) == 0) {
if(length(x) != 1) stop("x not of correct length")
else return(rep.int(x, prod(n)))
}
if (max(which) > length(n))
stop("n not long enough")
if (prod(n[which]) != length(x))
stop("x not of correct length")
}
else if (length(which) == 0) return(rep.int(x, prod(n)))
tmp <- seq_along(n)[-which]
if (length(tmp) == 0)
return(x)
for (add.in in tmp) {
bl <- prod(n[seq_len(add.in - 1)])
x <- x[rep.int(seq_len(bl), n[add.in]) + rep(seq.int(from = 0,
by = bl, length.out = length(x)/bl), each = bl * n[add.in])]
}
return(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.