# R/simulation_helpers.R In nicolagnecco/causalXtreme: Causal discovery in heavy-tailed models

#' Randomly pick elements
#' Select each element of \code{vec} with probability \code{prob} and
#' produce a vector with the selected elements.
#'
#' @param vec Numeric vector. Vector containing the elements to select.
#' @param prob Numeric --- between 0 and 1. The probability of selecting
#' each element of \code{vec}.
#'
#' @return Numeric vector. A vector with the selected elements
#'
pick_elements <- function(vec, prob){
r <- stats::rbinom(n = length(vec), size = 1, prob = prob)
vec[r == 1]
}

#' Inverse mirror uniform
#'
#' Produces the quantile of a mirrored uniform distribution
#' associated to the probability \code{prob}. The mirrored uniform
#' distribution has support [-max, -min] \eqn{\cup} [min, max].
#'
#' @param prob Numeric --- between 0 and 1. The probability associated
#' to the desired quantile.
#' @param min Numeric --- above 0. The lower bound of the positive half of
#' the support.
#' @param max Numeric --- above 0. The upper bound of the positive half of
#' the support. Note that \code{max} must be strictly greater than
#' \code{min}.
#'
#' @return Numeric --- between \code{-max} and \code{max}. The quantile
#' associated to the probability \code{prob}.
inverse_mirror_uniform <- function(prob, min, max){

# check if max < min
if (!(max > min)){
stop(paste("The maximum value, max, must be strictly greater",
"than the minimum value, min."))
}

# check if min > 0 and max > 0
if (!(min > 0 & max > 0)){
stop(paste("Both the minimum and the maximum values, min and max,",
"must be positive."))
}

if (prob == 0){
return(-max)
} else if (prob > 0 & prob < 1 / 2) {
return(2 * prob * (max - min) - max)
} else if (prob >= 1 / 2 & prob < 1){
return( (2 * prob - 1) * (max - min) + min)
} else if (prob == 1){
return(max)
} else {
stop("The probability, prob, must be between 0 and 1.")
}
}

#' Sample from uniform family
#'
#' Sample n elements from a uniform distribution with lower and upper
#' bound equal to \code{min} and \code{max}, respectively.
#' If \code{mirror == TRUE}, the elements are sampled from a mirrored
#'
#' @param n Integer. The number of elements to sample.
#' @param min Numeric. The lower bound of the distribution support.
#' If \code{mirror == TRUE}, this represents the lower bound of
#' the positive half of the support.
#' @param max Numeric. The upper bound of the distribution support.
#' If \code{mirror == TRUE}, this represents the upper bound of
#' the positive half of the support.
#' Note that \code{max} must be strictly greater than \code{min}.
#' @param mirror Logical. Should the elements be sampled from a
#' mirrored uniform distribution? If \code{mirror == TRUE},
#' \code{min} and \code{max} must be strictly greater than 0.
#'
#' @return Numeric vector. A vector with the sampled elements.
sample_uniform <- function(n, min, max, mirror = FALSE){

# check if max < min
if (!(max > min)){
stop(paste("The maximum value, max, must be strictly greater",
"than the minimum value, min."))
}

if (mirror == TRUE){

purrr::map_dbl(stats::runif(n),
inverse_mirror_uniform, min = min, max = max)

} else {

stats::runif(n = n, min = min, max = max)
}
}

#' Simulate random DAG
#'
#' Simulates a directed acyclic graph (DAG) and returns its adjacency matrix.
#' Copyright (c) 2013 Jonas Peters \email{[email protected]@math.ku.dk}.
#'
#' @param p Integer --- greater than 0. Number of nodes.
#' @param prob_connect Numeric --- between 0 and 1. The probability that an edge
#' \eqn{i {\rightarrow} j} is added to the DAG.
#' @param caus_order Numeric vector. The causal order of the DAG.
#' If the argument is not provided it is generated randomly.
#'
#' @return Square binary matrix. A matrix representing the random DAG.
random_dag <- function(p, prob_connect,
caus_order = sample(p, p, replace = FALSE)){

# check inputs
if (p < 1){
stop("The number of nodes p must be greater than 0.")
}

if (!missing(caus_order)){
if (length(caus_order) != p){
stop(paste("The number of nodes p does not match with the length of",
"the causal order."))
}
}

# get random dag
dag <- matrix(0, nrow = p, ncol = p)

if (p > 1){
for (i in 1:(p - 1)){
elms <- pick_elements(caus_order[(i + 1):p], prob_connect)
dag[caus_order[i], elms] <- 1
}
}
dag
}

#' Sample random coefficients
#'
#' Sample random coefficients from uniform distribution
#' for the given DAG \code{dag}.
#' Copyright (c) 2013 Jonas Peters \email{[email protected]@math.ku.dk}.
#'
#' @inheritParams compute_caus_order
#' @param lb Numeric. The lower bound of the coefficient that can be
#' sampled.
#' @param ub Numeric. The lower bound of the coefficient that can be
#' sampled. It must be stricly greater than \code{lb}.
#' @param two_intervals Logical. Should the coefficient be sampled
#' from two symmetric uniform distributions?
#' If \code{two_intervals == TRUE}, \code{lb} and \code{ub} must be
#' positive.
#' @return Square numeric matrix. The adjacency matrix of the underlying
#' DAG \code{dag}.
random_coeff <- function(dag, lb = 0.1, ub = 0.9, two_intervals = TRUE){

# check if dag is a (non-weighted) adjacency matrix
if (!all(dag %in% c(0, 1))){
stop("The entries of dag must be either 0 or 1.")
}

adj_mat <- matrix(0, nrow = NROW(dag), ncol = NCOL(dag))
num_coeff <- sum(dag)
adj_mat[dag == 1] <- sample_uniform(num_coeff, lb, ub, two_intervals)
}

#' Add confounders to a DAG
#'
#' Add confounders (i.e., hidden variables) at random to the DAG \code{dag}.
#' Each confounder is a source node and acts only on two observed variables.
#' The probability that a pair of observed variables is confounded is set by
#' \code{prob_confound}.
#'
#' @inheritParams compute_caus_order
#' @param prob_confound Numeric --- between 0 and 1. The probability that
#' a pair of observed nodes is confounded.
#' @return List. The list is made of:
#' \itemize{
#' \item \code{dag_confounders} --- Square binary matrix. Represents the full
#' DAG made of observed and hidden variables.
#' \item \code{pos_confounders} --- Integer vector. Represents the position
#' of confounders (rows and columns) in \code{dag_confounders}.
#' }
#'

# check if dag is a (non-weighted) adjacency matrix
if (!all(dag %in% c(0, 1))){
stop("The entries of dag must be either 0 or 1.")
}

# Consider all possible pairs of nodes in the DAG
p <- NROW(dag)
node_pairs <- which(upper.tri(dag), arr.ind = TRUE)
max_n_confounders  <- NROW(node_pairs)

# Choose the confounders randomly
n_confounders  <- stats::rbinom(n = 1, size = max_n_confounders,
prob = prob_confound)
confounders <- sample(x = 1:max_n_confounders, size = n_confounders,
replace = FALSE)

# Exit function if there are no confounders
if (n_confounders == 0){
return(list (dag_confounders = dag,
pos_confounders = integer(0)))
}

# Add confounders to the DAG
dag_confounders <- dag

for (i in 1:n_confounders){
confounded.pair <- node_pairs[confounders[i], ]
col_to_append <- rep(0, NROW(dag_confounders))
row_to_append <- rep(0, NROW(dag_confounders) + 1)
row_to_append[confounded.pair] <- 1
dag_confounders <- cbind(dag_confounders, col_to_append)
dag_confounders <- rbind(dag_confounders, row_to_append)
}

dag_confounders <- unname(dag_confounders)
pos_confounders <-  (p + 1):NROW(dag_confounders)

# Return data
return(list (dag_confounders = dag_confounders,
pos_confounders = pos_confounders))
}

#' Simulate noise observations
#'
#' Sample \code{n} observations for \code{p} independent noise variables
#' from a certain distribution \code{distr}.
#' @param n Positive integer. The number of observations, must be larger
#' than 1.
#' @param p Positive integer. The number of variables, must be larger
#' than 1.
#' @param distr Character. The distribution of the noise. It is one of:
#' \itemize{
#' \item \code{student_t}, in this case the user has to specify the
#' \code{tail_index}, i.e., the degrees of freedom,
#' \item \code{gaussian},
#' \item \code{log_normal}.
#' }
#' @param tail_index Positive numeric. The tail index, i.e., degrees
#' of freedom, of the noise.
#' @return Numeric matrix. Dataset matrix with \code{n}
#' rows (observations) and \code{p} columns (variables).
simulate_noise <- function(n, p, distr = c("student_t", "gaussian",
"log_normal"), tail_index){

if (p <= 1 | n <= 1){
stop("n and p must be larger than 1!")
}

distr <- match.arg(distr)

switch(distr,
"student_t" = {

noise <- array(stats::rt(n * p, df = tail_index), dim = c(n, p))

},
"gaussian" = {

noise <- array(stats::rnorm(n * p), dim = c(n, p))
},
"log_normal" = {

noise <- array(stats::rlnorm(n * p), dim = c(n, p))

})

return(noise)
}

#' Generate data from non-linear Structural Equation Model
#'
#' Generate data from non-linear Structural Causal Model as described in the
#' paper "Causal discovery in heavy-tailed models".
#'
#' @inheritParams get_all_paths
#' @param noise Numeric matrix. Dataset matrix with \code{n}
#' rows (observations) and \code{p} columns (variables).
#' @return Numeric matrix. Dataset matrix with \code{n}
#' rows (observations) and \code{p} columns (variables).

n <- NROW(noise)
dag <- (adj_mat != 0) * 1
nodes <- compute_caus_order(dag)

dataset <- matrix(0, nrow = n, ncol = p)
dataset_transf <- matrix(0, nrow = n, ncol = p)

for (i in nodes){
eps <- noise[, i]
dataset[, i] <- dataset_transf %*% betas + eps
dataset_transf[, i] <- broken_hockeystick(dataset[, i])
}

return(dataset)
}

#' Apply shifted-hockeystick function
#'
#' Apply shifted-hockeystick function to a numeric vector \code{v}.
#' The shifted-hockeystick function sets to zero all entries of
#' \code{v} greater or equal to the \code{q_low}-quantile and
#' strictly less than the \code{q_high}-quantile.
#'
#' @param v Numeric vector.
#' @param q_low Numeric, between 0 and 1.
#' @param q_high Numeric, between 0 and 1.
#' @return Numeric vector.
broken_hockeystick <- function(v, q_low = 0.05, q_high = 0.95){
n <- length(v)
r <- rank(v, ties.method = "first")
ind <- which(r > floor(n * q_low) & r <= ceiling(n * q_high))
v[ind] <- 0
return(v)
}

#' Transform to uniform margins
#'
#' Transform the numeric vector \code{x} to uniform margin
#' between 0 and 1.
#'
#' @inheritParams broken_hockeystick
#' @return Numeric vector.
uniform_margin <- function(v){
n <- length(v)
rank(v) / n
}

nicolagnecco/causalXtreme documentation built on Jan. 29, 2020, 5:50 p.m.