simulation_settings <- function(){
## void -> tibble
## returns a tibble with simulation settings
nexp <- 50
my_args <- list(
experiment = 1:nexp,
n = c(5e2, 1e3, 1e4),
p = c(4, 7, 10, 15, 20, 30, 50),
distr = c('student_t'),
tail_index = c(1.5, 2.5, 3.5),
has_confounder = c(F, T),
is_nonlinear = c(F, T),
has_uniform_margins = c(F, T))
my_args <- expand.grid(my_args, stringsAsFactors = F) %>%
as_tibble() %>%
rowwise() %>%
mutate(prob_connect = min(5/(p - 1), 1/2)) %>%
filter(has_confounder + is_nonlinear + has_uniform_margins <= 1) %>%
rowid_to_column("id") %>%
select(id, everything())
return(my_args)
}
method_settings <- function(){
## void -> tibble
## returns a tibble with all the used methods
method_tbl <- tibble(method = c("ease", "ica_lingam",
"direct_lingam", "pc_rank", "random"))
return(method_tbl)
}
set_simulations <- function(experiment_ids=1:NROW(simulation_settings()),
method_nms=method_settings()$method,
seed){
## numeric_vector character_vector integer -> list
## prepares the settings for the simulations
## INPUTS:
## - experiment_ids: a numeric vector that specifies the rows of the tibble generated
## by calling the function simulation_settings().
## - method_ids: a character vector that specifies the method to use. This is
## a subset of methods from the output of method_settings().
## - seed: an integer seed that determines the outcome of the simulation.
##
## RETURNS:
## The function returns a list made of:
## - simulation_arguments: a tibble generated by the function simulation_settings(),
## where only the rows in experiments_ids are kept.
## - method_arguments: a character vectors generated by the function method_settings(),
## where only the methods in method_nms are kept.
## NOTE:
## The simulations are fully repeatable. That is, even after running the full
## simulation, you can repeat it over a subset of arguments/methods.
## This is possible because this function assigns to each simulation run
## and method a unique random seed. These random seeds are generated with
## L'Ecuyer RNG method and are independent of each other.
# set simulation options
simulation_arguments <- simulation_settings()
m <- NROW(simulation_arguments)
if (any(!(experiment_ids %in% 1:m))){
stop(paste("Argument experiment_ids must contain integers between 1 and ",
m, ".", sep = ""))
}
experiment_ids <- sort(unique(experiment_ids))
# set methods
method_arguments <- method_settings()
k <- NROW(method_arguments)
if (any(!(method_nms %in% method_arguments$method))){
stop(paste("Argument method_nms must contains a subset of these methods: ",
paste(method_arguments, collapse = ", "), ".", sep = ""))
}
# create independent RNG streams with L'Ecuyer method
rng <- RNGseq(m + m * k, seed = seed)
# select RNG streams for simulations
sims_rng_stream <- rng[1:m]
simulation_arguments$rng <- sims_rng_stream
# select RNG streams for methods
meth_rng_stream <- list()
for (j in 1:k){
# cat(j*m + experiment_ids, '\n')
meth_rng_stream[[j]] <- rng[j * m + experiment_ids]
}
method_arguments$rng <- meth_rng_stream
# filter simulations
simulation_arguments <- simulation_arguments %>%
filter(id %in% experiment_ids)
# filter methods
method_arguments <- method_arguments %>%
filter(method %in% method_nms)
# return list
list(simulation_arguments=simulation_arguments,
method_arguments=method_arguments)
}
get_method_args <- function(method = c("ease", "lingam", "ica_lingam",
"direct_lingam",
"pc", "pc_rank", "random"), n){
## character integer -> list
## produces a list with the arguments used by the method in the simulation
method <- match.arg(method)
switch(method,
ease = {
list(k = floor(n ** 0.4))
},
ica_lingam = {
list(contrast_fun = "logcosh")
},
direct_lingam = {
list()
},
pc_rank = {
list(alpha = 5e-4)
},
random = {
list()
},
lingam = {
list(contrast_fun = "logcosh")
})
}
causal_discovery_wrapper <- function(dat, method, set_args){
time1 <- Sys.time()
ll <- causal_discovery(dat, method, set_args)
ll$time <- Sys.time() - time1
return(ll)
}
wrapper_sim <- function(i, j, sims_args, method_args, trimdata = FALSE,
meth_args = FALSE){
# Checks
if (meth_args){
if (!("set_args" %in% names(method_args))){
stop("When meth_args = TRUE, the tibble `method_args` must contain the
column `set_args`.")
}
} else {
if ("set_args" %in% names(method_args)){
stop("When meth_args = FALSE, the tibble `method_args` must *not* contain the
column `set_args`.")
}
}
# Set up index variables
m <- nrow(sims_args)
k <- nrow(method_args)
cat("Simulation", i, "out of", m, "--- Inner iteration", j, "out of", k, "\n")
# Simulate data
current_exp <- sims_args[i, ]
args_simulate <- current_exp %>% select(-experiment, -id, -rng)
rng_sims <- current_exp$rng[[1]]
rngtools::setRNG(rng_sims)
X <- do.call(simulate_data, args_simulate)
if (trimdata){
X$dataset <- trim_data(X$dataset)
}
n <- NROW(X$dataset)
# Run method
method <- method_args$method[j]
if (meth_args){
set_args <- method_args$set_args[[j]]
} else {
set_args <- get_method_args(method, n)
}
rng_method <- method_args$rng[[j]][[i]]
rngtools::setRNG(rng_method)
out <- causal_discovery_wrapper(dat = X$dataset,
method = method,
set_args = set_args)
# Evaluate algorithms
algo_result <- causal_metrics(simulated_data = X,
estimated_graphs = out[1:2])
# Return tibble
res <- current_exp %>% select(id) %>%
bind_cols(tibble(method = method)) %>%
bind_cols(tibble(sid = algo_result$sid,
shd = algo_result$shd,
time = out$time))
if (meth_args){
# create tibble with argument name-value pair
if (!length(set_args)){
args_tbl <- tibble(arg_name = "", arg_value = NA)
} else {
args_tbl <- tibble(arg_name = names(set_args)[1],
arg_value = set_args[[1]])
}
res <- res %>%
bind_cols(args_tbl)
}
# return value
return(res)
}
trim_data <- function(data){
## numeric_matrix -> numeric_matrix
## keeps observations in the tails
n <- nrow(data)
p <- ncol(data)
data_trimmed <- data %>%
as_tibble() %>%
mutate(id = 1:n()) %>%
gather(key = "variab",
value = "value",
-id) %>%
group_by(variab) %>%
mutate(uq = quantile(value, .9),
lq = quantile(value, .1),
bulk = value > lq & value < uq,
tail = !bulk) %>%
select(id, variab, tail) %>%
ungroup() %>%
spread(variab, tail)
is_in_tail <- (apply(data_trimmed[, -1], 1, sum) >= sqrt(p))
is_in_bulk <- !(is_in_tail)
c(sum(is_in_bulk)/n)
tail_id <- which(is_in_tail)
bulk_id <- which(is_in_bulk)
bulk_id <- sample(x = bulk_id, size = length(bulk_id) / 2)
ids <- sample(c(tail_id))
data[ids, ]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.