reproduce_paper_results/simulation_functions.R

simulation_settings <- function(){
  ## void -> tibble
  ## returns a tibble with simulation settings

  nexp <- 50
  my_args <- list(
    experiment = 1:nexp,
    n = c(5e2, 1e3, 1e4),
    p = c(4, 7, 10, 15, 20, 30, 50),
    distr = c('student_t'),
    tail_index = c(1.5, 2.5, 3.5),
    has_confounder = c(F, T),
    is_nonlinear = c(F, T),
    has_uniform_margins = c(F, T))

  my_args <- expand.grid(my_args, stringsAsFactors = F) %>%
    as_tibble() %>%
    rowwise() %>%
    mutate(prob_connect = min(5/(p - 1), 1/2)) %>%
    filter(has_confounder + is_nonlinear + has_uniform_margins <= 1) %>%
    rowid_to_column("id") %>%
    select(id, everything())

  return(my_args)
}

method_settings <- function(){
  ## void -> tibble
  ## returns a tibble with all the used methods
  method_tbl <- tibble(method = c("ease", "ica_lingam",
                                  "direct_lingam", "pc_rank", "random"))

  return(method_tbl)
}

set_simulations <- function(experiment_ids=1:NROW(simulation_settings()),
                            method_nms=method_settings()$method,
                            seed){
  ## numeric_vector character_vector integer -> list
  ## prepares the settings for the simulations
  ## INPUTS:
  ## - experiment_ids: a numeric vector that specifies the rows of the tibble generated
  ## by calling the function simulation_settings().
  ## - method_ids: a character vector that specifies the method to use. This is
  ## a subset of methods from the output of method_settings().
  ## - seed: an integer seed that determines the outcome of the simulation.
  ##
  ## RETURNS:
  ## The function returns a list made of:
  ## - simulation_arguments: a tibble generated by the function simulation_settings(),
  ## where only the rows in experiments_ids are kept.
  ## - method_arguments: a character vectors generated by the function method_settings(),
  ## where only the methods in method_nms are kept.
  ## NOTE:
  ## The simulations are fully repeatable. That is, even after running the full
  ## simulation, you can repeat it over a subset of arguments/methods.
  ## This is possible because this function assigns to each simulation run
  ## and method a unique random seed. These random seeds are generated with
  ## L'Ecuyer RNG method and are independent of each other.

  # set simulation options
  simulation_arguments <- simulation_settings()
  m <- NROW(simulation_arguments)
  if (any(!(experiment_ids %in% 1:m))){
    stop(paste("Argument experiment_ids must contain integers between 1 and ",
               m, ".", sep = ""))
  }

  experiment_ids <- sort(unique(experiment_ids))

  # set methods
  method_arguments <- method_settings()
  k <- NROW(method_arguments)
  if (any(!(method_nms %in% method_arguments$method))){
    stop(paste("Argument method_nms must contains a subset of these methods: ",
               paste(method_arguments, collapse = ", "), ".", sep = ""))
  }

  # create independent RNG streams with L'Ecuyer method
  rng <- RNGseq(m + m * k, seed = seed)

  # select RNG streams for simulations
  sims_rng_stream <- rng[1:m]
  simulation_arguments$rng <- sims_rng_stream

  # select RNG streams for methods
  meth_rng_stream <- list()

  for (j in 1:k){
    # cat(j*m + experiment_ids, '\n')
    meth_rng_stream[[j]] <- rng[j * m + experiment_ids]
  }

  method_arguments$rng <- meth_rng_stream

  # filter simulations
  simulation_arguments <- simulation_arguments %>%
    filter(id %in% experiment_ids)

  # filter methods
  method_arguments <- method_arguments %>%
    filter(method %in% method_nms)

  # return list
  list(simulation_arguments=simulation_arguments,
       method_arguments=method_arguments)

}

get_method_args <- function(method = c("ease", "lingam", "ica_lingam",
                                       "direct_lingam",
                                       "pc", "pc_rank", "random"), n){
  ## character integer -> list
  ## produces a list with the arguments used by the method in the simulation
  method <- match.arg(method)
  switch(method,
         ease = {
           list(k = floor(n ** 0.4))
         },
         ica_lingam = {
           list(contrast_fun = "logcosh")
         },
         direct_lingam = {
           list()
         },
         pc_rank = {
           list(alpha = 5e-4)
         },
         random = {
           list()
         },
         lingam = {
           list(contrast_fun = "logcosh")
         })
}

causal_discovery_wrapper <- function(dat, method, set_args){
  time1 <- Sys.time()
  ll <- causal_discovery(dat, method, set_args)
  ll$time <- Sys.time() - time1

  return(ll)
}

wrapper_sim <- function(i, j, sims_args, method_args, trimdata = FALSE,
                        meth_args = FALSE){

  # Checks
  if (meth_args){
    if (!("set_args" %in% names(method_args))){
      stop("When meth_args = TRUE, the tibble `method_args` must contain the
           column `set_args`.")
    }
  } else {
    if ("set_args" %in% names(method_args)){
      stop("When meth_args = FALSE, the tibble `method_args` must *not* contain the
           column `set_args`.")
    }
  }

  # Set up index variables
  m <- nrow(sims_args)
  k <- nrow(method_args)
  cat("Simulation", i, "out of", m, "--- Inner iteration", j, "out of", k, "\n")

  # Simulate data
  current_exp <- sims_args[i, ]
  args_simulate <- current_exp %>% select(-experiment, -id, -rng)

  rng_sims <- current_exp$rng[[1]]
  rngtools::setRNG(rng_sims)
  X <- do.call(simulate_data, args_simulate)

  if (trimdata){
    X$dataset <- trim_data(X$dataset)
  }

  n <- NROW(X$dataset)


  # Run method
  method <- method_args$method[j]

  if (meth_args){
    set_args <- method_args$set_args[[j]]
  } else {
    set_args <- get_method_args(method, n)
  }

  rng_method <- method_args$rng[[j]][[i]]
  rngtools::setRNG(rng_method)

  out <- causal_discovery_wrapper(dat = X$dataset,
                                  method = method,
                                  set_args = set_args)

  # Evaluate algorithms
  algo_result <- causal_metrics(simulated_data = X,
                                estimated_graphs = out[1:2])

  # Return tibble
  res <- current_exp %>% select(id) %>%
    bind_cols(tibble(method = method)) %>%
    bind_cols(tibble(sid = algo_result$sid,
                     shd = algo_result$shd,
                     time = out$time))

  if (meth_args){
    # create tibble with argument name-value pair
    if (!length(set_args)){
      args_tbl <- tibble(arg_name = "", arg_value = NA)
    } else {
      args_tbl <- tibble(arg_name = names(set_args)[1],
                         arg_value = set_args[[1]])
    }

    res <- res %>%
      bind_cols(args_tbl)
  }

  # return value
  return(res)
}

trim_data <- function(data){
  ## numeric_matrix -> numeric_matrix
  ## keeps observations in the tails

  n <- nrow(data)
  p <- ncol(data)
  data_trimmed <- data %>%
    as_tibble() %>%
    mutate(id = 1:n()) %>%
    gather(key = "variab",
           value = "value",
           -id) %>%
    group_by(variab) %>%
    mutate(uq = quantile(value, .9),
           lq = quantile(value, .1),
           bulk = value > lq & value < uq,
           tail = !bulk) %>%
    select(id, variab, tail) %>%
    ungroup() %>%
    spread(variab, tail)

  is_in_tail <- (apply(data_trimmed[, -1], 1, sum) >= sqrt(p))
  is_in_bulk <- !(is_in_tail)

  c(sum(is_in_bulk)/n)

  tail_id <- which(is_in_tail)
  bulk_id <- which(is_in_bulk)
  bulk_id <- sample(x = bulk_id, size = length(bulk_id) / 2)

  ids <- sample(c(tail_id))

  data[ids, ]
}
nicolagnecco/causalXtreme documentation built on April 21, 2024, 4:22 a.m.