MABC: Approximate Bayesian Computation with MICE

#' MICE-assisted Approximate Bayesian Calibration
#'
#' Produces a list with multiple waves of proposed input parameter values to
#' match a vector of target features.
#'
#' The \code{model} wrapper function for the simulaton model must have a vector
#' of model input parameter values as its one and only  argument. Furthermore,
#' it must return a vector of model features. These model features are then
#' compared against the target features.
#'
#' @param targets.empirical The vector of target features
#' @param model Wrapper function for the simulation model. See details for a
#'   description of the required format.
#' @param RMSD.tol.max Tolerance for the root mean squared distance between
#'   target features and model output features
#' @param min.givetomice Minimal number of observations in the training dataset
#'   to which MICE is applied
#' @param n.experiments Number of parameter combinations in each wave of model
#' runs
#' @param start.experiments If set to NULL (default), start experiments will be
#'   drawn uniformly from the prior distributions. If a matrix of input
#'   parameter values, possibly output from a previous calibration, models with
#'   these inputs will be run, instead of drawing from the prior distributions.
#'   To resume where a previous calibration ended, you can input
#'   start.experiments as a data.frame with inputs, outputs, seed, wave and RMSD
#'   values.
#' @param lls Vector of lower limits of the prior distribution of input
#'   parameter values
#' @param uls Vector of upper limits of the prior distribution of input
#'   parameter values
#' @param strict.positive.params Vector of indices that indicate which of the
#'   input parameters are strictly positive. Set to zero if there are no such
#'   parameters.
#' @param probability.params Vector of indices that indicate which of the input
#'   parameters are strictly between 0 and 1. Set to zero if there are no such
#'   parameters.
#' @param inside_prior TRUE by default. If FALSE, parameter sampling is not
#'   restricted to the initial ranges of the prior distribution during the
#'   subsequent algorithm steps.
#' @param method Method used by MICE. Currently, only "norm" is supported.
#' @param predictorMatrix As in mice::mice. Can be "complete", "LASSO" (Least
#'   Absolute Shrinkage and Selection Operator), or a user-defined matrix of
#'   zeros and ones. Diagonal must always be zeros. Ones indicate which
#'   variables are included in the chained equations in MICE.
#' @param maxit The maxit argument used in MICE (number of times that the
#'   chained equations are cycled through)
#' @param maxwaves The maximum number of waves of model runs
#' @param n_cores The number of cores available for parallel model runs. Default = 1, i.e. serial execution of model runs
#' @param multinode TRUE or FALSE (Default). If TRUE, model runs are distributed
#'   over the cores of multiple nodes, using DOsnow and snow as the back-end to
#'   the foreach package. If FALSE and n_cores > 1, model runs are distributed
#'   over the cores of a single node, using the parallel package.
#' @return A list with the following components: \item{results}{A list with \code{maxwaves} elements. Each element is itself a list with 6 named elements: \code{thiswave.io}: Input and Output of the simulations of this wave, \code{thiswave.median.features}: The l1 median of this wave's model output, \code{upto.thiswave.median.features}: The l1 median of all model output up to and including this wave, \code{closest.upto.thiswave.median.features}: The l1 median of the \code{min.givetomice} model features that are closest to the target features, up to this wave, \code{max.RMSD}: The largest Root Mean Square Distance of the \code{min.givetomice} model features that are closest to the target features, up to this wave. The \code{max.RMSD} is expected to decrease with increasing waves, \code{closest.upto.thiswave.io}: Input and Output of the \code{min.givetomice} simulations that have features closest to the target features, up to this wave. The input parameters make up the approximate posterior distribution.} \item{targets}{The vector of target
#'   features} \item{secondspassed}{The time (in seconds) it took MABC to
#'   complete the calibration}
#'
#' @import mice
#' @importFrom gsubfn strapplyc
#' @importFrom randtoolbox sobol
#' @importFrom pcaPP l1median
#' @importFrom glmnet cv.glmnet
#' @importFrom mvtnorm dmvnorm
#' @import dplyr
#' @import tidyverse
#' @export

MABC <- function(targets.empirical,
                 model,
                 RMSD.tol.max = 2,
                 min.givetomice = 64,
                 n.experiments = 256,
                 start.experiments = NULL,
                 lls,
                 uls,
                 strict.positive.params,
                 probability.params,
                 inside_prior = TRUE,
                 method = "norm",
                 predictorMatrix = "complete",
                 maxit = 50,
                 maxwaves = 4,
                 n_cores = n_cores,
                 multinode = FALSE){
  # 0. Start the clock
  ptm <- proc.time()
  # initiating the list where all MABC output will be stored
  calibration.list <- list()
  results <- vector("list", length = maxwaves)
  calibration.list$results <- results
  wave <- 1 # initiating the loop of waves of simulations (one iteration is one wave)
  max.RMSD <- Inf # initially it is infinitely large, but in later iterations it shrinks
  #sim.results.with.design.df <- NULL # Will be growing with each wave (appending)
  #sim.results.with.design.df.selected <- NULL
  #final.intermediate.features <- NULL

  modelstring <- unlist(paste0(deparse(model), collapse = " "))
  input.vector.length <- max(unique(stats::na.omit(as.numeric(gsubfn::strapplyc(modelstring, "[[](\\d+)[]]", simplify = TRUE))))) - 1 # minus one because the first input parameter is the random seed


  # input.vector.length <- max(unique(na.omit(as.numeric(unlist(strsplit(unlist(paste0(deparse(model), collapse = " ")), "[^0-9]+"))))))
  output.vector.length <- length(targets.empirical)


  x.names <- paste0("x.", seq(1:input.vector.length))
  y.names <- paste0("y.", seq(1:output.vector.length))
  x.offset <- length(x.names)
  sim.results.with.design.df <- data.frame(matrix(vector(), 0, (input.vector.length + output.vector.length),
                                                  dimnames = list(c(), c(x.names, y.names))),
                                           stringsAsFactors = FALSE) # Will be growing with each wave (appending)
  # sim.results.with.design.df.selected I think this does not need to be initiated
  final.intermediate.features <- rep(NA, times = length(targets.empirical))

  # 1. Start loop of waves
  while (wave <= maxwaves & max.RMSD > 0){
    print(c("wave", wave), quote = FALSE)

    if (wave == 1){
      if (identical(start.experiments, NULL)) {
        # 2. Initial, naive results, based on Sobol sequences
        range.width <- uls - lls
        ll.mat <- matrix(rep(lls, n.experiments), nrow = n.experiments, byrow = TRUE)
        range.width.mat <- matrix(rep(range.width, n.experiments), nrow = n.experiments, byrow = TRUE)
        sobol.seq.0.1 <- sobol(n = n.experiments, dim = length(lls), init = TRUE, scrambling = 1, seed = 1, normal = FALSE)
        experiments <- ll.mat + sobol.seq.0.1 * range.width.mat
      } else {
        if (ncol(start.experiments) == input.vector.length) {
          experiments <- start.experiments
        }
      }

    }

    if (exists("experiments", mode = "numeric")){
      if (multinode == TRUE){
        sim.results.simple <- model.mpi.run(model = model,
                                            actual.input.matrix = experiments,
                                            seed_count = 0,
                                            n_cores = n_cores)
      } else {
        sim.results.simple <- model.parallel.run(model = model,
                                                 actual.input.matrix = experiments,
                                                 seed_count = 0,
                                                 n_cores = n_cores)
      }

      new.sim.results.with.design.df <- as.data.frame(cbind(experiments,
                                                            sim.results.simple,
                                                            rep(wave, times = nrow(experiments))))

      names(new.sim.results.with.design.df) <- c(x.names, y.names, "seed", "wave")

      new.sim.results.with.design.complete <- stats::complete.cases(new.sim.results.with.design.df)
      new.sim.results.with.design.df <- dplyr::filter(new.sim.results.with.design.df,
                                                      new.sim.results.with.design.complete)

      if (wave == 1){ # TRUE for the first wave only
        sim.results.with.design.df <- rbind(sim.results.with.design.df,
                                            new.sim.results.with.design.df)
      } else {
        sim.results.with.design.df <- rbind(dplyr::select(sim.results.with.design.df,
                                                          -contains("RMSD")),
                                            new.sim.results.with.design.df)
      }



      diff.matrix <- sweep(x = sim.results.with.design.df[ , ((1 + x.offset):(x.offset + length(targets.empirical)))],
                           MARGIN = 2,
                           targets.empirical)
      rel.diff.matrix <- sweep(diff.matrix, MARGIN = 2,
                               targets.empirical, FUN = "/")
      squared.rel.diff.matrix <- rel.diff.matrix^2
      sum.squared.rel.diff <- rowSums(squared.rel.diff.matrix)
      RMSD <- sqrt(sum.squared.rel.diff / length(targets.empirical))
      sim.results.with.design.df$RMSD <- RMSD

    } else {
      sim.results.with.design.df <- start.experiments
      sim.results.with.design.df$wave <- 1 # everything that happened previously is now part of wave 1
      new.sim.results.with.design.df <- dplyr::select(sim.results.with.design.df,
                                                      -contains("RMSD"))
      RMSD <- sim.results.with.design.df$RMSD
    }

    n.close.to.targets <- min.givetomice
    final.intermediate.features <- targets.empirical

    # 5. Select n.close.to.targets shortest distances
    dist.order <- order(RMSD) # Ordering the squared distances from small to big.
    selected.distances <- dist.order[1:n.close.to.targets]
    sim.results.with.design.df.selected <- sim.results.with.design.df[selected.distances, ]

    calibration.list$results[[wave]]$thiswave.io <- new.sim.results.with.design.df

    # 5.aaaa Keeping track of medians
    # The median of the simulations in the latest wave
    calibration.list$results[[wave]]$thiswave.median.features <- pcaPP::l1median(dplyr::select(new.sim.results.with.design.df, contains("y.")))
    # The median of all simulations up to and including the latest wave
    calibration.list$results[[wave]]$upto.thiswave.median.features <- pcaPP::l1median(dplyr::select(sim.results.with.design.df, contains("y.")))
    # The median of the simulations to give to mice
    calibration.list$results[[wave]]$closest.upto.thiswave.median.features <- pcaPP::l1median(dplyr::select(sim.results.with.design.df.selected, contains("y.")))

    # 5.b. Record highest RMSD value for that the selected experiments
    max.RMSD <- max(sim.results.with.design.df.selected$RMSD)
    calibration.list$results[[wave]]$max.RMSD <- max.RMSD

    # 6. Record selected experiments to give to mice for this wave
    calibration.list$results[[wave]]$closest.upto.thiswave.io <- sim.results.with.design.df.selected

    mice.test <- list()
    if (max.RMSD <= RMSD.tol.max & wave < maxwaves){
      # 7. Put intermediate features in dataframe format
      final.intermediate.features.df <- as.data.frame(matrix(final.intermediate.features, ncol = length(final.intermediate.features)))
      names(final.intermediate.features.df) <- y.names

      # 8. Prepare dataframe to give to mice: selected experiments plus intermediate features
      df.give.to.mice <- gtools::smartbind(dplyr::select(sim.results.with.design.df.selected,
                                                         -one_of(c("RMSD", "seed", "wave"))), # adding target to training dataset
                                           final.intermediate.features.df)

      if (!identical(strict.positive.params, 0)){
        df.give.to.mice[, strict.positive.params] <- log(df.give.to.mice[, strict.positive.params])
      }
      if (!identical(probability.params, 0)){
        df.give.to.mice[, probability.params] <- log(df.give.to.mice[, probability.params] / (1 - df.give.to.mice[, probability.params])) # logit transformation
      }

      if (is.numeric(predictorMatrix)){
        predictorMatrix.give.to.mice <- predictorMatrix
      }

      if (identical(predictorMatrix, "LASSO")){
        predictorMatrix.LASSO <- diag(0, ncol = ncol(df.give.to.mice), nrow = ncol(df.give.to.mice))
        all.names <- names(df.give.to.mice)

        nrows.training.df <- dplyr::select(sim.results.with.design.df.selected,
                                           -one_of(c("RMSD", "seed", "wave"))) %>% nrow()

        for(y.index in 1:ncol(df.give.to.mice)){
          x4lasso <- as.matrix(df.give.to.mice[1:nrows.training.df, -y.index])
          y4lasso <- as.numeric(df.give.to.mice[1:nrows.training.df, y.index])
          alpha <- 1
          cvfit <- glmnet::cv.glmnet(x = x4lasso,
                                     y = y4lasso,
                                     family = "gaussian",
                                     alpha = alpha,
                                     nlambda = 20)
          remaining.indices <- stats::coef(cvfit, s = "lambda.1se")@i
          nonzero.names <- names(df.give.to.mice[-nrow(df.give.to.mice), -y.index])[remaining.indices] # These are the columns with non-zero coefficients
          col.indices <- all.names %in% nonzero.names
          predictorMatrix.LASSO[y.index, col.indices] <- 1
        }

        # We will now replace the "all zero" rows in predictorMatrix.LASSO with rows of the "complete" predictorMatrix
        # Finding out which rows are all zero
        all.zero.rows <- rowSums(predictorMatrix.LASSO) == 0
        # Creating complete predictorMatrix
        predictorMatrix.complete <- (1 - diag(1, ncol(df.give.to.mice)))
        # Doing the appropriate replacements
        predictorMatrix.LASSO[all.zero.rows, ] <- predictorMatrix.complete[all.zero.rows, ]
        predictorMatrix.give.to.mice <- predictorMatrix.LASSO
      }

      if (identical(predictorMatrix, "complete")){
        predictorMatrix.give.to.mice <- (1 - diag(1, ncol(df.give.to.mice)))
      }

      # do imputation
      mice.test <- tryCatch(mice.parallel(df.give.to.mice,
                                          m = 1,
                                          method = method,
                                          predictorMatrix = predictorMatrix.give.to.mice,
                                          maxit = maxit,
                                          printFlag = FALSE,
                                          n_cores = 1,
                                          n_experiments = 10 * n.experiments),
                            error = function(mice.err) {
                              return(list())
                            })
    }
    if (length(mice.test) > 0){

      # 11. Turn mice proposals into a new matrix of experiments
      experiments <- mice.test %>%
        purrr::modify_depth(1, "imp") %>%
        unlist() %>%
        matrix(byrow = TRUE,
               ncol = length(x.names))

      # Before we check the suitability of the new experimental input parameter values, we must backtransform the log values to natural values
      if (!identical(strict.positive.params, 0)){
        experiments[, strict.positive.params] <- exp(experiments[, strict.positive.params])
      }
      # And we must also backtransform the logit-transformed values
      if (!identical(probability.params, 0)){
        experiments[, probability.params] <- exp(experiments[, probability.params]) / (1 + exp(experiments[, probability.params]))
      }

      # Lastly, we must create a new matrix, of the same dimensions as the naive "experiments" matrix,
      # But sampled with replacement using the inverse probability weights.
      experiments.df <- data.frame(experiments)

      # We could add an argument to the function to force the new experiments to respect the boundaries of the prior distributions.
      within.prior.limits <- rep(TRUE, n.experiments)
      if (inside_prior == TRUE){ # experiments.df is a dataframe with n.experiments rows and length(lls) columns
        params.above.lls <- sign(sweep(x = experiments.df, MARGIN = 2, lls)) %>% rowSums()
        params.below.uls <- sign(sweep(x = -experiments.df, MARGIN = 2, -uls)) %>% rowSums()
        within.prior.limits <- params.above.lls %in% length(lls) & params.below.uls %in% length(uls)
        experiments.df <- experiments.df[within.prior.limits, ]
      }

      if (nrow(experiments.df) < n.experiments){
        wave <- maxwaves + 1
      } else {
        set.seed(0) # for reproducibility

        # NEW:
        mu.experiments <- colMeans(experiments.df)
        cov.experiments <- stats::cov(experiments.df)
        densities.experiments <- mvtnorm::dmvnorm(experiments.df, mu.experiments, cov.experiments)
        weights.experiments <- 1/densities.experiments
        #

        experiments <- matrix(unlist(dplyr::sample_n(experiments.df,
                                                     size = n.experiments,
                                                     replace = TRUE,
                                                     weight = weights.experiments)),
                              byrow = FALSE,
                              ncol = length(x.names))

        wave <- wave + 1
      }
    } else {
      wave <- maxwaves + 1
    }
  }

  # 15. Target features
  calibration.list$targets <- targets.empirical

  # 16. Stop clock and return calibration list
  calibration.list$secondspassed <- proc.time() - ptm # Stop the clock

  return(calibration.list)
}