R/gmjmcmc.R
In FBMS: Flexible Bayesian Model Selection and Model Averaging

# Title     : GMJMCMC
# Objective : Genetically Modified Mode Jumping MCMC algorithm
# Created by: jonlachmann
# Created on: 2021-02-11

# Allow the package to access Rcpp functions
#' @useDynLib FBMS
#' @importFrom Rcpp sourceCpp
NULL

#' Main algorithm for GMJMCMC (Genetically Modified MJMCMC)
#'
#' @param x matrix containing the design matrix with data to use in the algorithm
#' @param y response variable
#' @param transforms A character vector including the names of the non-linear functions to be used by the modification and the projection operator. 
#' @param P The number of population iterations for GMJMCMC. The default value is P = 10, which was used in our initial example for illustrative purposes. However, a larger value, such as P = 50, is typically more appropriate for most practical applications.
#' @param N The number of MJMCMC iterations per population. 
#' The default value is N = 100; however, for real applications, a larger value such as N = 1000 or higher is often preferable.
#' @param N.final The number of MJMCMC iterations performed for the final population. Per default one has N.final = N, but for practical applications, a much larger value (e.g., N.final = 1000) is recommended. Increasing N.final is particularly important if predictions and inferences are based solely on the last population.
#' @param probs A list of various probability vectors used by GMJMCMC, generated by \code{gen.probs.gmjmcmc}. 
#'   The key component \code{probs.gen} defines probabilities of different operators in the feature generation process. 
#'   Defaults typically favor interactions and modifications (0.4 each) over projections and mutations (0.1 each) to encourage interpretable nonlinear features.
#' @param params A list of various parameter vectors used by GMJMCMC, generated by \code{gen.params.gmjmcmc}.
#' @param loglik.pi A function specifying the marginal log-posterior of the model up to a constant, including the logarithm of the model prior:
#'   \eqn{\log p(M|Y) = \text{const} + \log p(Y|M) + \log p(M)}.
#'   Typically assumes a Gaussian model with Zellner's with \eqn{g = max(n,p^2) by default}.
#' @param loglik.alpha Relevant only if the non-linear projection features depend on parameters \eqn{\alpha}. 
#'   If \eqn{\alpha} is estimated, this argument specifies the corresponding marginal log-likelihood. 
#'   The default method sets all \eqn{\alpha} to 1 (fastest, but sometimes suboptimal). 
#'   Alternative estimation strategies ("deep" and "random") are implemented in \pkg{FBMS}.
#' @param mlpost_params All parameters for the estimator function loglik.pi
#' @param intercept Logical. Whether to include an intercept in the design matrix. Default is \code{TRUE}. No variable selection is performed on the intercept.
#' @param fixed Integer specifying the number of leading columns in the design matrix to always include in the model. Default is 0.
#' @param sub Logical. If \code{TRUE}, uses subsampling or a stochastic approximation approach to the likelihood rather than the full likelihood. Default is \code{FALSE}. 
#' @param verbose Logical. Whether to print messages during execution. Default is \code{TRUE} for \code{gmjmcmc} and \code{FALSE} for the parallel version.
#'
#' @return A list containing the following elements:
#' \item{models}{All models per population.}
#' \item{mc.models}{All models accepted by mjmcmc per population.}
#' \item{populations}{All features per population.}
#' \item{marg.probs}{Marginal feature probabilities per population.}
#' \item{model.probs}{Marginal feature probabilities per population.}
#' \item{model.probs.idx}{Marginal feature probabilities per population.}
#' \item{best.margs}{Best marginal model probability per population.}
#' \item{accept}{Acceptance rate per population.}
#' \item{accept.tot}{Overall acceptance rate.}
#' \item{best}{Best marginal model probability throughout the run, represented as the maximum value in \code{unlist(best.margs)}.}
#'
#' @examples
#' result <- gmjmcmc(y = matrix(rnorm(100), 100),
#' x = matrix(rnorm(600), 100), 
#' P = 2, 
#' transform = c("p0", "exp_dbl"))
#' summary(result)
#' plot(result)
#'
#' @export gmjmcmc
gmjmcmc <- function (
  x,
  y,
  transforms,
  P = 10,
  N = 100,
  N.final = NULL,
  probs = NULL,
  params = NULL,
  loglik.pi = NULL,
  loglik.alpha = gaussian.loglik.alpha,
  mlpost_params = list(family = "gaussian", beta_prior = list(type = "g-prior")),
  intercept = TRUE,
  fixed = 0,
  sub = FALSE,
  verbose = TRUE
) {
  # Verify that the data is well-formed
  if (intercept) {
    x <- cbind(1, x)
    fixed <- fixed + 1
  }
  data <- check.data(x, y, fixed, verbose)

  if (is.null(loglik.pi)) {
    if (is.null(mlpost_params$beta_prior$type) || is.null(mlpost_params$family))
      stop("mlpost_params$beta_prior and mlpost_params$family must be specified")
    loglik.pi <- select.mlpost.fun(mlpost_params$beta_prior$type, mlpost_params$family)
    if (mlpost_params$family == "gaussian")
      mlpost_params$beta_prior <- gen.mlpost.params.lm(mlpost_params$beta_prior$type, mlpost_params$beta_prior, ncol(data$x) - 1, nrow(data$x))
    else {
      mlpost_params$beta_prior <- gen.mlpost.params.glm(mlpost_params$beta_prior$type, mlpost_params$beta_prior, ncol(data$x) - 1, nrow(data$x))
      mlpost_params$beta_prior$type <- mlpost_params$beta_prior$type
    }
    
  }
  
  # Generate default probabilities and parameters if there are none supplied.
  if (is.null(probs)) probs <- gen.probs.gmjmcmc(transforms)
  if (is.null(params)) params <- gen.params.gmjmcmc(ncol(data$x) - data$fixed)
  if (!is.null(mlpost_params)) params$mlpost <- mlpost_params
  # Extract labels from column names in dataframe
  labels <- get.labels(data, verbose)
  # Set the transformations option
  set.transforms(transforms)
  # Acceptance probability per population
  accept <- vector("list", P)
  accept <- lapply(accept, function (x) x <- 0)
  # A list of populations that have been visited
  S <- vector("list", P)
  # A list of models that have been visited, refering to the populations
  models <- vector("list", P)
  mc.models <- vector("list", P)
  # A list of all the marginal probabilities for the features, per population
  marg.probs <- vector("list", P)
  # A list of all the marginal probabilities for the models, per population
  model.probs <- vector("list", P)
  # A list of all the indices of the models which the marginal probabilities for the models refer to, per population
  model.probs.idx <- vector("list", P)
  # A list of all the best marginal model likelihoods, per population
  best.margs <- vector("list", P)

  # Create first population
  F.0 <- gen.covariates(data)
  if (is.null(params$prel.select))
    S[[1]] <- F.0
  else
    S[[1]] <- F.0[params$prel.select]

  complex <- complex.features(S[[1]])

  if(length(N.final) == 0)
    N.final <- N
  ### Main algorithm loop - Iterate over P different populations
  N.this <- N
  for (p in seq_len(P)) {
    # Set population iteration count
    if (p == P) N.this <- N.final
    # Precalculate covariates and put them in data.t
    if (length(params$feat$prel.filter) > 0 | p != 1) data.t <- precalc.features(data, S[[p]])
    else data.t <- data
    
    # Initialize first model of population
    model.cur <- as.logical(rbinom(n = length(S[[p]]), size = 1, prob = 0.5))
    model.cur.res <- loglik.pre(loglik.pi, model.cur, complex, data.t, params$mlpost, NULL, FALSE)
    model.cur <- list(prob = 0, model = model.cur, coefs = model.cur.res$coefs, crit = model.cur.res$crit, alpha = 0)
    best.crit <- model.cur$crit # Reset first best criteria value

    # Run MJMCMC over the population
    if (verbose) cat(paste("Population", p, "begin."))
    mjmcmc_res <- mjmcmc.loop(data.t, complex, loglik.pi, model.cur, N.this, probs, params, sub, verbose)
    if (verbose) cat(paste("\nPopulation", p, "done.\n"))

    # Add the models visited in the current population to the model list
    models[[p]] <- mjmcmc_res$models
    mc.models[[p]] <- mjmcmc_res$mc.models
    # Store marginal likelihoods for current features
    marg.probs[[p]] <- mjmcmc_res$marg.probs
    # Store marginal likelihoods for the visited models
    model.probs[[p]] <- mjmcmc_res$model.probs
    # Store indices for which the marginal likelihoods for the visited models refer to
    model.probs.idx[[p]] <- mjmcmc_res$model.probs.idx
    # Store best marginal model probability for current population
    best.margs[[p]] <- mjmcmc_res$best.crit
    # Store the accepted number of steps for the current population
    accept[[p]] <- mjmcmc_res$accept
    # Print the marginal posterior distribution of the features after MJMCMC
    if (verbose) {
      cat(paste("\rCurrent best crit:", mjmcmc_res$best.crit, "\n"))
      cat("Feature importance:\n")
      print_dist(marg.probs[[p]], sapply(S[[p]], print.feature, labels = labels, round = 2), probs$filter)
    }
    if (params$rescale.large) prev.large <- params$large
    # Generate a new population of features for the next iteration (if this is not the last)
    if (p != P) {
      S[[p + 1]] <- gmjmcmc.transition(S[[p]], F.0, data, loglik.alpha, marg.probs[[1]], marg.probs[[p]], labels, probs, params$feat, verbose)
      complex <- complex.features(S[[p + 1]])
      if (params$rescale.large) params$large <- lapply(prev.large, function(x) x * length(S[[p + 1]]) / length(S[[p]]))
    }
  }
  # Calculate acceptance rate
  accept.tot <- sum(unlist(accept)) / (N * (P - 1) + N.final)
  accept <- lapply(accept, function (x) x / N)
  accept[[P]] <- accept[[P]] * N / N.final
  # Return formatted results
  results <- list(
    models = models,                   # All models per population
    mc.models = mc.models,             # Only mjmcmc models per population
    populations = S,                   # All features per population
    marg.probs = marg.probs,           # Marginal feature probabilities per population
    model.probs = model.probs,         # Marginal feature probabilities per population
    model.probs.idx = model.probs.idx, # Marginal feature probabilities per population
    best.margs = best.margs,           # Best marginal model probability per population
    accept = accept,                   # Acceptance rate per population
    accept.tot = accept.tot,           # Overall acceptance rate
    best = max(unlist(best.margs)),    # Best marginal model probability throughout the run
    transforms = transforms            # Transformations used by the model
  )
  results$labels <- labels
  results$fixed <- fixed
  results$intercept <- intercept
  results$ncov <- ncol(data$x) - data$fixed
  attr(results, "class") <- "gmjmcmc"
  return(results)
}


#' Subalgorithm for generating a new population of features in GMJMCMC (Genetically Modified MJMCMC)
#'
#' @param S.t The current population of features
#' @param F.0 The initial population of features, i.e. the bare covariates
#' @param data The data used in the model, here we use it to generate alphas for new features
#' @param loglik.alpha The log likelihood function to optimize the alphas for
#' @param marg.probs The marginal inclusion probabilities of the current features
#' @param marg.probs.F.0 The marginal inclusion probabilities of the initial population of features
#' @param labels Variable labels for printing
#' @param probs A list of the various probability vectors to use
#' @param params A list of the various parameters for all the parts of the algorithm
#' @param verbose A logical denoting if messages should be printed
#'
#' @return The updated population of features, that becomes S.t+1
#' 
#' @noRd
#' 
gmjmcmc.transition <- function (S.t, F.0, data, loglik.alpha, marg.probs.F.0, marg.probs, labels, probs, params, verbose = TRUE) {
  # Sample which features to keep based on marginal inclusion below probs$filter
  feats.keep <- as.logical(rbinom(n = length(marg.probs), size = 1, prob = pmin(marg.probs / probs$filter, 1)))

  # Always keep original covariates if that setting is on
  if (params$keep.org) {
    if (params$prel.filter > 0) {
      # Do preliminary filtering if turned on
      feats.keep[(seq_along(F.0))[marg.probs.F.0 > params$prel.filter]] <- T
    } # Keep all if no preliminary filtering
    else feats.keep[seq_along(F.0)] <- T
  }

  # Avoid removing too many features
  if (length(feats.keep) > 0 && mean(feats.keep) < params$keep.min & sum(feats.keep) < params$pop.max/2) {
    feats.add.n <- round((params$keep.min - mean(feats.keep)) * length(feats.keep))
    feats.add <- sample(which(!feats.keep), feats.add.n)
    if ((length(feats.add) + sum(feats.keep)) >= params$pop.max)
      feats.keep[feats.add] <- T
  }
  
  if (sum(feats.keep)>params$pop.max) {
    warning("Number of features to keep greater than pop.max! 
            Continue with first pop.max features to be kept!
            \n Ignore if the final set of features with high probabilities is smaller than the specified $feat$pop.max
            \n Otherwise check your tuning parameters and increase $feat$pop.max or probs$filter!")
    feats.keep[which(feats.keep==TRUE)[(params$pop.max+1):length(which(feats.keep==TRUE))]] <- FALSE
  }

  # Create a list of which features to replace
  feats.replace <- which(!feats.keep)

  # TODO: Let filtered features become part of new features - tuning parameter
  # Create a list of inclusion probabilities
  marg.probs.use <- c(rep(params$eps, length(F.0)), pmin(pmax(marg.probs, params$eps), (1-params$eps)))

  # Perform the replacements
  if (length(S.t) > params$pop.max)
    feats.replace <- sort(feats.replace,decreasing = T)
  for (i in feats.replace) {
    prev.size <- length(S.t)
    prev.feat.string <- print.feature(S.t[[i]], labels = labels, round = 2)
    if (prev.size > params$pop.max) {
      cat("Removed feature", prev.feat.string, "\n")
      S.t[[i]] <- NULL
    } else {
      S.t[[i]] <- gen.feature(c(F.0, S.t), marg.probs.use, data, loglik.alpha, probs, length(F.0), params, verbose)
      if (prev.size > length(S.t)) {
        if (verbose) {
          cat("Removed feature", prev.feat.string, "\n")
          cat("Population shrinking, returning.\n")
        }
        return(S.t)
      }
      if (verbose) cat("Replaced feature", prev.feat.string, "with", print.feature(S.t[[i]], labels=labels, round = 2), "\n")
      feats.keep[i] <- T
      marg.probs.use[i] <- mean(marg.probs.use)
    }
  }

  # Add additional features if the population is not at max size
  if (length(S.t) < params$pop.max) {
    for (i in (length(S.t)+1):params$pop.max) {
      prev.size <- length(S.t)
      S.t[[i]] <- gen.feature(c(F.0, S.t), marg.probs.use, data, loglik.alpha, probs, length(F.0), params, verbose)
      if (prev.size == length(S.t)) {
        if (verbose) cat("Population not growing, returning.\n")
        return(S.t)
      }
      if (verbose) cat("Added feature", print.feature(S.t[[i]], labels=labels, round = 2), "\n")
      marg.probs.use <- c(marg.probs.use, params$eps)
    }
  }
  return(S.t)
}