R/gp_opt.R
In adaptr: Adaptive Trial Simulator

Documented in cov_mat gp_opt pow_abs_dist

#' Calculates matrix of absolute distances raised to a power
#'
#' Used internally, calculates the absolute distances for values in a matrix
#' with possibly unequal dimensions, and raises these to a power.
#'
#' @param x1 numeric vector, with length corresponding to the number of rows in
#'   the returned matrix.
#' @param x2 numeric vector, with length corresponding to the number of columns
#'   in the returned matrix. If not specified, `x1` will be used for `x2`.
#' @param pow single numeric value, the power that all distances are raised to.
#'   Defaults to `2`, corresponding to pairwise, squared, Euclidean distances.
#'
#' @return Matrix with `length(x1)` rows and `length(x2)` columns including the
#'   calculated absolute pairwise distances raised to `pow`.
#'
#' @keywords internal
#'
pow_abs_dist <- function(x1, x2 = x1, pow = 2) {
  m <- matrix(nrow = length(x1), ncol = length(x2))
  for (i in 1:length(x2)) {
    # R stores matrices in column-major order
    m[, i] <- abs(x1 - x2[i])^pow
  }
  m
}



#' Estimates covariance matrices used by Gaussian process optimisation
#'
#' Used internally, estimates covariance matrices used by the Gaussian process
#' optimisation function. Calculates pairwise absolute distances raised to a
#' power (which defaults to `2`) using the [pow_abs_dist()] function, divides
#' the result by a` lengthscale` hyperparameter (which defaults to `1`, i.e., no
#' changes due to division), and subsequently returns the inverse exponentiation
#' of the resulting matrix.
#'
#' @inheritParams pow_abs_dist
#' @param g single numerical value; jitter/nugget value added to the diagonal
#'   if not `NULL` (the default); should be supplied if `x1` is the same as
#'   `x2`, to avoid potentially negative values in the matrix diagonal due to
#'   numerical instability.
#' @param lengthscale single numerical value; lengthscale hyperparameter that
#'   the matrix returned from [pow_abs_dist()] is divided by before the inverse
#'   exponentiation is done.
#'
#' @return Covariance matrix with `length(x1)` rows and `length(x2)` columns
#'   used by the Gaussian process optimiser.
#'
#' @keywords internal
#'
cov_mat <- function(x1, x2 = x1, g = NULL, pow = 2, lengthscale = 1) {
  res <- exp(-pow_abs_dist(x1, x2, pow) / lengthscale)
  if (!is.null(g)) {
    diag(res) <- diag(res) + g
  }
  res
}



#' Gaussian process-based optimisation
#'
#' Used internally. Simple Gaussian process-based Bayesian optimisation
#' function, used to find the next value to evaluate (as `x`) in the
#' [calibrate_trial()] function. Uses only a single input dimension, which may
#' be rescaled to the `[0, 1]` range by the function, and a covariance structure
#' based on absolute distances between values, raised to a power (`pow`) and
#' subsequently divided by `lengthscale` before the inverse exponentiation of
#' the resulting matrix is used. The `pow` and `lengthscale` hyperparameters
#' consequently control the smoothness by controlling the rate of decay between
#' correlations with distance.\cr
#' The optimisation algorithm uses bi-directional uncertainty bounds in an
#' acquisition function that suggests the next target to evaluate, with wider
#' uncertainty bounds (higher `kappa`) leading to increased 'exploration' (i.e.,
#' the function is more prone to suggest new target values where the uncertainty
#' is high and often further from the best evaluation so far) and narrower
#' uncertainty bounds leading to increased 'exploitation' (i.e., the function is
#' more prone to suggest new target values relatively close to the mean
#' predictions from the model).\cr
#' The `dir` argument controls whether the suggested value (based on both
#' uncertainty bounds) should be the value closest to `target` in either
#' direction (`dir = 0`), at or above `target` (`dir > 0`), or at or below
#' target (`dir < 0`), if any, are preferred.\cr
#' When the function being evaluated is noise-free and monotonically increasing
#' or decreasing, the optimisation function can narrow the range of predictions
#' based on the input evaluations (`narrow = TRUE`), leading to a finer grid of
#' potential new targets to suggest compared to when predictions are spaced over
#' the full range.\cr
#' If the new value at which to evaluate the function suggested has already been
#' evaluated, random noise will be added to ensure evaluation at a new value
#' (if `narrow` is `FALSE`, noise will be based on a random draw from a normal
#' distribution with the current suggested value as mean and the standard
#' deviation of the `x` values as SD, truncated to the range of `x`-values; if
#' `narrow` is `TRUE`, a new value drawn from a uniform distribution within the
#' current narrowed range will be suggested. For both strategies, the process
#' will be repeated until the suggested value is 'new').
#' \cr The Gaussian process model used is partially based on code from Gramacy
#' 2020 (with permission), see **References**.
#'
#' @param x numeric vector, the previous values where the function being
#'   calibrated was evaluated.
#' @param y numeric vector, the corresponding results of the previous
#'   evaluations at the `x` values (must be of the same length as `x`).
#' @param target single numeric value, the desired target value for the
#'   calibration process.
#' @param dir single numeric value (default `0`), used when selecting the next
#'   value to evaluate at. See [which_nearest()] for further description.
#' @param resolution single integer (default `5000`), size of the grid at which
#'   the predictions used to select the next value to evaluate at are made.\cr
#'   **Note:** memory use and time will substantially increase with higher
#'   values.
#' @param kappa single numeric value `> 0` (default `1.96`), used for the width
#'   of uncertainty bounds (based on the Gaussian process posterior predictive
#'   distribution), which are used to select the next value to evaluate at.
#' @param pow single numerical value, passed to [cov_mat()] and controls the
#'   smoothness of the Gaussian process. Should be between `1` (no smoothness,
#'   piecewise straight lines between each subsequent `x`/`y`-coordinate if
#'   `lengthscale` described below is `1`) and `2`; defaults to `1.95`, which
#'   leads to slightly faster decay of correlations when `x` values are
#'   internally scaled to the `[0, 1]`-range compared to `2`.
#' @param lengthscale single numerical value (default `1`) or numerical vector
#'   of length `2`; all values must be finite and non-negative. If a single
#'   value is provided, this will be used as the `lengthscale` hyperparameter
#'   and passed directly to [cov_mat()]. If a numerical vector of length `2` is
#'   provided, the second value must be higher than the first and the optimal
#'   `lengthscale` in this range will be found using an optimisation algorithm.
#'   If any value is `0`, a minimum amount of noise will be added as
#'   lengthscales must be `> 0`. Controls smoothness/decay in combination
#'   with `pow`.
#' @param scale_x single logical value; if `TRUE` (the default) the `x`-values
#'   will be scaled to the `[0, 1]` range according to the minimum/maximum
#'   values provided. If `FALSE`, the model will use the original scale. If
#'   distances on the original scale are small, scaling may be preferred. The
#'   returned values will always be on the original scale.
#' @param noisy single logical value. If `FALSE` (the default), a noiseless
#'   process is assumed, and interpolation between values is performed (i.e.,
#'   with no uncertainty at the evaluated `x`-values); if `TRUE`, the `y`-values
#'   are assumed to come from a noisy process, and regression is performed
#'   (i.e., some uncertainty at the evaluated `x`-values will be included in the
#'   predictions, with the amount estimated using an optimisation algorithm).
#' @param narrow single logical value. If `FALSE` (the default), predictions are
#'   evenly spread over the full `x`-range. If `TRUE`, the prediction grid will
#'   be spread evenly over an interval consisting of the two `x`-values with
#'   corresponding `y`-values closest to the target in opposite directions. This
#'   setting should only be used if `noisy` is `FALSE` and only if the function
#'   can safely be assumed to be only monotonically increasing or decreasing, in
#'   which case this will lead to a faster search and a smoother prediction grid
#'   in the relevant region without increasing memory use.
#'
#' @return List containing two elements, `next_x`, a single numerical value, the
#'   suggested next `x` value at which to evaluate the function, and
#'   `predictions`, a `data.frame` with `resolution` rows and the four columns:
#'   `x`, the `x` grid values where predictions are made; `y_hat`, the predicted
#'   means, and `lub` and `uub`, the lower and upper uncertainty bounds of the
#'   predictions according to `kappa`.
#'
#' @importFrom stats optimise optim runif rnorm var
#'
#' @references
#'
#' Gramacy RB (2020). Chapter 5: Gaussian Process Regression. In: Surrogates:
#' Gaussian Process Modeling, Design and Optimization for the Applied Sciences.
#' Chapman Hall/CRC, Boca Raton, Florida, USA.
#' [Available online](https://bookdown.org/rbg/surrogates/chap5.html).
#'
#' Greenhill S, Rana S, Gupta S, Vellanki P, Venkatesh S (2020). Bayesian
#' Optimization for Adaptive Experimental Design: A Review. IEEE Access, 8,
#' 13937-13948. \doi{10.1109/ACCESS.2020.2966228}
#'
#' @keywords internal
#'
gp_opt <- function(x, y, target, dir = 0, resolution = 5000,
                   kappa = 1.96, pow = 1.95, lengthscale = 1,
                   scale_x = TRUE, noisy = FALSE, narrow = FALSE) {

  # Define grid to evaluate over
  x_grid <- if (narrow) { # Only evaluate between nearest values in both directions
    sort(seq(from = x[which_nearest(y, target, 1)], to = x[which_nearest(y, target, -1)], length.out = resolution))
  } else { # Evaluate over the full range
    seq(from = min(x), to = max(x), length.out = resolution)
  }

  # Scale x and grid to 0-1 range if desired
  if (scale_x) {
    x_work <- (x - min(x)) / (max(x) - min(x))
    x_grid_work <- (x_grid - min(x)) / (max(x) - min(x))
  } else {
    x_work <- x
    x_grid_work <- x_grid
  }

  # Set or optimise g (jitter/nugget) and lengthscale
  eps <- .Machine$double.eps^0.5
  lengthscale <- ifelse(lengthscale <= 0, eps, lengthscale)
  if (noisy) { # Do regression (i.e., fit to noisy observations)
    if (length(lengthscale) != 1) { # Optimise both g and lengthscale
      opt <- optim(
        par = c(exp(mean(log(lengthscale))), 0.1 * var(y)),
        fn = function(par, D, y) {
          K <- exp(-D / par[1]) + diag(par[2], length(y))
          log_det_modulus <- determinant(K, logarithm = TRUE)$modulus
          length(y) * log(t(y) %*% solve(K) %*% y) / 2 + log_det_modulus / 2
        },
        gr = function(par, D, y) {
          K <- exp(-D / par[1]) + diag(par[2], length(y))
          Ki <- solve(K)
          dotK <- K * D / par[1]^2
          Kiy <- Ki %*% y
          lengthscale <- -(length(y) / 2 * t(Kiy) %*% dotK %*% Kiy / (t(y) %*% Kiy) - sum(diag(Ki %*% dotK)) / 2)
          g <- -length(y) / 2 * t(Kiy) %*% Kiy / (t(y) %*% Kiy) + sum(diag(Ki)) / 2
          c(lengthscale, g)
        },
        method = "L-BFGS-B",
        lower = c(lengthscale[1], eps),
        upper = c(lengthscale[2], var(y)),
        D = pow_abs_dist(x_work, pow = pow),
        y = y)
      lengthscale <- opt$par[1]
      g <- opt$par[2]
    } else { # Optimise only g
      g <- optimise(
        f = function(g, D, y) {
          K <- exp(-D / lengthscale) + diag(g, length(y))
          log_det_modulus <- determinant(K, logarithm = TRUE)$modulus
          length(y) * log(t(y) %*% solve(K) %*% y) / 2 + log_det_modulus / 2
        },
        interval = c(.Machine$double.eps^0.5, var(y)),
        D = pow_abs_dist(x_work, pow = pow),
        y = y
      )$minimum
    }
  } else { # Do interpolation (i.e., fit to noise-free observations)
    g <- eps # Minimal jitter always required
    # Optimise lengthscale if a range is provided, otherwise keep unchanged
    if (length(lengthscale) != 1) {
      lengthscale <- optimise(
        f = function(lengthscale, D, y) {
          K <- exp(-D / lengthscale) + diag(eps, length(y))
          log_det_modulus <- determinant(K, logarithm = TRUE)$modulus
          length(y) * log(t(y) %*% solve(K) %*% y) / 2 + log_det_modulus / 2
        },
        interval = lengthscale,
        D = pow_abs_dist(x_work, pow = pow),
        y = y)$minimum
    }
  }

  # Calculate predictions and bounds of acquisition function
  K_inv <- solve(cov_mat(x_work, g = g, pow = pow, lengthscale = lengthscale))
  Kx <- cov_mat(x_grid_work, x_work, pow = pow, lengthscale = lengthscale)
  Kxx <- cov_mat(x_grid_work, g = g, pow = pow, lengthscale = lengthscale)
  y_hat <- Kx %*% K_inv %*% y # Predicted y for grid values
  # Appropriately scaled prediction uncertainty bounds at grid values
  tau_squared <- drop(t(y) %*% K_inv %*% y / length(y))
  sigma <- sqrt(abs(diag(tau_squared * (Kxx - Kx %*% K_inv %*% t(Kx)))))
  lub <- y_hat - kappa * sigma
  uub <- y_hat + kappa * sigma

  # Find next target to evaluate (considering both uncertainty bounds simultaneously)
  next_x <- x_grid[(which_nearest(c(lub, uub), target, dir) - 1) %% resolution + 1]
  while (next_x %in% x) {
    if (narrow) {
      next_x <- runif(1, min = min(x_grid), max = max(x_grid))
    } else {
      next_x <- max(min(next_x + rnorm(1, 0, sd(x)), max(x)), min(x))
    }
  }

  # Return results
  list(next_x = next_x,
       predictions = data.frame(x = x_grid, y_hat = y_hat, lub = lub, uub = uub))
}