R/optimisers.R
In greta: Simple and Scalable Statistical Modelling in R

Documented in adadelta adagrad adagrad_da adam adamax bfgs cg cobyla ftrl gradient_descent l_bfgs_b momentum nadam nelder_mead newton_cg powell proximal_adagrad proximal_gradient_descent rms_prop slsqp tnc

# nolint start
#' @name optimisers
#'
#' @title optimisation methods
#' @description Functions to set up optimisers (which find parameters that
#'   maximise the joint density of a model) and change their tuning parameters,
#'   for use in [opt()]. For details of the algorithms and how to
#'   tune them, see the [TensorFlow optimiser docs](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers), or the [Tensorflow Probability optimiser docs](https://www.tensorflow.org/probability/api_docs/python/tfp/optimizer).
#'
#' @details The optimisers `powell()`, `cg()`, `newton_cg()`,
#'   `l_bfgs_b()`, `tnc()`, `cobyla()`, and `slsqp()` are
#'   now defunct. They will error when called in greta 0.5.0. This are removed
#'   because they are no longer available in TensorFlow 2.0. Note that
#'   optimiser `momentum()` has been replaced with `gradient_descent()`
#'
#'
#'
#' @return an `optimiser` object that can be passed to [opt()].
#'
#' @examples
#' \dontrun{
#' # use optimisation to find the mean and sd of some data
#' x <- rnorm(100, -2, 1.2)
#' mu <- variable()
#' sd <- variable(lower = 0)
#' distribution(x) <- normal(mu, sd)
#' m <- model(mu, sd)
#'
#' # configure optimisers & parameters via 'optimiser' argument to opt
#' opt_res <- opt(m, optimiser = bfgs())
#'
#' # compare results with the analytic solution
#' opt_res$par
#' c(mean(x), sd(x))
#' }
NULL
# nolint end

# defunct some optimisers
optimiser_defunct_error <- function(optimiser) {
  cli::cli_abort(
    c(
      "The optimiser, {.fun {optimiser}}, is defunct and has been removed \\
      in {.pkg greta} 0.5.0.",
      "Please use a different optimiser.",
      "See {.code ?optimisers} for detail on which optimizers are removed."
      )
  )
}

# deprecate some optimisers
optimiser_deprecation_warning <- function(version = "0.4.0") {
  cli::cli_warn(
    c(
      "This optimiser is deprecated and will be removed in {.pkg greta} \\
      {.val {version}}.",
      "Please use a different optimiser."
    )
  )
}

new_optimiser <- function(name,
                          method,
                          parameters,
                          class,
                          other_args){
  obj <- list(
    name = name,
    method = method,
    parameters = parameters,
    class = class,
    other_args = other_args
  )

  class_name <- glue::glue("{name}_optimiser")
  class(obj) <- c(class_name, "optimiser")
  obj
}

define_tf_optimiser <- function(name,
                                method,
                                parameters = list(),
                                other_args = list()) {
  new_optimiser(
    name = name,
    method = method,
    parameters = parameters,
    class = tf_optimiser,
    other_args = other_args
  )
}

define_tf_compat_optimiser <- function(name,
                                method,
                                parameters = list(),
                                other_args = list()) {
  new_optimiser(
    name = name,
    method = method,
    parameters = parameters,
    class = tf_compat_optimiser,
    other_args = other_args
  )
}

define_tfp_optimiser <- function(name,
                                 method,
                                 parameters = list(),
                                 other_args = list()) {
  new_optimiser(
    name = name,
    method = method,
    parameters = parameters,
    class = tfp_optimiser,
    other_args = other_args
  )
}

#' @rdname optimisers
#'
#' @param objective_function A function that accepts a point as a real Tensor
#'   and returns a Tensor of real dtype containing the value of the function at
#'   that point. The function to be minimized. If `batch_evaluate_objective` is
#'   TRUE, the function may be evaluated on a Tensor of shape `[n+1] + s` where
#'   n is the dimension of the problem and s is the shape of a single point in
#'   the domain (so n is the size of a Tensor representing a single point). In
#'   this case, the expected return value is a Tensor of shape `[n+1]`. Note
#'   that this method does not support univariate functions so the problem
#'   dimension n must be strictly greater than 1.
#' @param initial_vertex  Tensor of real dtype and any shape that can be
#'   consumed by the `objective_function`. A single point in the domain that
#'   will be used to construct an axes aligned initial simplex.
#' @param step_sizes  Tensor of real dtype and shape broadcasting compatible
#'   with `initial_vertex`. Supplies the simplex scale along each axes.
#' @param func_tolerance Single numeric number. The algorithm stops if the
#'    absolute difference between the largest and the smallest function value
#'    on the vertices of the simplex is below this number. Default is 1e-08.
#' @param position_tolerance Single numeric number. The algorithm stops if
#'   the largest absolute difference between the coordinates of the vertices
#'   is below this threshold.
#' @param reflection (optional) Positive Scalar Tensor of same dtype as
#'   `initial_vertex`. This parameter controls the scaling of the reflected
#'   vertex. See, [Press et al(2007)](https://numerical.recipes/book.html)
#'   for details. If not specified, uses the dimension dependent prescription of
#'    Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#' @param expansion (optional) Positive Scalar Tensor of same dtype as
#'  `initial_vertex`. Should be greater than 1 and reflection. This parameter
#'  controls the expanded scaling of a reflected vertex.See,
#'   [Press et al(2007)](https://numerical.recipes/book.html) for
#'   details. If not specified, uses the dimension dependent prescription of
#'   Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#' @param contraction (optional) Positive scalar Tensor of same dtype as
#'   `initial_vertex`. Must be between 0 and 1. This parameter controls the
#'   contraction of the reflected vertex when the objective function at the
#'   reflected point fails to show sufficient decrease. See,
#'   [Press et al(2007)](https://numerical.recipes/book.html) for
#'   details. If not specified, uses the dimension dependent prescription of
#'   Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#' @param shrinkage (Optional) Positive scalar Tensor of same dtype as
#'   `initial_vertex`. Must be between 0 and 1. This parameter is the scale by
#'   which the simplex is shrunk around the best point when the other steps fail
#'   to produce improvements. See,
#'   [Press et al(2007)](https://numerical.recipes/book.html) for
#'   details. If not specified, uses the dimension dependent prescription of
#'   Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#'
#' @export
#'
nelder_mead <- function(
    objective_function = NULL,
    initial_vertex = NULL,
    step_sizes = NULL,
    func_tolerance = 1e-08,
    position_tolerance = 1e-08,
    reflection = NULL,
    expansion = NULL,
    contraction = NULL,
    shrinkage = NULL) {

  define_tfp_optimiser(
    name = "nelder_mead",
    method = "tfp$optimizer$nelder_mead_minimize",
    parameters = list(
      objective_function = objective_function,
      initial_simplex = NULL,
      initial_vertex = initial_vertex,
      step_sizes = step_sizes,
      objective_at_initial_simplex = NULL,
      objective_at_initial_vertex = NULL,
      func_tolerance = func_tolerance,
      position_tolerance = position_tolerance,
      parallel_iterations = 1L,
      reflection = reflection,
      expansion = expansion,
      contraction = contraction,
      shrinkage = shrinkage,
      name = NULL
    )
  )
}

#' @rdname optimisers
#'
#' @param value_and_gradients_function A function that accepts a point as a
#'   real Tensor and returns a tuple of Tensors of real dtype containing the
#'   value of the function and its gradient at that point. The function to be
#'   minimized. The input should be of shape `[..., n]`, where n is the size of
#'   the domain of input points, and all others are batching dimensions. The
#'   first component of the return value should be a real Tensor of matching
#'   shape `[...]`. The second component (the gradient) should also be of
#'   shape `[..., n]` like the input value to the function.
#' @param initial_position real Tensor of shape `[..., n]`. The starting point,
#'   or points when using batching dimensions, of the search procedure. At
#'   these points the function value and the gradient norm should be finite.
#' @param tolerance Scalar Tensor of real dtype. Specifies the gradient
#'   tolerance for the procedure. If the supremum norm of the gradient vector
#'   is below this number, the algorithm is stopped. Default is 1e-08.
#' @param x_tolerance Scalar Tensor of real dtype. If the absolute change in
#'   the position between one iteration and the next is smaller than this
#'   number, the algorithm is stopped. Default of 0L.
#' @param f_relative_tolerance Scalar Tensor of real dtype. If the relative
#'   change in the objective value between one iteration and the next is
#'   smaller than this value, the algorithm is stopped.
#' @param initial_inverse_hessian_estimate Optional Tensor of the same dtype
#'   as the components of the output of the value_and_gradients_function. If
#'   specified, the shape should broadcastable to shape `[..., n, n]`; e.g. if a
#'   single `[n, n]` matrix is provided, it will be automatically broadcasted to
#'   all batches. Alternatively, one can also specify a different hessian
#'   estimate for each batch member. For the correctness of the algorithm, it
#'   is required that this parameter be symmetric and positive definite.
#'   Specifies the starting estimate for the inverse of the Hessian at the
#'   initial point. If not specified, the identity matrix is used as the
#'   starting estimate for the inverse Hessian.
#' @param stopping_condition (Optional) A function that takes as input two
#'   Boolean tensors of shape `[...]`, and returns a Boolean scalar tensor. The
#'   input tensors are converged and failed, indicating the current status of
#'   each respective batch member; the return value states whether the
#'   algorithm should stop. The default is `tfp$optimizer.converged_all` which
#'   only stops when all batch members have either converged or failed. An
#'   alternative is `tfp$optimizer.converged_any` which stops as soon as one
#'   batch member has converged, or when all have failed.
#' @param validate_args Logical, default TRUE. When TRUE, optimizer
#'   parameters are checked for validity despite possibly degrading runtime
#'   performance. When FALSE invalid inputs may silently render incorrect outputs.
#' @param max_line_search_iterations Python int. The maximum number of
#'   iterations for the hager_zhang line search algorithm.
#' @param f_absolute_tolerance Scalar Tensor of real dtype. If the absolute
#'   change in the objective value between one iteration and the next is
#'   smaller than this value, the algorithm is stopped.
#'
#' @export
bfgs <- function(value_and_gradients_function = NULL,
                 initial_position = NULL,
                 tolerance = 1e-08,
                 x_tolerance = 0L,
                 f_relative_tolerance = 0L,
                 initial_inverse_hessian_estimate = NULL,
                 stopping_condition = NULL,
                 validate_args = TRUE,
                 max_line_search_iterations = 50L,
                 f_absolute_tolerance = 0L) {
  define_tfp_optimiser(
    name = "bfgs",
    method = "tfp$optimizer$bfgs_minimize",
    parameters = list(
      value_and_gradients_function = value_and_gradients_function,
      initial_position = initial_position,
      tolerance = tolerance,
      x_tolerance = x_tolerance,
      f_relative_tolerance = f_relative_tolerance,
      initial_inverse_hessian_estimate = initial_inverse_hessian_estimate,
      parallel_iterations = 1L,
      stopping_condition = stopping_condition,
      validate_args = validate_args,
      max_line_search_iterations = max_line_search_iterations,
      f_absolute_tolerance = f_absolute_tolerance,
      name = NULL
    )
  )
}

#' @rdname optimisers
#' @export
#'
powell <- function() {
  optimiser_defunct_error("powell")
}

#' @rdname optimisers
#' @export
#'
momentum <- function() {
  optimiser_defunct_error("momentum")
}

#' @rdname optimisers
#' @export
#'
cg <- function() {
  optimiser_defunct_error("cg")
}

#' @rdname optimisers
#' @export
#'
newton_cg <- function() {
  optimiser_defunct_error("newton_cg")
}

#' @rdname optimisers
#' @export
#'
l_bfgs_b <- function() {
  optimiser_defunct_error("l_bfgs_b")
}

#' @rdname optimisers
#' @export
#'
tnc <- function() {
  optimiser_defunct_error("tnc")
}

#' @rdname optimisers
#' @export
#'
cobyla <- function() {
  optimiser_defunct_error("cobyla")
}

#' @rdname optimisers
#' @export
#'
slsqp <- function() {
  optimiser_defunct_error("slsqp")
}


#' @rdname optimisers
#' @export
#'
#' @param learning_rate the size of steps (in parameter space) towards the
#'   optimal value. Default value 0.01
#' @param momentum hyperparameter that accelerates gradient descent in the
#'   relevant direction and dampens oscillations. Defaults to 0, which is
#'   vanilla gradient descent.
#' @param nesterov Whether to apply Nesterov momentum. Defaults to FALSE.
gradient_descent <- function(learning_rate = 0.01,
                             momentum = 0,
                             nesterov = FALSE) {
  define_tf_optimiser(
    name = "gradient_descent",
    method = "tf$keras$optimizers$legacy$SGD",
    parameters = list(
      learning_rate = learning_rate,
      momentum = momentum,
      nesterov = nesterov
    )
  )
}


#' @rdname optimisers
#' @export
#'
#' @param rho the decay rate
#' @param epsilon a small constant used to condition gradient updates
adadelta <- function(learning_rate = 0.001, rho = 1, epsilon = 1e-08) {
  define_tf_optimiser(
    name = "adadelta",
    # method = "tf$keras$optimizers$Adadelta",
    method = "tf$keras$optimizers$legacy$Adadelta",
    parameters = list(
      learning_rate = learning_rate,
      rho = rho,
      epsilon = epsilon
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param initial_accumulator_value initial value of the 'accumulator' used to
#'   tune the algorithm
#'
adagrad <- function(learning_rate = 0.8,
                    initial_accumulator_value = 0.1,
                    epsilon = 1e-08) {
  define_tf_optimiser(
    name = "adagrad",
    # method = "tf$keras$optimizers$Adagrad",
    method = "tf$keras$optimizers$legacy$Adagrad",
    parameters = list(
      learning_rate = learning_rate,
      initial_accumulator_value = initial_accumulator_value,
      epsilon = epsilon
    )
  )
}

# nolint start
#' @rdname optimisers
#' @export
#'
#' @param global_step the current training step number
#' @param initial_gradient_squared_accumulator_value initial value of the
#'   accumulators used to tune the algorithm
#' @param l1_regularization_strength L1 regularisation coefficient (must be 0 or
#'   greater)
#' @param l2_regularization_strength L2 regularisation coefficient (must be 0 or
#'   greater)
#'
#' @note This optimizer isn't supported in TF2, so proceed with caution. See
#'  the [TF docs on AdagradDAOptimiser](https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/AdagradDAOptimizer) for more detail.
#'
adagrad_da <- function(learning_rate = 0.8,
                       global_step = 1L,
                       initial_gradient_squared_accumulator_value = 0.1,
                       l1_regularization_strength = 0,
                       l2_regularization_strength = 0) {

  optimiser_deprecation_warning(version = "0.6.0")

  define_tf_compat_optimiser(
    name = "adagrad_da",
    method = "tf$compat$v1$train$AdagradDAOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      global_step = global_step,
      initial_gradient_squared_accumulator_value =
        initial_gradient_squared_accumulator_value,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength
    )
  )
}
# nolint end

#' @rdname optimisers
#' @export
#'
#' @param beta_1 exponential decay rate for the 1st moment estimates
#' @param beta_2 exponential decay rate for the 2nd moment estimates
#' @param amsgrad Boolean. Whether to apply AMSGrad variant of this algorithm
#'   from the paper "On the Convergence of Adam and beyond". Defaults to FALSE.
#'
adam <- function(learning_rate = 0.1,
                 beta_1 = 0.9,
                 beta_2 = 0.999,
                 amsgrad = FALSE,
                 epsilon = 1e-08) {
  define_tf_optimiser(
    name = "adam",
    # method = "tf$keras$optimizers$Adam",
    method = "tf$keras$optimizers$legacy$Adam",
    parameters = list(
      learning_rate = learning_rate,
      beta_1 = beta_1,
      beta_2 = beta_2,
      epsilon = epsilon,
      amsgrad = amsgrad
    )
  )
}

#' @rdname optimisers
#' @export
#'
adamax <- function(learning_rate = 0.001,
                   beta_1 = 0.9,
                   beta_2 = 0.999,
                   epsilon = 1e-07){
  define_tf_optimiser(
    name = "adamax",
    # method = "tf$keras$optimizers$Adamax",
    method = "tf$keras$optimizers$legacy$Adamax",
    parameters = list(
      learning_rate = learning_rate,
      beta_1 = beta_1,
      beta_2 = beta_2,
      epsilon = epsilon
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param learning_rate_power power on the learning rate, must be 0 or less
#' @param l2_shrinkage_regularization_strength A float value, must be greater
#'   than or equal to zero. This differs from L2 above in that the L2 above is
#'   a stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
#'   When input is sparse shrinkage will only happen on the active weights.
#' @param beta A float value, representing the beta value from the paper by
#'   [McMahan et al 2013](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf). Defaults to 0
#'
ftrl <- function(learning_rate = 1,
                 learning_rate_power = -0.5,
                 initial_accumulator_value = 0.1,
                 l1_regularization_strength = 0,
                 l2_regularization_strength = 0,
                 l2_shrinkage_regularization_strength = 0,
                 beta = 0) {
  define_tf_optimiser(
    name = "ftrl",
    # method = "tf$keras$optimizers$Ftrl",
    method = "tf$keras$optimizers$legacy$Ftrl",
    parameters = list(
      learning_rate = learning_rate,
      learning_rate_power = learning_rate_power,
      initial_accumulator_value = initial_accumulator_value,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength,
      l2_shrinkage_regularization_strength = l2_shrinkage_regularization_strength,
      beta = beta
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @note This optimizer isn't supported in TF2, so proceed with caution. See
#'  the [TF docs on ProximalGradientDescentOptimizer](https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/ProximalGradientDescentOptimizer) for more detail.
#'
proximal_gradient_descent <- function(learning_rate = 0.01,
                                      l1_regularization_strength = 0,
                                      l2_regularization_strength = 0) {

  optimiser_deprecation_warning(version = "0.6.0")

  define_tf_compat_optimiser(
    name = "proximal_gradient_descent",
    method = "tf$compat$v1$train$ProximalGradientDescentOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @note This optimizer isn't supported in TF2, so proceed with caution. See
#'  the [TF docs on ProximalAdagradOptimizer](https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/ProximalAdagradOptimizer) for more detail.

#'
proximal_adagrad <- function(learning_rate = 1,
                             initial_accumulator_value = 0.1,
                             l1_regularization_strength = 0,
                             l2_regularization_strength = 0) {

  optimiser_deprecation_warning(version = "0.6.0")

  define_tf_compat_optimiser(
    name = "proximal_adagrad",
    method = "tf$compat$v1$train$ProximalAdagradOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      initial_accumulator_value = initial_accumulator_value,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength
    )
  )
}

#' @rdname optimisers
#' @export
#'
nadam <- function(learning_rate = 0.001,
                  beta_1 = 0.9,
                  beta_2 = 0.999,
                  epsilon = 1e-07){

  define_tf_optimiser(
    name = "nadam",
    # method = "tf$keras$optimizers$Nadam",
    method = "tf$keras$optimizers$legacy$Nadam",
    parameters = list(
      learning_rate = learning_rate,
      beta_1 = beta_1,
      beta_2 = beta_2,
      epsilon = epsilon
    )
  )

}

#' @rdname optimisers
#' @export
#'
#' @param centered Boolean. If TRUE, gradients are normalized by the estimated
#'   variance of the gradient; if FALSE, by the uncentered second moment.
#'   Setting this to TRUE may help with training, but is slightly more
#'   expensive in terms of computation and memory. Defaults to FALSE.
rms_prop <- function(learning_rate = 0.1,
                     rho = 0.9,
                     momentum = 0,
                     epsilon = 1e-10,
                     centered = FALSE) {
  define_tf_optimiser(
    name = "rms_prop",
    # method = "tf$keras$optimizers$RMSprop",
    method = "tf$keras$optimizers$legacy$RMSprop",
    parameters = list(
      learning_rate = learning_rate,
      rho = rho,
      momentum = momentum,
      epsilon = epsilon,
      centered = centered
    )
  )
}

#' @noRd
#' @export
print.optimiser <- print.sampler