R/optimisers.R

Defines functions rms_prop proximal_adagrad proximal_gradient_descent ftrl adam momentum adagrad_da adagrad adadelta gradient_descent slsqp cobyla tnc l_bfgs_b newton_cg bfgs cg powell nelder_mead define_tf_optimiser define_scipy_optimiser optimiser_deprecation_warning

Documented in adadelta adagrad adagrad_da adam bfgs cg cobyla ftrl gradient_descent l_bfgs_b momentum nelder_mead newton_cg powell proximal_adagrad proximal_gradient_descent rms_prop slsqp tnc

# nolint start
#' @name optimisers
#'
#' @title optimisation methods
#' @description Functions to set up optimisers (which find parameters that
#'   maximise the joint density of a model) and change their tuning parameters,
#'   for use in [opt()]. For details of the algorithms and how to
#'   tune them, see the
#'   [SciPy
#'    optimiser docs](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html) or the
#'   [TensorFlow
#'    optimiser docs](https://www.tensorflow.org/api_docs/python/tf/contrib/opt).
#'
#' @details The optimisers `powell()`, `cg()`, `newton_cg()`,
#'   `l_bfgs_b()`, `tnc()`, `cobyla()`, and `slsqp()` are
#'   deprecated. They will be removed in greta 0.4.0, since they will no longer
#'   be available in TensorFlow 2.0, on which that version of greta will depend.
#'
#' @return an `optimiser` object that can be passed to [opt()].
#'
#' @examples
#' \dontrun{
#' # use optimisation to find the mean and sd of some data
#' x <- rnorm(100, -2, 1.2)
#' mu <- variable()
#' sd <- variable(lower = 0)
#' distribution(x) <- normal(mu, sd)
#' m <- model(mu, sd)
#'
#' # configure optimisers & parameters via 'optimiser' argument to opt
#' opt_res <- opt(m, optimiser = bfgs())
#'
#' # compare results with the analytic solution
#' opt_res$par
#' c(mean(x), sd(x))
#' }
NULL
# nolint end

# deprecate some optimisers
optimiser_deprecation_warning <- function() {
  warning("This optimiser is deprecated and will be removed in greta 0.4.0. ",
    "Please use a different optimiser.",
    call. = FALSE
  )
}

# set up an optimiser object
define_scipy_optimiser <- function(name,
                                   method,
                                   parameters = list(),
                                   other_args = list(uses_callbacks = TRUE)) {
  obj <- list(
    name = name,
    method = method,
    parameters = parameters,
    other_args = other_args,
    class = scipy_optimiser
  )

  class_name <- paste0(name, "_optimiser")
  class(obj) <- c(class_name, "optimiser")
  obj
}

define_tf_optimiser <- function(name,
                                method,
                                parameters = list(),
                                other_args = list()) {
  obj <- list(
    name = name,
    method = method,
    parameters = parameters,
    class = tf_optimiser,
    other_args = other_args
  )

  class_name <- paste0(name, "_optimiser")
  class(obj) <- c(class_name, "optimiser")
  obj
}

#' @rdname optimisers
#' @export
#'
nelder_mead <- function() {
  define_scipy_optimiser(
    name = "nelder_mead",
    method = "Nelder-Mead"
  )
}

#' @rdname optimisers
#' @export
#'
powell <- function() {
  optimiser_deprecation_warning()

  define_scipy_optimiser(
    name = "powell",
    method = "Powell"
  )
}

#' @rdname optimisers
#' @export
#'
cg <- function() {
  optimiser_deprecation_warning()

  define_scipy_optimiser(
    name = "cg",
    method = "CG"
  )
}

#' @rdname optimisers
#' @export
#'
bfgs <- function() {
  define_scipy_optimiser(
    name = "bfgs",
    method = "BFGS"
  )
}

#' @rdname optimisers
#' @export
#'
newton_cg <- function() {
  optimiser_deprecation_warning()

  define_scipy_optimiser(
    name = "newton_cg",
    method = "Newton-CG"
  )
}

#' @rdname optimisers
#' @export
#'
#' @param maxcor maximum number of 'variable metric corrections' used to define
#'   the approximation to the hessian matrix
#' @param maxls maximum number of line search steps per iteration
#'
l_bfgs_b <- function(maxcor = 10, maxls = 20) {
  optimiser_deprecation_warning()

  define_scipy_optimiser(
    name = "l_bfgs_b",
    method = "L-BFGS-B",
    parameters = list(
      maxcor = as.integer(maxcor),
      maxls = as.integer(maxls)
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param max_cg_it maximum number of hessian * vector evaluations per iteration
#' @param stepmx maximum step for the line search
#' @param rescale log10 scaling factor used to trigger rescaling of objective
#'
tnc <- function(max_cg_it = -1, stepmx = 0, rescale = -1) {
  optimiser_deprecation_warning()

  define_scipy_optimiser(
    name = "tnc",
    method = "TNC",
    parameters = list(
      maxCGit = as.integer(max_cg_it),
      stepmx = stepmx,
      rescale = rescale
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param rhobeg reasonable initial changes to the variables
#'
#' @details The `cobyla()` does not provide information about the number of
#'   iterations nor convergence, so these elements of the output are set to NA
#'
cobyla <- function(rhobeg = 1) {
  optimiser_deprecation_warning()

  define_scipy_optimiser(
    name = "cobyla",
    method = "COBYLA",
    parameters = list(
      rhobeg = rhobeg
    ),
    other_args = list(uses_callbacks = FALSE)
  )
}

#' @rdname optimisers
#' @export
#'
slsqp <- function() {
  optimiser_deprecation_warning()

  define_scipy_optimiser(
    name = "slsqp",
    method = "SLSQP"
  )
}


#' @rdname optimisers
#' @export
#'
#' @param learning_rate the size of steps (in parameter space) towards the
#'   optimal value
gradient_descent <- function(learning_rate = 0.01) {
  define_tf_optimiser(
    name = "gradient_descent",
    method = "tf$compat$v1$train$GradientDescentOptimizer",
    parameters = list(
      learning_rate = learning_rate
    )
  )
}


#' @rdname optimisers
#' @export
#'
#' @param rho the decay rate
#' @param epsilon a small constant used to condition gradient updates
adadelta <- function(learning_rate = 0.001, rho = 1, epsilon = 1e-08) {
  define_tf_optimiser(
    name = "adadelta",
    method = "tf$compat$v1$train$AdadeltaOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      rho = rho,
      epsilon = epsilon
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param initial_accumulator_value initial value of the 'accumulator' used to
#'   tune the algorithm
#'
adagrad <- function(learning_rate = 0.8,
                    initial_accumulator_value = 0.1) {
  define_tf_optimiser(
    name = "adagrad",
    method = "tf$compat$v1$train$AdagradOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      initial_accumulator_value = initial_accumulator_value
    )
  )
}

# nolint start
#' @rdname optimisers
#' @export
#'
#' @param global_step the current training step number
#' @param initial_gradient_squared_accumulator_value initial value of the
#'   accumulators used to tune the algorithm
#' @param l1_regularization_strength L1 regularisation coefficient (must be 0 or
#'   greater)
#' @param l2_regularization_strength L2 regularisation coefficient (must be 0 or
#'   greater)
#'
adagrad_da <- function(learning_rate = 0.8,
                       global_step = 1L,
                       initial_gradient_squared_accumulator_value = 0.1,
                       l1_regularization_strength = 0,
                       l2_regularization_strength = 0) {
  define_tf_optimiser(
    name = "adagrad_da",
    method = "tf$compat$v1$train$AdagradDAOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      global_step = global_step,
      initial_gradient_squared_accumulator_value =
        initial_gradient_squared_accumulator_value,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength
    )
  )
}
# nolint end

#' @rdname optimisers
#' @export
#'
#' @param momentum the momentum of the algorithm
#' @param use_nesterov whether to use Nesterov momentum
#'
momentum <- function(learning_rate = 0.001,
                     momentum = 0.9,
                     use_nesterov = TRUE) {
  define_tf_optimiser(
    name = "momentum",
    method = "tf$compat$v1$train$MomentumOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      momentum = momentum,
      use_nesterov = use_nesterov
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param beta1 exponential decay rate for the 1st moment estimates
#' @param beta2 exponential decay rate for the 2nd moment estimates
#'
adam <- function(learning_rate = 0.1,
                 beta1 = 0.9,
                 beta2 = 0.999,
                 epsilon = 1e-08) {
  define_tf_optimiser(
    name = "adam",
    method = "tf$compat$v1$train$AdamOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      beta1 = beta1,
      beta2 = beta2,
      epsilon = epsilon
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param learning_rate_power power on the learning rate, must be 0 or less
#'
ftrl <- function(learning_rate = 1,
                 learning_rate_power = -0.5,
                 initial_accumulator_value = 0.1,
                 l1_regularization_strength = 0,
                 l2_regularization_strength = 0) {
  define_tf_optimiser(
    name = "ftrl",
    method = "tf$compat$v1$train$FtrlOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      learning_rate_power = learning_rate_power,
      initial_accumulator_value = initial_accumulator_value,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength
    )
  )
}

#' @rdname optimisers
#' @export
#'
proximal_gradient_descent <- function(learning_rate = 0.01,
                                      l1_regularization_strength = 0,
                                      l2_regularization_strength = 0) {
  define_tf_optimiser(
    name = "proximal_gradient_descent",
    method = "tf$compat$v1$train$ProximalGradientDescentOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength
    )
  )
}

#' @rdname optimisers
#' @export
#'
proximal_adagrad <- function(learning_rate = 1,
                             initial_accumulator_value = 0.1,
                             l1_regularization_strength = 0,
                             l2_regularization_strength = 0) {
  define_tf_optimiser(
    name = "proximal_adagrad",
    method = "tf$compat$v1$train$ProximalAdagradOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      initial_accumulator_value = initial_accumulator_value,
      l1_regularization_strength = l1_regularization_strength,
      l2_regularization_strength = l2_regularization_strength
    )
  )
}

#' @rdname optimisers
#' @export
#'
#' @param decay discounting factor for the gradient
#'
rms_prop <- function(learning_rate = 0.1,
                     decay = 0.9,
                     momentum = 0,
                     epsilon = 1e-10) {
  define_tf_optimiser(
    name = "rms_prop",
    method = "tf$compat$v1$train$RMSPropOptimizer",
    parameters = list(
      learning_rate = learning_rate,
      decay = decay,
      momentum = momentum,
      epsilon = epsilon
    )
  )
}

#' @noRd
#' @export
print.optimiser <- print.sampler
goldingn/greta documentation built on May 24, 2021, 11 a.m.