# nolint start
#' @name optimisers
#'
#' @title optimisation methods
#' @description Functions to set up optimisers (which find parameters that
#' maximise the joint density of a model) and change their tuning parameters,
#' for use in [opt()]. For details of the algorithms and how to
#' tune them, see the [TensorFlow optimiser docs](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers), or the [Tensorflow Probability optimiser docs](https://www.tensorflow.org/probability/api_docs/python/tfp/optimizer).
#'
#' @details The optimisers `powell()`, `cg()`, `newton_cg()`,
#' `l_bfgs_b()`, `tnc()`, `cobyla()`, and `slsqp()` are
#' now defunct. They will error when called in greta 0.5.0. This are removed
#' because they are no longer available in TensorFlow 2.0. Note that
#' optimiser `momentum()` has been replaced with `gradient_descent()`
#'
#'
#'
#' @return an `optimiser` object that can be passed to [opt()].
#'
#' @examples
#' \dontrun{
#' # use optimisation to find the mean and sd of some data
#' x <- rnorm(100, -2, 1.2)
#' mu <- variable()
#' sd <- variable(lower = 0)
#' distribution(x) <- normal(mu, sd)
#' m <- model(mu, sd)
#'
#' # configure optimisers & parameters via 'optimiser' argument to opt
#' opt_res <- opt(m, optimiser = bfgs())
#'
#' # compare results with the analytic solution
#' opt_res$par
#' c(mean(x), sd(x))
#' }
NULL
# nolint end
# defunct some optimisers
optimiser_defunct_error <- function(optimiser) {
cli::cli_abort(
c(
"The optimiser, {.fun {optimiser}}, is defunct and has been removed \\
in {.pkg greta} 0.5.0.",
"Please use a different optimiser.",
"See {.code ?optimisers} for detail on which optimizers are removed."
)
)
}
# deprecate some optimisers
optimiser_deprecation_warning <- function(version = "0.4.0") {
cli::cli_warn(
c(
"This optimiser is deprecated and will be removed in {.pkg greta} \\
{.val {version}}.",
"Please use a different optimiser."
)
)
}
new_optimiser <- function(name,
method,
parameters,
class,
other_args){
obj <- list(
name = name,
method = method,
parameters = parameters,
class = class,
other_args = other_args
)
class_name <- glue::glue("{name}_optimiser")
class(obj) <- c(class_name, "optimiser")
obj
}
define_tf_optimiser <- function(name,
method,
parameters = list(),
other_args = list()) {
new_optimiser(
name = name,
method = method,
parameters = parameters,
class = tf_optimiser,
other_args = other_args
)
}
define_tf_compat_optimiser <- function(name,
method,
parameters = list(),
other_args = list()) {
new_optimiser(
name = name,
method = method,
parameters = parameters,
class = tf_compat_optimiser,
other_args = other_args
)
}
define_tfp_optimiser <- function(name,
method,
parameters = list(),
other_args = list()) {
new_optimiser(
name = name,
method = method,
parameters = parameters,
class = tfp_optimiser,
other_args = other_args
)
}
#' @rdname optimisers
#'
#' @param objective_function A function that accepts a point as a real Tensor
#' and returns a Tensor of real dtype containing the value of the function at
#' that point. The function to be minimized. If `batch_evaluate_objective` is
#' TRUE, the function may be evaluated on a Tensor of shape `[n+1] + s` where
#' n is the dimension of the problem and s is the shape of a single point in
#' the domain (so n is the size of a Tensor representing a single point). In
#' this case, the expected return value is a Tensor of shape `[n+1]`. Note
#' that this method does not support univariate functions so the problem
#' dimension n must be strictly greater than 1.
#' @param initial_vertex Tensor of real dtype and any shape that can be
#' consumed by the `objective_function`. A single point in the domain that
#' will be used to construct an axes aligned initial simplex.
#' @param step_sizes Tensor of real dtype and shape broadcasting compatible
#' with `initial_vertex`. Supplies the simplex scale along each axes.
#' @param func_tolerance Single numeric number. The algorithm stops if the
#' absolute difference between the largest and the smallest function value
#' on the vertices of the simplex is below this number. Default is 1e-08.
#' @param position_tolerance Single numeric number. The algorithm stops if
#' the largest absolute difference between the coordinates of the vertices
#' is below this threshold.
#' @param reflection (optional) Positive Scalar Tensor of same dtype as
#' `initial_vertex`. This parameter controls the scaling of the reflected
#' vertex. See, [Press et al(2007)](https://numerical.recipes/book.html)
#' for details. If not specified, uses the dimension dependent prescription of
#' Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#' @param expansion (optional) Positive Scalar Tensor of same dtype as
#' `initial_vertex`. Should be greater than 1 and reflection. This parameter
#' controls the expanded scaling of a reflected vertex.See,
#' [Press et al(2007)](https://numerical.recipes/book.html) for
#' details. If not specified, uses the dimension dependent prescription of
#' Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#' @param contraction (optional) Positive scalar Tensor of same dtype as
#' `initial_vertex`. Must be between 0 and 1. This parameter controls the
#' contraction of the reflected vertex when the objective function at the
#' reflected point fails to show sufficient decrease. See,
#' [Press et al(2007)](https://numerical.recipes/book.html) for
#' details. If not specified, uses the dimension dependent prescription of
#' Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#' @param shrinkage (Optional) Positive scalar Tensor of same dtype as
#' `initial_vertex`. Must be between 0 and 1. This parameter is the scale by
#' which the simplex is shrunk around the best point when the other steps fail
#' to produce improvements. See,
#' [Press et al(2007)](https://numerical.recipes/book.html) for
#' details. If not specified, uses the dimension dependent prescription of
#' Gao and Han (2012) \doi{10.1007/s10589-010-9329-3}
#'
#' @export
#'
nelder_mead <- function(
objective_function = NULL,
initial_vertex = NULL,
step_sizes = NULL,
func_tolerance = 1e-08,
position_tolerance = 1e-08,
reflection = NULL,
expansion = NULL,
contraction = NULL,
shrinkage = NULL) {
define_tfp_optimiser(
name = "nelder_mead",
method = "tfp$optimizer$nelder_mead_minimize",
parameters = list(
objective_function = objective_function,
initial_simplex = NULL,
initial_vertex = initial_vertex,
step_sizes = step_sizes,
objective_at_initial_simplex = NULL,
objective_at_initial_vertex = NULL,
func_tolerance = func_tolerance,
position_tolerance = position_tolerance,
parallel_iterations = 1L,
reflection = reflection,
expansion = expansion,
contraction = contraction,
shrinkage = shrinkage,
name = NULL
)
)
}
#' @rdname optimisers
#'
#' @param value_and_gradients_function A function that accepts a point as a
#' real Tensor and returns a tuple of Tensors of real dtype containing the
#' value of the function and its gradient at that point. The function to be
#' minimized. The input should be of shape `[..., n]`, where n is the size of
#' the domain of input points, and all others are batching dimensions. The
#' first component of the return value should be a real Tensor of matching
#' shape `[...]`. The second component (the gradient) should also be of
#' shape `[..., n]` like the input value to the function.
#' @param initial_position real Tensor of shape `[..., n]`. The starting point,
#' or points when using batching dimensions, of the search procedure. At
#' these points the function value and the gradient norm should be finite.
#' @param tolerance Scalar Tensor of real dtype. Specifies the gradient
#' tolerance for the procedure. If the supremum norm of the gradient vector
#' is below this number, the algorithm is stopped. Default is 1e-08.
#' @param x_tolerance Scalar Tensor of real dtype. If the absolute change in
#' the position between one iteration and the next is smaller than this
#' number, the algorithm is stopped. Default of 0L.
#' @param f_relative_tolerance Scalar Tensor of real dtype. If the relative
#' change in the objective value between one iteration and the next is
#' smaller than this value, the algorithm is stopped.
#' @param initial_inverse_hessian_estimate Optional Tensor of the same dtype
#' as the components of the output of the value_and_gradients_function. If
#' specified, the shape should broadcastable to shape `[..., n, n]`; e.g. if a
#' single `[n, n]` matrix is provided, it will be automatically broadcasted to
#' all batches. Alternatively, one can also specify a different hessian
#' estimate for each batch member. For the correctness of the algorithm, it
#' is required that this parameter be symmetric and positive definite.
#' Specifies the starting estimate for the inverse of the Hessian at the
#' initial point. If not specified, the identity matrix is used as the
#' starting estimate for the inverse Hessian.
#' @param stopping_condition (Optional) A function that takes as input two
#' Boolean tensors of shape `[...]`, and returns a Boolean scalar tensor. The
#' input tensors are converged and failed, indicating the current status of
#' each respective batch member; the return value states whether the
#' algorithm should stop. The default is `tfp$optimizer.converged_all` which
#' only stops when all batch members have either converged or failed. An
#' alternative is `tfp$optimizer.converged_any` which stops as soon as one
#' batch member has converged, or when all have failed.
#' @param validate_args Logical, default TRUE. When TRUE, optimizer
#' parameters are checked for validity despite possibly degrading runtime
#' performance. When FALSE invalid inputs may silently render incorrect outputs.
#' @param max_line_search_iterations Python int. The maximum number of
#' iterations for the hager_zhang line search algorithm.
#' @param f_absolute_tolerance Scalar Tensor of real dtype. If the absolute
#' change in the objective value between one iteration and the next is
#' smaller than this value, the algorithm is stopped.
#'
#' @export
bfgs <- function(value_and_gradients_function = NULL,
initial_position = NULL,
tolerance = 1e-08,
x_tolerance = 0L,
f_relative_tolerance = 0L,
initial_inverse_hessian_estimate = NULL,
stopping_condition = NULL,
validate_args = TRUE,
max_line_search_iterations = 50L,
f_absolute_tolerance = 0L) {
define_tfp_optimiser(
name = "bfgs",
method = "tfp$optimizer$bfgs_minimize",
parameters = list(
value_and_gradients_function = value_and_gradients_function,
initial_position = initial_position,
tolerance = tolerance,
x_tolerance = x_tolerance,
f_relative_tolerance = f_relative_tolerance,
initial_inverse_hessian_estimate = initial_inverse_hessian_estimate,
parallel_iterations = 1L,
stopping_condition = stopping_condition,
validate_args = validate_args,
max_line_search_iterations = max_line_search_iterations,
f_absolute_tolerance = f_absolute_tolerance,
name = NULL
)
)
}
#' @rdname optimisers
#' @export
#'
powell <- function() {
optimiser_defunct_error("powell")
}
#' @rdname optimisers
#' @export
#'
momentum <- function() {
optimiser_defunct_error("momentum")
}
#' @rdname optimisers
#' @export
#'
cg <- function() {
optimiser_defunct_error("cg")
}
#' @rdname optimisers
#' @export
#'
newton_cg <- function() {
optimiser_defunct_error("newton_cg")
}
#' @rdname optimisers
#' @export
#'
l_bfgs_b <- function() {
optimiser_defunct_error("l_bfgs_b")
}
#' @rdname optimisers
#' @export
#'
tnc <- function() {
optimiser_defunct_error("tnc")
}
#' @rdname optimisers
#' @export
#'
cobyla <- function() {
optimiser_defunct_error("cobyla")
}
#' @rdname optimisers
#' @export
#'
slsqp <- function() {
optimiser_defunct_error("slsqp")
}
#' @rdname optimisers
#' @export
#'
#' @param learning_rate the size of steps (in parameter space) towards the
#' optimal value. Default value 0.01
#' @param momentum hyperparameter that accelerates gradient descent in the
#' relevant direction and dampens oscillations. Defaults to 0, which is
#' vanilla gradient descent.
#' @param nesterov Whether to apply Nesterov momentum. Defaults to FALSE.
gradient_descent <- function(learning_rate = 0.01,
momentum = 0,
nesterov = FALSE) {
define_tf_optimiser(
name = "gradient_descent",
method = "tf$keras$optimizers$legacy$SGD",
parameters = list(
learning_rate = learning_rate,
momentum = momentum,
nesterov = nesterov
)
)
}
#' @rdname optimisers
#' @export
#'
#' @param rho the decay rate
#' @param epsilon a small constant used to condition gradient updates
adadelta <- function(learning_rate = 0.001, rho = 1, epsilon = 1e-08) {
define_tf_optimiser(
name = "adadelta",
# method = "tf$keras$optimizers$Adadelta",
method = "tf$keras$optimizers$legacy$Adadelta",
parameters = list(
learning_rate = learning_rate,
rho = rho,
epsilon = epsilon
)
)
}
#' @rdname optimisers
#' @export
#'
#' @param initial_accumulator_value initial value of the 'accumulator' used to
#' tune the algorithm
#'
adagrad <- function(learning_rate = 0.8,
initial_accumulator_value = 0.1,
epsilon = 1e-08) {
define_tf_optimiser(
name = "adagrad",
# method = "tf$keras$optimizers$Adagrad",
method = "tf$keras$optimizers$legacy$Adagrad",
parameters = list(
learning_rate = learning_rate,
initial_accumulator_value = initial_accumulator_value,
epsilon = epsilon
)
)
}
# nolint start
#' @rdname optimisers
#' @export
#'
#' @param global_step the current training step number
#' @param initial_gradient_squared_accumulator_value initial value of the
#' accumulators used to tune the algorithm
#' @param l1_regularization_strength L1 regularisation coefficient (must be 0 or
#' greater)
#' @param l2_regularization_strength L2 regularisation coefficient (must be 0 or
#' greater)
#'
#' @note This optimizer isn't supported in TF2, so proceed with caution. See
#' the [TF docs on AdagradDAOptimiser](https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/AdagradDAOptimizer) for more detail.
#'
adagrad_da <- function(learning_rate = 0.8,
global_step = 1L,
initial_gradient_squared_accumulator_value = 0.1,
l1_regularization_strength = 0,
l2_regularization_strength = 0) {
optimiser_deprecation_warning(version = "0.6.0")
define_tf_compat_optimiser(
name = "adagrad_da",
method = "tf$compat$v1$train$AdagradDAOptimizer",
parameters = list(
learning_rate = learning_rate,
global_step = global_step,
initial_gradient_squared_accumulator_value =
initial_gradient_squared_accumulator_value,
l1_regularization_strength = l1_regularization_strength,
l2_regularization_strength = l2_regularization_strength
)
)
}
# nolint end
#' @rdname optimisers
#' @export
#'
#' @param beta_1 exponential decay rate for the 1st moment estimates
#' @param beta_2 exponential decay rate for the 2nd moment estimates
#' @param amsgrad Boolean. Whether to apply AMSGrad variant of this algorithm
#' from the paper "On the Convergence of Adam and beyond". Defaults to FALSE.
#'
adam <- function(learning_rate = 0.1,
beta_1 = 0.9,
beta_2 = 0.999,
amsgrad = FALSE,
epsilon = 1e-08) {
define_tf_optimiser(
name = "adam",
# method = "tf$keras$optimizers$Adam",
method = "tf$keras$optimizers$legacy$Adam",
parameters = list(
learning_rate = learning_rate,
beta_1 = beta_1,
beta_2 = beta_2,
epsilon = epsilon,
amsgrad = amsgrad
)
)
}
#' @rdname optimisers
#' @export
#'
adamax <- function(learning_rate = 0.001,
beta_1 = 0.9,
beta_2 = 0.999,
epsilon = 1e-07){
define_tf_optimiser(
name = "adamax",
# method = "tf$keras$optimizers$Adamax",
method = "tf$keras$optimizers$legacy$Adamax",
parameters = list(
learning_rate = learning_rate,
beta_1 = beta_1,
beta_2 = beta_2,
epsilon = epsilon
)
)
}
#' @rdname optimisers
#' @export
#'
#' @param learning_rate_power power on the learning rate, must be 0 or less
#' @param l2_shrinkage_regularization_strength A float value, must be greater
#' than or equal to zero. This differs from L2 above in that the L2 above is
#' a stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
#' When input is sparse shrinkage will only happen on the active weights.
#' @param beta A float value, representing the beta value from the paper by
#' [McMahan et al 2013](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf). Defaults to 0
#'
ftrl <- function(learning_rate = 1,
learning_rate_power = -0.5,
initial_accumulator_value = 0.1,
l1_regularization_strength = 0,
l2_regularization_strength = 0,
l2_shrinkage_regularization_strength = 0,
beta = 0) {
define_tf_optimiser(
name = "ftrl",
# method = "tf$keras$optimizers$Ftrl",
method = "tf$keras$optimizers$legacy$Ftrl",
parameters = list(
learning_rate = learning_rate,
learning_rate_power = learning_rate_power,
initial_accumulator_value = initial_accumulator_value,
l1_regularization_strength = l1_regularization_strength,
l2_regularization_strength = l2_regularization_strength,
l2_shrinkage_regularization_strength = l2_shrinkage_regularization_strength,
beta = beta
)
)
}
#' @rdname optimisers
#' @export
#'
#' @note This optimizer isn't supported in TF2, so proceed with caution. See
#' the [TF docs on ProximalGradientDescentOptimizer](https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/ProximalGradientDescentOptimizer) for more detail.
#'
proximal_gradient_descent <- function(learning_rate = 0.01,
l1_regularization_strength = 0,
l2_regularization_strength = 0) {
optimiser_deprecation_warning(version = "0.6.0")
define_tf_compat_optimiser(
name = "proximal_gradient_descent",
method = "tf$compat$v1$train$ProximalGradientDescentOptimizer",
parameters = list(
learning_rate = learning_rate,
l1_regularization_strength = l1_regularization_strength,
l2_regularization_strength = l2_regularization_strength
)
)
}
#' @rdname optimisers
#' @export
#'
#' @note This optimizer isn't supported in TF2, so proceed with caution. See
#' the [TF docs on ProximalAdagradOptimizer](https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/ProximalAdagradOptimizer) for more detail.
#'
proximal_adagrad <- function(learning_rate = 1,
initial_accumulator_value = 0.1,
l1_regularization_strength = 0,
l2_regularization_strength = 0) {
optimiser_deprecation_warning(version = "0.6.0")
define_tf_compat_optimiser(
name = "proximal_adagrad",
method = "tf$compat$v1$train$ProximalAdagradOptimizer",
parameters = list(
learning_rate = learning_rate,
initial_accumulator_value = initial_accumulator_value,
l1_regularization_strength = l1_regularization_strength,
l2_regularization_strength = l2_regularization_strength
)
)
}
#' @rdname optimisers
#' @export
#'
nadam <- function(learning_rate = 0.001,
beta_1 = 0.9,
beta_2 = 0.999,
epsilon = 1e-07){
define_tf_optimiser(
name = "nadam",
# method = "tf$keras$optimizers$Nadam",
method = "tf$keras$optimizers$legacy$Nadam",
parameters = list(
learning_rate = learning_rate,
beta_1 = beta_1,
beta_2 = beta_2,
epsilon = epsilon
)
)
}
#' @rdname optimisers
#' @export
#'
#' @param centered Boolean. If TRUE, gradients are normalized by the estimated
#' variance of the gradient; if FALSE, by the uncentered second moment.
#' Setting this to TRUE may help with training, but is slightly more
#' expensive in terms of computation and memory. Defaults to FALSE.
rms_prop <- function(learning_rate = 0.1,
rho = 0.9,
momentum = 0,
epsilon = 1e-10,
centered = FALSE) {
define_tf_optimiser(
name = "rms_prop",
# method = "tf$keras$optimizers$RMSprop",
method = "tf$keras$optimizers$legacy$RMSprop",
parameters = list(
learning_rate = learning_rate,
rho = rho,
momentum = momentum,
epsilon = epsilon,
centered = centered
)
)
}
#' @noRd
#' @export
print.optimiser <- print.sampler
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.