Nothing
# Various Gradient-based optimization routines: steepest descent, conjugate
# gradient, BFGS etc.
# Gradient Direction -----------------------------------------------------------
# Creates a direction sub stage
make_direction <- function(sub_stage) {
make_sub_stage(sub_stage, "direction")
}
# Steepest Descent
#
# normalize - If TRUE, then the returned direction vector is normalized to unit
# length. This can be useful for some adaptive line search methods, so that the
# total step length is purely determined by the line search value, rather than
# the product of the line search value and the magnitude of the direction.
sd_direction <- function(normalize = FALSE) {
make_direction(list(
init = function(opt, stage, sub_stage, par, fg, iter) {
opt$cache$gr_curr <- NULL
list(opt = opt)
},
calculate = function(opt, stage, sub_stage, par, fg, iter) {
sub_stage$value <- -opt$cache$gr_curr
if (sub_stage$normalize) {
sub_stage$value <- normalize(sub_stage$value)
}
list(sub_stage = sub_stage)
},
normalize = normalize
))
}
# Conjugate Gradient ------------------------------------------------------
# Conjugate gradient
#
# ortho_check - If TRUE, check successive direction are sufficiently orthogonal.
# If the orthogonality check is failed, then the next step is steepest descent.
# nu - the orthogonality threshold. Used only if ortho_check is TRUE. Compared
# with g_old . g_new / g_new . g_new
# cg_update - Function to generate the next direction using a method of e.g.
# Fletcher-Reeves or Polak-Ribiere. Pass one of the cg_update functions
# below, e.g. pr_plus_update
cg_direction <- function(ortho_check = FALSE, nu = 0.1,
cg_update = pr_plus_update,
preconditioner = "", memory = 5,
eps = .Machine$double.eps) {
cg <- make_direction(list(
ortho_check = ortho_check,
nu = nu,
cg_update = cg_update,
eps = eps,
init = function(opt, stage, sub_stage, par, fg, iter) {
if (preconditioner == "l-bfgs") {
res <- lbfgs_init(opt, stage, sub_stage, par, fg, iter)
res$sub_stage$memory <- memory
res$sub_stage$eps <- eps
sub_stage$preconditioner <- res$sub_stage
}
sub_stage$value <- rep(0, length(par))
sub_stage$pm_old <- rep(0, length(par))
opt$cache$gr_curr <- NULL
opt$cache$gr_old <- NULL
list(opt = opt, sub_stage = sub_stage)
},
calculate = function(opt, stage, sub_stage, par, fg, iter) {
gm <- opt$cache$gr_curr
gm_old <- opt$cache$gr_old
pm_old <- sub_stage$pm_old
# direction is initially steepest descent
pm <- -gm
# wm = Pg or just g if we're not preconditioning
wm <- gm
if (!is.null(gm_old)
&& (!sub_stage$ortho_check
|| !sub_stage$cg_restart(gm, gm_old, sub_stage$nu))) {
precondition_fn <- NULL
if (preconditioner == "l-bfgs" && !is.null(opt$cache$gr_old)) {
lbfgs <- sub_stage$preconditioner
ym <- gm - opt$cache$gr_old
sm <- opt$cache$update_old
lbfgs <- lbfgs_memory_update(lbfgs, ym, sm, lbfgs$eps)
precondition_fn <- function(rm) {
lbfgs_solve(rm, lbfgs, scale_inverse = TRUE, eps = lbfgs$eps)
}
wm <- precondition_fn(gm)
# p <- -Pg
pm <- -wm
sub_stage$preconditioner <- lbfgs
}
beta <- sub_stage$cg_update(gm, gm_old, pm_old, sub_stage$eps,
wm = wm,
preconditioner = precondition_fn
)
pm <- pm + (beta * pm_old)
descent <- dot(gm, pm)
if (descent >= 0) {
# message("Next CG direction is not a descent direction, resetting to SD")
pm <- -gm
}
}
sub_stage$value <- pm
list(sub_stage = sub_stage)
},
after_step = function(opt, stage, sub_stage, par, fg, iter, par0,
update) {
sub_stage$pm_old <- sub_stage$value
list(sub_stage = sub_stage)
},
depends = c("gradient_old")
))
if (preconditioner == "l-bfgs") {
cg$depends <- append(cg$depends, c("update_old"))
}
cg
}
# CG update formulae, grouped according to their numerators similar to the
# discussion in Hager and Zhang's survey paper
#
# The FR, CD and DY updates are all susceptible to "jamming": they can end up
# with very small step sizes and make little progress.
#
# Preconditioned expressions are based off those given by the Hager-Zhang
# CG survey (2006), L-CG_DESCENT paper (2013) and damped CG paper by
# Al-Baali, Caliciotti, Fasano and Roma (2017), except for preconditioned
# LS, DY and PR-FR which I haven't seen explicitly written down, but which
# it is easy to derive from the other examples.
# Fletcher-Reeves update.
fr_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
if (!is.null(preconditioner)) {
if (is.null(wm)) {
wm <- preconditioner(gm)
}
wm_old <- preconditioner(gm_old)
}
else {
if (is.null(wm)) {
wm <- gm
}
wm_old <- gm_old
}
dot(gm, wm) / (dot(gm_old, wm_old) + eps)
}
# Conjugate Descent update due to Fletcher.
cd_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
if (!is.null(preconditioner)) {
if (is.null(wm)) {
wm <- preconditioner(gm)
}
}
else {
if (is.null(wm)) {
wm <- gm
}
}
dot(-gm, wm) / (dot(pm_old, gm_old) + eps)
}
# The Dai-Yuan update.
dy_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
ym <- gm - gm_old
if (!is.null(preconditioner)) {
if (is.null(wm)) {
wm <- preconditioner(gm)
}
}
else {
if (is.null(wm)) {
wm <- gm
}
}
dot(gm, wm) / (dot(pm_old, ym) + eps)
}
# HS, PR and LS share a numerator. According to Hager and Zhang, they
# perform better in practice than the FR, CD and DY updates, despite less
# being known about their provable global convergence properties.
# The Hestenes-Stiefel update.
hs_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
ym <- gm - gm_old
if (!is.null(preconditioner)) {
if (is.null(wm)) {
wm <- preconditioner(gm)
}
}
else {
wm <- gm
}
dot(ym, wm) / (dot(pm_old, ym) + eps)
}
# An "HS+" modification of Hestenes-Stiefel, in analogy to the "PR+" variant of
# Polak-Ribiere suggested by Powell. As far as I can tell, Hager and Zhang
# suggested this modification.
# Hager, W. W., & Zhang, H. (2006).
# A survey of nonlinear conjugate gradient methods.
# \emph{Pacific journal of Optimization}, \emph{2}(1), 35-58.
hs_plus_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
beta <- hs_update(gm, gm_old, pm_old, eps,
wm = wm, preconditioner = preconditioner
)
max(0, beta)
}
# The Polak-Ribiere method for updating the CG direction. Also known as
# Polak-Ribiere-Polyak (PRP)
pr_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
ym <- gm - gm_old
if (!is.null(preconditioner)) {
if (is.null(wm)) {
wm <- preconditioner(gm)
}
wm_old <- preconditioner(gm_old)
}
else {
if (is.null(wm)) {
wm <- gm
}
wm_old <- gm_old
}
dot(wm, ym) / (dot(gm_old, wm_old) + eps)
}
# The "PR+" update due to Powell. Polak-Ribiere update, but if negative,
# restarts the CG from steepest descent. Prevents a possible lack of
# convergence when using a Wolfe line search.
pr_plus_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
beta <- pr_update(gm, gm_old, pm_old, eps,
wm = wm, preconditioner = preconditioner
)
max(0, beta)
}
# Liu-Storey update
ls_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL, preconditioner = NULL) {
ym <- gm - gm_old
if (!is.null(preconditioner)) {
if (is.null(wm)) {
wm <- preconditioner(gm)
}
}
else {
if (is.null(wm)) {
wm <- gm
}
}
dot(-ym, wm) / (dot(pm_old, gm_old) + eps)
}
# Hager-Zhang update as used in CG_DESCENT, theta = 2
hz_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = NULL,
preconditioner = NULL) {
ym <- gm - gm_old
if (!is.null(preconditioner)) {
vm <- preconditioner(ym)
if (is.null(wm)) {
wm <- preconditioner(gm)
}
}
else {
vm <- ym
if (is.null(wm)) {
wm <- gm
}
}
ipy <- 1 / (dot(pm_old, ym) + eps)
(dot(ym, wm) * ipy) - 2 * dot(ym, vm) * ipy * dot(pm_old, gm) * ipy
}
# "Restricted" Hager-Zhang update as used in CG_DESCENT to ensure
# convergence. Analogous to the PR+ and HS+ updates, but dynamically adjusts
# the lower bound as convergence occurs. Choice of eta is from the CG_DESCENT
# paper
hz_plus_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = wm, preconditioner = NULL) {
beta <- hz_update(gm, gm_old, pm_old, eps,
wm = wm, preconditioner = preconditioner
)
eta <- 0.01
eta_k <- -1 / (dot(pm_old) * min(eta, dot(gm_old)))
max(eta_k, beta)
}
# The PR-FR update suggested by Gilbert and Nocedal (1992)
prfr_update <- function(gm, gm_old, pm_old, eps = .Machine$double.eps,
wm = wm, preconditioner = NULL) {
bpr <- pr_update(gm, gm_old, pm_old,
eps = eps,
wm = wm, preconditioner = preconditioner
)
bfr <- fr_update(gm, gm_old, pm_old,
eps = eps,
wm = wm, preconditioner = preconditioner
)
if (bpr < -bfr) {
beta <- -bfr
}
else if (abs(bpr) <= bfr) {
beta <- bpr
}
else if (bpr > bfr) {
beta <- bfr
}
else {
stop("Problem in PR-FR Update")
}
beta
}
# Restart criteria due to Powell
# Checks that successive gradient vectors are sufficiently orthogonal
# g_new . g_old / g_new . g_new must be greater than or equal to nu.
cg_restart <- function(g_new, g_old, nu = 0.1) {
# could only happen on first iteration
if (is.null(g_old)) {
return(TRUE)
}
ortho_test <- abs(dot(g_new, g_old)) / dot(g_new)
should_restart <- ortho_test >= nu
should_restart
}
# BFGS --------------------------------------------------------------------
# The Broyden Fletcher Goldfarb Shanno method
# scale_inverse - if TRUE, scale the inverse Hessian approximation on the first
# step.
bfgs_direction <- function(eps = .Machine$double.eps,
scale_inverse = FALSE) {
make_direction(list(
eps = eps,
init = function(opt, stage, sub_stage, par, fg, iter) {
n <- length(par)
sub_stage$value <- rep(0, n)
if (!is.null(fg$hi)) {
hm <- fg$hi(par)
if (methods::is(hm, "numeric")) {
# Allow for just a vector to be passed, representing a diagonal
# approximation to the Hessian inverse. But we'll store it as the
# full matrix.
hm <- diag(hm)
}
sub_stage$hm <- hm
}
else {
sub_stage$hm <- diag(1, n)
}
opt$cache$gr_curr <- NULL
opt$cache$gr_old <- NULL
list(opt = opt, sub_stage = sub_stage)
},
calculate = function(opt, stage, sub_stage, par, fg, iter) {
gm <- opt$cache$gr_curr
gm_old <- opt$cache$gr_old
if (!is.null(gm_old)) {
sm <- opt$cache$update_old
hm <- sub_stage$hm
ym <- gm - gm_old
# Nocedal suggests this heuristic for scaling the first
# approximation in Chapter 6 Section "Implementation"
# Also used in the definition of L-BFGS
if (iter == 2 && scale_inverse) {
gamma <- dot(sm, ym) / dot(ym)
hm <- gamma * hm
}
sub_stage$hm <- bfgs_update(hm, sm, ym, eps = sub_stage$eps)
}
pm <- as.vector(-sub_stage$hm %*% gm)
descent <- dot(gm, pm)
if (descent >= 0) {
pm <- -gm
}
sub_stage$value <- pm
list(sub_stage = sub_stage)
},
depends = c("gradient_old", "update_old")
))
}
# H - inverse Hessian
# s - difference in iterates x
# y - difference in gradient of x
bfgs_update <- function(hm, sm, ym, eps) {
rho <- 1 / (dot(ym, sm) + eps)
im <- diag(1, nrow(hm))
rss <- rho * outer(sm, sm)
irsy <- im - rho * outer(sm, ym)
irys <- im - rho * outer(ym, sm)
(irsy %*% (hm %*% irys)) + rss
}
# SR1 ---------------------------------------------------------------------
# The Symmetric-Rank-1 Update. See section 6.2 of Nocedal and Wright.
# A Broyden-class Quasi Newton method, like BFGS, but using a rank-1 matrix
# to update the inverse Hessian approximation (i.e. the outer product of a
# vector).
# The good news: SR1 updates are sometimes better approximations to the Hessian
# than BFGS (according to Nocedal and Wright).
# The bad news: SR1 updates can contain a division by zero.
# The less-bad news: we can detect this and use the last Hessian approximation
# in these cases.
# The bad news 2: SR1 updates are not guaranteed to result in a positive
# definite Hessian, i.e. you won't always get a descent direction. As a result,
# Nocedal & Wright suggest using it with a trust-region approach rather than
# line search. Here we simply replace the SR1 update with the BFGS version to
# ensure a descent direction.
sr1_direction <- function(eps = .Machine$double.eps,
scale_inverse = FALSE, skip_bad_update = TRUE,
tol = 1e-8, try_bfgs = TRUE) {
make_direction(list(
eps = eps,
init = function(opt, stage, sub_stage, par, fg, iter) {
n <- length(par)
sub_stage$value <- rep(0, n)
if (!is.null(fg$hi)) {
hm <- fg$hi(par)
if (methods::is(hm, "numeric")) {
# Allow for just a vector to be passed, representing a diagonal
# approximation to the Hessian inverse. But we'll store it as the
# full matrix.
hm <- diag(hm)
}
sub_stage$hm <- hm
}
else {
sub_stage$hm <- diag(1, n)
}
opt$cache$gr_curr <- NULL
opt$cache$gr_old <- NULL
list(opt = opt, sub_stage = sub_stage)
},
calculate = function(opt, stage, sub_stage, par, fg, iter) {
gm <- opt$cache$gr_curr
gm_old <- opt$cache$gr_old
if (!is.null(gm_old)) {
sm <- opt$cache$update_old
hm <- sub_stage$hm
ym <- gm - gm_old
if (iter == 2 && scale_inverse) {
# Nocedal suggests this heuristic for scaling the first
# approximation in Chapter 6 Section "Implementation" for BFGS
# Also used in the definition of L-BFGS
gamma <- dot(sm, ym) / dot(ym)
hm <- gamma * hm
}
shy <- as.vector(sm - hm %*% ym)
up_den <- dot(shy, ym)
# Skip rule based on 6.26 in Nocedal & Wright
# Their version uses strict < rather than <=, but we want to catch
# the case where one of shy or ym is exactly zero
if (skip_bad_update && abs(up_den) <= tol * norm2(shy) * norm2(ym)) {
# message("SR1: skipping bad update")
sub_stage$hm <- hm
}
else {
sub_stage$hm <- hm + outer(shy, shy) / up_den
}
}
pm <- as.vector(-sub_stage$hm %*% gm)
descent <- dot(gm, pm)
if (descent >= 0) {
if (try_bfgs) {
# message("SR1 iter ", iter, " not a descent direction, trying BFGS")
hm_bfgs <- bfgs_update(hm, sm, ym, sub_stage$eps)
pm <- as.vector(-hm_bfgs %*% gm)
descent <- dot(gm, pm)
if (descent >= 0) {
# This should never happen: BFGS H should always be pd.
pm <- -gm
}
else {
# Only store the BFGS H for next iteration if it's a descent
sub_stage$hm <- hm_bfgs
}
}
else {
pm <- -gm
}
}
sub_stage$value <- pm
list(sub_stage = sub_stage)
},
depends = c("gradient_old", "update_old")
))
}
# L-BFGS ------------------------------------------------------------------
lbfgs_init <- function(opt, stage, sub_stage, par, fg, iter) {
opt$cache$gr_curr <- NULL
opt$cache$gr_old <- NULL
n <- length(par)
sub_stage$value <- rep(0, n)
sub_stage$rhos <- c()
sub_stage$sms <- c()
sub_stage$yms <- c()
list(opt = opt, sub_stage = sub_stage)
}
# Initial guess for solving Bp = q for p by using an approximation
# for H, the inverse of B, and calculating Hq directly.
# You can provide the inverse hessian function fg$hi and par, which can return
# either a matrix (although if it's dense that's not a great idea), or a vector
# in which case it's assumed to represent a diagonal matrix.
# Or you can set scale_inverse to TRUE and the usual L-BFGS guess for H as
# given by Nocedal and Wright is used. rho and eps may not be NULL in that case.
# Or you can set scale_inverse = FALSE and you will get q back directly, i.e.
# Assume use H = I, which is also used when scale_inverse = TRUE on the first
# iteration anyway
lbfgs_guess <- function(qm, scale_inverse = TRUE,
rho = NULL, ym = NULL, eps = NULL,
fg = NULL, par = NULL) {
# User-defined hessian inverse approximations
if (!is.null(fg) && !is.null(fg$hi)) {
hm <- fg$hi(par)
if (methods::is(hm, "matrix")) {
# Full matrix: not necessarily a great idea for memory usage
pm <- hm %*% qm
}
else {
# It's a vector representing a diagonal matrix
pm <- hm * qm
}
}
else {
# The usual L-BFGS guess
if (!is.null(rho) && !is.null(ym) && scale_inverse) {
# Eqn 7.20 in Nocedal & Wright
gamma <- 1 / (rho * (dot(ym) + eps))
hm <- rep(gamma, length(ym))
pm <- hm * qm
}
else {
# Effectively H = I
# This will happen on the first iteration
pm <- qm
}
}
pm
}
# Solve Bp = q for p by the two-loop recursion. If fn and par are non NULL
# and fg has the Hessian inverse function hi defined, it will be used to
# initialize the inital guess for p. Otherwise, if scale_inverse = TRUE, the
# usual L-BFGS guess is used. Otherwise, q is used.
# Note that we usually want to find p = -Hg, so usually take the negative of
# the return value
lbfgs_solve <- function(qm, lbfgs_state, scale_inverse, eps,
fg = NULL, par = NULL) {
sms <- lbfgs_state$sms
yms <- lbfgs_state$yms
rhos <- lbfgs_state$rhos
alphas <- rep(0, length(rhos))
# loop backwards latest values first
for (i in length(rhos):1) {
alphas[i] <- rhos[[i]] * dot(sms[[i]], qm)
qm <- qm - alphas[[i]] * yms[[i]]
}
# Choose estimate of inverse Hessian approximation
pm <- lbfgs_guess(
qm, scale_inverse, rhos[[length(rhos)]], yms[[length(yms)]],
eps, fg, par
)
# loop forwards
for (i in 1:length(rhos)) {
beta <- rhos[[i]] * dot(yms[[i]], pm)
pm <- pm + sms[[i]] * (alphas[[i]] - beta)
}
pm
}
# Update lbfgs_state with new ym, sm, rho, removing old values if we've
# reached the memory limit
# One real calculation (rho), otherwise just lots of boring housekeeping
lbfgs_memory_update <- function(lbfgs_state, ym, sm, eps) {
rho <- 1 / (dot(ym, sm) + eps)
rhos <- lbfgs_state$rhos
sms <- lbfgs_state$sms
yms <- lbfgs_state$yms
# discard oldest values if we've reached memory limit
if (length(sms) == lbfgs_state$memory) {
sms <- sms[2:length(sms)]
yms <- yms[2:length(yms)]
rhos <- rhos[2:length(rhos)]
}
# append latest values to memory
sms <- c(sms, list(sm))
yms <- c(yms, list(ym))
rhos <- c(rhos, list(rho))
lbfgs_state$sms <- sms
lbfgs_state$yms <- yms
lbfgs_state$rhos <- rhos
lbfgs_state
}
# The Limited Memory BFGS method
#
# memory - The number of previous updates to store.
# scale_inverse - if TRUE, scale the inverse Hessian approximation at each step.
lbfgs_direction <- function(memory = 5, scale_inverse = FALSE,
eps = .Machine$double.eps) {
if (memory < 1) {
stop("memory must be > 0")
}
make_direction(list(
memory = memory,
k = 0,
eps = eps,
init = lbfgs_init,
calculate = function(opt, stage, sub_stage, par, fg, iter) {
gm <- opt$cache$gr_curr
gm_old <- opt$cache$gr_old
if (is.null(gm_old)) {
# First iteration do steepest descent unless we have home-brew
# H approximation in fg
pm <- lbfgs_guess(-gm, fg = fg, par = par)
}
else {
# Update the memory
# y_{k-1}, s_{k-1} using notation in Nocedal and Wright
ym <- gm - gm_old
sm <- opt$cache$update_old
sub_stage <- lbfgs_memory_update(sub_stage, ym, sm, sub_stage$eps)
# Solve Bp = -g with updated memory
# lbfgs_solve returns Hg, so take negative of result
pm <- -lbfgs_solve(gm, sub_stage, scale_inverse, sub_stage$eps, fg, par)
}
descent <- dot(gm, pm)
if (descent >= 0) {
pm <- -gm
}
sub_stage$value <- pm
list(sub_stage = sub_stage)
},
depends = c("gradient_old", "update_old")
))
}
# Newton Method -----------------------------------------------------------
# Newton method. Requires the Hessian to be calculated, via a function hs in fg.
newton_direction <- function(try_safe_chol = FALSE) {
make_direction(list(
init = function(opt, stage, sub_stage, par, fg, iter) {
opt$cache$gr_curr <- NULL
list(opt = opt)
},
calculate = function(opt, stage, sub_stage, par, fg, iter) {
gm <- opt$cache$gr_curr
if (!is.null(fg$hs)) {
# B, an approximation to the (or the exact) Hessian
# We now need to solve Bp = -g for p
bm <- fg$hs(par)
if (methods::is(bm, "matrix")) {
if (try_safe_chol) {
rm <- safe_chol(bm)
}
else {
chol_result <- try(
{
# O(N^3)
rm <- chol(bm)
},
silent = TRUE
)
if (methods::is(chol_result, "try-error")) {
rm <- NULL
}
}
if (is.null(rm)) {
# message("Hessian is not positive-definite, resetting to SD")
pm <- -gm
}
else {
# Forward and back solving is "only" O(N^2)
pm <- hessian_solve(rm, gm)
}
}
else {
# vector: assume it's a diagonal approximation of B
pm <- -(1 / bm) * gm
}
}
else if (!is.null(fg$hi)) {
# H, an approximation to (or exact) inverse of the Hessian
hm <- fg$hs(par)
if (methods::is(hm, "matrix")) {
pm <- -hm %*% gm
}
else {
# vector: assume it's a diagonal approximation of H
pm <- -hm * gm
}
}
else {
stop("No hi or hs function available for Hessian information")
}
descent <- dot(gm, pm)
if (descent >= 0) {
pm <- -gm
}
sub_stage$value <- pm
list(sub_stage = sub_stage)
}
))
}
# A Partial Hessian approach: calculates the Cholesky decomposition of the
# Hessian (or some approximation) on the first iteration only. Future steps
# solve using this Hessian and the current gradient.
partial_hessian_direction <- function(hessian_every = 0) {
make_direction(list(
init = function(opt, stage, sub_stage, par, fg, iter) {
if (hessian_every == 0) {
hm <- fg$hs(par)
sub_stage$rm <- chol(hm)
}
list(sub_stage = sub_stage)
},
calculate = function(opt, stage, sub_stage, par, fg, iter) {
if (hessian_every > 0 && iter %% hessian_every == 0) {
hm <- fg$hs(par)
sub_stage$rm <- chol(hm)
}
gm <- opt$cache$gr_curr
rm <- sub_stage$rm
pm <- hessian_solve(rm, gm)
descent <- dot(gm, pm)
if (descent >= 0) {
pm <- -gm
}
sub_stage$value <- pm
list(sub_stage = sub_stage)
}
))
}
# Solves U'Ux = b
# U is upper triangular; need to solve U'(Ux) = b
# Step 1: Solve U'y = b which is a forwardsolve
# Step 2: Solve Ub = y which is a backsolve
# Can avoid explicit transpose in Step 1
# by passing the transpose argument to backsolve
upper_solve <- function(um, bm) {
backsolve(um, backsolve(um, bm, transpose = TRUE))
}
# Given the upper triangular cholesky decomposition of the hessian (or an
# approximation), U, and the gradient vector, g, solve Up = -g
hessian_solve <- function(um, gm) {
# This monkeying with dimensions is a tiny hack for the case of a repeated
# block diagonal Hessian, so that you only need to provide the N x N block.
nucol <- ncol(um)
ngcol <- length(gm) / nucol
dim(gm) <- c(nucol, ngcol)
pm <- upper_solve(um, -gm)
dim(pm) <- NULL
pm
}
# Attempts to ensure a safe Cholesky decomposition of a Hessian by detecting
# a failure, rebuilding the Hessian by setting negative eigenvalues to a small
# positive value and then trying again. Not a fast procedure!
#
# Suggested by
# https://www.r-bloggers.com/fixing-non-positive-definite-correlation-matrices-using-r-2/
# Refs:
# Brissette, F. P., Khalili, M., & Leconte, R. (2007).
# Efficient stochastic generation of multi-site synthetic precipitation data.
# Journal of Hydrology, 345(3), 121-133.
# https://www.etsmtl.ca/getattachment/Unites-de-recherche/Drame/Publications/Brissette_al07---JH.pdf
#
# Rebonato, R., & Jaeckel, P. (2011).
# The most general methodology to create a valid correlation matrix for risk
# management and option pricing purposes.
# doi 10.21314/JOR.2000.023
safe_chol <- function(hm, eps = 1e-10) {
rm <- NULL
chol_result <- try(
{
# O(N^3)
rm <- chol(hm)
},
silent = TRUE
)
if (methods::is(chol_result, "try-error")) {
# Also O(N^3)
eig <- eigen(hm)
eig$values[eig$values < 0] <- 1e-10
hm <- eig$vectors %*% (eig$values * diag(nrow(hm))) %*% t(eig$vectors)
chol_result <- try(
{
rm <- chol(hm)
},
silent = TRUE
)
}
rm
}
# Truncated Newton --------------------------------------------------------
# Truncated Newton, aka "Line Search Newton-CG"
# See Nocedal & Wright Chapter 7, algorithm 7.1
# L-BFGS preconditioner suggested e.g. by Hager & Zhang (2006)
# also implemented in minfunc
tn_direction <- function(init = 0, exit_criterion = "curvature",
preconditioner = "", memory = 5,
eps = .Machine$double.eps) {
tn <- make_direction(list(
init = function(opt, stage, sub_stage, par, fg, iter) {
if (preconditioner == "l-bfgs") {
res <- lbfgs_init(opt, stage, sub_stage, par, fg, iter)
res$sub_stage$memory <- memory
res$sub_stage$eps <- eps
sub_stage$preconditioner <- res$sub_stage
}
list(sub_stage = sub_stage)
},
calculate = function(opt, stage, sub_stage, par, fg, iter) {
gm <- opt$cache$gr_curr
precondition_fn <- NULL
# In the context of non-linear CG, Hager and Zhang (2006) mention
# using the approximation of the inverse Hessian as a preconditioner P
# replacing g with Pg and using L-BFGS specifically.
# In linear CG, use r instead of g
if (preconditioner == "l-bfgs" && !is.null(opt$cache$gr_old)) {
lbfgs <- sub_stage$preconditioner
ym <- gm - opt$cache$gr_old
sm <- opt$cache$update_old
lbfgs <- lbfgs_memory_update(lbfgs, ym, sm, lbfgs$eps)
precondition_fn <- function(rm) {
# This solves Bp = r for p => p = Hr which is what we want,
# so don't take negative (as needed for L-BFGS solution of pm)
lbfgs_solve(rm, lbfgs, scale_inverse = TRUE, eps = lbfgs$eps)
}
sub_stage$preconditioner <- lbfgs
}
if (init == 0) {
# Standard initialization. Saves on a finite difference gradient
# calculation because we know the residual is g
zm <- 0
}
else if (init == "l-bfgs" && preconditioner == "l-bfgs") {
# Use the L-BFGS guess as we've done a lot of the work already
# A potentially bad idea of my own invention
if (!is.null(opt$cache$gr_old)) {
lbfgs <- sub_stage$preconditioner
zm <- -lbfgs_solve(gm, lbfgs, scale_inverse = TRUE, eps = lbfgs$eps)
}
else {
zm <- 0
}
}
else {
# "previous", i.e. use the result from the last iteration
# Martens (2010)
if (!is.null(sub_stage$value)) {
zm <- sub_stage$value
}
else {
zm <- 0
}
}
inner_res <- tn_inner_cg(opt, fg, par, gm, precondition_fn,
zm = zm,
exit_criterion = exit_criterion
)
opt <- inner_res$opt
pm <- inner_res$zm
descent <- dot(gm, pm)
if (descent >= 0) {
# message("TN: Not a descent direction")
pm <- -gm
}
sub_stage$value <- pm
list(opt = opt, sub_stage = sub_stage)
}
))
if (preconditioner == "l-bfgs") {
tn$depends <- append(tn$depends, c("gradient_old", "update_old"))
}
tn
}
# Linear CG inner loop of truncated Newton
# Inner loop from Nocedal & Wright Chapter 7, Algo 7.1
# with modification for preconditioner as in Chapter 5, Algo 5.3
# Gives up when negative curvature or convergence occurs
# returns a list with opt (containing updated gradient count)
# and zm, the final estimate of pm solving Bp = -g
# max_iter Maximum number of iterations for the inner CG loop. With exact
# arithmetic, we would expect the loop to terminate in N steps at most,
# but rounding errors can prevent that and we want to stop way before that
# anyway. Default is from TNPACK (Xie and Schlick 1999).
# exit_criterion: either "curvature" (check negative curvature) or "descent",
# a comparison of descent direction used in TNPACK (Xie and Schlick 1999)
tn_inner_cg <- function(opt, fg, par, gm, preconditioner = NULL, zm = 0,
exit_criterion = "curvature", max_iter = 40) {
gn <- norm2(gm)
eps <- min(0.5, sqrt(gn)) * gn
if (length(zm) == 1 && zm == 0) {
rm <- gm
}
else {
if (opt$counts$gr >= opt$convergence$max_gr) {
zm <- -gm
return(list(
opt = opt,
zm = zm
))
}
Bd <- bd_approx(fg, par, zm, gm)
opt$counts$gr <- opt$counts$gr + 1
rm <- Bd + gm
}
if (!is.null(preconditioner)) {
ym <- preconditioner(rm)
}
else {
ym <- rm
}
dm <- -ym
dot_rm_ym <- dot(rm, ym)
j <- 0
while (j < max_iter) {
if (opt$counts$gr >= opt$convergence$max_gr) {
if (j == 0) {
zm <- -gm
}
return(list(
opt = opt,
zm = zm
))
}
Bd <- bd_approx(fg, par, dm, gm)
opt$counts$gr <- opt$counts$gr + 1
dBd <- dot(dm, Bd)
if (exit_criterion == "curvature") {
if (dBd <= 0) {
# -ve curvature, bail out
if (j == 0) {
# Use steepest descent if the initial Bd estimate is unusable
zm <- -gm
}
break
}
}
alpha <- dot_rm_ym / dBd
# only used in the strong curvature criterion
zm_old <- zm
zm <- zm + alpha * dm
if (exit_criterion == "strong") {
# alternative exit criterion used in TNPACK
if (dot(gm, zm) >= dot(gm, zm_old) + 1e-15) {
if (j == 0) {
zm <- -gm
}
else {
zm <- zm_old
}
break
}
}
rm <- rm + alpha * Bd
if (norm2(rm) < eps) {
break
}
if (!is.null(preconditioner)) {
ym <- preconditioner(rm)
}
else {
ym <- rm
}
dot_rm_ym_new <- dot(rm, ym)
beta <- dot_rm_ym_new / dot_rm_ym
dm <- beta * dm - ym
dot_rm_ym <- dot_rm_ym_new
j <- j + 1
}
list(
opt = opt,
zm = zm
)
}
# Finite difference approximation of the Hessian-vector product, Bd
# The default step size, h, is suggested by
# Andrei, N. (2009). Accelerated conjugate gradient algorithm with finite
# difference Hessian/vector product approximation for unconstrained
# optimization. Journal of Computational and Applied Mathematics, 230(2),
# 570-582.
# Found in:
# http://timvieira.github.io/blog/post/2014/02/10/gradient-vector-product/
# Something similar is used in minfunc
bd_approx <- function(fg, par, dm, gm,
h = 2 * sqrt(.Machine$double.eps) *
(1 + norm2(par)) / norm2(dm)) {
g_fwd <- fg$gr(par + h * dm)
(g_fwd - gm) / h
}
# Gradient Dependencies ------------------------------------------------------------
# Calculate the gradient at par.
require_gradient <- function(opt, stage, par, fg, iter) {
if (!has_gr_curr(opt, iter)) {
opt <- calc_gr_curr(opt, par, fg$gr, iter)
if (any(!is.finite(opt$cache$gr_curr))) {
opt$terminate <- list(
what = "gr_inf",
val = Inf
)
opt$is_terminated <- TRUE
}
}
list(opt = opt)
}
attr(require_gradient, "event") <- "before gradient_descent"
attr(require_gradient, "name") <- "gradient"
# Caches the gradient at the current step.
require_gradient_old <- function(opt, par, fg, iter, par0, update) {
opt$cache$gr_old <- opt$cache$gr_curr
opt
}
attr(require_gradient_old, "event") <- "after step"
attr(require_gradient_old, "name") <- "gradient_old"
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.