mize.R
In mize: Unconstrained Numerical Optimization Algorithms

## ----setup, include = FALSE, echo = FALSE, message = FALSE--------------------
knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, comment = "#>")
library(mize)

## ----Definining a function and gradient to optimize---------------------------
rb_fg <- list(
   fn = function(x) { 100 * (x[2] - x[1] * x[1]) ^ 2 + (1 - x[1]) ^ 2  },
   gr = function(x) { c( -400 * x[1] * (x[2] - x[1] * x[1]) - 2 * (1 - x[1]),
                          200 *        (x[2] - x[1] * x[1])) })

## ----A function list with an optional fg item---------------------------------
rb_fg <- list(
   fn = function(x) { 100 * (x[2] - x[1] * x[1]) ^ 2 + (1 - x[1]) ^ 2  },
   gr = function(x) { c( -400 * x[1] * (x[2] - x[1] * x[1]) - 2 * (1 - x[1]),
                          200 *        (x[2] - x[1] * x[1])) },
   fg = function(x) {   
     a <- x[2] - x[1] * x[1]
     b <- 1 - x[1]
     list( 
       fn = 100 * a ^ 2 + b ^ 2,
       gr = c( -400 * x[1] * a - 2 * b,
                200 * a)
     )
   }
)

## ----Defining a starting point------------------------------------------------
rb0 <- c(-1.2, 1)

## ----Defaults-----------------------------------------------------------------
res <- mize(rb0, rb_fg)
# What were the final parameter values? (should be close to c(1, 1))
res$par

# What was the function value at that point (should be close to 0)
res$f

# How many iterations did it take?
res$iter

# How many function evaluations?
res$nf

# How many gradient evaluations?
res$ng

# Why did the optimization terminate?
res$terminate

## ----Verbose mode-------------------------------------------------------------
res <- mize(rb0, rb_fg, grad_tol = 1e-3, ginf_tol = 1e-3, max_iter = 10, 
            verbose = TRUE)

## ----Log every 10 iterations--------------------------------------------------
res <- mize(rb0, rb_fg, grad_tol = 1e-3, verbose = TRUE, log_every = 10)

## ----Returning stored progress------------------------------------------------
res <- mize(rb0, rb_fg, store_progress = TRUE, log_every = 10)
res$progress

## ----Steepest descent---------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "SD")

## ----BFGS---------------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "BFGS")

## ----BFGS without scaled Hessian----------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "BFGS", scale_hess = FALSE)

## ----LBFGS--------------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "L-BFGS", memory = 7)

## ----LBFGS without scaled Hessian---------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "L-BFGS", scale_hess = FALSE)

## ----CG with PR+--------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG")

## ----CG with HZ+--------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", cg_update = "HZ+")

## ----NAG----------------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "NAG")

## ----NAG with 100 steps-------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 100, method = "NAG", store_progress = TRUE)
plot(res$progress$nf, log(res$progress$f), type = "l")
res$f

## ----NAG with 100 steps and less aggressive momentum--------------------------
resq <- mize(rb0, rb_fg, max_iter = 100, method = "NAG", nest_q = 0.001, 
            store_progress = TRUE)
plot(res$progress$nf, log(res$progress$f), type = "l",
     ylim = range(log(res$progress$f), log(resq$progress$f)))
lines(resq$progress$nf, log(resq$progress$f), col = "red")
resq$f

## ----Momentum-----------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", mom_schedule = 0.9)

## ----Momentum plot------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9,
            store_progress = TRUE)
plot(res$progress$nf, log(res$progress$f), type = "l")
res$f

## ----momentum with a switch function------------------------------------------
# Switch from a momentum of 0.4 to 0.8 at iteration 5
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", mom_schedule = "switch",
            mom_init = 0.4, mom_final = 0.8, mom_switch_iter = 5)

## ----momentum with a ramp function--------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", mom_schedule = "ramp",
            mom_init = 0.4, mom_final = 0.8)

## ----momentum with nesterov schedule------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", mom_schedule = "nsconvex")

## ----momentum with nesterov schedule and non-zero q---------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", mom_schedule = "nsconvex",
            nest_q = 0.001)

## ----momentum with random momentum--------------------------------------------
mom_fn <- function(iter, max_iter) {
  runif(n = 1, min = 0, max = 1)
}
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", mom_schedule = mom_fn)

## ----Simplified Nesterov momentum---------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", mom_schedule = 0.9, 
            mom_type = "nesterov")

## ----Nesterov versus classical momentum---------------------------------------
resc <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9, 
             store_progress = TRUE)
resn <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9, 
             mom_type = "nesterov", 
             store_progress = TRUE)
# Best f found for Nesterov momentum
resn$f
# Best f found for classical momentum
resc$f
plot(resc$progress$nf, log(resc$progress$f), type = "l",
     ylim = range(log(resc$progress$f), log(resn$progress$f)))
lines(resn$progress$nf, log(resn$progress$f), col = "red")

## ----Nesterov momentum with convex approximation------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "MOM", 
            mom_schedule = "nsconvex", nest_convex_approx = TRUE, 
            mom_type = "nesterov")

## ----other Wolfe line search--------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "Rasmussen")
# Use Mark Schmidt's minFunc line search 
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "Schmidt")
# Hager-Zhang line search  
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "Hager-Zhang")
# Hager-Zhang can be abbreviated to "HZ"
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "HZ")
# You can explicitly set More-Thuente too
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "More-Thuente")
# More-Thuente can be abbreviated to "MT"
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "MT")

## ----Line search parameters---------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", cg_update = "HZ+", 
            c2 = 0.5, c1 = 0.1)

## ----Line search with slope ratio---------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, step_next_init = "slope")

## ----Line search with Hager-Zhang QuadStep------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, step_next_init = "hz", 
            line_search = "mt")

## ----Line search with scipy initialization------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, step0 = "scipy")

## ----Line search with initial step length of 1--------------------------------
# An initial guess of 1 for the step length isn't bad for L-BFGS
res <- mize(rb0, rb_fg, max_iter = 10, step0 = 1, method = "L-BFGS")

## ----BFGS with no Newton step-------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "BFGS", try_newton_step = FALSE)

## ----alternative Wolfe conditions---------------------------------------------
# Rasmussen line search with standard Wolfe conditions
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "Rasmussen",
            strong_curvature = FALSE)
# Hager-Zhang with strong Wolfe conditions
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "HZ",
            strong_curvature = TRUE, approx_armijo = FALSE)
# More-Thuente with approx Armijo conditions
res <- mize(rb0, rb_fg, max_iter = 10, method = "CG", line_search = "MT",
            approx_armijo = TRUE)

## ----constant step size-------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "SD", line_search = "constant",
            norm_direction = TRUE, step0 = 0.01)

## ----backtracking with cubic interpolation------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, line_search = "backtracking", step0 = 1, 
            c1 = 0.1)

## ----backtracking with halved step size---------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, line_search = "backtracking",
            step0 = 1, c1 = 0.1, step_down = 0.5)

## ----bold driver--------------------------------------------------------------
# increase step size by 10%, but reduce by 50%
res <- mize(rb0, rb_fg, max_iter = 10, line_search = "bold",
            step0 = 1, step_down = 0.5, step_up = 1.1)

## ----max line search functions------------------------------------------------
# No more than 10 gradient evaluations allowed per line search
res <- mize(rb0, rb_fg, max_iter = 10, ls_max_gr = 10)

## -----------------------------------------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "DBD",
            step0 = "rasmussen", step_down = 0.5, step_up = 1.1,
            dbd_weight = 0.5)

## ----t-SNE style DBD parameters-----------------------------------------------
res <- mize(rb0, rb_fg, max_iter = 10, method = "DBD",
            step0 = "rasmussen", step_down = 0.8, step_up = 0.2,
            step_up_fun = "+")

## -----------------------------------------------------------------------------
# DBD with rel_tol and abs_tol is explicitly set
res <- mize(rb0, rb_fg, max_iter = 10, method = "DBD",
             step0 = "rasmussen", step_down = 0.8, step_up = 0.2,
             step_up_fun = "+", rel_tol = 1e-8, abs_tol = 1e-8)
# 10 gradient calculations as expected
res$ng
# But 10 function calculations too, only used in the tolerance check
res$nf

# Turn off the rel_tol and abs_tol and let max_iter handle termination
res <- mize(rb0, rb_fg, max_iter = 10, method = "DBD",
            step0 = "rasmussen", step_down = 0.8, step_up = 0.2,
            step_up_fun = "+", rel_tol = NULL, abs_tol = NULL,
            grad_tol = 1e-5)
# 11 gradient calculations
res$ng
# Only one function evalation needed (to calculate res$f)
res$nf

## ----momentum with restart----------------------------------------------------
resc <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9, 
             store_progress = TRUE)
resf <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9, 
             store_progress = TRUE, restart = "fn")
resg <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9, 
             store_progress = TRUE, restart = "gr")
plot(resc$progress$nf, log(resc$progress$f), type = "l", 
     ylim = range(log(resc$progress$f), log(resf$progress$f),
                  log(resg$progress$f)))
lines(resf$progress$nf, log(resf$progress$f), col = "red")
lines(resg$progress$nf, log(resg$progress$f), col = "blue")

## ----momentum with restart and wait time--------------------------------------
resfw <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9, 
             store_progress = TRUE, restart = "fn", restart_wait = 1)
resgw <- mize(rb0, rb_fg, max_iter = 100, method = "MOM", mom_schedule = 0.9, 
             store_progress = TRUE, restart = "gr", restart_wait = 1)
plot(resc$progress$nf, log(resc$progress$f), type = "l", 
     ylim = range(log(resc$progress$f), log(resf$progress$f),
                  log(resg$progress$f), log(resfw$progress$f),
                  log(resgw$progress$f)))
lines(resf$progress$nf, log(resf$progress$f), col = "red")
lines(resfw$progress$nf, log(resfw$progress$f), col = "blue")
lines(resgw$progress$nf, log(resgw$progress$f), col = "orange")