psqn_generic | R Documentation |
Optimization method for generic partially separable functions.
psqn_generic(
par,
fn,
n_ele_func,
rel_eps = 1e-08,
max_it = 100L,
n_threads = 1L,
c1 = 1e-04,
c2 = 0.9,
use_bfgs = TRUE,
trace = 0L,
cg_tol = 0.5,
strong_wolfe = TRUE,
env = NULL,
max_cg = 0L,
pre_method = 1L,
mask = as.integer(c()),
gr_tol = -1
)
psqn_aug_Lagrang_generic(
par,
fn,
n_ele_func,
consts,
n_constraints,
multipliers = as.numeric(c()),
penalty_start = 1L,
rel_eps = 1e-08,
max_it = 100L,
max_it_outer = 100L,
violations_norm_thresh = 1e-06,
n_threads = 1L,
c1 = 1e-04,
c2 = 0.9,
tau = 1.5,
use_bfgs = TRUE,
trace = 0L,
cg_tol = 0.5,
strong_wolfe = TRUE,
env = NULL,
max_cg = 0L,
pre_method = 1L,
mask = as.integer(c()),
gr_tol = -1
)
par |
Initial values for the parameters. |
fn |
Function to compute the element functions and their derivatives. Each call computes an element function. See the examples section. |
n_ele_func |
Number of element functions. |
rel_eps |
Relative convergence threshold. |
max_it |
Maximum number of iterations. |
n_threads |
Number of threads to use. |
c1 |
Thresholds for the Wolfe condition. |
c2 |
Thresholds for the Wolfe condition. |
use_bfgs |
Logical for whether to use BFGS updates or SR1 updates. |
trace |
Integer where larger values gives more information during the optimization. |
cg_tol |
Threshold for the conjugate gradient method. |
strong_wolfe |
|
env |
Environment to evaluate |
max_cg |
Maximum number of conjugate gradient iterations in each iteration. Use zero if there should not be a limit. |
pre_method |
Preconditioning method in the conjugate gradient method. Zero yields no preconditioning, one yields diagonal preconditioning, two yields the incomplete Cholesky factorization from Eigen, and three yields a block diagonal preconditioning. One and three are fast options with three seeming to work well for some poorly conditioned problems. |
mask |
zero based indices for parameters to mask (i.e. fix). |
gr_tol |
convergence tolerance for the Euclidean norm of the gradient. A negative value yields no check. |
consts |
Function to compute the constraints which must be equal to zero. See the example Section. |
n_constraints |
The number of constraints. |
multipliers |
Staring values for the multipliers in the augmented
Lagrangian method. There needs to be the same number of multipliers as the
number of constraints. An empty vector, |
penalty_start |
Starting value for the penalty parameterin the augmented Lagrangian method. |
max_it_outer |
Maximum number of augmented Lagrangian steps. |
violations_norm_thresh |
Threshold for the norm of the constraint violations. |
tau |
Multiplier used for the penalty parameter between each outer iterations. |
The function follows the method described by Nocedal and Wright (2006)
and mainly what is described in Section 7.4. Details are provided
in the psqn vignette. See vignette("psqn", package = "psqn")
.
The partially separable function we consider can be quite general and the
only restriction is that we can write the function to be minimized as a sum
of so-called element functions each of which only depends on a small number
of the parameters. A more restricted version is available through the
psqn
function.
The optimization function is also available in C++ as a header-only library. Using C++ may reduce the computation time substantially. See the vignette in the package for examples.
A list like psqn
and psqn_aug_Lagrang
.
Nocedal, J. and Wright, S. J. (2006). Numerical Optimization (2nd ed.). Springer.
Lin, C. and Moré, J. J. (1999). Incomplete Cholesky factorizations with limited memory. SIAM Journal on Scientific Computing.
# example with a GLM as in the vignette
# assign the number of parameters and number of observations
set.seed(1)
K <- 20L
n <- 5L * K
# simulate the data
truth_limit <- runif(K, -1, 1)
dat <- replicate(
n, {
# sample the indices
n_samp <- sample.int(5L, 1L) + 1L
indices <- sort(sample.int(K, n_samp))
# sample the outcome, y, and return
list(y = rpois(1, exp(sum(truth_limit[indices]))),
indices = indices)
}, simplify = FALSE)
# we need each parameter to be present at least once
stopifnot(length(unique(unlist(
lapply(dat, `[`, "indices")
))) == K) # otherwise we need to change the code
# assign the function we need to pass to psqn_generic
#
# Args:
# i cluster/element function index.
# par the parameters that this element function depends on. It has length zero
# if we need to pass the one-based indices of the parameters that the i'th
# element function depends on.
# comp_grad TRUE of the gradient should be computed.
r_func <- function(i, par, comp_grad){
z <- dat[[i]]
if(length(par) == 0L)
# return the indices
return(z$indices)
eta <- sum(par)
exp_eta <- exp(eta)
out <- -z$y * eta + exp_eta
if(comp_grad)
attr(out, "grad") <- rep(-z$y + exp_eta, length(z$indices))
out
}
# minimize the function
R_res <- psqn_generic(
par = numeric(K), fn = r_func, n_ele_func = length(dat), c1 = 1e-4, c2 = .1,
trace = 0L, rel_eps = 1e-9, max_it = 1000L, env = environment())
# get the same as if we had used optim
R_func <- function(x){
out <- vapply(dat, function(z){
eta <- sum(x[z$indices])
-z$y * eta + exp(eta)
}, 0.)
sum(out)
}
R_func_gr <- function(x){
out <- numeric(length(x))
for(z in dat){
idx_i <- z$indices
eta <- sum(x[idx_i])
out[idx_i] <- out[idx_i] -z$y + exp(eta)
}
out
}
opt <- optim(numeric(K), R_func, R_func_gr, method = "BFGS",
control = list(maxit = 1000L))
# we got the same
all.equal(opt$value, R_res$value)
# also works if we fix some parameters
to_fix <- c(7L, 1L, 18L)
par_fix <- numeric(K)
par_fix[to_fix] <- c(-1, -.5, 0)
R_res <- psqn_generic(
par = par_fix, fn = r_func, n_ele_func = length(dat), c1 = 1e-4, c2 = .1,
trace = 0L, rel_eps = 1e-9, max_it = 1000L, env = environment(),
mask = to_fix - 1L) # notice the -1L because of the zero based indices
# the equivalent optim version is
opt <- optim(
numeric(K - length(to_fix)),
function(par) { par_fix[-to_fix] <- par; R_func (par_fix) },
function(par) { par_fix[-to_fix] <- par; R_func_gr(par_fix)[-to_fix] },
method = "BFGS", control = list(maxit = 1000L))
res_optim <- par_fix
res_optim[-to_fix] <- opt$par
# we got the same
all.equal(res_optim, R_res$par, tolerance = 1e-5)
all.equal(R_res$par[to_fix], par_fix[to_fix]) # the parameters are fixed
# add equality constraints
idx_constrained <- list(c(2L, 19L, 11L, 7L), c(3L, 5L, 8L), 9:7)
# evaluates the c(x) in equalities c(x) = 0.
#
# Args:
# i constrain index.
# par the constrained parameters. It has length zero if we need to pass the
# one-based indices of the parameters that the i'th constrain depends on.
# what integer which is zero if the function should be returned and one if the
# gradient should be computed.
consts <- function(i, par, what){
if(length(par) == 0)
# need to return the indices
return(idx_constrained[[i]])
if(i == 1){
out <- exp(sum(par[1:2])) + exp(sum(par[3:4])) - 1
if(what == 1)
attr(out, "grad") <- c(rep(exp(sum(par[1:2])), 2),
rep(exp(sum(par[3:4])), 2))
} else if(i == 2){
# the parameters need to be on a circle
out <- sum(par^2) - 1
if(what == 1)
attr(out, "grad") <- 2 * par
} else if(i == 3){
out <- sum(par) - .5
if(what == 1)
attr(out, "grad") <- rep(1, length(par))
}
out
}
# optimize with the constraints and masking
res_consts <- psqn_aug_Lagrang_generic(
par = par_fix, fn = r_func, n_ele_func = length(dat), c1 = 1e-4, c2 = .1,
trace = 0L, rel_eps = 1e-8, max_it = 1000L, env = environment(),
consts = consts, n_constraints = length(idx_constrained),
mask = to_fix - 1L)
res_consts
# the constraints are satisfied
consts(1, res_consts$par[idx_constrained[[1]]], 0) # ~ 0
consts(2, res_consts$par[idx_constrained[[2]]], 0) # ~ 0
consts(3, res_consts$par[idx_constrained[[3]]], 0) # ~ 0
# compare with the alabama package
if(require(alabama)){
ala_fit <- auglag(
par_fix, R_func, R_func_gr,
heq = function(x){
c(x[to_fix] - par_fix[to_fix],
consts(1, x[idx_constrained[[1]]], 0),
consts(2, x[idx_constrained[[2]]], 0),
consts(3, x[idx_constrained[[3]]], 0))
}, control.outer = list(trace = 0L))
cat(sprintf("Difference in objective value is %.6f. Parametes are\n",
ala_fit$value - res_consts$value))
print(rbind(alabama = ala_fit$par,
psqn = res_consts$par))
cat("\nOutput from all.equal\n")
print(all.equal(ala_fit$par, res_consts$par))
}
# the overhead here is though quite large with the R interface from the psqn
# package. A C++ implementation is much faster as shown in
# vignette("psqn", package = "psqn"). The reason it is that it is very fast
# to evaluate the element functions in this case
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.