| psqn_generic | R Documentation | 
Optimization method for generic partially separable functions.
psqn_generic(
  par,
  fn,
  n_ele_func,
  rel_eps = 1e-08,
  max_it = 100L,
  n_threads = 1L,
  c1 = 1e-04,
  c2 = 0.9,
  use_bfgs = TRUE,
  trace = 0L,
  cg_tol = 0.5,
  strong_wolfe = TRUE,
  env = NULL,
  max_cg = 0L,
  pre_method = 1L,
  mask = as.integer(c()),
  gr_tol = -1
)
psqn_aug_Lagrang_generic(
  par,
  fn,
  n_ele_func,
  consts,
  n_constraints,
  multipliers = as.numeric(c()),
  penalty_start = 1L,
  rel_eps = 1e-08,
  max_it = 100L,
  max_it_outer = 100L,
  violations_norm_thresh = 1e-06,
  n_threads = 1L,
  c1 = 1e-04,
  c2 = 0.9,
  tau = 1.5,
  use_bfgs = TRUE,
  trace = 0L,
  cg_tol = 0.5,
  strong_wolfe = TRUE,
  env = NULL,
  max_cg = 0L,
  pre_method = 1L,
  mask = as.integer(c()),
  gr_tol = -1
)
| par | Initial values for the parameters. | 
| fn | Function to compute the element functions and their derivatives. Each call computes an element function. See the examples section. | 
| n_ele_func | Number of element functions. | 
| rel_eps | Relative convergence threshold. | 
| max_it | Maximum number of iterations. | 
| n_threads | Number of threads to use. | 
| c1 | Thresholds for the Wolfe condition. | 
| c2 | Thresholds for the Wolfe condition. | 
| use_bfgs | Logical for whether to use BFGS updates or SR1 updates. | 
| trace | Integer where larger values gives more information during the optimization. | 
| cg_tol | Threshold for the conjugate gradient method. | 
| strong_wolfe | 
 | 
| env | Environment to evaluate  | 
| max_cg | Maximum number of conjugate gradient iterations in each iteration. Use zero if there should not be a limit. | 
| pre_method | Preconditioning method in the conjugate gradient method. Zero yields no preconditioning, one yields diagonal preconditioning, two yields the incomplete Cholesky factorization from Eigen, and three yields a block diagonal preconditioning. One and three are fast options with three seeming to work well for some poorly conditioned problems. | 
| mask | zero based indices for parameters to mask (i.e. fix). | 
| gr_tol | convergence tolerance for the Euclidean norm of the gradient. A negative value yields no check. | 
| consts | Function to compute the constraints which must be equal to zero. See the example Section. | 
| n_constraints | The number of constraints. | 
| multipliers | Staring values for the multipliers in the augmented
Lagrangian method. There needs to be the same number of multipliers as the
number of constraints. An empty vector,  | 
| penalty_start | Starting value for the penalty parameterin the augmented Lagrangian method. | 
| max_it_outer | Maximum number of augmented Lagrangian steps. | 
| violations_norm_thresh | Threshold for the norm of the constraint violations. | 
| tau | Multiplier used for the penalty parameter between each outer iterations. | 
The function follows the method described by Nocedal and Wright (2006)
and mainly what is described in Section 7.4. Details are provided
in the psqn vignette. See vignette("psqn", package = "psqn").
The partially separable function we consider can be quite general and the
only restriction is that we can write the function to be minimized as a sum
of so-called element functions each of which only depends on a small number
of the parameters. A more restricted version is available through the
psqn function.
The optimization function is also available in C++ as a header-only library. Using C++ may reduce the computation time substantially. See the vignette in the package for examples.
A list like psqn and psqn_aug_Lagrang.
Nocedal, J. and Wright, S. J. (2006). Numerical Optimization (2nd ed.). Springer.
Lin, C. and Moré, J. J. (1999). Incomplete Cholesky factorizations with limited memory. SIAM Journal on Scientific Computing.
# example with a GLM as in the vignette
# assign the number of parameters and number of observations
set.seed(1)
K <- 20L
n <- 5L * K
# simulate the data
truth_limit <- runif(K, -1, 1)
dat <- replicate(
  n, {
    # sample the indices
    n_samp <- sample.int(5L, 1L) + 1L
    indices <- sort(sample.int(K, n_samp))
    # sample the outcome, y, and return
    list(y = rpois(1, exp(sum(truth_limit[indices]))),
         indices = indices)
  }, simplify = FALSE)
# we need each parameter to be present at least once
stopifnot(length(unique(unlist(
  lapply(dat, `[`, "indices")
))) == K) # otherwise we need to change the code
# assign the function we need to pass to psqn_generic
#
# Args:
#   i cluster/element function index.
#   par the parameters that this element function depends on. It has length zero
#       if we need to pass the one-based indices of the parameters that the i'th
#       element function depends on.
#   comp_grad TRUE of the gradient should be computed.
r_func <- function(i, par, comp_grad){
  z <- dat[[i]]
  if(length(par) == 0L)
    # return the indices
    return(z$indices)
  eta <- sum(par)
  exp_eta <- exp(eta)
  out <- -z$y * eta + exp_eta
  if(comp_grad)
    attr(out, "grad") <- rep(-z$y + exp_eta, length(z$indices))
  out
}
# minimize the function
R_res <- psqn_generic(
  par = numeric(K), fn = r_func, n_ele_func = length(dat), c1 = 1e-4, c2 = .1,
  trace = 0L, rel_eps = 1e-9, max_it = 1000L, env = environment())
# get the same as if we had used optim
R_func <- function(x){
  out <- vapply(dat, function(z){
    eta <- sum(x[z$indices])
    -z$y * eta + exp(eta)
  }, 0.)
  sum(out)
}
R_func_gr <- function(x){
  out <- numeric(length(x))
  for(z in dat){
    idx_i <- z$indices
    eta <- sum(x[idx_i])
    out[idx_i] <- out[idx_i] -z$y + exp(eta)
  }
  out
}
opt <- optim(numeric(K), R_func, R_func_gr, method = "BFGS",
             control = list(maxit = 1000L))
# we got the same
all.equal(opt$value, R_res$value)
# also works if we fix some parameters
to_fix <- c(7L, 1L, 18L)
par_fix <- numeric(K)
par_fix[to_fix] <- c(-1, -.5, 0)
R_res <- psqn_generic(
  par = par_fix, fn = r_func, n_ele_func = length(dat), c1 = 1e-4, c2 = .1,
  trace = 0L, rel_eps = 1e-9, max_it = 1000L, env = environment(),
  mask = to_fix - 1L) # notice the -1L because of the zero based indices
# the equivalent optim version is
opt <- optim(
  numeric(K - length(to_fix)),
  function(par) { par_fix[-to_fix] <- par; R_func   (par_fix) },
  function(par) { par_fix[-to_fix] <- par; R_func_gr(par_fix)[-to_fix] },
  method = "BFGS", control = list(maxit = 1000L))
res_optim <- par_fix
res_optim[-to_fix] <- opt$par
# we got the same
all.equal(res_optim, R_res$par, tolerance = 1e-5)
all.equal(R_res$par[to_fix], par_fix[to_fix]) # the parameters are fixed
# add equality constraints
idx_constrained <- list(c(2L, 19L, 11L, 7L), c(3L, 5L, 8L), 9:7)
# evaluates the c(x) in equalities c(x) = 0.
#
# Args:
#   i constrain index.
#   par the constrained parameters. It has length zero if we need to pass the
#       one-based indices of the parameters that the i'th constrain depends on.
#   what integer which is zero if the function should be returned and one if the
#        gradient should be computed.
consts <- function(i, par, what){
  if(length(par) == 0)
    # need to return the indices
    return(idx_constrained[[i]])
  if(i == 1){
    out <- exp(sum(par[1:2])) + exp(sum(par[3:4])) - 1
    if(what == 1)
      attr(out, "grad") <- c(rep(exp(sum(par[1:2])), 2),
                             rep(exp(sum(par[3:4])), 2))
  } else if(i == 2){
    # the parameters need to be on a circle
    out <- sum(par^2) - 1
    if(what == 1)
      attr(out, "grad") <- 2 * par
  } else if(i == 3){
    out <- sum(par) - .5
    if(what == 1)
      attr(out, "grad") <- rep(1, length(par))
  }
  out
}
# optimize with the constraints and masking
res_consts <- psqn_aug_Lagrang_generic(
  par = par_fix, fn = r_func, n_ele_func = length(dat), c1 = 1e-4, c2 = .1,
  trace = 0L, rel_eps = 1e-8, max_it = 1000L, env = environment(),
  consts = consts, n_constraints = length(idx_constrained),
  mask = to_fix - 1L)
res_consts
# the constraints are satisfied
consts(1, res_consts$par[idx_constrained[[1]]], 0) # ~ 0
consts(2, res_consts$par[idx_constrained[[2]]], 0) # ~ 0
consts(3, res_consts$par[idx_constrained[[3]]], 0) # ~ 0
# compare with the alabama package
if(require(alabama)){
    ala_fit <- auglag(
      par_fix, R_func, R_func_gr,
      heq = function(x){
        c(x[to_fix] - par_fix[to_fix],
          consts(1, x[idx_constrained[[1]]], 0),
          consts(2, x[idx_constrained[[2]]], 0),
          consts(3, x[idx_constrained[[3]]], 0))
      }, control.outer = list(trace = 0L))
    cat(sprintf("Difference in objective value is %.6f. Parametes are\n",
                ala_fit$value - res_consts$value))
    print(rbind(alabama = ala_fit$par,
                psqn = res_consts$par))
    cat("\nOutput from all.equal\n")
    print(all.equal(ala_fit$par, res_consts$par))
}
# the overhead here is though quite large with the R interface from the psqn
# package. A C++ implementation is much faster as shown in
# vignette("psqn", package = "psqn"). The reason it is that it is very fast
# to evaluate the element functions in this case
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.