#' Accelerated proximal gradient optimization
#'
#' This function implements an accelerated proximal gradient method (Nesterov
#' 2007, Beck and Teboulle 2009). It solves: \deqn{min_x (f(x) + h(x)), x \in
#' R^dim_x} where \eqn{f} is smooth, convex and \eqn{h} is non-smooth, convex
#' but simple in that we can easily evaluate the proximal operator of h.
#'
#' @param grad_f A function that computes the gradient of f :
#' \eqn{grad_f(v,opts) = df(v)/dv}
#' @param prox_h A function that computes the proximal operator of h :
#' \eqn{prox_h(v,t,opts) = argmin_x (t*h(x) + 1/2 * norm(x-v)^2)}
#' @param dim_x The dimension of the unknown \eqn{x}
#' @param opts List of parameters, both for the \code{apg} function and for the
#' \code{grad_f} and \code{prox_h} functions. For \code{apg}, the list can
#' contain the following fields: \itemize{ \item \code{X_INIT}: initial
#' starting point (default \code{0}) \item \code{USE_RESTART} : use adaptive
#' restart scheme (default \code{TRUE}) \item \code{MAX_ITERS} : maximum
#' iterations before termination (default \code{2000}) \item \code{EPS} :
#' tolerance for termination (default \code{1e-6}) \item \code{ALPHA} :
#' step-size growth factor (default \code{1.01}) \item \code{BETA} : step-size
#' shrinkage factor (default \code{0.5}) \item \code{QUIET} : if false writes
#' out information every 100 iters (default \code{FALSE}) \item
#' \code{GEN_PLOTS} : if true generates plots of norm of proximal gradient
#' (default \code{TRUE}) \item \code{USE_GRA} : if true uses UN-accelerated
#' proximal gradient descent (typically slower) (default \code{FALSE}) \item
#' \code{STEP_SIZE} : starting step-size estimate, if not set then apg makes
#' initial guess (default \code{NULL}) \item \code{FIXED_STEP_SIZE} : don't
#' change step-size (forward or back tracking), uses initial step-size
#' throughout, only useful if good STEP_SIZE set (default \code{FALSE}) } In
#' addition, \code{opts} can contain other fields that will be passed to the
#' \code{grad_f} and \code{prox_h} functions. This code was borrowed and
#' adapted from the MATLAB version of Brendan O'Donoghue available at
#' \code{https://github.com/bodono/apg}
#'
#' @return A list with \code{x}, the solution of the problem: \deqn{min_x (f(x)
#' + h(x)), x \in R^dim_x ,} and \code{t}, the last step size.
#'
#' @export
#' @examples # Solve a Lasso problem:
#' # min_x 1/2 norm( A%*%x - b )^2 + lambda ||x||_1
#' n <- 50
#' m <- 20
#' lambda <- 1
#' A <- matrix(rnorm(m*n), nrow=n)
#' b <- rnorm(n)
#' r <- apg(grad.quad, prox.l1, m, list(A=A, b=b, lambda=lambda) )
#' # This gives the same result as:
#' # m <- glmnet(A,b,alpha=1, standardize=FALSE,lambda=1/50,intercept=FALSE)
apg <- function(grad_f, prox_h, dim_x, opts) {
# Set default parameters
X_INIT <- numeric(dim_x) # initial starting point
USE_RESTART <- TRUE # use adaptive restart scheme
MAX_ITERS <- 2000 # maximum iterations before termination
EPS <- 1e-6 # tolerance for termination
ALPHA <- 1.01 # step-size growth factor
BETA <- 0.5 # step-size shrinkage factor
QUIET <- FALSE # if false writes out information every 100 iters
GEN_PLOTS <- TRUE # if true generates plots of norm of proximal gradient
USE_GRA <- FALSE # if true uses UN-accelerated proximal gradient descent (typically slower)
STEP_SIZE = NULL # starting step-size estimate, if not set then apg makes initial guess
FIXED_STEP_SIZE <- FALSE # don't change step-size (forward or back tracking), uses initial step-size throughout, only useful if good STEP_SIZE set
# Replace the default parameters by the ones provided in opts if any
for (u in c("X_INIT","USE_RESTART", "MAX_ITERS", "EPS", "ALPHA", "BETA", "QUIET", "GEN_PLOTS", "USE_GRA", "STEP_SIZE", "FIXED_STEP_SIZE")) {
eval(parse(text=paste('if (exists("',u,'", where=opts)) ',u,' <- opts[["',u,'"]]',sep='')))
}
# Initialization
x <- X_INIT
y <- x
g <- grad_f(y, opts)
if (norm_vec(g) < EPS) {
return(list(x=x,t=0))
}
theta <- 1
# Initial step size
if (is.null(STEP_SIZE)) {
# Barzilai-Borwein step-size initialization:
t <- 1 / norm_vec(g)
x_hat <- x - t*g
g_hat <- grad_f(x_hat, opts)
t <- abs(sum( (x - x_hat)*(g - g_hat)) / (sum((g - g_hat)^2)))
} else {
t <- STEP_SIZE
}
# Main loop
for (k in seq(MAX_ITERS)) {
if (!QUIET && (k %% 100==0)) {
message(paste('iter num ',k,', norm(tGk): ',err1,', step-size: ',t,sep=""))
}
x_old <- x
y_old <- y
# The proximal gradient step (update x)
x <- prox_h( y - t*g, t, opts)
# The error for the stopping criterion
err1 <- norm_vec(y-x) / max(1,norm_vec(x))
if (err1 < EPS) break
# Update theta for acceleration
if(!USE_GRA)
theta <- 2/(1 + sqrt(1+4/(theta^2)))
else
theta <- 1
end
# Update y
if (USE_RESTART && sum((y-x)*(x-x_old))>0) {
x <- x_old
y <- x
theta <- 1
} else {
y <- x + (1-theta)*(x-x_old)
}
# New gradient
g_old <- g
g <- grad_f(y,opts)
# Update stepsize by TFOCS-style backtracking
if (!FIXED_STEP_SIZE) {
t_hat <- 0.5*sum((y-y_old)^2)/abs(sum((y - y_old)*(g_old - g)))
t <- min( ALPHA*t, max( BETA*t, t_hat ))
}
}
if (!QUIET) {
message(paste('iter num ',k,', norm(tGk): ',err1,', step-size: ',t,sep=""))
if (k==MAX_ITERS) message(paste('Warning: maximum number of iterations reached'))
message('Terminated')
}
# Return solution and step size
return(list(x=x,t=t))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.