#' Stochastic gradient descent
#'
#' Run stochastic gradient descent in order to optimize the induced loss
#' function given a model and data.
#'
#' @param formula an object of class \code{"\link{formula}"} (or one that can be
#' coerced to that class): a symbolic description of the model to be fitted.
#' The details can be found in \code{"\link{glm}"}.
#' @param data an optional data frame, list or environment (or object coercible
#' by \code{\link[base]{as.data.frame}} to a data frame) containing the
#' variables in the model. If not found in data, the variables are taken from
#' environment(formula), typically the environment from which glm is called.
#' @param model character specifying the model to be used: \code{"lm"} (linear
#' model), \code{"glm"} (generalized linear model), \code{"cox"} (Cox
#' proportional hazards model), \code{"gmm"} (generalized method of moments),
#' \code{"m"} (M-estimation). See \sQuote{Details}.
#' @param model.control a list of parameters for controlling the model.
#' \describe{
#' \item{\code{family} (\code{"glm"})}{a description of the error distribution and
#' link function to be used in the model. This can be a character string
#' naming a family function, a family function or the result of a call to
#' a family function. (See \code{\link[stats]{family}} for details of
#' family functions.)}
#' \item{\code{rank} (\code{"glm"})}{logical. Should the rank of the design matrix
#' be checked?}
#' \item{\code{fn} (\code{"gmm"})}{a function \eqn{g(\theta,x)} which returns a
#' \eqn{k}-vector corresponding to the \eqn{k} moment conditions. It is a
#' required argument if \code{gr} not specified.}
#' \item{\code{gr} (\code{"gmm"})}{a function to return the gradient. If
#' unspecified, a finite-difference approximation will be used.}
#' \item{\code{nparams} (\code{"gmm"})}{number of model parameters. This is
#' automatically determined for other models.}
#' \item{\code{type} (\code{"gmm"})}{character specifying the generalized method of
#' moments procedure: \code{"twostep"} (Hansen, 1982), \code{"iterative"}
#' (Hansen et al., 1996). Defaults to \code{"iterative"}.}
#' \item{\code{wmatrix} (\code{"gmm"})}{weighting matrix to be used in the loss
#' function. Defaults to the identity matrix.}
#' \item{\code{loss} (\code{"m"})}{character specifying the loss function to be
#' used in the estimating equation. Default is the Huber loss.}
#' \item{\code{lambda1}}{L1 regularization parameter. Default is 0.}
#' \item{\code{lambda2}}{L2 regularization parameter. Default is 0.}
#' }
#' @param sgd.control an optional list of parameters for controlling the estimation.
#' \describe{
#' \item{\code{method}}{character specifying the method to be used: \code{"sgd"},
#' \code{"implicit"}, \code{"asgd"}, \code{"ai-sgd"}, \code{"momentum"},
#' \code{"nesterov"}. Default is \code{"ai-sgd"}. See \sQuote{Details}.}
#' \item{\code{lr}}{character specifying the learning rate to be used:
#' \code{"one-dim"}, \code{"one-dim-eigen"}, \code{"d-dim"},
#' \code{"adagrad"}, \code{"rmsprop"}. Default is \code{"one-dim"}.
#' See \sQuote{Details}.}
#' \item{\code{lr.control}}{vector of scalar hyperparameters one can
#' set dependent on the learning rate. For hyperparameters aimed
#' to be left as default, specify \code{NA} in the corresponding
#' entries. See \sQuote{Details}.}
#' \item{\code{start}}{starting values for the parameter estimates. Default is
#' random initialization around zero.}
#' \item{\code{size}}{number of SGD estimates to store for diagnostic purposes
#' (distributed log-uniformly over total number of iterations)}
#' \item{\code{reltol}}{relative convergence tolerance. The algorithm stops
#' if it is unable to change the relative mean squared difference in the
#' parameters by more than the amount. Default is \code{1e-05}.}
#' \item{\code{npasses}}{the maximum number of passes over the data. Default
#' is 3.}
#' \item{\code{pass}}{logical. Should \code{tol} be ignored and run the
#' algorithm for all of \code{npasses}?}
#' \item{\code{shuffle}}{logical. Should the algorithm shuffle the data set
#' including for each pass?}
#' \item{\code{verbose}}{logical. Should the algorithm print progress?}
#' }
#' @param \dots arguments to be used to form the default \code{sgd.control}
#' arguments if it is not supplied directly.
#' @param x,y a design matrix and the respective vector of outcomes.
#'
#' @details
#' Models:
#' The Cox model assumes that the survival data is ordered when passed
#' in, i.e., such that the risk set of an observation i is all data points after
#' it.
#'
#' Methods:
#' \describe{
#' \item{\code{sgd}}{stochastic gradient descent (Robbins and Monro, 1951)}
#' \item{\code{implicit}}{implicit stochastic gradient descent (Toulis et al.,
#' 2014)}
#' \item{\code{asgd}}{stochastic gradient with averaging (Polyak and Juditsky,
#' 1992)}
#' \item{\code{ai-sgd}}{implicit stochastic gradient with averaging (Toulis et
#' al., 2015)}
#' \item{\code{momentum}}{"classical" momentum (Polyak, 1964)}
#' \item{\code{nesterov}}{Nesterov's accelerated gradient (Nesterov, 1983)}
#' }
#'
#' Learning rates and hyperparameters:
#' \describe{
#' \item{\code{one-dim}}{scalar value prescribed in Xu (2011) as
#' \deqn{a_n = scale * gamma/(1 + alpha*gamma*n)^(-c)}
#' where the defaults are
#' \code{lr.control = (scale=1, gamma=1, alpha=1, c)}
#' where \code{c} is \code{1} if implemented without averaging,
#' \code{2/3} if with averaging}
#' \item{\code{one-dim-eigen}}{diagonal matrix
#' \code{lr.control = NULL}}
#' \item{\code{d-dim}}{diagonal matrix
#' \code{lr.control = (epsilon=1e-6)}}
#' \item{\code{adagrad}}{diagonal matrix prescribed in Duchi et al. (2011) as
#' \code{lr.control = (eta=1, epsilon=1e-6)}}
#' \item{\code{rmsprop}}{diagonal matrix prescribed in Tieleman and Hinton
#' (2012) as
#' \code{lr.control = (eta=1, gamma=0.9, epsilon=1e-6)}}
#' }
#'
#' @return
#' An object of class \code{"sgd"}, which is a list containing the following
#' components:
#' \item{model}{name of the model}
#' \item{coefficients}{a named vector of coefficients}
#' \item{converged}{logical. Was the algorithm judged to have converged?}
#' \item{estimates}{estimates from algorithm stored at each iteration
#' specified in \code{pos}}
#' \item{fitted.values}{the fitted mean values}
#' \item{pos}{vector of indices specifying the iteration number each estimate
#' was stored for}
#' \item{residuals}{the residuals, that is response minus fitted values}
#' \item{times}{vector of times in seconds it took to complete the number of
#' iterations specified in \code{pos}}
#' \item{model.out}{a list of model-specific output attributes}
#'
#' @author Dustin Tran, Tian Lan, Panos Toulis, Ye Kuang, Edoardo Airoldi
#' @references
#' John Duchi, Elad Hazan, and Yoram Singer. Adaptive subgradient methods for
#' online learning and stochastic optimization. \emph{Journal of Machine
#' Learning Research}, 12:2121-2159, 2011.
#'
#' Yurii Nesterov. A method for solving a convex programming problem with
#' convergence rate \eqn{O(1/k^2)}. \emph{Soviet Mathematics Doklady},
#' 27(2):372-376, 1983.
#'
#' Boris T. Polyak. Some methods of speeding up the convergence of iteration
#' methods. \emph{USSR Computational Mathematics and Mathematical Physics},
#' 4(5):1-17, 1964.
#'
#' Boris T. Polyak and Anatoli B. Juditsky. Acceleration of stochastic
#' approximation by averaging. \emph{SIAM Journal on Control and Optimization},
#' 30(4):838-855, 1992.
#'
#' Herbert Robbins and Sutton Monro. A stochastic approximation method.
#' \emph{The Annals of Mathematical Statistics}, pp. 400-407, 1951.
#'
#' Panos Toulis, Jason Rennie, and Edoardo M. Airoldi, "Statistical analysis of
#' stochastic gradient methods for generalized linear models", In
#' \emph{Proceedings of the 31st International Conference on Machine Learning},
#' 2014.
#'
#' Panos Toulis, Dustin Tran, and Edoardo M. Airoldi, "Stability and optimality
#' in stochastic gradient descent", arXiv preprint arXiv:1505.02417, 2015.
#'
#' Wei Xu. Towards optimal one pass large scale learning with averaged
#' stochastic gradient descent. arXiv preprint arXiv:1107.2490, 2011.
#'
#' # Dimensions
#' @examples
#' ## Linear regression
#' set.seed(42)
#' N <- 1e4
#' d <- 5
#' X <- matrix(rnorm(N*d), ncol=d)
#' theta <- rep(5, d+1)
#' eps <- rnorm(N)
#' y <- cbind(1, X) %*% theta + eps
#' dat <- data.frame(y=y, x=X)
#' sgd.theta <- sgd(y ~ ., data=dat, model="lm")
#' sprintf("Mean squared error: %0.3f", mean((theta - as.numeric(sgd.theta$coefficients))^2))
#'
#'
#' @useDynLib sgd
#' @import MASS
#' @importFrom methods new
#' @importFrom Rcpp evalCpp
#' @importFrom stats gaussian is.empty.model model.matrix model.response rnorm coef fitted predict
################################################################################
# Classes
################################################################################
#' @export
sgd <- function(x, ...) UseMethod("sgd")
################################################################################
# Methods
################################################################################
#' @export
sgd.default <- function(x, ...) {
stop("class of x is not a formula, function, or matrix")
}
#' @export
#' @rdname sgd
sgd.formula <- function(formula, data, model,
model.control=list(),
sgd.control=list(...),
...) {
call <- match.call() # set call function to match on arguments
# 1. Validity check.
if (missing(data)) {
data <- environment(formula)
}
# 2. Build X and Y according to the formula.
mf <- match.call(expand.dots=FALSE)
m <- match(c("formula", "data"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
Y <- model.response(mf, "any")
if (length(dim(Y)) == 1L) {
nm <- rownames(Y)
dim(Y) <- NULL
if (!is.null(nm)) {
names(Y) <- nm
}
}
mt <- attr(mf, "terms")
if (!is.empty.model(mt)) {
X <- model.matrix(mt, mf)
} else {
X <- matrix(, NROW(Y), 0L)
}
# 3. Pass into sgd.matrix().
return(sgd.matrix(X, Y, model, model.control, sgd.control))
}
#' @export
#' @rdname sgd
sgd.matrix <- function(x, y, model,
model.control=list(),
sgd.control=list(...),
...) {
call <- match.call() # set call function to match on arguments
if (missing(x)) {
stop("'x' not specified")
}
if (missing(y)) {
stop("'y' not specified")
}
if (missing(model)) {
stop("'model' not specified")
}
if (!is.list(model.control)) {
stop("'model.control' is not a list")
}
model.control <- do.call("valid_model_control",
c(model.control, model=model, d=ncol(x)))
if (!is.list(sgd.control)) {
stop("'sgd.control' is not a list")
}
sgd.control <- do.call("valid_sgd_control",
c(sgd.control, N=NROW(y), nparams=model.control$nparams))
return(fit(x, y, model, model.control, sgd.control))
}
#' @export
#' @rdname sgd
# TODO y should be allowed to be a big matrix too; it should be any combination
# (x is a big matrix, y is, etc.)
sgd.big.matrix <- function(x, y, model,
model.control=list(),
sgd.control=list(...),
...) {
return(sgd.matrix(x, y, model, model.control, sgd.control))
}
################################################################################
# Helper functions
################################################################################
fit <- function(x, y, model,
model.control,
sgd.control) {
#time_start <- proc.time()[3] # TODO timer only starts here
# TODO
if (model == "gmm") {
if (sgd.control$method %in% c("implicit", "ai-sgd")) {
stop("implicit methods not implemented yet")
}
}
if (model %in% c("lm", "glm")) {
model.control$transfer <- transfer_name(model.control$family$link)
family <- model.control$family
model.control$family <- family$family
# Enable logistic regression if response is binary factor.
if (is.factor(y) && family == "binomial") {
y <- as.integer(as.character(y))
}
}
dataset <- list(X=x, Y=as.matrix(y))
if ('big.matrix' %in% class(x)) {
dataset$big <- TRUE
dataset[["bigmat"]] <- x@address
} else {
dataset$big <- FALSE
dataset[["bigmat"]] <- new("externalptr")
}
if (sgd.control$verbose) {
print("Completed pre-processing attributes...")
print("Running C++ algorithm...")
}
out <- run(dataset, model.control, sgd.control)
if (sgd.control$verbose) {
print("Completed C++ algorithm...")
}
if (length(out) == 0) {
stop("An error has occured, program stopped")
}
class(out) <- "sgd"
if (model %in% c("lm", "glm")) {
out$model.out$transfer <- model.control$transfer
out$model.out$family <- family
}
out$pos <- as.vector(out$pos)
#out$times <- as.vector(out$times) + (proc.time()[3] - time_start) # C++ time + R time
out$times <- as.vector(out$times)
out$fitted.values <- predict(out, x, type="response")
out$residuals <- y - fitted(out)
return(out)
}
valid_model_control <- function(model, model.control=list(...), ...) {
# Run validity check of arguments passed to model.control given model. It
# passes defaults to those unspecified and converts to the correct type if
# possible; otherwise it errors.
# Check validity of regularization parameters.
lambda1 <- model.control$lambda1
if (is.null(lambda1)) {
lambda1 <- 0
} else if (!is.numeric(lambda1)) {
stop("'lambda1' must be numeric")
} else if (length(lambda1) != 1) {
stop(gettextf("length of 'lambda1' should equal %d", 1), domain=NA)
}
lambda2 <- model.control$lambda2
if (is.null(lambda2)) {
lambda2 <- 0
} else if (!is.numeric(lambda2)) {
stop("'lambda2' must be numeric")
} else if (length(lambda2) != 1) {
stop(gettextf("length of 'lambda2' should equal %d", 1), domain=NA)
}
nparams <- model.control$d
# Set family to gaussian for linear model.
if (model == "lm") {
model.control$family <- gaussian()
}
if (model %in% c("lm", "glm")) {
control.family <- model.control$family
control.rank <- model.control$rank
control.trace <- model.control$trace
control.deviance <- model.control$deviance
# Check validity of family.
if (is.null(control.family)) {
control.family <- gaussian()
} else if (is.character(control.family)) {
control.family <- get(control.family, mode="function", envir=parent.frame())()
} else if (is.function(control.family)) {
control.family <- control.family()
} else if (is.null(control.family$family)) {
print(control.family)
stop("'family' not recognized")
}
# Check validity of rank.
if (is.null(control.rank)) {
control.rank <- FALSE
} else if (!is.logical(control.rank)) {
stop ("'rank' not logical")
}
# Check validity of trace.
if (is.null(control.trace)) {
control.trace <- FALSE
} else if (!is.logical(control.trace)) {
stop ("'trace' not logical")
}
# Check validity of deviance.
if (is.null(control.deviance)) {
control.deviance <- FALSE
} else if (!is.logical(control.deviance)) {
stop ("'deviance' not logical")
}
return(list(
name=model,
family=control.family,
rank=control.rank,
trace=control.trace,
deviance=control.deviance,
nparams=nparams,
lambda1=lambda1,
lambda2=lambda2))
} else if (model == "cox") {
return(list(
name=model,
nparams=nparams,
lambda1=lambda1,
lambda2=lambda2))
} else if (model == "gmm") {
control.fn <- model.control$fn
control.gr <- model.control$gr
control.nparams <- model.control$nparams
control.type <- model.control$type
control.wmatrix <- model.control$wmatrix
# Check validify of moment function and its gradient.
if (is.null(control.fn) && is.null(control.gr)) {
stop("either 'fn' or 'gr' must be specified")
} else if (!is.null(control.fn) && !is.function(control.fn)) {
stop("'fn' not a function")
} else if (!is.null(control.gr) && !is.function(control.gr)) {
stop("'gr' not a function")
} else if (!is.null(control.fn) && is.null(control.gr)) {
# Default to numerical gradient via central differences.
#library(numDeriv)
# TODO probably does not work
control.gr <- function(x, fn=control.fn) {
d <- length(x)
h <- 1e-5
out <- rep(0, d)
for (i in 1:d) {
ei <- c(rep(0, i-1), h, rep(0, d-i))
out[i] <- (fn(x + ei) - fn(x - ei)) / (2*h)
}
return(out)
}
}
# Check validity of nparams.
if (is.null(control.nparams)) {
stop("'nparams' not specified")
} else if (!is.numeric(control.nparams) ||
control.nparams - as.integer(control.nparams) != 0 ||
control.nparams < 1) {
stop("'nparams' must be a positive integer")
}
# Check validity of GMM type.
if (is.null(control.type)) {
control.type <- "iterative"
} else if (!is.character(control.type)) {
stop("'type' must be a string")
# TODO implement cuee
} else if (!(control.type %in% c("twostep", "iterative", "cuee"))) {
stop("'type' not recognized")
}
# Check validity of weighting matrix.
if (is.null(control.wmatrix)) {
# do nothing, since will not store large matrix in R but in C++
} else if (!is.matrix(control.wmatrix)) {
stop("'wmatrix' not a matrix")
# TODO check if dimensions are same as moment conditions
#} else if (identical(dim(control.wmatrix), c(k,k))) {
}
return(list(
name=model,
gr=control.gr,
type=control.type,
nparams=control.nparams,
lambda1=lambda1,
lambda2=lambda2))
} else if (model == "m") {
control.loss <- model.control$loss
if (is.null(control.loss)) {
control.loss <- "huber"
} else if (!is.character(control.loss)) {
stop ("'model.control$loss' must be a string")
} else if (control.loss != "huber") {
stop ("'loss' not available")
}
return(list(
name=model,
loss=control.loss,
nparams=nparams,
lambda1=lambda1,
lambda2=lambda2))
} else {
stop("model not specified")
}
}
valid_sgd_control <- function(method="ai-sgd", lr="one-dim",
lr.control=NULL,
start=rnorm(nparams, mean=0, sd=1e-5),
size=100,
reltol=1e-5, npasses=3, pass=F,
shuffle=F, verbose=F,
truth=NULL, check=F,
N, nparams, ...) {
# The following are internal parameters that can be used but aren't written in
# the documentation for succinctness:
# check: logical, specifying whether to check against \code{truth} for
# convergence instead of using reltol
# truth: true set of parameters
# TODO size isn't the correct thing since reltol means you don't know when it
# ends. user should specify how often to store the iterates (how many per
# iteration)
# Run validity check of arguments passed to sgd.control. It passes defaults to
# those unspecified and converts to the correct type if possible; otherwise it
# errors.
# Check validity of method.
if (!is.character(method)) {
stop("'method' must be a string")
} else if (!(method %in% c("sgd", "implicit", "asgd", "ai-sgd", "momentum",
"nesterov"))) {
stop("'method' not recognized")
}
# Check validity of learning rate.
lrs <- c("one-dim", "one-dim-eigen", "d-dim", "adagrad", "rmsprop")
if (is.numeric(lr)) {
if (lr < 1 | lr > length(lrs)) {
stop("'lr' out of range")
}
lr <- lrs[lr]
} else if (is.character(lr)) {
lr <- tolower(lr)
if (!(lr %in% lrs)) {
stop("'lr' not recognized")
}
} else {
stop("invalid 'lr'")
}
# Check validity of lr.control.
if (!is.null(lr.control) && !is.numeric(lr.control)) {
stop("'lr.control' must be numeric")
} else if (lr == "one-dim") {
if (method %in% c("asgd", "ai-sgd")) {
c <- 2/3
} else {
c <- 1
}
defaults <- c(1, 1, 1, c)
if (is.null(lr.control)) {
lr.control <- defaults
} else if (length(lr.control) != 4) {
stop(gettextf("length of 'lr.control' should equal %d", 4), domain=NA)
}
missing <- which(is.na(lr.control))
lr.control[missing] <- defaults[missing]
} else if (lr == "one-dim-eigen") {
if (is.null(lr.control)) {
lr.control <- 0 # garbage number to store double in C++
} else if (length(lr.control) != 0) {
stop(gettextf("length of 'lr.control' should equal %d", 0), domain=NA)
}
} else if (lr == "d-dim") {
defaults <- 1e-6
if (is.null(lr.control)) {
lr.control <- defaults
} else if (length(lr.control) != 1) {
stop(gettextf("length of 'lr.control' should equal %d", 1), domain=NA)
}
missing <- which(is.na(lr.control))
lr.control[missing] <- defaults[missing]
} else if (lr == "adagrad") {
defaults <- c(1, 1e-6)
if (is.null(lr.control)) {
lr.control <- defaults
} else if (length(lr.control) != 2) {
stop(gettextf("length of 'lr.control' should equal %d", 2), domain=NA)
}
missing <- which(is.na(lr.control))
lr.control[missing] <- defaults[missing]
} else if (lr == "rmsprop") {
defaults <- c(1, 0.9, 1e-6)
if (is.null(lr.control)) {
lr.control <- defaults
} else if (length(lr.control) != 3) {
stop(gettextf("length of 'lr.control' should equal %d", 3), domain=NA)
}
missing <- which(is.na(lr.control))
lr.control[missing] <- defaults[missing]
}
# Check validity of start.
if (!is.numeric(start)) {
stop("'start' must be numeric")
} else if (length(start) != nparams) {
stop(gettextf("length of 'start' should equal %d", nparams), domain=NA)
}
# Check validity of size.
if (!is.numeric(size) || size - as.integer(size) != 0 || size < 1) {
stop("'size' must be positive integer")
}
# Check validity of reltol
if (!is.numeric(reltol)) {
stop("'reltol' must be numeric")
} else if (length(reltol) != 1) {
stop("'reltol' must be scalar")
}
# Check validity of npasses.
if (!is.numeric(npasses) || npasses - as.integer(npasses) != 0 || npasses < 1) {
stop("'npasses' must be positive integer")
}
# Check validity of pass.
if (!is.logical(pass)) {
stop("'pass' must be logical")
}
# Check validity of shuffle.
if (!is.logical(shuffle)) {
stop("'shuffle' must be logical")
}
# Check validity of verbose.
if (!is.logical(verbose)) {
stop("'verbose' must be logical")
}
# Check validity of additional arguments if the method is implicit.
if (method %in% c("implicit", "ai-sgd")) {
call <- match.call()
implicit.control <- do.call("valid_implicit_control", list(...))
} else {
implicit.control <- NULL
}
# TODO they should be vectors in C++, not requiring conversion
start <- as.matrix(start)
if (check) {
truth <- as.matrix(truth)
}
return(c(list(method=method,
lr=lr,
lr.control=lr.control,
start=start,
size=size,
reltol=reltol,
npasses=npasses,
pass=pass,
shuffle=shuffle,
verbose=verbose,
check=check,
truth=truth,
nparams=nparams),
implicit.control))
}
valid_implicit_control <- function(delta=30L, ...) {
# Maintain control parameters for running implicit SGD. Pass defaults
# if unspecified.
#
# Args:
# delta: convergence criterion for the one-dimensional optimization
args <- list(...)
if (!is.null(names(args))) {
stop("Invalid args passed into sgd.control through dots")
}
if (!is.numeric(delta) || delta - as.integer(delta) != 0 || delta <= 0) {
stop("value of 'delta' must be integer > 0")
}
return(list(delta=delta))
}
transfer_name <- function(link.name) {
if(!is.character(link.name)) {
stop("link name must be a string")
}
link.names <- c("identity", "log", "logit", "inverse")
transfer.names <- c("identity", "exp", "logistic", "inverse")
transfer.idx <- which(link.names == link.name)
if (length(transfer.idx) == 0) {
stop("no match link function founded!")
}
return(transfer.names[transfer.idx])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.