# This function fits sparse additive models for one
# value of the tuning parameters lambda1, lambda2. This function
# solves the optimization problem using a proximal gradient descent
# algorithm. The nice feature is that a general form of the
# algorithm can be applied to both least-squares or logistic loss, or
# really any loss function. Properties of the least-squares loss
# allow us to use a block coordinate descent algorithm instead.
# This is a general function which can be used for either
# trend-filtering or the sobolev norm.
#
# Args:
# y: Response vector of size n.
# x_ord: The ordered/sorted design matrix of size n*p.
# ord: A n*p matrix of the ordering for the covariate matrix x.
# lambda1, lambda2: Tuning parameters.
# init_fhat: Initial value of the estimated functions.
# init_intercept: Initial value for the intercept.
# k: order of the TF.
# max_iter: Maximum iterations of the algorithm.
# tol: Tolerance for stopping criteria.
# step_size: Initial step size for the line search algorithm.
# alpha: Tuning parameter for the line search algorithm.
# family: "gaussian" for least squares loss and "binomial" for logistic loss.
# method: Charecter vector indicating method to use. "tf" for trend-filtering
# and "sobolev" for the sobolev norm smoothness penalty.
# parallel: A boolean, indicating if proximal gradient descent should
# be parallelized. Ignored if coord.desc == TRUE.
# ncores: If parallel == TRUE, specifies size of cluster.
# line_search: A boolean, indicator if line search should be used or
# fixed step size.
# FISTA: A boolean, should the accelerated proximal gradient descent be used?
#
#
# Returns:
# A list with the following components:
# fhat: The estimate of the component functions, an n*p matrix.
# intercept: The estimate of the intercept term, a scalar.
# conv: A boolean indicator of convergence status.
proxGrad_one <- function(y, x_ord, ord, lambda1, lambda2,
init_fhat, init_intercept, k=0,
max_iter = 100, tol = 1e-4,
step_size = 5, alpha = 0.8,
family = "gaussian", method = "tf",
parallel = FALSE, ncores = 8,
line_search = TRUE, FISTA = FALSE,
...) {
# Initialize some objects.
counter <- 1
converged <- FALSE
n <- nrow(x_ord)
p <- ncol(x_ord)
# Initialize parameters using warm starts.
old_ans <- vector("list", 2)
old_ans[[1]] <- init_intercept
old_ans[[2]] <- init_fhat
if(FISTA) {
# While old_ans will be used to store
# iterate k, we have another object to store
# iterate k-1.
# This is for Nesterov style acceleration.
old_old_ans <- vector("list", 2)
old_old_ans[[1]] <- init_intercept
old_old_ans[[2]] <- init_fhat
}
if(line_search == FALSE) {
# If not doing a line search we need to specify a step size in terms of the
# Lipchitz constant.
# Specifcally we use step size ((pa+1)*L)^-1 where L is
# the lipschitz constant, and pa is the number of non-zero functions on the
# current iterate.
if(family == "binomial") {
L.const <- 1/4
} else if(family == "gaussian") {
L.const <- 1
}
}
while(counter < max_iter & !converged) {
if(line_search == FALSE) {
sparse_f <- 1 * (colMeans(abs(old_ans[[2]])) != 0)
step_size <- (L.const * (1 + sum(sparse_f)))^(-1)
}
if(FISTA) {
# Evaluate extrapolation step of fista.
wk <- counter/(counter+3)
fista_fhat <- old_ans[[2]] + wk*(old_ans[[2]] - old_old_ans[[2]])
fista_inter <- old_ans[[1]] + wk*(old_ans[[1]] - old_old_ans[[1]])
new_ans <- LineSearch(alpha, step_size, y, fista_fhat,
fista_inter,
x_ord, ord, k, lambda1, lambda2, family,
method = method, parallel = parallel,
ncores = ncores,
line_search = line_search, ...)
} else {
new_ans <- LineSearch(alpha, step_size, y,
old_ans[[2]], old_ans[[1]],
x_ord, ord, k, lambda1, lambda2, family,
method = method,parallel = parallel,
ncores = ncores,
line_search = line_search, ...)
}
temp_res1 <- mean((new_ans[[2]] - old_ans[[2]])^2) +
(new_ans[[1]] - old_ans[[1]])^2
temp_res2 <- mean((old_ans[[2]])^2) + (old_ans[[1]])^2
# Compare the relative change in parameter values and compare to
# tolerance to check convergence. Else update iteration.
if(temp_res1/(temp_res2+1e-30) <= tol) {
converged <- TRUE
} else {
counter <- counter + 1;
if(FISTA){
old_old_ans <- old_ans;
}
old_ans <- new_ans;
}
}
if(converged == FALSE) {
expr <- paste0("Algorithm did not converge for Lambda1 = ", signif(lambda1, 2),
" and Lambda2 = ", signif(lambda2,2));
warning(expr)
}
list("fhat" = new_ans[[2]],
"intercept" = new_ans[[1]],
"conv" = converged)
}
# This function performs sparse additive trend-filtering for one
# value of the tuning parameters for the case of a least squares loss.
# This is done via a block-coordinate descent method. This function is
# used when family="gaussian".
#
# Args:
# y: Response vector of size n.
# x: The design matrix of size n*p.
# ord: A n*p matrix of the ordering for the covariate matrix x.
# lambda1, lambda2: Tuning parameters.
# init_fhat: Initial value of the estimated functions.
# k: order of the Trend filtering problem, ignored for sobolev.
# max_iter: Maximum iterations of the algorithm.
# tol: Tolerance for stopping criteria.
# method: Charecter vector indicating method to use. "tf" for trend-filtering
# and "sobolev" for the sobolev norm smoothness penalty.
#
# Returns:
# A list with the following components:
# fhat: The estimate of the component functions, an n*p matrix.
# intercept: The estimate of the intercept term, a scalar.
# conv: A boolean indicator of convergence status.
blockCoord_one <- function(y, x, ord,init_fhat, k=0,
max_iter = 100, tol = 1e-4,
lambda1, lambda2, method = "tf",...) {
n <- length(y)
p <- ncol(x)
counter <- 1
converged <- FALSE
old.fhat <- init_fhat
# Begin loop for block coordinate descent
while(counter < max_iter & !converged) {
for(j in 1:p) {
res <- y - apply(init_fhat[, -j], 1, sum)- mean(y)
if(method == "tf") {
init_fhat[ord[,j], j] <- solve.prox.tf(res[ord[, j]] - mean(res),
x[ord[, j], j],
k = k, lambda1, lambda2, ...)
} else if(method == "sobolev"){
init_fhat[ord[,j], j] <- cpp_solve_prox(res[ord[,j]] - mean(res),
x[ord[, j], j], lambda1, lambda2,
n = n, n_grid = 1e+3,
lam_tilde_old = 0.5)
}
}
temp_res1 <- mean((init_fhat - old.fhat)^2)
temp_res2 <- mean((old.fhat)^2)
#cat("Iteration: ", counter,", Tol:", temp_res1/(temp_res2+1e-30),"\n")
# Compare the relative change in parameter values and compare to
# tolerance to check convergence. Else update iteration.
if(temp_res1/(temp_res2+1e-30) <= tol) {
converged <- TRUE
} else {
old.fhat <- init_fhat
counter <- counter+1
}
}
list("fhat" = init_fhat,
"intercept" = mean(y),
"conv" = converged)
}
# This function solves the full problem for a sequence of lambda
# values.
#
# Args:
# y: Response vector of size n.
# x: An n*p design matrix.
# max.iter: Maximum number of iterations for the algorithm
# tol: The tolerance for the algorithm
# initpars: Initial parameter values, defaults to NULL f^hat = 0.
# lambda.max: maximum lambda value
# lambda.min.ratio: Ratio between largest and smallest lambda value.
# nlam: Number of lambda values.
# zeta: If NULL (default) the lambda_1 = lambda and lambda2 = lambda^2. Otherwise
# a double in [0,1] so that lambda_1 = zeta*lambda and lambda_2 = (1-zeta)*lambda.
# k: Order of trend-filter
# family: "gaussian" for least squares norm, and "binomial" for logistic loss.
# ininitintercept: Initial value for the intercept term.
# step: Step size for the proximal graident descent algorithm.
# alpha: Alpha value for the line search algorithm used in proximal gradient descent.
# coord.desc: A boolean, indicating if block coordinate descent
# should be used for family=="gaussian"
# method: Charecter vector indicating method to use. "tf" for trend-filtering
# and "sobolev" for the sobolev norm smoothness penalty.
# parallel: A boolean, indicating if proximal gradient descent should
# be parallelized. Ignored if coord.desc == TRUE.
# ncores: If parallel == TRUE, specifies size of cluster.
# line_search: A boolean, indicator if line search should be used or
# fixed step size.
# FISTA: A boolean, should the accelerated proximal gradient descent be used?
# return_x: A boolean, should the design matrix be returned.
# for large design matrices we recommend return_x == FALSE.
fit.additive <- function(y, x, max.iter = 100, tol = 1e-4,
initpars = NULL, lambda.max = NULL,
lambda.min.ratio = 1e-3,
nlam = 50, zeta = NULL,
k = 0, family = "binomial",
initintercept = NULL, step = 1, alpha = 0.8,
coord.desc = TRUE, method = "tf",
parallel = FALSE, ncores = 8,
line_search = FALSE,
FISTA = TRUE, verbose = FALSE,
return_x = TRUE, ...) {
method <- tolower(method)
if(family != "binomial" & family != "gaussian") {
stop("Currenlty, only 'gaussian' and 'binomial' are acceptable values for
function parameter: family.")
}
if(method != "tf" & method != "sobolev") {
stop("Currenlty, only 'tf' and 'sobolev' are acceptable values for
function parameter: method.")
}
# Initialize some values.
x <- as.matrix(x)
n <- length(y)
p <- ncol(x)
# Generate matrix of orders.
ord <- apply(x, 2, order)
x_ord <- sapply(1:p, function(i){
x[ord[,i], i]
})
### Deprecated, no need to calculate ranks,
### especially when working with high-dimensional data.
#ranks <- apply(x, 2, rank)
if(is.null(initpars)) {
initpars <- matrix(0, ncol = p, nrow = n)
}
if(family == "binomial"){
y <- factor(y)
if(length(levels(y))!=2) {
stop("For binomial family response must be binary or a factor with only two
levels.")
}
levs.y <- levels(y)
y <- ifelse(y == levels(y)[2], 1, -1)
}
if(is.null(initintercept) & family == "binomial") {
y2 <- y
y2[y==-1] <- 0
mp <- mean(y2)
initintercept <- log(mp/(1-mp))
}
if(is.null(initintercept) & family == "gaussian") {
initintercept <- mean(y)
}
# If we are paralleling some computations.
if(parallel) {
require(doParallel)
require(parallel)
# Begin cluster
#cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cores = ncores)
if(verbose) {
cat("Registering Parallel Backend with", ncores, "Cores.\n")
}
}
if(is.null(lambda.max)) {
lambda.max <- find.lambdamax(x_ord, ord, k,
y, family = family,
method = method, parallel = parallel,
ncores = ncores, zeta = zeta)
if(verbose){
cat("Lambda Max found: ", lambda.max, "\n")
}
}
lam.seq <- seq(log10(lambda.max), log10(lambda.max*lambda.min.ratio),
length = nlam)
lam.seq <- 10^lam.seq
if(is.null(zeta)) {
lam1.seq <- lam.seq
lam2.seq <- lam.seq^2
} else {
if(zeta > 1 | zeta < 0) {
stop("zeta must be within [0,1]")
}
lam1.seq <- lam.seq * zeta
lam2.seq <- lam.seq * (1 - zeta)
}
ans <- array(0, dim = c(n, p, nlam))
ans.inters <- numeric(nlam)
conv.vec <- c()
for(i in 1:nlam) {
if(verbose) {
cat("Lambda value: ", i, "\n");
}
if(family=="gaussian" & coord.desc) {
temp <- blockCoord_one(y, x, ord,init_fhat = initpars,
k=k, max_iter = max.iter, tol = tol,
lambda1 = lam1.seq[i],lambda2 = lam2.seq[i],
method = method,...)
} else {
#print(i)
temp <- proxGrad_one(y, x_ord, ord, lam1.seq[i], lam2.seq[i],
init_fhat = initpars, init_intercept = initintercept,
k=k, max_iter = max.iter, tol = tol,
step_size = step, alpha = alpha,
family = family, method = method,
parallel = parallel, ncores = ncores,
line_search = line_search, FISTA = FISTA,
...)
}
ans[, , i] <- temp$fhat
ans.inters[i] <- temp$intercept
initintercept <- ans.inters[i]
initpars <- ans[, , i]
conv.vec <- c(conv.vec, temp$conv)
}
if(return_x == TRUE) {
obj <- list("f_hat" = ans,
"intercept" = ans.inters,
"x" = x,
"ord" = ord,
"lam" = lam.seq,
"k" = k,
"family" = family,
"conv" = conv.vec)
} else {
obj <- list("f_hat" = ans,
"intercept" = ans.inters,
"lam" = lam.seq,
"k" = k,
"family" = family,
"conv" = conv.vec)
}
class(obj) <- "add_mod"
return(obj)
}
# The Generic predict function our framework based on linear
# interpolation.
#
# Args:
# obj: A fitted additive model, object of type "add_mod"
# new.data: A new x_matrix which we wish to fit.
# type: "function" will return the fitted components f_1,...,f_p.
predict.add_mod <- function(obj, new.data, type = "function",
x.mat = obj$x) {
nlam <- length(obj$lam)
p <- dim(obj$f_hat)[2]
obj$x <- x.mat
ans <- array(0, dim = c(dim(new.data), nlam) )
for(i in 1:nlam) {
for(j in 1:p) {
ans[,j,i] <- suppressWarnings({
approx(obj$x[, j], obj$f_hat[,j,i], new.data[, j],
rule = 2)$y
})
}
}
if(type == "function") {
return(ans)
} else {
if(obj$family == "gaussian"){
temp <- apply(ans, c(1,3),sum)
temp <- t(apply(temp, 1, "+", obj$intercept))
return(temp)
} else {
temp <- apply(ans, c(1,3),sum)
temp <- t(apply(temp, 1, "+", obj$intercept))
return(1/(1 + exp(-temp)))
}
}
return(ans)
}
plot.add_mod <- function(obj, f_ind = 1, lam_ind = 1,
x.mat = obj$x, ...) {
y <- obj$f_hat[,f_ind,lam_ind]
plot(x = sort(x.mat[,f_ind]), y=y[order(x.mat[,f_ind])],
xlab = paste0("X_", f_ind), ylab = paste0("f_", f_ind), ...)
}
# Another generic function to find the lambda_max, based on the proximal
# gradient descent algorithm.
find.lambdamax <- function(x_mat_ord, ord_mat, k,
y, family = "gaussian",
method = "tf", parallel = FALSE,
ncores = 8, zeta = NULL, ...) {
n <- length(y)
if(family == "binomial") {
y.bar <- sum(y==1)/n
inter <- log(y.bar/(1-y.bar))
vecR <- (-y)/(1 + exp(y * inter))
lam.max <- sqrt(mean(vecR^2))
} else if(family == "gaussian") {
inter <- mean(y)
lam.max <- sqrt(mean(y^2))
}
f_hat <- matrix(0, ncol = ncol(x_mat_ord), nrow = nrow(x_mat_ord))
#print(zeta)
convg <- FALSE
while(!convg){
lam <- lam.max*0.9
if(is.null(zeta)){
f_hat_new <- GetZ(f_hat, inter, step_size = 1,
x_mat_ord, ord_mat, k=k,
lam, lam^2, y, family = family,
method = method, parallel = parallel,
ncores = ncores, ...)
} else {
f_hat_new <- GetZ(f_hat, inter, step_size = 1,
x_mat_ord, ord_mat,k=k,
zeta*lam, (1-zeta)*lam, y,
family = family,
method = method, parallel = parallel,
ncores = ncores, ...)
}
temp_res <- mean((f_hat_new[[2]] - f_hat)^2)
if(temp_res < 1e-30){
lam.max <- lam
} else {
convg <- TRUE
}
}
return(lam.max)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.