## Kernel methods for Generalized Elastic Nets
##
## by Artem Sokolov
#' Kernel models for linear regression, binary classification and one-class problems.
#'
#' Infers the problem type and learns the appropriate kernel model.
#'
#' The entries in the kernel matrix K can be interpreted as dot products
#' in some feature space \eqn{\phi}. The corresponding weight vector can be
#' retrieved via \eqn{w = \sum_i v_i \phi(x_i)}. However, new samples can be
#' classified without explicit access to the underlying feature space:
#' \deqn{w^T \phi(x) + b = \sum_i v_i \phi^T (x_i) \phi(x) + b = \sum_i v_i K( x_i, x ) + b}
#'
#' The method determines the problem type from the labels argument y.
#' If y is a numeric vector, then a ridge regression model is trained by optimizing the following objective function:
#' \deqn{ \frac{1}{2n} \sum_i a_i (z_i - (w^T x_i + b))^2 + w^Tw }
#'
#' If y is a factor with two levels, then the function returns a binary classification model, obtained by optimizing the following objective function:
#' \deqn{ -\frac{1}{n} \sum_i y_i s_i - \log( 1 + \exp(s_i) ) + w^Tw }
#' where
#' \deqn{ s_i = w^T x_i + b }
#'
#' Finally, if no labels are provided (y == NULL), then a one-class model is constructed using the following objective function:
#' \deqn{ -\frac{1}{n} \sum_i s_i - \log( 1 + \exp(s_i) ) + w^Tw }
#' where
#' \deqn{ s_i = w^T x_i }
#'
#' In all cases, \eqn{w = \sum_i v_i \phi(x_i)} and the method solves for \eqn{v_i}.
#'
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param y n-by-1 vector of response values. Must be numeric vector for regression, factor with 2 levels for binary classification, or NULL for a one-class task.
#' @param lambda scalar, regularization parameter
#' @param a n-by-1 vector of sample weights (regression only)
#' @param max.iter maximum number of iterations (binary classification and one-class problems only)
#' @param eps convergence precision (binary classification and one-class problems only)
#' @param v.init initial parameter estimate for the kernel weights (binary classification and one-class problems only)
#' @param b.init initial parameter estimate for the bias term (binary classification only)
#' @param fix.bias set to TRUE to prevent the bias term from being updated (regression only) (default: FALSE)
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param balanced boolean specifying whether the balanced model is being trained (binary classification only) (default: FALSE)
#' @return A list with two elements:
#' \describe{
#' \item{v}{n-by-1 vector of kernel weights}
#' \item{b}{scalar, bias term for the linear model (omitted for one-class models)}
#' }
#' @seealso \code{\link{gelnet}}
#' @export
gelnet.ker <- function( K, y, lambda, a, max.iter = 100, eps = 1e-5, v.init=rep(0,nrow(K)),
b.init=0, fix.bias=FALSE, silent=FALSE, balanced=FALSE )
{
## Determine the problem type
if( is.null(y) )
{
if( !silent ) cat( "Training a one-class model\n" )
gelnet.kor( K, lambda, max.iter, eps, v.init, silent )
}
else if( is.factor(y) )
{
if( nlevels(y) == 1 )
stop( "All labels are identical\nConsider training a one-class model instead" )
if( nlevels(y) > 2 )
stop( "Labels belong to a multiclass task\nConsider training a set of one-vs-one or one-vs-rest models" )
if( !silent ) cat( "Training a logistic regression model\n" )
gelnet.klr( K, y, lambda, max.iter, eps, v.init, b.init, silent, balanced )
}
else if( is.numeric(y) )
{
if( !silent ) cat( "Training a linear regression model\n" )
if( b.init == 0 ) b.init <- sum(a*y) / sum(a)
gelnet.krr( K, y, lambda, a, fix.bias )
}
else
{ stop( "Unknown label type\ny must be a numeric vector, a 2-level factor or NULL" ) }
}
#' Kernel ridge regression
#'
#' Learns a kernel ridge regression model.
#'
#' The entries in the kernel matrix K can be interpreted as dot products
#' in some feature space \eqn{\phi}. The corresponding weight vector can be
#' retrieved via \eqn{w = \sum_i v_i \phi(x_i)}. However, new samples can be
#' classified without explicit access to the underlying feature space:
#' \deqn{w^T \phi(x) + b = \sum_i v_i \phi^T (x_i) \phi(x) + b = \sum_i v_i K( x_i, x ) + b}
#'
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param y n-by-1 vector of response values
#' @param lambda scalar, regularization parameter
#' @param a n-by-1 vector of samples weights
#' @param fix.bias set to TRUE to force the bias term to 0 (default: FALSE)
#' @return A list with two elements:
#' \describe{
#' \item{v}{n-by-1 vector of kernel weights}
#' \item{b}{scalar, bias term for the model}
#' }
#' @noRd
gelnet.krr <- function( K, y, lambda, a, fix.bias=FALSE )
{
## Argument verification
n <- nrow(K)
stopifnot( length(y) == n )
stopifnot( length(a) == n )
stopifnot( is.null(dim(a)) ) ## a must not be in matrix form
## Set up the sample weights and kernalization of the bias term
A <- diag(a)
if( fix.bias )
{
## Solve the optimization problem in closed form
m <- solve( K %*% A %*% t(K) + lambda * n * K, K %*% A %*% y )
list( v = drop(m), b = 0 )
}
else
{
## Set up kernalization of the bias term
K1 <- rbind( K, 1 )
K0 <- cbind( rbind( K, 0 ), 0 )
## Solve the optimization problem in closed form
m <- solve( K1 %*% A %*% t(K1) + lambda * n * K0, K1 %*% A %*% y )
## Retrieve the weights and the bias term
list( v = m[1:n], b = m[n+1] )
}
}
#' Kernel logistic regression
#'
#' Learns a kernel logistic regression model for a binary classification task
#'
#' The method operates by constructing iteratively re-weighted least squares approximations
#' of the log-likelihood loss function and then calling the kernel ridge regression routine
#' to solve those approximations. The least squares approximations are obtained via the Taylor series
#' expansion about the current parameter estimates.
#'
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param y n-by-1 vector of binary response labels
#' @param lambda scalar, regularization parameter
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param v.init initial parameter estimate for the kernel weights
#' @param b.init initial parameter estimate for the bias term
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param balanced boolean specifying whether the balanced model is being evaluated
#' @return A list with two elements:
#' \describe{
#' \item{v}{n-by-1 vector of kernel weights}
#' \item{b}{scalar, bias term for the model}
#' }
#' @seealso \code{\link{gelnet.krr}}, \code{\link{gelnet.logreg.obj}}
#' @noRd
gelnet.klr <- function( K, y, lambda, max.iter = 100, eps = 1e-5,
v.init=rep(0,nrow(K)), b.init=0, silent=FALSE, balanced=FALSE )
{
## Argument verification
stopifnot( nrow(K) == ncol(K) )
stopifnot( nrow(K) == length(y) )
## Convert the labels to {0,1}
y <- factor(y)
if( !silent )
cat( "Treating", levels(y)[1], "as the positive class\n" )
y <- as.integer( y == levels(y)[1] )
## Objective function to minimize
fobj <- function( v, b )
{
s <- K %*% v + b
R2 <- lambda * t(v) %*% K %*% v / 2
LL <- mean( y * s - log(1+exp(s)) )
R2 - LL
}
## Compute the initial objective value
v <- v.init
b <- b.init
fprev <- fobj( v, b )
## Reduce kernel logistic regression to kernel regression
## about the current Taylor method estimates
for( iter in 1:max.iter )
{
if( silent == FALSE )
{
cat( "Iteration", iter, ": " )
cat( "f =", fprev, "\n" )
}
## Compute the current fit
s <- drop(K %*% v + b)
pr <- 1 / (1 + exp(-s))
## Snap probabilities to 0 and 1 to avoid division by small numbers
j0 <- which( pr < eps )
j1 <- which( pr > (1-eps) )
pr[j0] <- 0; pr[j1] <- 1
## Compute the sample weights and the response
a <- pr * (1-pr)
a[c(j0,j1)] <- eps
z <- s + (y - pr) / a
## Rebalance the sample weights according to the class counts
if( balanced )
{
jpos <- which( y==1 )
jneg <- which( y==0 )
a[jpos] <- a[jpos] * length(y) / ( length(jpos)*2 )
a[jneg] <- a[jneg] * length(y) / ( length(jneg)*2 )
}
## Run the coordinate descent for the resulting regression problem
m <- gelnet.krr( K, z, lambda, a )
v <- m$v
b <- m$b
## Evaluate the objective function and check convergence criteria
f <- fobj( v, b )
if( abs(f - fprev) / abs(fprev) < eps ) break
else fprev <- f
}
list( v = v, b = b )
}
#' Kernel one-class regression
#'
#' Learns a kernel one-class model for a given kernel matrix
#'
#' The method operates by constructing iteratively re-weighted least squares approximations
#' of the log-likelihood loss function and then calling the kernel ridge regression routine
#' to solve those approximations. The least squares approximations are obtained via the Taylor series
#' expansion about the current parameter estimates.
#'
#'
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param lambda scalar, regularization parameter
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param v.init initial parameter estimate for the kernel weights
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @return A list with one element:
#' \describe{
#' \item{v}{n-by-1 vector of kernel weights}
#' }
#' @seealso \code{\link{gelnet.krr}}, \code{\link{gelnet.oneclass.obj}}
#' @noRd
gelnet.kor <- function( K, lambda, max.iter = 100, eps = 1e-5,
v.init = rep(0,nrow(K)), silent= FALSE )
{
## Argument verification
stopifnot( nrow(K) == ncol(K) )
## Define the objective function to optimize
fobj <- function( v )
{
s <- K %*% v
LL <- mean( s - log( 1 + exp(s) ) )
R2 <- lambda * t(v) %*% K %*% v / 2
R2 - LL
}
## Set the initial parameter estimates
v <- v.init
fprev <- fobj( v )
## Run Newton's method
for( iter in 1:max.iter )
{
if( silent == FALSE )
{
cat( "Iteration", iter, ": " )
cat( "f =", fprev, "\n" )
}
## Compute the current fit
s <- drop(K %*% v)
pr <- 1 / (1 + exp(-s))
## Compute the sample weights and active response
a <- pr * (1-pr)
z <- s + 1/pr
## Solve the resulting regression problem
mm <- gelnet.krr( K, z, lambda, a, fix.bias=TRUE )
v <- mm$v
## Compute the new objective function value
f <- fobj( v )
if( abs(f - fprev) / abs(fprev) < eps ) break
else fprev <- f
}
list( v = v )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.