R/gelnet.R
In gelnet: Generalized Elastic Nets

Documented in adj2lapl adj2nlapl gelnet gelnet.cv gelnet.ker gelnet.lin.obj gelnet.logreg.obj gelnet.oneclass.obj L1.ceiling

## gelnet.R - Generalized Elastic Nets
##
## by Artem Sokolov

#' Generate a graph Laplacian
#'
#' Generates a graph Laplacian from the graph adjacency matrix.
#'
#' A graph Laplacian is defined as:
#' \eqn{ l_{i,j} = deg( v_i ) }, if \eqn{ i = j };
#' \eqn{ l_{i,j} = -1 }, if \eqn{ i \neq j } and \eqn{v_i} is adjacent to \eqn{v_j};
#' and \eqn{ l_{i,j} = 0 }, otherwise
#'
#' @param A n-by-n adjacency matrix for a graph with n nodes
#' @return The n-by-n Laplacian matrix of the graph
#' @seealso \code{\link{adj2nlapl}}
#' @export
adj2lapl <- function( A )
  {
    n <- nrow(A)
    stopifnot( ncol(A) == n )
    
    ## Compute the off-diagonal entries
    L <- -A
    diag(L) <- 0

    ## Compute the diagonal entries
    ## Degree of a node: sum of weights on the edges
    s <- apply( L, 2, sum )
    diag(L) <- -s	## Negative because L == -A
    L
  }

#' Generate a normalized graph Laplacian
#'
#' Generates a normalized graph Laplacian from the graph adjacency matrix.
#'
#' A normalized graph Laplacian is defined as:
#' \eqn{ l_{i,j} = 1 }, if \eqn{ i = j };
#' \eqn{ l_{i,j} = - 1 / \sqrt{ deg(v_i) deg(v_j) } }, if \eqn{ i \neq j } and \eqn{v_i} is adjacent to \eqn{v_j};
#' and \eqn{ l_{i,j} = 0 }, otherwise
#'
#' @param A n-by-n adjacency matrix for a graph with n nodes
#' @return The n-by-n Laplacian matrix of the graph
#' @seealso \code{\link{adj2nlapl}}
#' @export
adj2nlapl <- function(A)
  {
    n <- nrow(A)
    stopifnot( ncol(A) == n )

    ## Zero out the diagonal
    diag(A) <- 0
    
    ## Degree of a node: sum of weights on the edges
    d <- 1 / apply( A, 2, sum )
    d <- sqrt(d)

    ## Account for "free-floating" nodes
    j <- which( is.infinite(d) )
    d[j] <- 0

    ## Compute the non-normalized Laplacian
    L <- adj2lapl( A )

    ## Normalize
    res <- t( L*d ) * d
    rownames(res) <- rownames(A)
    colnames(res) <- rownames(res)
    res
  }

#' Linear regression objective function value
#'
#' Evaluates the linear regression objective function value for a given model.
#' See details.
#'
#' Computes the objective function value according to
#' \deqn{ \frac{1}{2n} \sum_i a_i (z_i - (w^T x_i + b))^2 + R(w) }
#'  where
#' \deqn{ R(w) = \lambda_1 \sum_j d_j |w_j| + \frac{\lambda_2}{2} (w-m)^T P (w-m) }
#'
#' @param w p-by-1 vector of model weights
#' @param b the model bias term
#' @param X n-by-p matrix of n samples in p dimensions
#' @param z n-by-1 response vector
#' @param l1 L1-norm penalty scaling factor \eqn{\lambda_1}
#' @param l2 L2-norm penalty scaling factor \eqn{\lambda_2}
#' @param a n-by-1 vector of sample weights
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature-feature penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @return The objective function value.
#' @seealso \code{\link{gelnet}}
#' @export
gelnet.lin.obj <- function( w, b, X, z, l1, l2, a = rep(1,nrow(X)),
                           d = rep(1,ncol(X)), P = diag(ncol(X)), m = rep(0,ncol(X)) )
  {
    n <- nrow(X)
    p <- ncol(X)

    ## Compute the residual and loss
    r <- z - (X %*% w + b)
    L <- mean( a * r * r) / 2
    R1 <- l1 * t(d) %*% abs(w)
    R2 <- l2 * t(w-m) %*% P %*% (w-m) / 2

    L + R1 + R2
  }

#' Logistic regression objective function value
#'
#' Evaluates the logistic regression objective function value for a given model.
#' See details.
#
#' Computes the objective function value according to
#' \deqn{ -\frac{1}{n} \sum_i y_i s_i - \log( 1 + \exp(s_i) ) + R(w) }
#'  where
#' \deqn{ s_i = w^T x_i + b }
#' \deqn{ R(w) = \lambda_1 \sum_j d_j |w_j| + \frac{\lambda_2}{2} (w-m)^T P (w-m) }
#' When balanced is TRUE, the loss average over the entire data is replaced with averaging
#' over each class separately. The total loss is then computes as the mean over those
#' per-class estimates.
#'
#' @param w p-by-1 vector of model weights
#' @param b the model bias term
#' @param X n-by-p matrix of n samples in p dimensions
#' @param y n-by-1 binary response vector sampled from {0,1}
#' @param l1 L1-norm penalty scaling factor \eqn{\lambda_1}
#' @param l2 L2-norm penalty scaling factor \eqn{\lambda_2}
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature-feature penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @param balanced boolean specifying whether the balanced model is being evaluated
#' @return The objective function value.
#' @seealso \code{\link{gelnet}}
#' @export
gelnet.logreg.obj <- function( w, b, X, y, l1, l2, d = rep(1,ncol(X)),
                              P = diag(ncol(X)), m = rep(0,ncol(X)), balanced = FALSE )
  {
    ## Compute the regularization terms
    stopifnot( sort(unique(y)) == c(0,1) )
    s <- X %*% w + b
    R1 <- l1 * t(d) %*% abs(w)
    R2 <- l2 * t(w-m) %*% P %*% (w-m) / 2

    ## Compute log-likelihood
    v <- y * s - log(1+exp(s))
    if( balanced == TRUE )
      LL <- (mean( v[y==0] ) + mean( v[y==1] )) / 2
    else
      LL <- mean( v )
    
    R1 + R2 - LL
  }

#' One-class regression objective function value
#'
#' Evaluates the one-class objective function value for a given model
#' See details.
#'
#' Computes the objective function value according to
#' \deqn{ -\frac{1}{n} \sum_i s_i - \log( 1 + \exp(s_i) ) + R(w) }
#'  where
#' \deqn{ s_i = w^T x_i }
#' \deqn{ R(w) = \lambda_1 \sum_j d_j |w_j| + \frac{\lambda_2}{2} (w-m)^T P (w-m) }
#'
#' @param w p-by-1 vector of model weights
#' @param X n-by-p matrix of n samples in p dimensions
#' @param l1 L1-norm penalty scaling factor \eqn{\lambda_1}
#' @param l2 L2-norm penalty scaling factor \eqn{\lambda_2}
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature-feature penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @return The objective function value.
#' @seealso \code{\link{gelnet}}
#' @export
gelnet.oneclass.obj <- function( w, X, l1, l2, d = rep(1,ncol(X)),
                                P = diag(ncol(X)), m = rep(0,ncol(X)) )
  {
    s <- X %*% w
    LL <- mean( s - log( 1 + exp(s) ) )
    R1 <- l1 * t(d) %*% abs(w)
    R2 <- l2 * t(w-m) %*% P %*% (w-m) / 2
    R1 + R2 - LL
  }

#' The largest meaningful value of the L1 parameter
#'
#' Computes the smallest value of the LASSO coefficient L1 that leads to an
#'  all-zero weight vector for a given linear regression problem.
#'
#' The cyclic coordinate descent updates the model weight \eqn{w_k} using a soft threshold operator
#' \eqn{ S( \cdot, \lambda_1 d_k ) } that clips the value of the weight to zero, whenever the absolute
#' value of the first argument falls below \eqn{\lambda_1 d_k}. From here, it is straightforward to compute
#' the smallest value of \eqn{\lambda_1}, such that all weights are clipped to zero.
#'
#' @param X n-by-p matrix of n samples in p dimensions
#' @param y n-by-1 vector of response values. Must be numeric vector for regression, factor with 2 levels for binary classification, or NULL for a one-class task.
#' @param a n-by-1 vector of sample weights (regression only)
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature association penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @param l2 coefficient for the L2-norm penalty
#' @param balanced boolean specifying whether the balanced model is being trained (binary classification only) (default: FALSE)
#' @return The largest meaningful value of the L1 parameter (i.e., the smallest value that yields a model with all zero weights)
#' @export
L1.ceiling <- function( X, y, a = rep(1,nrow(X)), d = rep(1,ncol(X)),
                       P = diag(ncol(X)), m = rep(0,ncol(X)), l2 = 1, balanced = FALSE )
  {
    if( is.null(y) )
      {
        ## One-class model
        xy <- (apply( X/2, 2, mean ) + l2 * P %*% m) / d
        return( max( abs(xy) ) )
      }
    else if( is.factor(y) )
      {
        ## Binary model
        ## Convert the labels to {0,1}
        y <- as.integer( y == levels(y)[1] )
        
        a <- rep( 0.25, nrow(X) )
        if( balanced )
          {
            jpos <- which( y==1 ); jneg <- which( y==0 )
            a[jpos] <- a[jpos] * length(y) / ( length(jpos)*2 )
            a[jneg] <- a[jneg] * length(y) / ( length(jneg)*2 )
          }
        z <- (y - 0.5) / 0.25
        return( L1.ceiling( X, z, a, d, P, m, l2 ) )
      }
    else if( is.numeric(y) )
      {
        ## Liner regression model
        stopifnot( nrow(X) == length(y) )
        b1 <- sum( a*y ) / sum(a)
        xy <- (apply( a*X*(y - b1), 2, mean ) + l2 * P %*% m) / d
        return( max( abs(xy) ) )
      }
    else
      { stop( "Unknown label type\ny must be a numeric vector, a 2-level factor or NULL" ) }
  }

#' GELnet for linear regression, binary classification and one-class problems.
#'
#' Infers the problem type and learns the appropriate GELnet model via coordinate descent.
#'
#' The method determines the problem type from the labels argument y.
#' If y is a numeric vector, then a regression model is trained by optimizing the following objective function:
#' \deqn{ \frac{1}{2n} \sum_i a_i (y_i - (w^T x_i + b))^2 + R(w) }
#'
#' If y is a factor with two levels, then the function returns a binary classification model, obtained by optimizing the following objective function:
#' \deqn{ -\frac{1}{n} \sum_i y_i s_i - \log( 1 + \exp(s_i) ) + R(w) }
#'  where
#' \deqn{ s_i = w^T x_i + b }
#'
#' Finally, if no labels are provided (y == NULL), then a one-class model is constructed using the following objective function:
#' \deqn{ -\frac{1}{n} \sum_i s_i - \log( 1 + \exp(s_i) ) + R(w) }
#'  where
#' \deqn{ s_i = w^T x_i }
#'
#' In all cases, the regularizer is defined by
#' \deqn{ R(w) = \lambda_1 \sum_j d_j |w_j| + \frac{\lambda_2}{2} (w-m)^T P (w-m) }
#'
#' The training itself is performed through cyclical coordinate descent, and the optimization is terminated after the desired tolerance is achieved or after a maximum number of iterations.
#'
#' @param X n-by-p matrix of n samples in p dimensions
#' @param y n-by-1 vector of response values. Must be numeric vector for regression, factor with 2 levels for binary classification, or NULL for a one-class task.
#' @param l1 coefficient for the L1-norm penalty
#' @param l2 coefficient for the L2-norm penalty
#' @param nFeats alternative parameterization that returns the desired number of non-zero weights. Takes precedence over l1 if not NULL (default: NULL)
#' @param a n-by-1 vector of sample weights (regression only)
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature association penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param w.init initial parameter estimate for the weights
#' @param b.init initial parameter estimate for the bias term
#' @param fix.bias set to TRUE to prevent the bias term from being updated (regression only) (default: FALSE)
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param balanced boolean specifying whether the balanced model is being trained (binary classification only) (default: FALSE)
#' @param nonneg set to TRUE to enforce non-negativity constraints on the weights (default: FALSE )
#' @return A list with two elements:
#' \describe{
#'   \item{w}{p-by-1 vector of p model weights}
#'   \item{b}{scalar, bias term for the linear model (omitted for one-class models)}
#' }
#' @seealso \code{\link{gelnet.lin.obj}}, \code{\link{gelnet.logreg.obj}}, \code{\link{gelnet.oneclass.obj}}
#' @export
gelnet <- function( X, y, l1, l2, nFeats=NULL, a=rep(1,n), d=rep(1,p), P=diag(p), m=rep(0,p),
                   max.iter=100, eps=1e-5, w.init=rep(0,p), b.init=NULL,
                   fix.bias=FALSE, silent=FALSE, balanced=FALSE, nonneg=FALSE )
  {
    n <- nrow(X)
    p <- ncol(X)
    
    ## Determine the problem type
    if( is.null(y) )
      {
        if( !silent ) cat( "Training a one-class model\n" )
        f.gel <- function(L1) {gelnet.oneclass( X, L1, l2, d, P, m, max.iter, eps, w.init, silent, nonneg )}
      }
    else if( is.factor(y) )
      {
        if( nlevels(y) == 1 )
          stop( "All labels are identical\nConsider training a one-class model instead" )
        if( nlevels(y) > 2 )
          stop( "Labels belong to a multiclass task\nConsider training a set of one-vs-one or one-vs-rest models" )
        if( !silent ) cat( "Training a logistic regression model\n" )
        if( is.null(b.init) ) b.init <- 0
        f.gel <- function(L1) {gelnet.logreg( X, y, L1, l2, d, P, m, max.iter, eps, w.init, b.init, silent, balanced, nonneg )}
      }
    else if( is.numeric(y) )
      {
        if( !silent ) cat( "Training a linear regression model\n" )
        if( is.null(b.init) ) b.init <- sum(a*y) / sum(a)
        f.gel <- function(L1) {gelnet.lin( X, y, L1, l2, a, d, P, m, max.iter, eps, w.init, b.init, fix.bias, silent, nonneg )}
      }
    else
      { stop( "Unknown label type\ny must be a numeric vector, a 2-level factor or NULL" ) }

    ## Train a model with the required number of features (if requested)
    if( !is.null(nFeats) )
      {
        L1s <- L1.ceiling( X, y, a, d, P, m, l2, balanced )
        return( gelnet.L1bin( f.gel, nFeats, L1s ) )        
      }
    else
      { return( f.gel(l1) ) }
  }

#' Kernel models for linear regression, binary classification and one-class problems.
#'
#' Infers the problem type and learns the appropriate kernel model.
#'
#' The entries in the kernel matrix K can be interpreted as dot products
#' in some feature space \eqn{\phi}. The corresponding weight vector can be
#' retrieved via \eqn{w = \sum_i v_i \phi(x_i)}. However, new samples can be
#' classified without explicit access to the underlying feature space:
#' \deqn{w^T \phi(x) + b = \sum_i v_i \phi^T (x_i) \phi(x) + b = \sum_i v_i K( x_i, x ) + b}
#'
#' The method determines the problem type from the labels argument y.
#' If y is a numeric vector, then a ridge regression model is trained by optimizing the following objective function:
#' \deqn{ \frac{1}{2n} \sum_i a_i (z_i - (w^T x_i + b))^2 + w^Tw }
#'
#' If y is a factor with two levels, then the function returns a binary classification model, obtained by optimizing the following objective function:
#' \deqn{ -\frac{1}{n} \sum_i y_i s_i - \log( 1 + \exp(s_i) ) + w^Tw }
#'  where
#' \deqn{ s_i = w^T x_i + b }
#'
#' Finally, if no labels are provided (y == NULL), then a one-class model is constructed using the following objective function:
#' \deqn{ -\frac{1}{n} \sum_i s_i - \log( 1 + \exp(s_i) ) + w^Tw }
#'  where
#' \deqn{ s_i = w^T x_i }
#'
#' In all cases, \eqn{w = \sum_i v_i \phi(x_i)} and the method solves for \eqn{v_i}.
#'
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param y n-by-1 vector of response values. Must be numeric vector for regression, factor with 2 levels for binary classification, or NULL for a one-class task.
#' @param lambda scalar, regularization parameter
#' @param a n-by-1 vector of sample weights (regression only)
#' @param max.iter maximum number of iterations (binary classification and one-class problems only)
#' @param eps convergence precision (binary classification and one-class problems only)
#' @param v.init initial parameter estimate for the kernel weights (binary classification and one-class problems only)
#' @param b.init initial parameter estimate for the bias term (binary classification only)
#' @param fix.bias set to TRUE to prevent the bias term from being updated (regression only) (default: FALSE)
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param balanced boolean specifying whether the balanced model is being trained (binary classification only) (default: FALSE)
#' @return A list with two elements:
#' \describe{
#'   \item{v}{n-by-1 vector of kernel weights}
#'   \item{b}{scalar, bias term for the linear model (omitted for one-class models)}
#' }
#' @seealso \code{\link{gelnet}}
#' @export
gelnet.ker <- function( K, y, lambda, a, max.iter = 100, eps = 1e-5, v.init=rep(0,nrow(K)),
                       b.init=0, fix.bias=FALSE, silent=FALSE, balanced=FALSE )
  {
    ## Determine the problem type
    if( is.null(y) )
      {
        if( !silent ) cat( "Training a one-class model\n" )
        gelnet.kor( K, lambda, max.iter, eps, v.init, silent )
      }
    else if( is.factor(y) )
      {
        if( nlevels(y) == 1 )
          stop( "All labels are identical\nConsider training a one-class model instead" )
        if( nlevels(y) > 2 )
          stop( "Labels belong to a multiclass task\nConsider training a set of one-vs-one or one-vs-rest models" )
        if( !silent ) cat( "Training a logistic regression model\n" )
        gelnet.klr( K, y, lambda, max.iter, eps, v.init, b.init, silent, balanced )
      }
    else if( is.numeric(y) )
      {
        if( !silent ) cat( "Training a linear regression model\n" )
        if( b.init == 0 ) b.init <- sum(a*y) / sum(a)
        gelnet.krr( K, y, lambda, a, fix.bias )
      }
    else
      { stop( "Unknown label type\ny must be a numeric vector, a 2-level factor or NULL" ) }
  }

#' k-fold cross-validation for parameter tuning.
#'
#' Performs k-fold cross-validation to select the best pair of the L1- and L2-norm penalty values.
#'
#' Cross-validation is performed on a grid of parameter values. The user specifies the number of values
#' to consider for both the L1- and the L2-norm penalties. The L1 grid values are equally spaced on
#' [0, L1s], where L1s is the smallest meaningful value of the L1-norm penalty (i.e., where all the model
#' weights are just barely zero). The L2 grid values are on a logarithmic scale centered on 1.
#'
#' @param X n-by-p matrix of n samples in p dimensions
#' @param y n-by-1 vector of response values. Must be numeric vector for regression, factor with 2 levels for binary classification, or NULL for a one-class task.
#' @param nL1 number of values to consider for the L1-norm penalty
#' @param nL2 number of values to consider for the L2-norm penalty
#' @param nFolds number of cross-validation folds (default:5)
#' @param a n-by-1 vector of sample weights (regression only)
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature association penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param w.init initial parameter estimate for the weights
#' @param b.init initial parameter estimate for the bias term
#' @param fix.bias set to TRUE to prevent the bias term from being updated (regression only) (default: FALSE)
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param balanced boolean specifying whether the balanced model is being trained (binary classification only) (default: FALSE)
#' @return A list with the following elements:
#' \describe{
#'   \item{l1}{the best value of the L1-norm penalty}
#'   \item{l2}{the best value of the L2-norm penalty}
#'   \item{w}{p-by-1 vector of p model weights associated with the best (l1,l2) pair.}
#'   \item{b}{scalar, bias term for the linear model associated with the best (l1,l2) pair. (omitted for one-class models)}
#'   \item{perf}{performance value associated with the best model. (Likelihood of data for one-class, AUC for binary classification, and -RMSE for regression)}
#' }
#' @seealso \code{\link{gelnet}}
#' @export
gelnet.cv <- function( X, y, nL1, nL2, nFolds=5, a=rep(1,n), d=rep(1,p), P=diag(p), m=rep(0,p),
                   max.iter=100, eps=1e-5, w.init=rep(0,p), b.init=0,
                   fix.bias=FALSE, silent=FALSE, balanced=FALSE )
  {
    n <- nrow(X)
    p <- ncol(X)
    
    ## One-class
    if( is.null(y) )
      {
        ## Assign every k^th sample to its fold
        vfa <- rep( 1:nFolds, length=n )

        ## The evaluation function computes the likelihood of the test data given the model
        f.eval <- function( mm, X.te, y.te )
          {
            s <- drop( X.te %*% mm$w )
            pr <- exp(s) / (1 + exp(s))
            sum( log( pr ) )
          }
      }

    ## Binary classification
    else if( is.factor(y) )
      {
        if( nlevels(y) == 1 )
          stop( "All labels are identical\nConsider training a one-class model instead" )
        if( nlevels(y) > 2 )
          stop( "Labels belong to a multiclass task\nConsider training a set of one-vs-one or one-vs-rest models" )

        ## Sample each class separately
        vfa <- rep( 0, length=n )
        jpos <- which( y == levels(y)[1] )
        jneg <- which( y == levels(y)[2] )
        vfa[jpos] <- rep(1:nFolds, length.out = length(jpos) )
        vfa[jneg] <- rep(1:nFolds, length.out = length(jneg) )

        ## The evaluation function computes AUC on the test data/labels
        f.eval <- function( mm, X.te, y.te )
          {
            s <- drop( X.te %*% mm$w + mm$b )
            jp <- which( y.te == levels(y)[1] ); np <- length( jp )
            jn <- which( y.te == levels(y)[2] ); nn <- length( jn )
            s0 <- sum( rank(s)[jp] )
            (s0 - np*(np+1) / 2) / (np*nn)
          }
      }

    ## Regression
    else if( is.numeric(y) )
      {
        ## Assign every k^th sample to a fold to maintain data representation across folds
        vfa <- rep( 0, length=n )
        vfa[order(y)] <- rep( 1:nFolds, length=n )

        ## The evaluation function computes -RMSE on the test data/labels
        ## Negative RMSE is used for consistency with other "higher is better" measures
        f.eval <- function( mm, X.te, y.te )
          {
            s <- drop( X.te %*% mm$w + mm$b )
            -sqrt( mean( (s - y.te)^2 ) )
          }
      }    
    else
      { stop( "Unknown label type\ny must be a numeric vector, a 2-level factor or NULL" ) }

    ## Generate a vecotr of values for the L2-norm penalty
    v1 <- 1:(as.integer( nL2 / 2 ))
    if( nL2 %% 2 == 0 )
      vL2 <- 10 ^ c(rev(1-v1), v1)
    else
      vL2 <- 10 ^ c(rev(-v1), 0, v1)
    stopifnot( length(vL2) == nL2 )

    ## Traverse the L2-norm penalty values
    mm.best <- list( perf = -Inf )
    for( l2 in vL2 )
      {
        ## Generate a vector of values for the L1-norm penalty
        L1s <- L1.ceiling( X, y, a, d, P, m, l2, balanced )
        vL1 <- seq( 0, L1s, length.out = nL1+1 )[1:nL1]
        stopifnot( length(vL1) == nL1 )

        ## Traverse the L1-norm penalty values
        for( l1 in vL1 )
          {
            if( !silent )
              {
                cat( "===================================================\n" )
                cat( "Evaluating the choice of L2 =", l2, "; L1 =", l1, "\n" )
              }
            
            ## Traverse the folds
            res <- rep( 0, nFolds )
            for( k in 1:nFolds )
              {
                if( !silent )
                  cat( "== Fold", k, "==\n" )
                
                ## Isolate the training and test data
                j.te <- which( vfa == k )
                j.tr <- which( vfa != k )

                ## Train and evaluate the model
                mm <- gelnet( X[j.tr,], y[j.tr], l1, l2, NULL, a[j.tr], d, P, m,
                             max.iter, eps, w.init, b.init,
                             fix.bias, silent, balanced )
                res[k] <- f.eval( mm, X[j.te,], y[j.te] )

                if( !silent )
                cat( "Performance:", res[k], "\n" )
              }
            cat( "Average performance across all folds: ", mean(res), "\n" )

            ## Compare to and store the best model
            if( mean(res) > mm.best$perf )
              {
                mm.best <- mm
                mm.best$l1 <- l1
                mm.best$l2 <- l2
                mm.best$perf <- mean(res)
              }
          }
      }

    mm.best
  }

#' GELnet for linear regression
#'
#' Constructs a GELnet model for linear regression using coordinate descent.
#'
#' The method operates through cyclical coordinate descent.
#' The optimization is terminated after the desired tolerance is achieved, or after a maximum number of iterations.
#' 
#' @param X n-by-p matrix of n samples in p dimensions
#' @param y n-by-1 vector of response values
#' @param l1 coefficient for the L1-norm penalty
#' @param l2 coefficient for the L2-norm penalty
#' @param a n-by-1 vector of sample weights
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature association penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param w.init initial parameter estimate for the weights
#' @param b.init initial parameter estimate for the bias term
#' @param fix.bias set to TRUE to prevent the bias term from being updated (default: FALSE)
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param nonneg set to TRUE to enforce non-negativity constraints on the weights (default: FALSE )
#' @return A list with two elements:
#' \describe{
#'   \item{w}{p-by-1 vector of p model weights}
#'   \item{b}{scalar, bias term for the linear model}
#' }
#' @useDynLib gelnet gelnet_lin_opt
#' @noRd
gelnet.lin <- function( X, y, l1, l2, a = rep(1,n), d = rep(1,p), P = diag(p),
                       m=rep(0,p), max.iter = 100, eps = 1e-5, w.init = rep(0,p),
                       b.init = sum(a*y)/sum(a), fix.bias=FALSE, silent=FALSE, nonneg=FALSE )
  {
    n <- nrow(X)
    p <- ncol(X)

    ## Verify argument dimensionality
    stopifnot( length(y) == n )
    stopifnot( length(a) == n )
    stopifnot( length(d) == p )
    stopifnot( all( dim(P) == c(p,p) ) )
    stopifnot( length(m) == p )
    stopifnot( length(w.init) == p )
    stopifnot( length(b.init) == 1 )
    stopifnot( length(l1) == 1 )
    stopifnot( length(l2) == 1 )

    ## Verify name matching (if applicable)
    if( is.null(colnames(X)) == FALSE && is.null(colnames(P)) == FALSE )
      {
        stopifnot( is.null( rownames(P) ) == FALSE )
        stopifnot( all( colnames(X) == rownames(P) ) )
        stopifnot( all( colnames(X) == colnames(P) ) )
      }

    ## Set the initial parameter estimates
    S <- X %*% w.init + b.init
    Pw <- P %*% (w.init - m)

    ## Call the C routine
    res <- .C( "gelnet_lin_opt",
              as.double(X), as.double(y), as.double(a), as.double(d),
              as.double(P), as.double(m), as.double(l1), as.double(l2),
              as.double(S), as.double(Pw), as.integer(n), as.integer(p),
              as.integer(max.iter), as.double(eps), as.integer(fix.bias),
              w = as.double( w.init ), b = as.double(b.init), as.integer(silent), as.integer(nonneg) )

    res <- res[c("w","b")]
    names( res$w ) <- colnames(X)

    res
  }

#' GELnet for logistic regression
#'
#' Constructs a GELnet model for logistic regression using the Newton method.
#'
#' The method operates by constructing iteratively re-weighted least squares approximations
#' of the log-likelihood loss function and then calling the linear regression routine
#' to solve those approximations. The least squares approximations are obtained via the Taylor series
#' expansion about the current parameter estimates.
#'
#' @param X n-by-p matrix of n samples in p dimensions
#' @param y n-by-1 vector of binary response labels
#' @param l1 coefficient for the L1-norm penalty
#' @param l2 coefficient for the L2-norm penalty
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature association penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param w.init initial parameter estimate for the weights
#' @param b.init initial parameter estimate for the bias term
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param balanced boolean specifying whether the balanced model is being trained
#' @param nonneg set to TRUE to enforce non-negativity constraints on the weights (default: FALSE )
#' @return A list with two elements:
#' \describe{
#'   \item{w}{p-by-1 vector of p model weights}
#'   \item{b}{scalar, bias term for the linear model}
#' }
#' @seealso \code{\link{gelnet.lin}}
#' @useDynLib gelnet gelnet_logreg_opt
#' @noRd
gelnet.logreg <- function( X, y, l1, l2, d = rep(1,p), P = diag(p), m = rep(0,p),
                          max.iter = 100, eps = 1e-5, w.init = rep(0,p),
                          b.init = 0, silent = FALSE, balanced = FALSE, nonneg = FALSE )
  {
    n <- nrow(X)
    p <- ncol(X)

    ## Verify argument dimensionality
    stopifnot( length( unique(y) ) == 2 )
    stopifnot( length(y) == n )
    stopifnot( length(d) == p )
    stopifnot( all( dim(P) == c(p,p) ) )
    stopifnot( length( w.init ) == p )
    stopifnot( length( b.init ) == 1 )
    stopifnot( length(l1) == 1 )
    stopifnot( length(l2) == 1 )

    ## Verify name matching (if applicable)
    if( is.null(colnames(X)) == FALSE && is.null(colnames(P)) == FALSE )
      {
        stopifnot( is.null( rownames(P) ) == FALSE )
        stopifnot( all( colnames(X) == rownames(P) ) )
        stopifnot( all( colnames(X) == colnames(P) ) )
      }

    ## Convert the labels to {0,1}
    y <- factor(y)
    if( !silent )
      cat( "Treating", levels(y)[1], "as the positive class\n" )
    y <- as.integer( y == levels(y)[1] )
    
    ## Set the initial parameter estimates
    S <- X %*% w.init + b.init
    Pw <- P %*% (w.init - m)

    res <- .C( "gelnet_logreg_opt",
              as.double(X), as.integer(y), as.double(d), as.double(P),
              as.double(m), as.double(l1), as.double(l2),
              as.double(S), as.double(Pw), as.integer(n), as.integer(p),
              as.integer(max.iter), as.double(eps), w = as.double( w.init ),
              b = as.double(b.init), as.integer(silent), as.integer(balanced), as.integer(nonneg) )

    res <- res[c("w","b")]
    names( res$w ) <- colnames(X)
    
    res
  }

#' GELnet for one-class regression
#'
#' Constructs a GELnet model for one-class regression using the Newton method.
#'
#' The function optimizes the following objective:
#' \deqn{ -\frac{1}{n} \sum_i s_i - \log( 1 + \exp(s_i) ) + R(w) }
#'  where
#' \deqn{ s_i = w^T x_i }
#' \deqn{ R(w) = \lambda_1 \sum_j d_j |w_j| + \frac{\lambda_2}{2} (w-m)^T P (w-m) }
#' The method operates by constructing iteratively re-weighted least squares approximations
#' of the log-likelihood loss function and then calling the linear regression routine
#' to solve those approximations. The least squares approximations are obtained via the Taylor series
#' expansion about the current parameter estimates.
#'
#' @param X n-by-p matrix of n samples in p dimensions
#' @param l1 coefficient for the L1-norm penalty
#' @param l2 coefficient for the L2-norm penalty
#' @param d p-by-1 vector of feature weights
#' @param P p-by-p feature association penalty matrix
#' @param m p-by-1 vector of translation coefficients
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param w.init initial parameter estimate for the weights
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param nonneg set to TRUE to enforce non-negativity constraints on the weights (default: FALSE )
#' @return A list with one element:
#' \describe{
#'   \item{w}{p-by-1 vector of p model weights}
#' }
#' @seealso \code{\link{gelnet.lin}}, \code{\link{gelnet.oneclass.obj}}
#' @noRd
gelnet.oneclass <- function( X, l1, l2, d = rep(1,p), P = diag(p), m = rep(0,p),
                            max.iter = 100, eps = 1e-5,
                            w.init = rep(0,p), silent= FALSE, nonneg=FALSE )
  {
    n <- nrow(X)
    p <- ncol(X)

    ## Verify argument dimensionality
    stopifnot( length(d) == p )
    stopifnot( all( dim(P) == c(p,p) ) )
    stopifnot( length(m) == p )
    stopifnot( length(w.init) == p )
    stopifnot( length(l1) == 1 )
    stopifnot( length(l2) == 1 )

    ## Verify name matching (if applicable)
    if( is.null(colnames(X)) == FALSE && is.null(colnames(P)) == FALSE )
      {
        stopifnot( is.null( rownames(P) ) == FALSE )
        stopifnot( all( colnames(X) == rownames(P) ) )
        stopifnot( all( colnames(X) == colnames(P) ) )
      }

    ## Set the initial parameter estimates
    w <- w.init

    ## Run Newton's method
    fprev <- gelnet.oneclass.obj( w, X, l1, l2, d, P, m )
    for( iter in 1:max.iter )
      {
        if( silent == FALSE )
          {
            cat( "Iteration", iter, ": " )
            cat( "f =", fprev, "\n" )
          }

        ## Compute the current fit
        s <- X %*% w
        pr <- 1 / ( 1 + exp(-s) )

        ## Compute the sample weights and active response
        a <- pr * (1-pr)
        z <- s + 1/pr

        ## Run coordinate descent for the resulting regression problem
        mm <- gelnet.lin( X, z, l1, l2, a, d, P, m, iter*2, eps, w, 0, TRUE, TRUE, nonneg )
        w <- mm$w

        f <- gelnet.oneclass.obj( w, X, l1, l2, d, P, m )
        if( abs(f - fprev) / abs(fprev) < eps ) break
        else fprev <- f
      }
    
    list( w = w )
  }

#' A GELnet model with a requested number of non-zero weights
#'
#' Binary search to find an L1 penalty parameter value that yields the desired
#'   number of non-zero weights in a GELnet model.
#'
#' The method performs simple binary search starting in [0, l1s] and iteratively
#' training a model using the provided \code{f.gelnet}. At each iteration, the
#' method checks if the number of non-zero weights in the model is higher or lower
#' than the requested \code{nF} and adjusts the value of the L1 penalty term accordingly.
#' For linear regression problems, it is recommended to initialize \code{l1s} to the output
#' of \code{L1.ceiling}.
#' 
#' @param f.gelnet a function that accepts one parameter: L1 penalty value,
#'    and returns a typical GELnets model (list with w and b as its entries)
#' @param nF the desired number of non-zero features
#' @param l1s the right side of the search interval: search will start in [0, l1s]
#' @param max.iter the maximum number of iterations of the binary search
#'
#' @return The model with the desired number of non-zero weights and the corresponding value of the
#' L1-norm parameter. Returned as a list with three elements:
#' \describe{
#'   \item{w}{p-by-1 vector of p model weights}
#'   \item{b}{scalar, bias term for the linear model}
#'   \item{l1}{scalar, the corresponding value of the L1-norm parameter}
#' }
#' @seealso \code{\link{L1.ceiling}}
#' @noRd
gelnet.L1bin <- function( f.gelnet, nF, l1s, max.iter=10 )
  {
    ## Set up the search region
    L1top <- l1s
    L1bot <- 0

    ## Perform binary search
    for( i in 1:max.iter )
      {
        cat( "Binary search iteration", i, "\n" )
        l1 <- (L1top + L1bot) / 2
        m <- f.gelnet( l1 )
        k <- sum( m$w != 0 )
        cat( "Learned a model with", k, "non-zero features\n" )
        if( k == nF ) break
        if( k < nF ) {L1top <- l1} else {L1bot <- l1}
      }

    ## Store the selected L1 parameter value into the model
    m$l1 <- l1
    m
  }

#' Kernel ridge regression
#'
#' Learns a kernel ridge regression model.
#'
#' The entries in the kernel matrix K can be interpreted as dot products
#' in some feature space \eqn{\phi}. The corresponding weight vector can be
#' retrieved via \eqn{w = \sum_i v_i \phi(x_i)}. However, new samples can be
#' classified without explicit access to the underlying feature space:
#' \deqn{w^T \phi(x) + b = \sum_i v_i \phi^T (x_i) \phi(x) + b = \sum_i v_i K( x_i, x ) + b}
#'
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param y n-by-1 vector of response values
#' @param lambda scalar, regularization parameter
#' @param a n-by-1 vector of samples weights
#' @param fix.bias set to TRUE to force the bias term to 0 (default: FALSE)
#' @return A list with two elements:
#' \describe{
#'   \item{v}{n-by-1 vector of kernel weights}
#'   \item{b}{scalar, bias term for the model}
#' }
#' @seealso \code{\link{gelnet.lin.obj}}
#' @noRd
gelnet.krr <- function( K, y, lambda, a, fix.bias=FALSE )
  {
    ## Argument verification
    n <- nrow(K)
    stopifnot( length(y) == n )
    stopifnot( length(a) == n )
    stopifnot( is.null(dim(a)) )	## a must not be in matrix form

    ## Set up the sample weights and kernalization of the bias term
    A <- diag(a)

    if( fix.bias )
      {
        ## Solve the optimization problem in closed form
        m <- solve( K %*% A %*% t(K) + lambda * n * K, K %*% A %*% y )
        list( v = drop(m), b = 0 )
      }
    else
      {
        ## Set up kernalization of the bias term
        K1 <- rbind( K, 1 )
        K0 <- cbind( rbind( K, 0 ), 0 )
        
        ## Solve the optimization problem in closed form
        m <- solve( K1 %*% A %*% t(K1) + lambda * n * K0, K1 %*% A %*% y )

        ## Retrieve the weights and the bias term
        list( v = m[1:n], b = m[n+1] )
      }
  }

#' Kernel logistic regression
#'
#' Learns a kernel logistic regression model for a binary classification task
#'
#' The method operates by constructing iteratively re-weighted least squares approximations
#' of the log-likelihood loss function and then calling the kernel ridge regression routine
#' to solve those approximations. The least squares approximations are obtained via the Taylor series
#' expansion about the current parameter estimates.
#' 
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param y n-by-1 vector of binary response labels
#' @param lambda scalar, regularization parameter
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param v.init initial parameter estimate for the kernel weights
#' @param b.init initial parameter estimate for the bias term
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @param balanced boolean specifying whether the balanced model is being evaluated
#' @return A list with two elements:
#' \describe{
#'   \item{v}{n-by-1 vector of kernel weights}
#'   \item{b}{scalar, bias term for the model}
#' }
#' @seealso \code{\link{gelnet.krr}}, \code{\link{gelnet.logreg.obj}}
#' @noRd
gelnet.klr <- function( K, y, lambda, max.iter = 100, eps = 1e-5,
                       v.init=rep(0,nrow(K)), b.init=0, silent=FALSE, balanced=FALSE )
  {
    ## Argument verification
    stopifnot( nrow(K) == ncol(K) )
    stopifnot( nrow(K) == length(y) )

    ## Convert the labels to {0,1}
    y <- factor(y)
    if( !silent )
      cat( "Treating", levels(y)[1], "as the positive class\n" )
    y <- as.integer( y == levels(y)[1] )
    
    ## Objective function to minimize
    fobj <- function( v, b )
      {
        s <- K %*% v + b
        R2 <- lambda * t(v) %*% K %*% v / 2
        LL <- mean( y * s - log(1+exp(s)) )
        R2 - LL
      }

    ## Compute the initial objective value
    v <- v.init
    b <- b.init
    fprev <- fobj( v, b )

    ## Reduce kernel logistic regression to kernel regression
    ##  about the current Taylor method estimates
    for( iter in 1:max.iter )
      {
        if( silent == FALSE )
          {
            cat( "Iteration", iter, ": " )
            cat( "f =", fprev, "\n" )
          }
        
        ## Compute the current fit
        s <- drop(K %*% v + b)
        pr <- 1 / (1 + exp(-s))

        ## Snap probabilities to 0 and 1 to avoid division by small numbers
        j0 <- which( pr < eps )
        j1 <- which( pr > (1-eps) )
        pr[j0] <- 0; pr[j1] <- 1

        ## Compute the sample weights and the response
        a <- pr * (1-pr)
        a[c(j0,j1)] <- eps
        z <- s + (y - pr) / a

        ## Rebalance the sample weights according to the class counts
        if( balanced )
          {
            jpos <- which( y==1 )
            jneg <- which( y==0 )
            a[jpos] <- a[jpos] * length(y) / ( length(jpos)*2 )
            a[jneg] <- a[jneg] * length(y) / ( length(jneg)*2 )
          }
        
        ## Run the coordinate descent for the resulting regression problem
        m <- gelnet.krr( K, z, lambda, a )
        v <- m$v
        b <- m$b

        ## Evaluate the objective function and check convergence criteria
        f <- fobj( v, b )
        if( abs(f - fprev) / abs(fprev) < eps ) break
        else fprev <- f
      }

    list( v = v, b = b )
  }

#' Kernel one-class regression
#'
#' Learns a kernel one-class model for a given kernel matrix
#'
#' The method operates by constructing iteratively re-weighted least squares approximations
#' of the log-likelihood loss function and then calling the kernel ridge regression routine
#' to solve those approximations. The least squares approximations are obtained via the Taylor series
#' expansion about the current parameter estimates.
#'
#' 
#' @param K n-by-n matrix of pairwise kernel values over a set of n samples
#' @param lambda scalar, regularization parameter
#' @param max.iter maximum number of iterations
#' @param eps convergence precision
#' @param v.init initial parameter estimate for the kernel weights
#' @param silent set to TRUE to suppress run-time output to stdout (default: FALSE)
#' @return A list with one element:
#' \describe{
#'   \item{v}{n-by-1 vector of kernel weights}
#' }
#' @seealso \code{\link{gelnet.krr}}, \code{\link{gelnet.oneclass.obj}}
#' @noRd
gelnet.kor <- function( K, lambda, max.iter = 100, eps = 1e-5,
                       v.init = rep(0,nrow(K)), silent= FALSE )
  {
    ## Argument verification
    stopifnot( nrow(K) == ncol(K) )

    ## Define the objective function to optimize
    fobj <- function( v )
      {
        s <- K %*% v
        LL <- mean( s - log( 1 + exp(s) ) )
        R2 <- lambda * t(v) %*% K %*% v / 2
        R2 - LL
      }

    ## Set the initial parameter estimates
    v <- v.init
    fprev <- fobj( v )

    ## Run Newton's method
    for( iter in 1:max.iter )
      {
        if( silent == FALSE )
          {
            cat( "Iteration", iter, ": " )
            cat( "f =", fprev, "\n" )
          }

        ## Compute the current fit
        s <- drop(K %*% v)
        pr <- 1 / (1 + exp(-s))

        ## Compute the sample weights and active response
        a <- pr * (1-pr)
        z <- s + 1/pr

        ## Solve the resulting regression problem
        mm <- gelnet.krr( K, z, lambda, a, fix.bias=TRUE )
        v <- mm$v

        ## Compute the new objective function value
        f <- fobj( v )
        if( abs(f - fprev) / abs(fprev) < eps ) break
        else fprev <- f
      }

    list( v = v )
  }
Any scripts or data that you put into this service are public.
gelnet documentation built on May 2, 2019, 2:10 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
gelnet
Generalized Elastic Nets

R/gelnet.R
In gelnet: Generalized Elastic Nets

Defines functions adj2lapl adj2nlapl gelnet.lin.obj gelnet.logreg.obj gelnet.oneclass.obj L1.ceiling gelnet gelnet.ker gelnet.cv gelnet.lin gelnet.logreg gelnet.oneclass gelnet.L1bin gelnet.krr gelnet.klr gelnet.kor

Documented in adj2lapl adj2nlapl gelnet gelnet.cv gelnet.ker gelnet.lin.obj gelnet.logreg.obj gelnet.oneclass.obj L1.ceiling

Try the gelnet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gelnet Generalized Elastic Nets

R/gelnet.R In gelnet: Generalized Elastic Nets

Defines functions adj2lapl adj2nlapl gelnet.lin.obj gelnet.logreg.obj gelnet.oneclass.obj L1.ceiling gelnet gelnet.ker gelnet.cv gelnet.lin gelnet.logreg gelnet.oneclass gelnet.L1bin gelnet.krr gelnet.klr gelnet.kor

Documented in adj2lapl adj2nlapl gelnet gelnet.cv gelnet.ker gelnet.lin.obj gelnet.logreg.obj gelnet.oneclass.obj L1.ceiling

Try the gelnet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gelnet
Generalized Elastic Nets

R/gelnet.R
In gelnet: Generalized Elastic Nets