In jaredhuling/ordinis: Fits penalized regression models with coordinate descent

library(knitr)
opts_chunk$set(message = FALSE)

Introduction to `ordinis'

The 'ordinis' package provides computation for penalized regression problems via coordinate descent. It is mostly for my own experimentation at this stage, however it is fairly efficient and reliable.

Install using the devtools package:

devtools::install_github("jaredhuling/ordinis")

or by cloning and building

Example

library(ordinis)

# compute the full solution path, n > p
set.seed(123)
n <- 500
p <- 50000
m <- 50
b <- matrix(c(runif(m), rep(0, p - m)))
x <- matrix(rnorm(n * p, sd = 3), n, p)
y <- drop(x %*% b) + rnorm(n)

mod <- ordinis(x, y, 
               penalty = "mcp",
               lower.limits = rep(0, p), # force all coefficients to be positive
               penalty.factor = c(0, 0, rep(1, p-2)), # don't penalize first two coefficients
               alpha = 0.95)  # use elastic net with alpha = 0.95

plot(mod)

## show likelihood
logLik(mod)

## compute AIC
AIC(mod)

## BIC
BIC(mod)

Performance

Lasso (linear regression)

library(microbenchmark)
library(glmnet)

b <- matrix(c(runif(m, min = -1), rep(0, p - m)))
x <- matrix(rnorm(n * p, sd = 3), n, p)
y <- drop(x %*% b) + rnorm(n)

lambdas = glmnet(x, y)$lambda

microbenchmark(
    "glmnet[lasso]" = {resg <- glmnet(x, y, thresh = 1e-10,  # thresh must be very small 
                                      lambda = lambdas)},    # for comparable precision
    "ordinis[lasso]" = {reso <- ordinis(x, y, lambda = lambdas, 
                                       tol = 1e-3)},
    times = 5
)


# difference of results
max(abs(coef(resg) - reso$beta))

microbenchmark(
    "glmnet[lasso]" = {resg <- glmnet(x, y, thresh = 1e-15,  # thresh must be very low for comparable precision
                                      lambda = lambdas)},
    "ordinis[lasso]" = {reso <- ordinis(x, y, lambda = lambdas, 
                                            tol = 1e-3)},
    times = 5
)

# difference of results
max(abs(coef(resg) - reso$beta))

Lasso (logistic regression)

glmnet is clearly faster for logistic regression for the same precision

library(MASS)

set.seed(123)
n <- 200
p <- 10000
m <- 20
b <- matrix(c(runif(m, min = -0.5, max = 0.5), rep(0, p - m)))
x <- matrix(rnorm(n * p, sd = 3), n, p)
y <- 1 *(drop(x %*% b) + rnorm(n) > 0)

lambdas = glmnet(x, y, family = "binomial", lambda.min.ratio = 0.02)$lambda

microbenchmark(
    "glmnet[lasso]" = {resg <- glmnet(x, y, family = "binomial",
                                      thresh = 1e-10,  
                                      lambda = lambdas)},    
    "ordinis[lasso]"     = {reso <- ordinis(x, y, family = "binomial", 
                                            lambda = lambdas, 
                                            tol = 1e-3, tol.irls = 1e-3)},
    times = 5
)

# difference of results
max(abs(coef(resg) - reso$beta))


microbenchmark(
    "glmnet[lasso]" = {resg <- glmnet(x, y, family = "binomial",
                                      thresh = 1e-15,  
                                      lambda = lambdas)},    
    "ordinis[lasso]"     = {reso <- ordinis(x, y, family = "binomial", 
                                            lambda = lambdas, 
                                            tol = 1e-3, tol.irls = 1e-3)},
    times = 5
)

# difference of results
max(abs(coef(resg) - reso$beta))

Lasso (linear regression, ill-conditioned)

library(MASS)

set.seed(123)
n <- 500
p <- 1000
m <- 50
b <- matrix(c(runif(m, min = -1), rep(0, p - m)))
sig <- matrix(0.5, ncol=p,nrow=p); diag(sig) <- 1
x <- mvrnorm(n, mu=rep(0, p), Sigma = sig)
y <- drop(x %*% b) + rnorm(n)

lambdas = glmnet(x, y)$lambda[1:65]

microbenchmark(
    "glmnet[lasso]" = {resg <- glmnet(x, y, thresh = 1e-9,  # thresh must be very small 
                                      lambda = lambdas)},    # for comparable precision
    "ordinis[lasso]" = {reso <- ordinis(x, y, lambda = lambdas, 
                                       tol = 1e-5)},
    times = 5
)


# difference of results
max(abs(coef(resg) - reso$beta))

microbenchmark(
    "glmnet[lasso]" = {resg <- glmnet(x, y, thresh = 1e-11,  # thresh must be very low for comparable precision
                                      lambda = lambdas)},
    "ordinis[lasso]" = {reso <- ordinis(x, y, lambda = lambdas, 
                                            tol = 1e-5)},
    times = 5
)

# difference of results
max(abs(coef(resg) - reso$beta))

Validity of solutions with various bells and whistles

Due to internal differences in standardization, we now compare with glmnet when using observation weights, penalty scaling factors, and parameter box constraints

set.seed(123)
n = 200
p = 1000
m <- 15
b = c(runif(m, min = -0.5, max = 0.5), rep(0, p - m))
x = (matrix(rnorm(n * p, sd = 3), n, p))
y = drop(x %*% b) + rnorm(n)
y2 <- 1 * (y > rnorm(n, mean = 0.5, sd = 3))


wts <- runif(nrow(x))
wts <- wts / mean(wts) # re-scale like glmnet does, so we can compare

penalty.factor <- rbinom(ncol(x), 1, 0.99) * runif(ncol(x)) * 5
penalty.factor <- (penalty.factor / sum(penalty.factor)) * ncol(x)  # re-scale like glmnet does, so we can compare

system.time(resb <- ordinis(x, y2, family = "binomial", tol = 1e-7, tol.irls = 1e-5,
                            penalty = "lasso",
                            alpha = 0.5,  #elastic net term
                            lower.limits = 0, upper.limits = 0.02, # box constraints on all parameters
                            standardize = FALSE, intercept = TRUE,
                            weights = wts, # observation weights
                            penalty.factor = penalty.factor)) # penalty scaling factors

system.time(resg <- glmnet(x,y2, family = "binomial",
                           lambda = resb$lambda,
                           alpha = 0.5, #elastic net term
                           weights = wts, # observation weights
                           penalty.factor = penalty.factor, # penalty scaling factors
                           lower.limits = 0, upper.limits = 0.02, # box constraints on all parameters
                           standardize = FALSE, intercept = TRUE,
                           thresh = 1e-16))

## compare solutions
max(abs(resb$beta[-1,] - resg$beta))


# now with no box constraints
system.time(resb <- ordinis(x, y2, family = "binomial", tol = 1e-7, tol.irls = 1e-5,
                            penalty = "lasso",
                            alpha = 0.5,  #elastic net term
                            standardize = FALSE, intercept = TRUE,
                            weights = wts, # observation weights
                            penalty.factor = penalty.factor)) # penalty scaling factors

system.time(resg <- glmnet(x,y2, family = "binomial",
                           lambda = resb$lambda,
                           alpha = 0.5, #elastic net term
                           weights = wts, # observation weights
                           penalty.factor = penalty.factor, # penalty scaling factors
                           standardize = FALSE, intercept = TRUE,
                           thresh = 1e-16))

## compare solutions
max(abs(resb$beta[-1,] - resg$beta))

A Note on the Elastic Net and linear models

Due to how scaling of the response is handled different in glmnet, it yields slightly different solutions than both ordinis and ncvreg for Gaussian models with a ridge penalty term

library(ncvreg)

## I'm setting all methods to have high precision just so solutions are comparable.
## differences in computation time may be due in part to the arbitrariness of the 
## particular precisions chosen
system.time(resg <- glmnet(x, y, family = "gaussian", alpha = 0.25, 
                           thresh = 1e-15))

system.time(res <- ordinis(x, y, family = "gaussian", penalty = "lasso", alpha = 0.25,
                            tol = 1e-10, lambda = resg$lambda))

system.time(resn <- ncvreg(x, y, family="gaussian", penalty = "lasso",
                           lambda = resg$lambda, alpha = 0.25, max.iter = 100000,
                           eps = 1e-10))

resgg <- res; resgg$beta[-1,] <- resg$beta

# compare ordinis and glmnet
max(abs(res$beta[-1,] - resg$beta))

# compare ordinis and ncvreg
max(abs(res$beta - resn$beta))

# compare ncvreg and glmnet
max(abs(resn$beta[-1,] - resg$beta))