#.First.lib <- function(lib, pkg)
.onLoad <- function(lib, pkg)
{
library.dynam("mpath", pkg, lib)
}
#coordinate descent algorithm
#ref: Regularization paths for generalized linear models via coordinate descent, Friedman et al., JSS, 2010, 33(1)
glmreg <- function(x, ...) UseMethod("glmreg")
glmreg.default <- function(x, ...) {
if (extends(class(x), "Matrix"))
return(glmreg.matrix(x = x, ...))
stop("no method for objects of class ", sQuote(class(x)),
" implemented")
}
glmreg.formula <- function(formula, data, weights, offset=NULL, contrasts=NULL,
x.keep=FALSE, y.keep=TRUE, ...){
## extract x, y, etc from the model formula and frame
#if(!attr(terms(formula, data=data), "intercept"))
# stop("non-intercept model is not implemented")
if(missing(data)) data <- environment(formula)
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "weights",
"offset"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms") # allow model.frame to have updated it
Y <- model.response(mf, "any") # e.g. factors are allowed
## avoid problems with 1D arrays, but keep names
if(length(dim(Y)) == 1L) {
nm <- rownames(Y)
dim(Y) <- NULL
if(!is.null(nm)) names(Y) <- nm
}
## null model support
X <- if (!is.empty.model(mt)) model.matrix(mt, mf, contrasts) else matrix(,NROW(Y), 0L)
## avoid any problems with 1D or nx1 arrays by as.vector.
weights <- as.vector(model.weights(mf))
if(!length(weights)) weights <- rep(1, nrow(mf))
else if(any(weights < 0)) stop("negative weights not allowed")
if(!is.null(weights) && !is.numeric(weights))
stop("'weights' must be a numeric vector")
if(length(weights) != length(Y))
stop("'weights' must be the same length as response variable")
offset <- as.vector(model.offset(mf))
if(!is.null(offset)) {
if(length(offset) != NROW(Y))
stop(gettextf("number of offsets is %d should equal %d (number of observations)", length(offset), NROW(Y)), domain = NA)
}
### End of addition 08/07/2012
RET <- glmreg_fit(X[,-1], Y, weights=weights, offset=offset,...)
RET$call <- match.call()
RET <- c(RET, list(formula=formula, terms = mt, data=data,
contrasts = attr(X, "contrasts"),
xlevels = .getXlevels(mt, mf)))
if(x.keep) RET$x <- data[,colnames(data)%in%attr(mt, "term.labels")]
if(y.keep) RET$y <- Y
class(RET) <- "glmreg"
RET
}
glmreg.matrix <- function(x, y, weights, offset=NULL, ...){
RET <- glmreg_fit(x, y, weights=weights, offset=offset,...)
RET$call <- match.call()
return(RET)
}
glmreg_fit <- function(x, y, weights, start=NULL, etastart=NULL, mustart=NULL, offset=NULL, nlambda=100, lambda=NULL, lambda.min.ratio=ifelse(nobs<nvars,.05, .001),alpha=1, gamma=3, rescale=TRUE, standardize=TRUE, intercept=TRUE, penalty.factor = rep(1, nvars),thresh=1e-6, eps.bino=1e-5, maxit=1000, eps=.Machine$double.eps, theta, family=c("gaussian", "binomial", "poisson", "negbin"), penalty=c("enet","mnet","snet"), convex=FALSE, x.keep=FALSE, y.keep=TRUE, trace=FALSE){
#if(!is.null(start) && !is.null(etastart) && !is.null(mustart))
#stop("start, etastart and mustart is for testing only\n")
family <- match.arg(family)
penalty <- match.arg(penalty)
if(!family %in% c("gaussian", "binomial", "poisson", "negbin")){
print(family)
stop("'family' not recognizied\n")
}
if(family == "gaussian") rescale <- FALSE
#if(family!="gaussian" && y < 0)
# stop("except for gaussian family, response should be nonnegative")
if (gamma <= 1 && penalty=="mnet") stop("gamma must be greater than 1 for the mnet penalty")
if (gamma <= 2 && penalty=="snet") stop("gamma must be greater than 2 for the snet penalty")
if (alpha < 0 || alpha > 1){
cat("alpha=", alpha)
stop("alpha must be equal or greater than 0; and less than or equal to 1")
}
if(!is.null(etastart) && !is.null(mustart)){
if((length(etastart) != length(mustart)) || length(etastart) != length(y))
stop("length of etastart and mustart should be the same as y\n")
}
## avoid any problems with 1D or nx1 arrays by as.vector.
nm <- dim(x)
nobs <- n <- nm[1]
nvars <- m <- nm[2]
if (is.null(lambda) && is.null(lambda.min.ratio)){
stop("lambda or lambda.min.ratio must be provided\n")
}
if(missing(weights)) weights=rep(1,nobs)
weights <- as.vector(weights)
if(!is.null(weights) && !is.numeric(weights))
stop("'weights' must be a numeric vector")
## check weights and offset
if( !is.null(weights) && any(weights < 0) ){
stop("negative weights not allowed")
}
if (is.null(offset)){
is.offset <- FALSE
offset <- rep.int(0, nobs)
}else is.offset <- TRUE
if(family=="binomial"){
if(is.factor(y))
y <- as.integer(y) - 1
if(any(y > 1) || any(y<0))
stop("response should be between 0 and 1 for family=binomial")
}
if(family=="negbin"){
if(missing(theta))
stop("theta has to be provided for family='negbin'\n")
else if(length(theta) > 1)
stop("theta has to be one scale parameter for family='negbin'\n")
else if(theta <= 0){
cat("theta=", theta, "\n")
stop("theta has to be positive for family='negbin'\n")
}
}
if(family != "negbin") theta <- 1 ### this theta is not useful but required as input in Fortran glmlink subroutine
tracetype <- 0
if(trace)
tracetype <- 1
famtype <- switch(family,
"gaussian"=1,
"binomial"=2,
"poisson"=3,
"negbin"=4)
pentype <- switch(penalty,
"enet"=1,
"mnet"=2,
"snet"=3)
stantype <- as.numeric(standardize)
# warning("alp = 1/theta in glm negative.binomial function\n")
if(length(penalty.factor) != nvars) stop("number of penalty.factor should be the same as nvars")
if(all(penalty.factor==0)){
lambda <- rep(0, nvars)
penalty.factor <- rep(1, nvars)
}
### ref: McCullagh and Nelder, 2nd edition, 1989, page 121
### compute intercept-only model
if(family!="negbin") tmpres <- glm(y~rep(1, nobs)-1, weights=weights, offset=offset, family=family)
else tmpres <- glm(y~rep(1, nobs)-1, weights=weights, offset=offset, family=negative.binomial(theta))
eta0 <- predict(tmpres, type="link")
mu0 <- predict(tmpres, type="response")
nulldev <- .Fortran("deveval",
n=as.integer(n),
y=as.double(y),
mu=as.double(rep(mu0, n)),
theta=as.double(theta),
weights=as.double(weights),
family=as.integer(famtype),
dev=as.double(0),
PACKAGE="mpath")$dev
### compute the pathwise coordinate descent, cf: section 2.5 in Friedman et al., JSS 2010, 33(1)
if(is.null(weights)) weights <- rep(1, n)
wt <- weights/sum(weights)
if(is.null(mustart) || is.null(etastart) || is.null(lambda)){
eta <- eta0
mu <- mu0
}
else{
mu <- mustart
eta <- etastart
}
if(is.null(lambda)){
w <- .Fortran("glmlink",
n=as.integer(1),
mu=as.double(mu),
family=as.integer(famtype),
theta=as.double(theta),
w = as.double(0),
ep = as.double(eps.bino),
PACKAGE="mpath")$w
z <- .Fortran("zeval",
n=as.integer(n),
y=as.double(y),
eta=as.double(eta),
mu=as.double(rep(mu,n)),
w=as.double(rep(w,n)),
family=as.integer(famtype),
z=as.double(rep(0,n)),
PACKAGE="mpath")$z
z <- z - offset
lmax <- findlam(x=x, y=y, weights=weights, family=family, theta=theta, w=w, z=z, alpha=ifelse(alpha > 0, alpha, 1e-3), penalty.factor=penalty.factor, standardize=standardize)
# if(penalty %in% c("mnet", "snet") && !rescale) lmax <- 0.5 * sqrt(lmax)
lpath <- seq(log(lmax), log(lambda.min.ratio * lmax), length.out=nlambda)
lambda <- exp(lpath)
}
else nlambda <- length(lambda)
beta <- matrix(0, ncol=nlambda, nrow=m)
b0 <- rep(0, nlambda)
if(is.null(start))
startv <- 0
if(!is.null(start)){
if(length(start) != (m+1))
stop("length of start doesn't match x dimension\n")
else{
startv <- 1
#if(length(start) > 1){
# for(j in 1: nlambda)
# beta[,j] <- start[-1]
#}
#b0 <- rep(start[1], nlambda)
}
}
else{
b0 <- rep(0, nlambda)
start <- rep(0, dim(x)[2]+1)
}
resdev <- rep(0, nlambda)
yhat <- matrix(0, nobs, nlambda)
penfac <- penalty.factor/sum(penalty.factor) * nvars
lam <- penfac %o% lambda
if(family=="gaussian") {
innermaxit <- maxit
maxit <- 1
}
else innermaxit <- 1
wtnew <- weights/sum(weights)
meanx <- drop(wtnew %*% x)
if(standardize){
xx <- scale(x, meanx, FALSE) # centers x
xx <- sqrt(wtnew) * xx
one <- rep(1,n)
normx <- sqrt(one %*% xx^2)
xx <- scale(x, meanx, normx)
}
tmp <- .Fortran("outloop",
x=as.double(x),
y=as.double(y),
weights=as.double(weights),
wt=as.double(wt),
n=as.integer(n),
m=as.integer(m),
penalty=as.integer(pentype),
nlambda=as.integer(nlambda),
lam=as.double(lam),
alpha=as.double(alpha),
gam=as.double(gamma),
theta=as.double(theta),
rescale=as.integer(rescale),
mu=as.double(mu),
eta=as.double(eta),
offset=as.double(offset),
family=as.integer(famtype),
standardize=as.integer(stantype),
intercept=as.integer(intercept),
nulldev=as.double(nulldev),
thresh=as.double(thresh),
maxit=as.integer(maxit),
innermaxit=as.integer(innermaxit),
eps=as.double(eps),
trace=as.integer(tracetype),
start=as.double(start),
startv=as.integer(startv),
b=as.double(beta),
bz=as.double(b0),
resdev=as.double(resdev),
ypre=as.double(yhat),
convout=as.integer(rep(0,nlambda)),
satu=as.integer(0),
good=as.integer(nlambda),
ep = as.double(eps.bino),
outpll = as.double(matrix(0, maxit, nlambda)),
PACKAGE="mpath")
if(nlambda > 1 || tmp$satu==0)
good <- 1:tmp$good # only those non-saturated models
else if(tmp$satu==1)
return(RET <- list(satu=1))
### Names
if(is.null(colnames(x))) varnames <- paste("V", 1:ncol(x), sep="")
else varnames <- colnames(x)
beta <- matrix(tmp$b, ncol=nlambda)[,good]
beta <- as.matrix(beta)
b0 <- tmp$bz[good]
### note: pll was from standardized beta values if standardize=TRUE
if(trace)
pll <- matrix(tmp$outpll, ncol=nlambda)
else pll <- NULL
convex.min <- if (convex && all(weights==1)) convexMin(rbind(b0, beta), xx, penalty, gamma, lambda*(1-alpha), family, theta=theta) else NULL
if(convex && any(weights!=1)) warnings("code for convex region implemented for weights=1 only\n")
if (standardize){
#if (family != "gaussian" && standardize){
beta <- as.matrix(beta/as.vector(normx))
if(intercept){
b0 <- b0 - crossprod(meanx,beta)
if (family == "gaussian"){
b0 <- mean(y) + b0 ### changed 4/22/2015
b0 <- b0 - mean(offset)
}
}
}
else normx <- NULL
resdev <- tmp$resdev[good]
yhat <- matrix(tmp$ypre, ncol=nlambda)[,good]
theta <- rep(theta, nlambda)
if(is.null(dim(beta)))
names(beta) <- colnames(x)
else{
rownames(beta) <- colnames(x)
colnames(beta) <- round(lambda,digits=4)[good]
}
lambda <- lambda[good]
nlambda <- length(lambda[good])
b0 <- matrix(b0, ncol=nlambda)
RET <- list(family=family,standardize=standardize, satu=tmp$satu, lambda=lambda, nlambda=nlambda, beta=beta, b0=b0, meanx=meanx, normx=normx, theta=theta[good], nulldev=nulldev, resdev=resdev, pll = pll, fitted.values=yhat, converged=tmp$convout[good], convex.min=convex.min, penalty.factor=penalty.factor, gamma=gamma, alpha=alpha, is.offset=is.offset, offset=offset)
if(x.keep) RET$x <- x
if(y.keep) RET$y <- y
class(RET) <- "glmreg"
RET$twologlik <- try(2*logLik(RET, newx=x, y=y, weights=weights, offset=offset))
###penalized log-likelihood function value for rescaled beta
penval <- rep(NA, nlambda)
for(j in (1:nlambda)){
penval[j] <- .Fortran("penGLM",
start=as.double(beta[,j]),
m=as.integer(m),
lambda=as.double(RET$lambda[j]*penalty.factor),
alpha=as.double(alpha),
gam=as.double(gamma),
penalty=as.integer(pentype),
pen=as.double(0.0),
PACKAGE="mpath")$pen
}
RET$penval <- penval
if(standardize)
RET$pllres <- RET$twologlik/2 - n*penval
else RET$pllres <- RET$twologlik/2 - penval
if(intercept) npar <- 1+RET$df else npar <- RET$df
if(nlambda == 1){
RET$df <- sum(abs(beta) > 0) ### df= number of nonzero coefficients (intercept excluded)
RET$aic <- -RET$twologlik + 2*npar
RET$bic <- -RET$twologlik + log(n)*npar
}
else{
RET$df <- apply(abs(beta) > 0, 2, sum) ##number of nonzero coefficients for each lambda
RET$aic <- -RET$twologlik + 2*npar
RET$bic <- -RET$twologlik + log(n)*npar
}
RET
}
g <- function(mu, family, eps.bino=1e-5){
switch(family,
"gaussian"={
eta <- mu
},
"binomial"={
ep <- eps.bino
n <- length(mu)
eta <- rep(0, n)
for(i in 1:n){
if(1-mu[i] > ep && mu[i] > ep)
eta[i] <- log(mu[i]/(1-mu[i]))
}
},
"poisson"={
eta <- log(mu)
},
"negbin"={
eta <- log(mu)
}
)
return(eta)
}
### eta is the estimated beta_0 in the intercept-only model, need to check 2/29/2020
init <- function(wt, y, offset, family){
mu <- as.vector(wt %*% y) + offset
### is the above mu wrong such that mu < 0?
switch(family,
"gaussian"={
eta <- mu
},
"binomial"={
eta <- log(mu/(1-mu))
},
"poisson"={
eta <- log(pmax(1, mu))
},
"negbin"={ ###Negative binomial regression by Hilbe, page 52, Table 4.1. Note, Table 8.3 has errors.
eta <- log(pmax(1, mu))
}
)
list(mu=mu, eta=eta)
}
findlam <- function(x, y, weights, family, theta=NULL, w, z, alpha, penalty.factor, eps.bino=1e-5, standardize=TRUE){
if(all(penalty.factor==0)) return(0)
if (alpha <= 0 || alpha > 1){
stop("alpha must be greater than 0 and less than or equal to 1, but alpha=", alpha)
}
if(family=="gaussian" || standardize) #changed 8/12/2016
weights <- weights/sum(weights)
penfac <- penalty.factor/sum(penalty.factor) * dim(x)[2]
acvar <- which(penfac > 0)
### regression from unregularied variables
if(length(acvar) < length(penalty.factor)){
x <- x[, acvar]
penfac <- penfac[acvar]
}
if(family == "negbin" && is.null(theta)){
fit <- glm.nb(y ~ 1, weights = weights)
mu <- fit$fitted.values
w <- .Fortran("glmlink",
n = as.integer(length(y)),
mu = as.double(mu),
family = as.integer(4),
theta = as.double(fit$theta),
w = as.double(rep(0, length(y))),
ep = as.double(eps.bino),
PACKAGE="mpath")$w
resid <- .Fortran("zeval",
n = as.integer(length(y)),
y = as.double(y),
eta = as.double(rep(0, length(y))),
mu = as.double(mu),
w = as.double(w),
family = as.integer(4),
z = as.double(rep(0, length(y))),
PACKAGE="mpath")$z
}
else{
if(length(acvar) < length(penalty.factor))
resid <- lm(z ~ x[,-acvar], weights=weights)$resid
else resid <- z-weighted.mean(z, weights)
}
if(standardize){
xx <- x
meanx <- drop(weights %*% x)
x <- scale(x, meanx, FALSE) # centers x
x <- sqrt(weights) * x
one <- rep(1, length(y))
normx <- sqrt(drop(one %*% (x^2)))
x <- scale(xx, meanx, normx)
lmax <- max(abs(crossprod(x,weights*w*resid))/(penfac*alpha))
}
else
lmax <- max(abs(crossprod(x,weights*w*resid))/(penfac*alpha))
lmax
}
deviance.glmreg <- function(object, newx, y, weights, na.action=na.pass, ...){
if(missing(newx) && missing(y)) return(object$resdev)
family <- object$family
if(missing(weights)) weights <- rep(1, length(y))
if(!is.null(object$terms)){
mf <- model.frame(delete.response(object$terms), newx, na.action = na.action, xlev = object$xlevels)
newx <- model.matrix(delete.response(object$terms), mf, contrasts = object$contrasts)
newx <- newx[,-1] ### remove the intercept
}
object$terms <- NULL ### now newx is matrix, but predict function requires a dataframe for newx unless terms is NULL
mu <- predict(object, newx, type="response")
famtype <- switch(family,
"gaussian"=1,
"binomial"=2,
"poisson"=3,
"negbin"=4)
dev <- rep(NA, object$nlambda)
for(j in 1:object$nlambda)
dev[j] <- .Fortran("deveval",
n=as.integer(length(y)),
y=as.double(y),
mu=as.double(mu[,j]),
theta=as.double(object$theta[j]),
weights=as.double(weights),
family= as.integer(famtype),
dev=as.double(0),
PACKAGE="mpath")$dev
return(dev)
}
### convert glm object to class glmreg, thus can be used to predict newdata
conv2glmreg <- function(object, family=c("poisson", "negbin")){
family <- match.arg(family)
if(class(object)=="glm") class(object) <- "glmreg"
object$family <- family
object$nlambda <- 1
namecount <- names(object$coefficients)
object$coefficients <- matrix(object$coefficients, ncol=1)
rownames(object$coefficients) <- namecount
object
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.