# R/rgam.R In relgam: Reluctant Generalized Additive Models

#### Documented in rgam

#' Fit reluctant generalized additive model
#'
#' Fits a reluctant generalized additive model (RGAM) for an entire regularization
#' path indexed by the parameter \code{lambda}. Fits linear, logistic, Poisson
#' and Cox regression models. RGAM is a three-step algorithm: Step 1 fits the
#' lasso and computes residuals, Step 2 constructs the non-linear features, and
#' Step 3 fits a lasso of the response on both the linear and non-linear features.
#'
#' If there are variables which the user definitely wants to compute non-linear
#' versions for in Step 2 of the algorithm, they can be specified as a vector for
#' the \code{init_nz} option. The algorithm will compute non-linear versions for
#' these features as well as the features suggested by Step 1 of the algorithm.
#'
#' If \code{standardize = TRUE}, the standard deviation of the linear and
#' non-linear features would be 1 and gamma respectively. If
#' \code{standardize = FALSE}, linear features will remain on their original
#' scale while non-linear features would have standard deviation gamma times
#' the mean standard deviation of the linear features.
#'
#' For \code{family="gaussian"}, \code{rgam} standardizes \code{y} to have unit
#' variance (using \code{1/n} rather than \code{1/(n-1)} formula).
#'
#' @param x Input matrix, of dimension \code{nobs x nvars}; each row is
#' an observation vector.
#' @param y Response variable. Quantitative for \code{family = "gaussian"} or
#' \code{family = "poisson"} (non-negative counts). For \code{family="binomial"},
#' should be a numeric vector consisting of 0s and 1s. For \code{family="cox"},
#' y should be a two-column matrix with columns named 'time' and 'status'.
#' The latter is a binary variable, with '1' indicating death, and '0'
#' indicating right-censored.
#' @param lambda A user-supplied \code{lambda} sequence. Typical usage is to
#' have the program compute its own \code{lambda} sequence; supplying a value of
#' lambda overrides this.
#' @param lambda.min.ratio Smallest value for lambda as a fraction of the
#' largest lambda value. The default depends on the sample size nobs relative to
#' the number of variables nvars. If nobs > nvars, the default is 0.0001,
#' close to zero. If nobs < nvars, the default is 0.01.
#' @param standardize If \code{TRUE} (default), the columns of the input matrix
#' are standardized before the algorithm is run. See details section for more
#' information.
#' @param family Response type. Either \code{"gaussian"} (default) for linear
#' regression, \code{"binomial"} for logistic regression, \code{"poisson"} for
#' Poisson regression or \code{"cox"} for Cox regression.
#' @param offset A vector of length nobs. Useful for the "poisson" family (e.g.
#' log of exposure time), or for refining a model by starting at a current fit.
#' Default is NULL. If supplied, then values must also be supplied to the predict
#' function.
#' @param init_nz A vector specifying which features we must include when
#' computing the non-linear features. Default is to construct non-linear features
#' for all given features.
#' @param removeLin When constructing the non-linear features, do we remove
#' the linear component from them? Default is \code{TRUE}.
#' @param nfolds Number of folds for CV in Step 1 (default is 5). Although
#' \code{nfolds} can be as large as the sample size (leave-one-out CV), it is
#' not recommended for large datasets. Smallest value allowable is \code{nfolds = 3}.
#' @param foldid An optional vector of values between 1 and \code{nfolds}
#' identifying what fold each observation is in. If supplied, \code{nfolds} can
#' be missing.
#' @param df Degrees of freedom for the non-linear fit in Step 2. Default is 4.
#' @param gamma Scale factor for non-linear features (vs. original features), to
#' be between 0 and 1. Default is 0.8 if \code{init_nz = c()}, 0.6 otherwise.
#' @param tol Parameter to be passed to \code{smooth.spline}: a tolerance for
#' same-ness or uniqueness of the x values. Default is 0.01. See
#' \code{smooth.spline} documentation for more details.
#' @param parallel If TRUE, the \code{cv.glmnet()} call in Step 1 is
#' parallelized. Must register parallel before hand, such as doMC or others.
#' Default is FALSE.
#' @param verbose If \code{TRUE} (default), model-fitting is tracked with a
#' progress bar.
#'
#' @return An object of class \code{"rgam"}.
#' \item{full_glmfit}{The glmnet object resulting from Step 3: fitting a \code{glmnet}
#' model for the response against the linear & non-linear features.}
#' \item{spline_fit}{List of spline fits for residual against each response.
#' Needed for predicting on new data.}
#' \item{lin_comp_fit}{If \code{removeLin = TRUE}, a list of coefficients for
#' simple linear regression of non-linear feature on original feature. Needed
#' for predicting on new data.}
#' \item{init_nz}{Column indices for the features which we allow to have
#' non-linear relationship with the response.}
#' \item{step1_nz}{Indices of features which CV in Step 1 chose.}
#' \item{removeLin}{Did we remove the linear components when constructing the
#' non-linear features? Needed for predicting on new data.}
#' \item{mxf}{Means of the features (both linear and non-linear).}
#' \item{sxf}{Scale factors of the features (both linear and non-linear).}
#' \item{feat}{Column indices of the non-zero features for each value of
#' \code{lambda}.}
#' \item{linfeat}{Column indices of the non-zero linear components for each value of
#' \code{lambda}.}
#' \item{nonlinfeat}{Column indices of the non-zero non-linear components for each value
#' of \code{lambda}.}
#' \item{nzero_feat}{The number of non-zero features for each value of
#' \code{lambda}.}
#' \item{nzero_lin}{The number of non-zero linear components for each value of
#' \code{lambda}.}
#' \item{nzero_nonlin}{The number of non-zero non-linear components for each value
#' of \code{lambda}.}
#' \item{lambda}{The actual sequence of \code{lambda} values used.}
#' \item{p}{The number of features in the original data matrix.}
#' \item{family}{Response type.}
#' \item{call}{The call that produced this object.}
#'
#' @examples
#' set.seed(1)
#' n <- 100; p <- 20
#' x <- matrix(rnorm(n * p), n, p)
#' beta <- matrix(c(rep(2, 5), rep(0, 15)), ncol = 1)
#' y <- x %*% beta + rnorm(n)
#'
#' fit <- rgam(x, y)
#'
#' # construct non-linear features for only those selected by Step 1
#' fit <- rgam(x, y, init_nz = c())
#'
#' # specify scale factor gamma and degrees of freedom
#' fit <- rgam(x, y, gamma = 1, df = 6)
#'
#' # binomial family
#' bin_y <- ifelse(y > 0, 1, 0)
#' fit2 <- rgam(x, bin_y, family = "binomial")
#'
#' # Poisson family
#' poi_y <- rpois(n, exp(x %*% beta))
#' fit3 <- rgam(x, poi_y, family = "poisson")
#' # Poisson with offset
#' offset <- rnorm(n)
#' fit3 <- rgam(x, poi_y, family = "poisson", offset = offset)
#'
#' @import utils
#' @export
rgam <- function(x, y, lambda = NULL, lambda.min.ratio = ifelse(nrow(x) < ncol(x),
0.01, 1e-04), standardize = TRUE,
family = c("gaussian","binomial", "poisson", "cox"), offset = NULL,
init_nz, removeLin = TRUE, nfolds = 5, foldid = NULL,
df = 4, gamma, tol = 0.01, parallel = FALSE, verbose = TRUE) {
this.call <- match.call()

n <- nrow(x); p <- ncol(x)
if (is.null(offset)) {
offset <- y * 0
is.offset = FALSE
} else {
is.offset = TRUE
}

family <- match.arg(family)
if (family == "binomial" && any(!(unique(y) %in% c(0, 1)))) {
stop("If family is binomial, y can only contain 0s and 1s")
}
if (family == "poisson" && any(y < 0)) {
stop("If family is Poisson, y values cannot be negative")
}

# specify init_nz and gamma
if (missing(init_nz)) {
message("init_nz not specified: setting to default (all features)")
init_nz <- 1:p
}
if (missing(gamma)) {
if (length(init_nz) == 0) {
message("using default value of gamma for RGAM_SEL: 0.8")
gamma <- 0.8
} else {
message("using default value of gamma for RGAM: 0.6")
gamma <- 0.6
}
}

# center (and scale) columns of x and y
mx <- colMeans(x)
if (standardize) {
sx <- apply(x, 2, sd) * sqrt((n-1) / n)
} else {
sx <- rep(1, ncol(x))
}
x <- scale(x, mx, sx)
if (family == "gaussian") {
my <- mean(y); sy <- sd(y) * sqrt((n-1) / n)
y <- scale(y, my, sy)
}

# remove features with sx == 0 from consideration
exclude <- which(sx == 0)
x[, exclude] <- 0

# if fold IDs not given, randomly create them
# if given, count the number of folds
if (is.null(foldid)) {
foldid <- sample(rep(seq(nfolds), length = n))
} else {
nfolds <- length(unique(foldid))
}
if (nfolds < 3) {
stop("nfolds must be 3 or larger; nfolds = 5 recommended")
}

# start progress bar
if (verbose)  pb <- utils::txtProgressBar(min = 0, max = 3, style = 3)

# Step 1: Get residuals after "best linear fit"
cv <- glmnet::cv.glmnet(x, y, standardize = FALSE, foldid = foldid,
family = family, parallel = parallel, exclude = exclude,
lambda.min.ratio = lambda.min.ratio, offset = offset)

if (family %in% c("gaussian", "binomial", "poisson")) {
yhat0 <- predict(cv, x, s = cv$lambda.min, type = "response", newoffset = offset) r <- y - yhat0 } else if (family == "cox") { yhat0 <- predict(cv, x, s = cv$lambda.min, type = "link", newoffset = offset)
}

# get index of features which have non-zero coefficients for step 2
step1_nz <- predict(cv, s = cv$lambda.min, type="nonzero") # to handle special case of nz_feat being empty if (is.null(dim(step1_nz))) { step1_nz <- numeric() } else { step1_nz <- step1_nz[, 1] } init_nz <- sort(unique(c(step1_nz, init_nz))) # update progress bar if (verbose) utils::setTxtProgressBar(pb, 1) # Step 2: make the non-linear features x out <- makef(x[, init_nz, drop = F], r, df, tol = tol, removeLin = removeLin) f <- out$f

# standardize non-linear features
mf <- colMeans(f)
if (standardize) {
sf <- apply(f, 2, sd) * sqrt((n-1) / n) / gamma
} else {
sf <- (apply(f, 2, sd) / mean(apply(x, 2, sd))) / gamma
}
f <- scale(f, mf, sf)

# update the mean and sd vectors, and exclude
mxf <- c(mx, mf)
sxf <- c(sx, sf)
exclude <- c(exclude, p + which(sf == 0))
f[, which(sf == 0)] <- 0

# update progress bar
if (verbose) utils::setTxtProgressBar(pb, 2)

# Step 3: fit lasso with both linear and non-linear features
full_glmfit <- glmnet::glmnet(cbind(x, f), y, lambda = lambda, standardize = FALSE,
family = family, exclude = exclude,
lambda.min.ratio = lambda.min.ratio, offset = offset)

# rescale intercept values and beta values
beta <- matrix(full_glmfit$beta, nrow = p + length(init_nz)) if (family == "gaussian") { full_glmfit$a0 <- full_glmfit$a0 * sy - colSums(beta * mxf / sxf) * sy + my full_glmfit$beta <- t(scale(t(beta), F, sxf)) * sy
} else {
full_glmfit$a0 <- full_glmfit$a0 - colSums(beta * mxf / sxf)
full_glmfit$beta <- t(scale(t(beta), F, sxf)) } # get no. of nonzeros for linear and non-linear components nz <- predict(full_glmfit, type = "nonzero") linfeat <- lapply(nz, function(x) x[x <= p]) nonlinfeat <- lapply(nz, function(x) init_nz[x[x > p] - p]) nzero_lin <- unlist(lapply(linfeat, length)) nzero_nonlin <- unlist(lapply(nonlinfeat, length)) # figure out no. of nonzero features (whether linear or non-linear) feat <- lapply(1:length(nz), function(i) unique(c(linfeat[[i]], nonlinfeat[[i]]))) nzero_feat <- unlist(lapply(feat, length)) full_glmfit$offset <- is.offset

# update progress bar
if (verbose) utils::setTxtProgressBar(pb, 3)

out <- list(full_glmfit = full_glmfit, spline_fit = out$spline_fit, lin_comp_fit = out$lin_comp_fit,
init_nz = init_nz, step1_nz = step1_nz, removeLin = removeLin,
mxf = mxf, sxf = sxf,
feat = feat, linfeat = linfeat, nonlinfeat = nonlinfeat,
nzero_feat = nzero_feat, nzero_lin = nzero_lin,
nzero_nonlin = nzero_nonlin, lambda = full_glmfit\$lambda,
p = p, family = family, call = this.call)
class(out) <- "rgam"
return(out)
}


## Try the relgam package in your browser

Any scripts or data that you put into this service are public.

relgam documentation built on Jan. 13, 2020, 5:06 p.m.