#' @useDynLib xrnet, .registration = TRUE
#' @importFrom Rcpp sourceCpp
#' @importFrom stats predict
#' @importFrom bigmemory is.big.matrix
#' @importFrom methods is
NULL
#' Fit hierarchical regularized regression model
#'
#' @description Fits hierarchical regularized regression model that enables the
#' incorporation of external data for predictor variables. Both the predictor
#' variables and external data can be regularized by the most common penalties
#' (lasso, ridge, elastic net). Solutions are computed across a two-dimensional
#' grid of penalties (a separate penalty path is computed for the predictors and
#' external variables). Currently support regularized linear and logistic
#' regression, future extensions to other outcomes (i.e. Cox regression) will be
#' implemented in the next major update.
#'
#' @param x predictor design matrix of dimension \eqn{n x p}, matrix options
#' include:
#' \itemize{
#' \item matrix
#' \item big.matrix
#' \item filebacked.big.matrix
#' \item sparse matrix (dgCMatrix)
#' }
#' @param y outcome vector of length \eqn{n}
#' @param external (optional) external data design matrix of dimension
#' \eqn{p x q},
#' matrix options include:
#' \itemize{
#' \item matrix
#' \item sparse matrix (dgCMatrix)
#' }
#' @param unpen (optional) unpenalized predictor design matrix, matrix options
#' include:
#' \itemize{
#' \item matrix
#' }
#' @param family error distribution for outcome variable, options include:
#' \itemize{
#' \item "gaussian"
#' \item "binomial"
#' }
#' @param penalty_main specifies regularization object for x. See
#' \code{\link{define_penalty}} for more details.
#' @param penalty_external specifies regularization object for external. See
#' \code{\link{define_penalty}} for more details.
#' @param weights optional vector of observation-specific weights. Default is 1
#' for all observations.
#' @param standardize indicates whether x and/or external should be
#' standardized. Default is c(TRUE, TRUE).
#' @param intercept indicates whether an intercept term is included for x
#' and/or external. Default is c(TRUE, FALSE).
#' @param control specifies xrnet control object. See
#' \code{\link{xrnet_control}} for more details.
#'
#' @details This function extends the coordinate descent algorithm of the
#' R package \code{glmnet} to allow the type of regularization (i.e. ridge,
#' lasso) to be feature-specific. This extension is used to enable fitting
#' hierarchical regularized regression models, where external information for
#' the predictors can be included in the \code{external=} argument. In addition,
#' elements of the R package \code{biglasso} are utilized to enable the use of
#' standard R matrices, memory-mapped matrices from the \code{bigmemory}
#' package, or sparse matrices from the \code{Matrix} package.
#'
#' @references
#' Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010).
#' Regularization Paths for Generalized Linear Models via Coordinate Descent.
#' Journal of Statistical Software, 33(1), 1-22. URL
#' http://www.jstatsoft.org/v33/i01/.
#'
#' @references
#' Zeng, Y., and Breheny, P. (2017).
#' The biglasso Package: A Memory- and Computation-Efficient Solver for Lasso
#' Model Fitting with Big Data in R. arXiv preprint arXiv:1701.05936. URL
#' https://arxiv.org/abs/1701.05936.
#'
#' @references
#' Michael J. Kane, John Emerson, Stephen Weston (2013).
#' Scalable Strategies for Computing with Massive Data.
#' Journal of Statistical Software, 55(14), 1-19. URL
#' http://www.jstatsoft.org/v55/i14/.
#'
#' @return A list of class \code{xrnet} with components:
#' \item{beta0}{matrix of first-level intercepts indexed by penalty values}
#' \item{betas}{3-dimensional array of first-level penalized coefficients
#' indexed by penalty values}
#' \item{gammas}{3-dimensional array of first-level non-penalized coefficients
#' indexed by penalty values}
#' \item{alpha0}{matrix of second-level intercepts indexed by penalty values}
#' \item{alphas}{3-dimensional array of second-level external data coefficients
#' indexed by penalty values}
#' \item{penalty}{vector of first-level penalty values}
#' \item{penalty_ext}{vector of second-level penalty values}
#' \item{family}{error distribution for outcome variable}
#' \item{num_passes}{total number of passes over the data in the coordinate
#' descent algorithm}
#' \item{status}{error status for xrnet fitting}
#' \itemize{
#' \item 0 = OK
#' \item 1 = Error/Warning
#' }
#' \item{error_msg}{description of error}
#'
#' @examples
#' ### hierarchical regularized linear regression ###
#' data(GaussianExample)
#'
#' ## define penalty for predictors and external variables
#' ## default is ridge for predictors and lasso for external
#' ## see define_penalty() function for more details
#'
#' penMain <- define_penalty(0, num_penalty = 20)
#' penExt <- define_penalty(1, num_penalty = 20)
#'
#' ## fit model with defined regularization
#' fit_xrnet <- xrnet(
#' x = x_linear,
#' y = y_linear,
#' external = ext_linear,
#' family = "gaussian",
#' penalty_main = penMain,
#' penalty_external = penExt
#' )
#' @export
xrnet <- function(x,
y,
external = NULL,
unpen = NULL,
family = c("gaussian", "binomial"),
penalty_main = define_penalty(),
penalty_external = define_penalty(),
weights = NULL,
standardize = c(TRUE, TRUE),
intercept = c(TRUE, FALSE),
control = list()) {
this_call <- match.call()
family <- match.arg(family)
# check type of x matrix
if (is(x, "matrix")) {
if (typeof(x) != "double") {
stop("x must be of type double")
}
mattype_x <- 1
}
else if (is.big.matrix(x)) {
if (bigmemory::describe(x)@description$type != "double") {
stop("x must be of type double")
}
mattype_x <- 2
} else if ("dgCMatrix" %in% class(x)) {
if (typeof(x@x) != "double") {
stop("x must be of type double")
}
mattype_x <- 3
} else {
stop(
"x must be a standard R matrix,
big.matrix, filebacked.big.matrix, or dgCMatrix"
)
}
y <- as.double(drop(y))
# check dimensions of x and y
nr_x <- NROW(x)
nc_x <- NCOL(x)
y_len <- NROW(y)
if (y_len != nr_x) {
stop(
paste(
"Length of y (", y_len,
") not equal to the number of rows of x (", nr_x, ")",
sep = ""
)
)
}
## Prepare external ##
is_sparse_ext <- FALSE
if (!is.null(external)) {
# check if external is a sparse matrix
if (is(external, "sparseMatrix")) {
is_sparse_ext <- TRUE
} else {
# convert to matrix
if (!("matrix" %in% class(external))) {
external <- as.matrix(external)
}
if (typeof(external) != "double") {
stop("external must be of type double")
}
}
nr_ext <- NROW(external)
nc_ext <- NCOL(external)
if (nc_x != nr_ext) {
stop(
paste("Number of columns in x (", nc_x,
") not equal to the number of rows in external (", nr_ext,
")",
sep = ""
)
)
}
} else {
external <- matrix(vector("numeric", 0), 0, 0)
nr_ext <- as.integer(0)
nc_ext <- as.integer(0)
}
## Prepare unpenalized covariates ##
if (!is.null(unpen)) {
# check dimensions
nc_unpen <- NCOL(unpen)
if (y_len != NROW(unpen)) {
stop(
paste(
"Length of y (", y_len,
") not equal to the number of rows of unpen (", NROW(unpen),
")",
sep = ""
)
)
}
if (!("matrix" %in% class(unpen))) {
unpen <- as.matrix(unpen)
}
if (typeof(unpen) != "double") {
stop("unpen must be a numeric matrix of type 'double'")
}
} else {
unpen <- matrix(vector("numeric", 0), 0, 0)
nc_unpen <- as.integer(0)
}
# set weights
if (is.null(weights)) {
weights <- as.double(rep(1, nr_x))
} else if (length(weights) != y_len) {
stop(
paste(
"Length of weights (", length(weights),
") not equal to length of y (", y_len,
")",
sep = ""
)
)
} else if (any(weights < 0)) {
stop("weights can only contain non-negative values")
} else {
weights <- as.double(weights)
}
penalty <- initialize_penalty(
penalty_main = penalty_main,
penalty_external = penalty_external,
nr_x = nr_x,
nc_x = nc_x,
nc_unpen = nc_unpen,
nr_ext = nr_ext,
nc_ext = nc_ext,
intercept = intercept
)
control <- do.call("xrnet_control", control)
control <- initialize_control(
control_obj = control,
nc_x = nc_x,
nc_unpen = nc_unpen,
nc_ext = nc_ext,
intercept = intercept
)
fit <- fitModelRcpp(
x = x,
mattype_x = mattype_x,
y = y,
ext = external,
is_sparse_ext = is_sparse_ext,
fixed = unpen,
weights_user = weights,
intr = intercept,
stnd = standardize,
penalty_type = penalty$ptype,
cmult = penalty$cmult,
quantiles = c(penalty$quantile, penalty$quantile_ext),
num_penalty = c(penalty$num_penalty, penalty$num_penalty_ext),
penalty_ratio = c(penalty$penalty_ratio, penalty$penalty_ratio_ext),
penalty_user = penalty$user_penalty,
penalty_user_ext = penalty$user_penalty_ext,
lower_cl = control$lower_limits,
upper_cl = control$upper_limits,
family = family,
thresh = control$tolerance,
maxit = control$max_iterations,
ne = control$dfmax,
nx = control$pmax
)
# check status of model fit
if (fit$status %in% c(0, 1)) {
if (fit$status == 0) {
fit$status <- "0 (OK)"
}
else if (fit$status == 1) {
fit$status <- "1 (Error/Warning)"
fit$error_msg <- "Max number of iterations reached"
warning("Max number of iterations reached")
}
# Create arrays ordering coefficients by 1st level / 2nd level penalty
fit$beta0 <- matrix(
fit$beta0,
nrow = penalty$num_penalty,
ncol = penalty$num_penalty_ext,
byrow = TRUE
)
dim(fit$betas) <- c(nc_x, penalty$num_penalty_ext, penalty$num_penalty)
fit$betas <- aperm(fit$betas, c(1, 3, 2))
if (intercept[2]) {
fit$alpha0 <- matrix(
fit$alpha0,
nrow = penalty$num_penalty,
ncol = penalty$num_penalty_ext, byrow = TRUE
)
} else {
fit$alpha0 <- NULL
}
if (nc_ext > 0) {
dim(fit$alphas) <- c(nc_ext, penalty$num_penalty_ext, penalty$num_penalty)
fit$alphas <- aperm(fit$alphas, c(1, 3, 2))
} else {
fit$alphas <- NULL
fit$penalty_ext <- NULL
}
if (nc_unpen > 0) {
dim(fit$gammas) <- c(
nc_unpen, penalty$num_penalty_ext, penalty$num_penalty
)
fit$gammas <- aperm(fit$gammas, c(1, 3, 2))
} else {
fit$gammas <- NULL
}
}
fit$call <- this_call
class(fit) <- "xrnet"
return(fit)
}
#' Control function for xrnet fitting
#'
#' @description Control function for \code{\link{xrnet}} fitting.
#'
#' @param tolerance positive convergence criterion. Default is 1e-08.
#' @param max_iterations maximum number of iterations to run coordinate
#' gradient descent across all penalties before returning an error.
#' Default is 1e+05.
#' @param dfmax maximum number of variables allowed in model. Default is
#' \eqn{ncol(x) + ncol(unpen) + ncol(external) + intercept[1] + intercept[2]}.
#' @param pmax maximum number of variables with nonzero coefficient estimate.
#' Default is \eqn{min(2 * dfmax + 20, ncol(x) + ncol(unpen) + ncol(external)
#' + intercept[2])}.
#' @param lower_limits vector of lower limits for each coefficient. Default is
#' -Inf for all variables.
#' @param upper_limits vector of upper limits for each coefficient. Default is
#' Inf for all variables.
#'
#' @return A list object with the following components:
#' \item{tolerance}{The coordinate descent stopping criterion.}
#' \item{dfmax}{The maximum number of variables that will be allowed in the
#' model.}
#' \item{pmax}{The maximum number of variables with nonzero coefficient
#' estimate.}
#' \item{lower_limits}{Feature-specific numeric vector of lower bounds for
#' coefficient estimates}
#' \item{upper_limits}{Feature-specific numeric vector of upper bounds for
#' coefficient estimates}
#' @export
xrnet_control <- function(tolerance = 1e-08,
max_iterations = 1e+05,
dfmax = NULL,
pmax = NULL,
lower_limits = NULL,
upper_limits = NULL) {
if (tolerance <= 0) {
stop("tolerance must be greater than 0")
}
if (max_iterations <= 0 || as.integer(max_iterations) != max_iterations) {
stop("max_iterations must be a positive integer")
}
control_obj <- list(
tolerance = tolerance,
max_iterations = max_iterations,
dfmax = dfmax,
pmax = pmax,
lower_limits = lower_limits,
upper_limits = upper_limits
)
}
initialize_penalty <- function(penalty_main,
penalty_external,
nr_x,
nc_x,
nc_unpen,
nr_ext,
nc_ext,
intercept) {
names(penalty_external) <- c(
"penalty_type_ext",
"quantile_ext",
"num_penalty_ext",
"penalty_ratio_ext",
"user_penalty_ext",
"custom_multiplier_ext"
)
penalty_obj <- c(penalty_main, penalty_external)
# check penalty object for x
if (length(penalty_obj$penalty_type) > 1) {
if (length(penalty_obj$penalty_type) != nc_x) {
stop(
"Length of penalty_type (",
length(penalty_obj$penalty_type),
") not equal to number of columns in x (",
nc_x, ")"
)
}
} else {
penalty_obj$penalty_type <- rep(penalty_obj$penalty_type, nc_x)
}
if (is.null(penalty_obj$penalty_ratio)) {
if (penalty_obj$user_penalty[1] == 0) {
if (nr_x > nc_x) {
penalty_obj$penalty_ratio <- 1e-04
} else {
penalty_obj$penalty_ratio <- 0.01
}
if (penalty_obj$num_penalty < 3) {
penalty_obj$num_penalty <- 3
stop("num_penalty must be at least 3
when automatically computing penalty path")
}
} else {
penalty_obj$user_penalty <- rev(sort(penalty_obj$user_penalty))
penalty_obj$penalty_ratio <- 0.0
}
}
if (is.null(penalty_obj$custom_multiplier)) {
penalty_obj$custom_multiplier <- rep(1.0, nc_x)
} else if (length(penalty_obj$custom_multiplier) != nc_x) {
stop(
"Length of custom_multiplier (",
length(penalty_obj$custom_multiplier),
") not equal to number of columns in x (",
nc_x, ")"
)
}
# check penalty object for external
if (nc_ext > 0) {
if (length(penalty_obj$penalty_type_ext) > 1) {
if (length(penalty_obj$penalty_type_ext) != nc_ext) {
stop(
"Length of penalty_type_ext (",
length(penalty_obj$penalty_type_ext),
") not equal to number of columns in external (",
nc_ext,
")"
)
}
} else {
penalty_obj$penalty_type_ext <- rep(penalty_obj$penalty_type_ext, nc_ext)
}
if (is.null(penalty_obj$penalty_ratio_ext)) {
if (penalty_obj$user_penalty_ext[1] == 0) {
if (nr_ext > nc_ext) {
penalty_obj$penalty_ratio_ext <- 1e-04
} else {
penalty_obj$penalty_ratio_ext <- 0.01
}
if (penalty_obj$num_penalty_ext < 3) {
penalty_obj$num_penalty_ext <- 3
stop("num_penalty_ext must be at least
3 when automatically computing penalty path")
}
} else {
penalty_obj$user_penalty_ext <- rev(sort(penalty_obj$user_penalty_ext))
penalty_obj$penalty_ratio_ext <- 0.0
}
}
if (is.null(penalty_obj$custom_multiplier_ext)) {
penalty_obj$custom_multiplier_ext <- rep(1.0, nc_ext)
} else if (
length(penalty_obj$custom_multiplier_ext) != nc_ext && nc_ext > 0
) {
stop(
"Length of custom_multiplier_ext (",
length(penalty_obj$custom_multiplier_ext),
") not equal to number of columns in external (",
nc_ext, ")"
)
}
} else {
penalty_obj$penalty_type_ext <- NULL
penalty_obj$num_penalty_ext <- 1
penalty_obj$penalty_ratio_ext <- 0
penalty_obj$custom_multiplier_ext <- numeric(0)
}
# vectors holding penalty type and multipliers across all variables
if (intercept[2]) {
penalty_obj$ptype <- c(
penalty_obj$penalty_type,
rep(0.0, nc_unpen),
0.0,
penalty_obj$penalty_type_ext
)
penalty_obj$cmult <- c(
penalty_obj$custom_multiplier,
rep(0.0, nc_unpen),
0.0,
penalty_obj$custom_multiplier_ext
)
} else {
penalty_obj$ptype <- c(
penalty_obj$penalty_type,
rep(0.0, nc_unpen),
penalty_obj$penalty_type_ext
)
penalty_obj$cmult <- c(
penalty_obj$custom_multiplier,
rep(0.0, nc_unpen),
penalty_obj$custom_multiplier_ext
)
}
return(penalty_obj)
}
initialize_control <- function(control_obj,
nc_x,
nc_unpen,
nc_ext,
intercept) {
if (is.null(control_obj$dfmax)) {
control_obj$dfmax <- as.integer(
nc_x + nc_ext + nc_unpen + intercept[1] + intercept[2]
)
} else if (
control_obj$dfmax <= 0 || as.integer(control_obj$dfmax) != control_obj$dfmax
) {
stop("dfmax can only contain postive integers")
}
if (is.null(control_obj$pmax)) {
control_obj$pmax <- as.integer(
min(2 * control_obj$dfmax + 20, nc_x + nc_ext + nc_unpen + intercept[2])
)
} else if (
control_obj$pmax <= 0 || as.integer(control_obj$pmax) != control_obj$pmax
) {
stop("pmax can only contain positive integers")
}
if (is.null(control_obj$lower_limits)) {
control_obj$lower_limits <- rep(
-Inf, nc_x + nc_ext + nc_unpen + intercept[2]
)
} else if (length(control_obj$lower_limits) != nc_x + nc_ext + nc_unpen) {
stop(
"Length of lower_limits (",
length(control_obj$lower_limits),
") not equal to sum of number of columns in x, unpen, and external (",
nc_x + nc_ext + nc_unpen, ")"
)
} else if (intercept[2]) {
control_obj$lower_limits <- c(
control_obj$lower_limits[1:(nc_x + nc_unpen)],
-Inf,
control_obj$lower_limits[
(nc_x + nc_unpen + 1):length(control_obj$lower_limits)
]
)
}
if (is.null(control_obj$upper_limits)) {
control_obj$upper_limits <- rep(
Inf, nc_x + nc_ext + nc_unpen + intercept[2]
)
} else if (length(control_obj$upper_limits) != nc_x + nc_ext + nc_unpen) {
stop(
"Length of upper_limits (",
length(control_obj$upper_limits),
") not equal to sum of number of columns in x, unpen, and external (",
nc_x + nc_ext + nc_unpen, ")"
)
} else if (intercept[2]) {
control_obj$upper_limits <- c(
control_obj$upper_limits[1:(nc_x + nc_unpen)],
-Inf,
control_obj$upper_limits[
(nc_x + nc_unpen + 1):length(control_obj$upper_limits)
]
)
}
return(control_obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.