Nothing
#' Fit a model with principal components lasso
#'
#' Fit a model using the principal components lasso for an entire regularization
#' path indexed by the parameter \code{lambda}. Fits linear and logistic regression
#' models.
#'
#' The objective function for \code{"gaussian"} is
#' \deqn{1/2 RSS/nobs + \lambda*||\beta||_1 + \theta/2 \sum quadratic
#' penalty for group k,}
#' where the sum is over the feature groups \eqn{1, ..., K}. The objective function
#' for \code{"binomial"} is
#' \deqn{-loglik/nobs + \lambda*||\beta||_1 + \theta/2 \sum quadratic
#' penalty for group k.}
#'
#' \code{pcLasso} can handle overlapping groups. In this case, the original
#' \code{x} matrix is expanded to a \code{nobs x p_1+...+p_K} matrix (where
#' \code{p_k} is the number of features in group k) such that columns
#' \code{p_1+...+p_{k-1}+1} to \code{p_1+...+p_k} represent the feature matrix for
#' group k. \code{pcLasso} returns the model coefficients for both the expanded
#' feature space and the original feature space.
#'
#' One needs to specify the strength of the quadratic penalty either by
#' specifying \code{ratio}, which is the ratio of shrinkage between the second
#' and first principal components in the absence of the \eqn{\ell_1} penalty,
#' or by specifying the multiplier \code{theta}. \code{ratio} is unitless and is
#' more convenient.
#'
#' \code{pcLasso} always mean centers the columns of the \code{x} matrix. If
#' \code{standardize=TRUE}, \code{pcLasso} will also scale the columns to have
#' standard deviation 1. In all cases, the \code{beta} coefficients returned are
#' for the original \code{x} values (i.e. uncentered and unscaled).
#'
#' @param x Input matrix, of dimension \code{nobs} x \code{nvars}; each row is
#' an observation vector.
#' @param y Response variable. Quantitative for \code{family = "gaussian"}. For
#' \code{family="binomial"}, should be a numeric vector consisting of 0s and 1s.
#' @param w Observation weights. Default is 1 for each observation.
#' @param family Response type. Either \code{"gaussian"} (default) for linear
#' regression or \code{"binomial"} for logistic regression.
#' @param ratio Ratio of shrinkage between the second and first principal components
#' in the absence of the \eqn{\ell_1} penalty. More convenient way to specify the
#' strength of the quadratic penalty. A value between 0 and 1 (only 1 included).
#' \code{ratio = 1} corresponds to the lasso, 0.5-0.9 are good values to use.
#' Default is \code{NULL}. Exactly one of \code{ratio} or \code{theta} must be
#' specified.
#' @param theta Multiplier for the quadratic penalty: a non-negative real number.
#' \code{theta = 0} corresponds to the lasso, and larger \code{theta} gives
#' strong shrinkage toward the top principal components. Default is \code{NULL}.
#' Exactly one of \code{ratio} or \code{theta} must be specified.
#' @param groups A list describing which features belong in each group. The
#' length of the list should be equal to the number of groups, with
#' \code{groups[[k]]} being a vector of feature/column numbers which belong to
#' group k. Each feature must be assigned to at least one group. Features can
#' belong to more than one group. By default, all the features belong to a
#' single group.
#' @param lambda.min.ratio Smallest value for \code{lambda}, as a fraction of the
#' largest \code{lambda} value. The default depends on the sample size \code{nobs}
#' relative to the number of variables \code{nvars}. If \code{nobs >= nvars},
#' the default is \code{0.0001}, close to zero. If \code{nobs < nvars}, the default is
#' \code{0.01}. This is only used when the user does not specify a \code{lambda}
#' sequence.
#' @param nlam Number of \code{lambda} values; default is 100.
#' @param lambda A user supplied \code{lambda} sequence. Typical usage is to
#' have the program compute its own \code{lambda} sequence based on \code{nlam}
#' and \code{lambda.min.ratio}; supplying a value of lambda overrides this.
#' @param standardize If \code{TRUE}, the columns of the feature matrices are
#' standardized before the algorithm is run. Default is \code{FALSE}.
#' @param SVD_info A list containing SVD information. Usually this should not
#' be specified by the user: the function will compute it on its own by default.
#' \code{SVD_info} is a list with three elements:
#' \enumerate{
#' \item \code{aa}: A row-wise concatenation of the k quadratic penalty matrices.
#' \item \code{d}: A list where the kth element is the vector of squared singular
#' values of the input matrix corresponding to group k.
#' \item \code{dd}: A list where the kth element is the vector of
#' \eqn{d_{k1}^2 - d_{kj}^2} for the input matrix corresponding to group k.
#' }
#' Since the initial SVD of \code{x} can be the largest part of the overall
#' computation, we allow the user to compute it once and then re-use. See
#' example below.
#' @param nv Number of singular vectors to use in the singular value
#' decompositions. If not specified, the full SVD is used.
#' @param propack If \code{TRUE} (default), uses \code{propack.svd} from the
#' \code{svd} package to perform the singular value decompositions. If not, uses
#' \code{svd} from base R.
#' @param thr Convergence threhold for the coordinate descent algorithm. Default
#' is \code{1e-4}.
#' @param maxit Maximum number of passes over the data for all lambda values;
#' default is \code{1e5}.
#' @param verbose Print out progess along the way? Default is \code{FALSE}.
#' @return An object of class \code{"pcLasso"}.
#' \item{beta}{If the groups overlap, a \code{p_1+..._p_K x length(lambda)}
#' matrix of coefficients in the expanded feature space. If not, a \code{nvars}
#' x \code{length(lambda)} matrix of coefficients in the original feature space.}
#' \item{origbeta}{If the groups overlap, a \code{nvars} x
#' \code{length(lambda)} matrix of coefficients in the original feature space.
#' \code{NULL} if the groups do not overlap.}
#' \item{a0}{Intercept sequence of length \code{length(lambda)}.}
#' \item{lambda}{The actual sequence of \code{lambda} values used.}
#' \item{nzero}{If the groups overlap, the number of non-zero coefficients in the
#' expanded feature space for each value of \code{lambda}. Otherwise, the number
#' of non-zero coefficients in the original feature space.}
#' \item{orignzero}{If the groups are overlapping, this is the number of
#' non-zero coefficients in the original feature space of the model for each
#' \code{lambda}. If groups are not overlapping, it is \code{NULL}.}
#' \item{jerr}{Error flag for warnings and errors (largely for internal
#' debugging).}
#' \item{theta}{Value of \code{theta} used in model fitting.}
#' \item{origgroups}{If the \code{groups} parameter was passed to the
#' function call, this is a copy of that parameter. Otherwise, it is a list of
#' length 1, with the first element being a vector of integers from 1 to
#' \code{nvars}.}
#' \item{groups}{If the groups are not overlapping, this has the same
#' value as \code{groups}. If the groups are overlapping, then
#' \code{groups[[k]]} is the vector from \code{p_1 + ... + p_{k-1} + 1} to
#' \code{p_1 + ... p_k}, where \code{p_k} is the number of features in group k.}
#' \item{SVD_info}{A list containing SVD information. See param \code{SVD_info}
#' for more information.}
#' \item{mx}{If groups overlap, column means of the expanded \code{x}
#' matrix. Otherwise, column means of the original \code{x} matrix.}
#' \item{origmx}{Column means of the original \code{x} matrix.}
#' \item{my}{If \code{family = "gaussian"}, mean of the responses \code{y}.
#' Otherwise, it is \code{NA}.}
#' \item{overlap}{A logical flag indicating if the feature groups were
#' overlapping or not.}
#' \item{nlp}{Actual number of passes over the data for all lambda values.}
#' \item{family}{Response type.}
#' \item{call}{The call that produced this object.}
#'
#' @examples
#' set.seed(1)
#' x <- matrix(rnorm(100 * 20), 100, 20)
#' y <- rnorm(100)
#'
#' # all features in one group by default
#' fit1 <- pcLasso(x, y, ratio = 0.8)
#' # print(fit1) # Not run
#' # features in groups
#' groups <- vector("list", 4)
#' for (k in 1:4) {
#' groups[[k]] <- 5 * (k-1) + 1:5
#' }
#' fit2 <- pcLasso(x, y, groups = groups, ratio = 0.8)
#' # groups can be overlapping
#' groups[[1]] <- 1:8
#' fit3 <- pcLasso(x, y, groups = groups, ratio = 0.8)
#'
#' # specify ratio or theta, but not both
#' fit4 <- pcLasso(x, y, groups = groups, theta = 10)
#'
#' # family = "binomial"
#' y2 <- sample(0:1, 100, replace = TRUE)
#' fit5 <- pcLasso(x, y2, ratio = 0.8, family = "binomial")
#'
#' # example where SVD is computed once, then re-used
#' fit1 <- pcLasso(x, y, ratio = 0.8)
#' fit2 <- pcLasso(x, y, ratio = 0.8, SVD_info = fit1$SVD_info)
#'
#' @export
#' @importFrom svd propack.svd
pcLasso <- function(x, y, w = rep(1,length(y)), family = c("gaussian", "binomial"),
ratio = NULL, theta = NULL, groups = vector("list", 1),
lambda.min.ratio = ifelse(nrow(x) < ncol(x), 0.01, 1e-04),
nlam = 100, lambda = NULL, standardize = F, SVD_info = NULL,
nv = NULL, propack = T, thr = 1e-4, maxit = 1e5, verbose=FALSE) {
this.call <- match.call()
n <- nrow(x)
p <- ncol(x)
y <- as.vector(y)
if (length(y) != n) {
stop("length of y is not equal to number of rows of x")
}
family <- match.arg(family)
if (family == "binomial" && any(!(unique(y) %in% c(0, 1)))) {
stop("if family is binomial, y can only contain 0s and 1s")
}
# check that each feature appears in some group
if (length(groups) == 1) groups[[1]] <- 1:p
if (length(unique(unlist(groups))) < p) {
stop("Some features not assigned to a group")
}
ngroups <- length(groups)
sizes <- unlist(lapply(groups, length))
overlap <- F
origx <- x
origmx <- colMeans(x)
origp <- ncol(x)
origgroups <- groups
# if there is overlap, redefine x such that features which belong to multiple
# groups now appear as duplicated columns
# redefine groups as well (group[[k]] = (p_1+...+p_{k-1}+1):(p_1+...+p_k))
if (length(origgroups) > 1) {
nc <- length(unlist(origgroups))
if (nc > p) { # there is overlap
groups <- vector("list", length(origgroups))
overlap <- T
x <- matrix(NA, n, nc)
i1 <- 1
for(k in 1:ngroups) {
i2 <- i1 + length(origgroups[[k]]) - 1
x[, i1:i2] <- origx[, origgroups[[k]]]
groups[[k]] <- i1:i2
i1 <- i2 + 1
}
}
p <- ncol(x)
}
# scale x and y
mx <- colMeans(x)
if (standardize) {
sx <- apply(x, 2, sd) * sqrt((n-1) / n)
} else {
sx <- rep(1, ncol(x))
}
x <- scale(x, mx, sx)
my <- NA
if (family == "gaussian") {
my <- mean(y)
y <- y - my
}
# if SVD_info not provided, do the SVD for the feature matrices
if (is.null(SVD_info)) {
v = d = dd = vector("list", ngroups)
if(verbose) cat("Starting SVD computation", fill = T)
for (k in 1:ngroups) {
nvv <- nv
if (is.null(nv)) {
nvv <- min(nrow(x[, groups[[k]], drop = F]),
ncol(x[, groups[[k]], drop = F]))
}
if (nrow(x) > length(groups[[k]])) {
xtemp <- t(x[, groups[[k]]]) %*% x[, groups[[k]]]
eig <- eigen(xtemp)
v[[k]] <- eig$vec
d[[k]] <- eig$val
dd[[k]] <- max(d[[k]]) - d[[k]] + .01
}
else {
if (propack) {
sv <- svd::propack.svd(x[, groups[[k]], drop = F],
neig = nvv)
} else {
sv <- svd(x[, groups[[k]], drop = F], nv = nvv)
}
v[[k]] <- sv$v
d[[k]] <- sv$d^2
dd[[k]] <- max(d[[k]]) - d[[k]] + .01
}
}
if(verbose) cat("SVD completed", fill = T)
} else {
aa <- SVD_info$aa
d <- SVD_info$d
dd <- SVD_info$dd
}
# check that exactly one of ratio, theta is provided (also check bounds)
# if theta is NULL, derive it from ratio
if (missing(ratio) && missing(theta)) {
stop("Provide ratio or theta")
} else if (!missing(ratio) && !missing(theta) &&
!is.null(ratio) && !is.null(theta)) {
stop("Provide only ratio or theta, not both")
} else if (is.null(theta)) {
if (ratio < 0 || ratio > 1) {
stop("ratio must be in [0, 1]")
}
thetanew <- rep(NA, ngroups)
for (k in 1:ngroups) {
thetanew[k] <- d[[k]][2] * (1 - ratio) / (ratio * (d[[k]][1] - d[[k]][2]))
}
theta <- mean(thetanew)
} else if (theta < 0) {
stop("theta must be non-negative")
}
# if SVD_info not provided, compute the VD_{d1^2 - dj^2}V^T matrices for each group
# and concatenate row-wise
if (is.null(SVD_info)) {
aa <- matrix(0, p, max(sizes))
i1 <- 1
for(k in 1:ngroups) {
i2 <- i1 + sizes[k] - 1
aa[i1:i2, 1:sizes[k]] <- scale(v[[k]], FALSE, 1/(dd[[k]])) %*% t(v[[k]])
i1 <- i2 + 1
}
# build up SVD_info for returned output
SVD_info <- list()
SVD_info$aa <- aa
SVD_info$d <- d
SVD_info$dd <- dd
}
# compute mg: vector of length length(sizes)+1. Denotes the number for the
# first feature in group k, with an additional last entry = p + 1.
i1 <- 1
mg <- c(1, cumsum(sizes) + 1)
# get lambda sequence
ulam <- lambda * n # we need this multiplication by n because documentation has RSS/(2n),
# but fortran code assumes RSS/2
if (is.null(lambda)) {
maxlam <- max(abs(t(x) %*% (y - mean(y))))
if (family == "binomial") {
maxlam <- 4 * maxlam
}
ulam <- exp(seq(log(maxlam), log(maxlam * lambda.min.ratio), length = nlam))
}
nlam <- length(ulam)
out <- pcLassof(x, y, w, mg, aa, ulam, theta, ngroups, thr=thr, maxit=maxit,
family=family, verbose=verbose)
nzero <- colSums(out$beta != 0)
# get intercept sequence
if (family == "gaussian") {
a0 <- rep(my, nlam)
} else {
# if here, family must be binomial
a0 <- out$a0
}
# rescale intercept values
a0 <- a0 - colSums(out$beta * mx / sx)
# rescale beta values (beta will only change when standardize = TRUE)
out$beta <- t(scale(t(out$beta), F, sx))
# convert beta values in expanded space to beta values in original space
origbeta <- NULL
orignzero <- NULL
if (overlap) {
origbeta <- matrix(0, ncol(origx), ncol(out$beta))
for(k in 1:ngroups) {
origbeta[origgroups[[k]], ] <- origbeta[origgroups[[k]], ] +
out$beta[groups[[k]], ]
}
orignzero <- colSums(origbeta != 0)
}
lambda <- out$ulam / n
if (family == "binomial") {
lambda=lambda / 4
}
out <- list(beta=out$beta, origbeta=origbeta, a0=a0, lambda=lambda,
nzero=nzero, orignzero=orignzero, jerr=out$jerr, theta=theta,
origgroups=origgroups, groups=groups, SVD_info = SVD_info, mx=mx,
origmx=origmx, my=my, overlap=overlap, nlp=out$nlp,
family=family, call=this.call)
yhat <- predict.pcLasso(out, origx)
if (family == "gaussian") dev <- colSums(apply(yhat,2,msefun, y))
if (family == "binomial") dev <- colSums(apply(yhat,2,binfun, y))
out$dev <- dev
class(out) <- "pcLasso"
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.