Nothing
#' Hybrid Bootstrap Confidence Intervals
#'
#' Performs a hybrid bootstrapping approach to construct quantile based
#' confidence intervals around the original lasso/MCP/SCAD estimator.
#' Specifically, a traditional pairs bootstrap is performed with 1 adjustment:
#' if the bootstrap sample for a given covariate is zero, a random sample from
#' the full conditional posterior is used as the bootstrap sample instead.
#' This avoids the creation of intervals with endpoints exactly equal to zero.
#'
#' The resulting intervals WILL NOT have exact nominal coverage for all
#' covariates. They are instead constructed in a way that overall coverage will
#' be approximately equal to nominal so long as the true distribution of betas
#' is Laplace and the covariates are independent. That said, in practice,
#' average coverage is fairly robust to these assumptions.
#'
#' Note: Draws from the full conditional posterior are approximations for
#' MCP/SCAD or when \code{alpha} is not 1.
#'
#' @param X The design matrix, without an intercept. \code{boot_ncvreg}
#' standardizes the data and includes an intercept by default.
#' @param y The response vector.
#' @param fit (optional) An object of class \code{ncvreg} or
#' \code{cv.ncvreg}. An object of class \code{ncvreg}
#' provides data, penalty choices, and \code{lambda} sequence to
#' \code{boot_ncvreg}. An object of class \code{cv.ncvreg} can in
#' addition can provide information for selecting \code{lambda}
#' and estimating \code{sigma2}. If provided, \code{y} should not
#' be provided and \code{X} should only be provided if \code{fit}
#' does not contain \code{X}.
#' @param lambda (optional) The value of lambda to provide interval estimates
#' for. If left missing will be selected using CV. If user wants
#' to set the lambda sequence used to select \code{lambda} via
#' cross validation, they should call \code{cv.ncvreg} separately
#' and pass the resulting object to \code{fit}.
#' @param sigma2 (optional) The variance to use for the Hybrid sampling. If
#' left missing will be set using the estimator suggested by Reid
#' et. al. (2016) using CV.
#' @param cluster Bootstrapping and \code{cv.ncvreg} (if applicable) can be run
#' in parallel across a cluster using the **parallel** package.
#' The cluster must be set up in advance using the
#' [parallel::makeCluster()] function from that package. The
#' cluster must then be passed to \code{boot_ncvreg}.
#' @param seed You may set the seed of the random number generator in order
#' to obtain reproducible results. This is set for the overall
#' process. If the user wishes to set a seed specifically for
#' \code{cv.ncvreg} they should call it separately then pass the
#' fitted object as an argument to \code{fit}.
#' @param nboot The number of bootstrap replications to use.
#' @param penalty The penalty to be applied to the model. Either "lasso" (the
#' default), "MCP", or "SCAD".
#' @param level The confidence level required.
#' @param gamma The tuning parameter of the MCP/SCAD penalty
#' (see \code{ncvreg} for details). Default is 3 for MCP and 3.7
#' for SCAD. Ignored if fit is provided.
#' @param alpha Tuning parameter for the Elastc net estimator which controls
#' the relative contributions from the lasso/MCP/SCAD penalty and
#' the ridge, or L2 penalty. `alpha=1` is equivalent to
#' lasso/MCP/SCAD penalty, while `alpha=0` would be equivalent to
#' ridge regression. However, `alpha=0` is not supported; `alpha`
#' may be arbitrarily small, but not exactly 0. Ignored if fit is
#' provided.
#' @param returnCV If \code{TRUE}, the \code{cv.ncvreg} fit will be returned
#' (if applicable).
#' @param return_boot If \code{TRUE}, the bootstrap draws will be returned.
#' @param verbose If \code{FALSE}, non-essential messages are suppressed.
#' @param ... named arguments to be passed to \code{ncvreg} and
#' \code{cv.ncvreg}.
#'
#' @returns A list with:
#' \describe{
#' \item{confidence_intervals}{A \code{data.frame} with the original point estimates along with lower and upper bounds of Hybrid CIs.}
#' \item{lambda}{The value of \code{lambda} the \code{confidence_intervals} were constructed at.}
#' \item{sigma2}{The value of \code{sigma2} used for the Hybrid bootstrap sampling.}
#' \item{penalty}{The penalty the intervals correspond to.}
#' \item{alpha}{The tuning parameter for the Enet estimator used.}
#' \item{level}{The confidence level the intervals correspond to.}
#' }
#' If a penalty other than "lasso" is used,
#' \describe{
#' \item{gamma}{The tuning parameter for MCP/SCAD penalty.}
#' }
#' If \code{returnCV} is \code{TRUE} and a \code{cv.ncvreg} object was fit or supplied
#' \describe{
#' \item{cv.ncvreg}{The \code{cv.ncvreg} fit used to estimate \code{lambda} and \code{sigma2} (if applicable).}
#' }
#' If \code{return_boot} is \code{TRUE}
#' \describe{
#' \item{boot_draws}{A \code{data.frame} of the Hybrid bootstrap draws are returned.}
#' }
#' @export boot_ncvreg
#'
#' @examples
#' data(Prostate)
#' X <- Prostate$X
#' y <- Prostate$y
#' boot_ncvreg(X, y, level = 0.8)
boot_ncvreg <- function(X, y, fit, lambda, sigma2, cluster, seed, nboot = 1000,
penalty = "lasso", level = 0.95,
gamma = switch(penalty, SCAD = 3.7, 3), alpha = 1,
returnCV = FALSE, return_boot = FALSE, verbose = FALSE,
...) {
if ((missing(X) | missing(y)) & (missing(fit) || !inherits(fit, c("cv.ncvreg", "ncvreg")))) {
stop("Either X and y or a fit of class ncvreg or cv.ncvreg must be supplied.")
}
if (!missing(X) & missing(fit)) { # After &, just makes sure warning isn't duplicated as this is checked below as well
if (any(attr(ncvreg::std(X), "scale") == 0)) warning("Some columns in X are singular. Intervals cannot be produced for corresponding covariates.")
}
cv_fit <- NULL
if (!missing(fit)) {
original_object_class <- class(fit)[1]
if (inherits(fit, "cv.ncvreg")) {
cv_fit <- fit
fit <- cv_fit$fit
}
if (is.null(fit$X) & missing(X)) {
stop(paste0("fit object missing X, please rerun ", original_object_class, " with returnX = TRUE or supply X directly along with the ", original_object_class, " object."))
}
if (!missing(X) && is.null(attr(X, "scale"))) X <- std(X)
if (!is.null(fit$X) && !missing(X) && !identical(fit$X, X)) {
stop("X supplied along with ", original_object_class, " object which also contains X and they are not the same. It is unclear which should be used.")
}
if (missing(X)) X <- fit$X
if (any(attr(X, "scale") == 0)) warning("Some columns in X are singular. Intervals cannot be produced for corresponding covariates.")
if (!missing(y) && !identical(fit$y, y)) {
stop(paste0("y supplied along with ", original_object_class, " object which also contains y and they are not the same. It is unclear which should be used."))
}
if (missing(y)) y <- fit$y
if (fit$family != "gaussian") {
stop(paste0("fit of object class ", original_object_class, " was fit with ", fit$family, " family, but only 'gaussian' family is currently supported for boot_ncvreg"))
}
if (all(fit$penalty.factor != 1)) stop("Alternate penalty factors are currently not supported.")
## Use the values from ncvreg or cv.ncvreg object
penalty <- fit$penalty
gamma <- fit$gamma
alpha <- fit$alpha
}
# Coercion
if (!is.matrix(X)) {
tmp <- try(X <- stats::model.matrix(~0+., data=X), silent=TRUE)
if (inherits(tmp, "try-error")) stop("X must be a matrix or able to be coerced to a matrix", call.=FALSE)
}
if (storage.mode(X)=="integer") storage.mode(X) <- "double"
if (!is.double(y)) {
tmp <- try(y <- as.double(y), silent=TRUE)
if (inherits(tmp, "try-error")) stop("y must be numeric or able to be coerced to numeric", call.=FALSE)
}
## Seed handling
if (!missing(seed)) {
original_seed <- .GlobalEnv$.Random.seed
on.exit(.GlobalEnv$.Random.seed <- original_seed)
set.seed(seed)
}
args <- list(...)
cv.args <- args[names(args) %in% c("nfolds", "fold", "returnY", "trace")] ## returnY is depricated
ncvreg.args <- args[names(args) %in% c("lambda.min", "nlambda", "eps", "max.iter", "dfmax", "warn")]
if (length(cv.args) > 0 & !is.null(cv_fit)) {
warning("Additional arguments for cv.ncvreg are ignored when a cv.ncvreg object is supplied")
}
if ("penalty.factor" %in% names(args)) {
stop("Sorry, specification of alternative penality factors is not supported")
}
if (verbose & any(c("returnX", "convex") %in% names(args))) {
message(paste0("Ignoring argument(s) ", paste0(names(args)[names(args) %in% c("returnX", "convex")], collapse = ", "), " they are set to FALSE in cv.ncvreg and any ncvreg objects fit are not accessible to user."))
}
if ("family" %in% names(args)) {
stop("Ignoring argument 'family', only guassian family is currently supported")
}
original_coefs <- NULL
if (is.null(cv_fit) & (missing(lambda) | missing(sigma2))) {
## Not needed unless we don't have sigma2 and lambda
cv.args$X <- X
cv.args$y <- y
cv.args$penalty <- penalty
cv.args$alpha <- alpha
cv.args$gamma <- gamma
if (!missing(fit)) cv.args$lambda <- fit$lambda
if (!missing(cluster)) cv.args$cluster <- cluster
cv_fit <- do.call("cv.ncvreg", c(cv.args, ncvreg.args))
if (missing(fit)) fit <- cv_fit$fit
}
if (verbose) {
if (missing(lambda)) {
message("Using cross validation to select lambda.")
} else {
message("Using user specified value for lambda.")
}
if (missing(sigma2)) {
message("Estimating variance using Reid estimator.")
} else {
message("Using user specified value for sigma2.")
}
}
if (missing(lambda)) lambda <- cv_fit$lambda.min
if (missing(sigma2)) {
## Using fit from cv_fit here to ensure consistency
yhat <- cv_fit$fit$linear.predictors[,cv_fit$min]
reid_coefs <- coef(cv_fit$fit, lambda = cv_fit$lambda.min)[-1]
sh_lh <- sum(reid_coefs != 0)
sigma2 <- (max(length(y) - sh_lh, 1))^(-1) * sum((y - yhat)^2)
}
## If X, y, lambda, sigma2 specified, still need an ncvreg fit
if (missing(fit)) {
ncvreg.args$X <- X
ncvreg.args$y <- y
ncvreg.args$penalty <- penalty
ncvreg.args$alpha <- alpha
ncvreg.args$gamma <- gamma
fit <- do.call("ncvreg", ncvreg.args)
}
original_coefs <- coef(fit, lambda = lambda)[-1]
if (!missing(cluster)) {
if (!inherits(cluster, "cluster")) stop("cluster is not of class 'cluster'; see ?makeCluster", call.=FALSE)
parallel::clusterExport(cluster, c("X", "y", "lambda", "sigma2", "ncvreg.args", "penalty", "gamma", "alpha"), envir=environment())
parallel::clusterCall(cluster, function() library(ncvreg))
results <- parallel::parLapply(
cl=cluster, X=1:nboot, fun=bootf, XX = X, yy=y, lambda = lambda,
sigma2 = sigma2, ncvreg.args = ncvreg.args,
penalty = penalty, alpha = alpha, gamma = gamma,
verbose = FALSE
)
}
if (is.null(attr(X, "nonsingular"))) {
ns <- 1:length(original_coefs)
} else {
ns <- attr(X, "nonsingular")
}
boot_draws <- matrix(nrow = nboot, ncol = length(original_coefs))
for (i in 1:nboot) {
if (!missing(cluster)) {
boot_draws[i,ns] <- results[[i]]
} else {
boot_draws[i,ns] <- bootf(XX=X, yy=y, lambda = lambda, sigma2 = sigma2,
ncvreg.args = ncvreg.args,
penalty = penalty, alpha = alpha, gamma = gamma,
verbose = verbose)
}
}
colnames(boot_draws) <- names(original_coefs)
cis <- compute_intervals(boot_draws, alpha = 1 - level)
cis <- data.frame(
estimates = original_coefs,
cis
)
val <- list(
confidence_intervals = cis,
lambda = lambda,
sigma2 = sigma2,
penalty = penalty,
alpha = alpha,
level = level
)
if (penalty != "lasso") val$gamma <- gamma
if (return_boot) val$boot_draws <- boot_draws
if (returnCV) val$cv.ncvreg <- cv_fit
return(val)
}
bootf <- function(XX, yy, lambda, sigma2, ncvreg.args,
penalty = c("lasso", "MCP", "SCAD"),
alpha = 1, gamma = switch(penalty, SCAD = 3.7, 3),
verbose = FALSE,
...) {
p <- ncol(XX)
n <- length(yy)
idx_new <- sample(1:n, replace = TRUE)
ynew <- yy[idx_new]
ynew <- ynew - mean(ynew)
xnew <- ncvreg::std(XX[idx_new,,drop=FALSE])
nonsingular <- attr(xnew, "nonsingular")
p_nonsingular <- length(nonsingular)
if (p_nonsingular == 0) {
warning("No non-singular columns in bootstrap draw, returning NAs.")
return(rep(NA_real_, p))
}
rescale <- (attr(xnew, "scale")[nonsingular])^(-1)
if (!is.null(attr(XX, "scale"))) {
XXnonsingular <- attr(XX, "nonsingular")
rescaleXX <- (attr(XX, "scale")[XXnonsingular][nonsingular])^(-1)
} else {
rescaleXX <- 1
}
full_rescale_factor <- rescale * rescaleXX
nlambda <- ifelse(!is.null(ncvreg.args$nlambda), ncvreg.args$nlambda, 100)
lambda_seq <- setupLambda(
xnew, ynew, "gaussian", alpha = alpha,
nlambda, lambda.min = ifelse(n > p, .001, .05),
penalty.factor=rep(1, ncol(xnew))
)
if (lambda < min(lambda_seq)) {
if (verbose) message("Lambda too small, extending lambda sequence for bootstrap sample.")
lambda_min <- lambda - (lambda / 100)
lambda_seq <- setupLambda(
xnew, ynew, "gaussian", alpha = alpha,
lambda.min = lambda_min / max(lambda_seq), nlambda,
penalty.factor=rep(1, ncol(xnew))
)
}
if (lambda >= max(lambda_seq)) {
if (verbose) message("Lambda too large, setting to lambda max for bootstrap sample.")
lambda <- max(lambda_seq) - (max(lambda_seq) / 100)
}
ncvreg.args$X <- xnew
ncvreg.args$y <- ynew
ncvreg.args$lambda <- lambda_seq
ncvreg.args$penalty <- penalty
ncvreg.args$alpha <- alpha
ncvreg.args$gamma <- gamma
## Ignore user specified lambda.min, nlambda here since lambda sequence is being specified
fit <- do.call("ncvreg", ncvreg.args[!(names(ncvreg.args) %in% c("lambda.min", "nlambda"))])
modes <- coef(fit, lambda = lambda)[-1]
## For "elastic net" idea
if (alpha < 1) {
ynew <- c(ynew, rep(0, p_nonsingular))
xnew <- rbind(xnew, sqrt(n*(1 - alpha)*lambda)*diag(p_nonsingular))
xnew <- ncvreg::std(xnew)
xnew <- xnew * sqrt(n / (n + p_nonsingular))
lambda <- lambda * alpha
}
partial_residuals <- ynew - (
as.numeric(xnew %*% modes) - (xnew * matrix(modes, nrow = nrow(xnew), ncol = ncol(xnew), byrow=TRUE))
)
z <- (1/n)*colSums(xnew * partial_residuals)
if (sum(modes == 0) > 0) {
draws <- draw_full_cond(z[modes == 0], lambda, sigma2, n) ## Only where modes are 0
if (penalty == "MCP") {
draws <- sapply(draws, firm_threshold_c, lambda, gamma)
} else if (penalty == "SCAD") {
draws <- sapply(draws, scad_threshold_c, lambda, gamma)
}
}
boot_draws <- numeric(p)
if (sum(modes == 0) > 0) modes[modes == 0] <- draws
boot_draws[nonsingular] <- modes * full_rescale_factor
if (p_nonsingular < p) boot_draws[!(1:p %in% nonsingular)] <- NA_real_
return(boot_draws)
}
draw_full_cond <- function(z, lambda, sigma2, n) {
## Tails being transferred on to (log probability in each tail)
se <- sqrt(sigma2 / n)
obs_lw <- pnorm(0, z + lambda, se, log.p = TRUE)
obs_up <- pnorm(0, z - lambda, se, lower.tail = FALSE, log.p = TRUE)
obs_p_lw <- obs_lw + ((z*lambda*n) / sigma2)
obs_p_up <- obs_up - ((z*lambda*n) / sigma2)
## Find the proportion of each to the overall probability
frac_lw_log <- ifelse(is.infinite(exp(obs_p_lw - obs_p_up)), 0, obs_p_lw - obs_p_up - log(1 + exp(obs_p_lw - obs_p_up)))
frac_up_log <- ifelse(is.infinite(exp(obs_p_up - obs_p_lw)), 0, obs_p_up - obs_p_lw - log(1 + exp(obs_p_up - obs_p_lw)))
ps <- runif(length(z))
log_ps <- log(ps)
log_one_minus_ps <- log(1 - ps)
draws <- ifelse(
frac_lw_log >= log_ps,
qnorm(log_ps + obs_lw - frac_lw_log, z + lambda, se, log.p = TRUE),
qnorm(log_one_minus_ps + obs_up - frac_up_log, z - lambda, se, lower.tail = FALSE, log.p = TRUE)
)
return(draws)
}
soft_threshold <- function(z_j, lambda) {
if (z_j > lambda) {
return(z_j - lambda)
} else if (abs(z_j) <= lambda) {
return(0)
} else if (z_j < -lambda) {
return(z_j + lambda)
}
}
firm_threshold_c <- function(z_j, lambda, gamma) {
z_j <- z_j + sign(z_j)*lambda
if (abs(z_j) <= gamma*lambda) {
return((gamma / (gamma - 1))*soft_threshold(z_j, lambda))
} else {
return(z_j)
}
}
scad_threshold_c <- function(z_j, lambda, gamma) {
z_j <- z_j + sign(z_j)*lambda
if (abs(z_j) <= 2*lambda) {
return(soft_threshold(z_j, lambda))
} else if (abs(z_j) > 2*lambda & abs(z_j) <= gamma*lambda) {
lambda_alt <- (gamma*lambda) / (gamma - 1)
return(((gamma - 1) / (gamma - 2)) * soft_threshold(z_j, lambda_alt))
} else {
z_j
}
}
compute_intervals <- function(draws, alpha = 0.2, quiet = FALSE) {
any_nas <- any(as.logical(apply(draws, 2, function(x) sum(is.na(x)) > 0)))
if (any_nas & !quiet) {
warning("NAs in draws, this usually means a column in the bootstrapped X was singular.")
}
lowers <- apply(draws, 2, function(x) quantile(x, alpha / 2, na.rm = TRUE))
uppers <- apply(draws, 2, function(x) quantile(x, 1 - (alpha / 2), na.rm = TRUE))
return(data.frame(lower = lowers, upper = uppers))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.