#' Stability selection for a quadrupen fit.
#'
#' Compute the stability path of a (possibly randomized) fitting
#' procedure as introduced by Meinshausen and Buhlmann (2010).
#'
#' @param x matrix of features, possibly sparsely encoded
#' (experimental). Do NOT include intercept.
#'
#' @param y response vector.
#'
#' @param penalty a string for the fitting procedure used for
#' cross-validation. Either \code{\link{elastic.net}} or
#' \code{"bounded.reg"}.
#'
#' @param subsamples integer indicating the number of subsamplings
#' used to estimate the selection probabilities. Default is 100.
#'
#' @param sample.size integer indicating the size of each subsamples.
#' Default is \code{floor(n/2)}.
#'
#' @param randomize Should a randomized version of the fitting
#' procedure by used? Default is \code{TRUE}. See details below.
#'
#' @param weakness Coefficient used for randomizing. Default is
#' \code{0.5}. Ignored when \code{randomized} is \code{FALSE}. See
#' details below.
#'
#' @param folds list with \code{subsamples} entries with vectors
#' describing the folds to use for the stability procedure. By
#' default, the folds are randomly sampled with the specified
#' \code{subsamples} argument.
#'
#' @param verbose logical; indicates if the progression should be
#' displayed. Default is \code{TRUE}.
#'
#' @param mc.cores the number of cores to use. The default uses 2 cores.
#'
#' @param ... additional parameters to overwrite the defaults of the
#' fitting procedure. See the corresponding documentation
#' (\code{\link{elastic.net}} or \code{\link{bounded.reg}})
#'
#' @return An object of class \code{\linkS4class{stability.path}}.
#'
#' @note When \code{randomized = TRUE}, the \code{penscale} argument
#' that weights the penalty tuned by \eqn{\lambda_1}{lambda1} is
#' perturbed (divided) for each subsample by a random variable
#' uniformly distributed on
#' \if{latex}{\eqn{[\alpha,1]}}\if{html}{[α,1]}\if{text}{\eqn{[alpha,1]}},
#' where
#' \if{latex}{\eqn{\alpha}}\if{html}{α}\if{text}{\eqn{alpha}} is
#' the weakness parameter.
#'
#' If the user runs the fitting method with option
#' \code{'bulletproof'} set to \code{FALSE}, the algorithm may stop
#' at an early stage of the path. Early stops of the underlying
#' fitting function are handled internally, in the following way: we
#' chose to simply skip the results associated with such runs, in
#' order not to bias the stability selection procedure. If it occurs
#' too often, a warning is sent to the user, in which case you should
#' reconsider the grid of \code{lambda1} for stability selection. If
#' \code{bulletproof} is \code{TRUE} (the default), there is nothing
#' to worry about, except a possible slow down when any switching to
#' the proximal algorithm is required.
#'
#' @references N. Meinshausen and P. Buhlmann (2010). Stability
#' Selection, JRSS(B).
#'
#' @seealso \code{\linkS4class{stability.path}} and
#' \code{\link{plot,stability.path-method}}.
#'
#' @examples
#' ## Simulating multivariate Gaussian with blockwise correlation
#' ## and piecewise constant vector of parameters
#' beta <- rep(c(0,1,0,-1,0), c(25,10,25,10,25))
#' Soo <- matrix(0.75,25,25) ## bloc correlation between zero variables
#' Sww <- matrix(0.75,10,10) ## bloc correlation between active variables
#' Sigma <- bdiag(Soo,Sww,Soo,Sww,Soo) + 0.2
#' diag(Sigma) <- 1
#' n <- 100
#' x <- as.matrix(matrix(rnorm(95*n),n,95) %*% chol(Sigma))
#' y <- 10 + x %*% beta + rnorm(n,0,10)
#'
#' ## Build a vector of label for true nonzeros
#' labels <- rep("irrelevant", length(beta))
#' labels[beta != 0] <- c("relevant")
#' labels <- factor(labels, ordered=TRUE, levels=c("relevant","irrelevant"))
#'
#' ## Call to stability selection function, 200 subsampling
#' stab <- stability(x,y, subsamples=200, lambda2=1, min.ratio=1e-2)
#' ## Recover the selected variables for a given cutoff
#' ## and per-family error rate, without producing any plot
#' stabpath <- plot(stab, cutoff=0.75, PFER=1, plot=FALSE)
#'
#' cat("\nFalse positives for the randomized Elastic-net with stability selection: ",
#' sum(labels[stabpath$selected] != "relevant"))
#' cat("\nDONE.\n")
#'
#' @keywords models regression
#'
#' @importFrom stats runif
#' @importFrom utils modifyList
#' @name stability
#' @aliases stability
#' @rdname stability
#'
#' @export
stability <- function(x,
y,
penalty = c("elastic.net", "bounded.reg"),
subsamples = 100,
sample.size = floor(n/2),
randomize = TRUE,
weakness = 0.5,
verbose = TRUE,
folds = replicate(subsamples, sample(1:nrow(x), sample.size), simplify=FALSE),
mc.cores = 2,
...) {
## =============================================================
## INITIALIZATION & PARAMETERS RECOVERY
if (Sys.info()[['sysname']] == "Windows") {
warning("\nWindows does not support fork, enforcing mc.cores to '1'.")
mc.cores <- 1
}
penalty <- match.arg(penalty)
get.lambda1 <- switch(penalty,
"elastic.net" = get.lambda1.l1,
"bounded.reg" = get.lambda1.li)
p <- ncol(x)
n <- nrow(x)
user <- list(...)
defs <- default.args(penalty,nrow(x),ncol(x),user)
args <- modifyList(defs, user)
## overwrite parameters irrelevant in and resampling context
args$control$verbose <- 0
args$max.feat <- p
## Compute a grid of lambda1 (the smae for each fold)
if (is.null(args$lambda1)) {
input <- standardize(x,y,args$intercept,args$normalize,args$penscale)
args$lambda1 <- get.lambda1(input$xty,args$nlambda1,args$min.ratio)
rm(input)
}
nlambda1 <- length(args$lambda1)
## record the basic penscale value for randomizing
penscale <- args$penscale
## Prepare blocs of sub samples to run jobs parallely
blocs <- split(1:subsamples, 1:mc.cores)
if (verbose) {
cat(paste("\n\nSTABILITY SELECTION ",ifelse(randomize,"with","without")," randomization (weakness = ",weakness,")",sep=""))
cat(paste("\nFitting procedure:",penalty," with lambda2 = ",args$lambda2," and an ",nlambda1,"-dimensional grid of lambda1.", sep=""))
cat("\nRunning",length(blocs),"jobs parallely (1 per core)")
cat("\nApprox.", length(blocs[[1]]),"subsamplings for each job for a total of",subsamples)
}
## function to run on each core
bloc.stability <- function(subsets) {
select <- Matrix(0,length(args$lambda1),p)
subsamples.ok <- 0
for (s in 1:length(subsets)) {
if (randomize) {args$penscale <- penscale / runif(p,weakness,1)}
active <- do.call(quadrupen, c(list(x=x[folds[[subsets[s]]], ], y=y[folds[[subsets[s]]]]), args))@active.set
if (nrow(active) == length(args$lambda1)) {
subsamples.ok <- subsamples.ok + 1
select <- select + active
}
}
if (subsamples.ok < 0.5*length(subsets)) {
cat("\nWarning: more than 50% of the run were discarded in that core due to early stops of the fitting procedure. You should consider largest 'min.ratio' or strongest 'lambda2'.")
}
return(select/(subsamples.ok*length(blocs)))
}
## Now launch the B jobs...
prob.bloc <- mclapply(blocs, bloc.stability, mc.cores=mc.cores)
## Construct the probability path
path <- Matrix(0,nlambda1,p)
for (b in 1:length(prob.bloc)) {
path <- path + prob.bloc[[b]]
}
return(new("stability.path",
probabilities = path ,
penalty = penalty ,
naive = args$naive ,
lambda1 = args$lambda1,
lambda2 = args$lambda2,
folds = folds ))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.