#' (Iterative) Sure Independence Screening ((I)SIS) and Fitting in Generalized
#' Linear Models and Cox's Proportional Hazards Models
#'
#' This function first implements the Iterative Sure Independence Screening for
#' different variants of (I)SIS, and then fits the final regression model using
#' the R packages \pkg{ncvreg}, \pkg{glmnet}, \pkg{msaenet} and \pkg{Coxnet} for the SCAD/MCP/LASSO/ENET/AENET
#' regularized loglikelihood for the variables picked by (I)SIS.
#'
#' @export
#' @importFrom glmnet glmnet
#' @importFrom msaenet aenet
#' @importFrom ncvreg ncvreg
#' @importFrom glmnet cv.glmnet
#' @importFrom ncvreg cv.ncvreg
#' @importFrom survival coxph
#' @importFrom survival Surv
#' @importFrom stats cor glm.fit
#' @importFrom stats glm.fit
#' @importFrom stats quantile
#' @importFrom stats binomial
#' @importFrom stats coef
#' @importFrom stats gaussian
#' @importFrom stats poisson
#' @importFrom stats deviance
#' @importFrom stats predict
#' @importFrom nnet multinom
#' @import doParallel
#' @importFrom foreach foreach
#' @importFrom Coxnet Coxnet
#'
#' @param x The design matrix, of dimensions n * p, without an intercept. Each
#' row is an observation vector. \code{SIS} standardizes the data and includes
#' an intercept by default.
#' @param y The response vector of dimension n * 1. Quantitative for
#' \code{family='gaussian'}, non-negative counts for \code{family='poisson'},
#' binary (0-1) for \code{family='binomial'}, factor for \code{family='multinom'}.
#' For \code{family='cox'}, \code{y} should be an object of class \code{Surv},
#' as provided by the function \code{Surv()} in the package \pkg{survival}.
#' @param family Response type (see above).
#' @param penalty The penalty to be applied in the regularized likelihood
#' subproblems. 'SCAD', 'MCP', or 'lasso' are provided. 'lasso' is the default
#' for family = 'multinom' or 'cox', 'SCAD' is the default for other families.
#' @param concavity.parameter The tuning parameter used to adjust the concavity
#' of the SCAD/MCP penalty. Default is 3.7 for SCAD and 3 for MCP.
#' @param tune Method for tuning the regularization parameter of the penalized
#' likelihood subproblems and of the final model selected by (I)SIS. Options
#' include \code{tune='bic'}, \code{tune='ebic'}, \code{tune='aic'}, and
#' \code{tune='cv'}.
#' @param nfolds Number of folds used in cross-validation. The default is 10.
#' @param type.measure Loss to use for cross-validation. Currently five
#' options, not all available for all models. The default is
#' \code{type.measure='deviance'}, which uses squared-error for gaussian models
#' (also equivalent to \code{type.measure='mse'} in this case), deviance for
#' logistic and poisson regression, and partial-likelihood for the Cox model.
#' Both \code{type.measure='class'} and \code{type.measure='auc'} apply only to
#' logistic regression and give misclassification error and area under the ROC
#' curve, respectively. \code{type.measure='mse'} or \code{type.measure='mae'}
#' (mean absolute error) can be used by all models except the \code{'cox'};
#' they measure the deviation from the fitted mean to the response. For
#' \code{penalty='SCAD'} and \code{penalty='MCP'}, only
#' \code{type.measure='deviance'} is available.
#' @param gamma.ebic Specifies the parameter in the Extended BIC criterion
#' penalizing the size of the corresponding model space. The default is
#' \code{gamma.ebic=1}. See references at the end for details.
#' @param nsis Number of pedictors recuited by (I)SIS.
#' @param iter Specifies whether to perform iterative SIS. The default is
#' \code{iter=TRUE}.
#' @param iter.max Maximum number of iterations for (I)SIS and its variants.
#' @param varISIS Specifies whether to perform any of the two ISIS variants
#' based on randomly splitting the sample into two groups. The variant
#' \code{varISIS='aggr'} is an aggressive variable screening procedure, while
#' \code{varISIS='cons'} is a more conservative approach. The default is
#' \code{varISIS='vanilla'}, which performs the traditional vanilla version of
#' ISIS. See references at the end for details.
#' @param perm Specifies whether to impose a data-driven threshold in the size
#' of the active sets calculated during the ISIS procedures. The threshold is
#' calculated by first decoupling the predictors \eqn{x_i} and response
#' \eqn{y_i} through a random permutation \eqn{\pi} of \eqn{(1,...,n)} to form
#' a null model. For this newly permuted data, marginal regression
#' coefficients for each predictor are recalculated. As the marginal
#' regression coefficients of the original data should be larger than most
#' recalculated coefficients in the null model, the data-driven threshold is
#' given by the \eqn{q}th quantile of the null coefficients. This data-driven
#' threshold only allows a \eqn{1-q} proportion of inactive variables to enter
#' the model when \eqn{x_i} and \eqn{y_i} are not related (in the null model).
#' The default is here is \code{perm=FALSE}. See references at the end for
#' details.
#' @param q Quantile for calculating the data-driven threshold in the
#' permutation-based ISIS. The default is \code{q=1} (i.e., the maximum
#' absolute value of the permuted estimates).
#' @param greedy Specifies whether to run the greedy modification of the
#' permutation-based ISIS. The default is \code{greedy=FALSE}.
#' @param greedy.size Maximum size of the active sets in the greedy
#' modification of the permutation-based ISIS. The default is
#' \code{greedy.size=1}.
#' @param seed Random seed used for sample splitting, random permutation, and
#' cross-validation sampling of training and test sets.
#' @param standardize Logical flag for x variable standardization, prior to
#' performing (iterative) variable screening. The resulting coefficients are
#' always returned on the original scale. Default is \code{standardize=TRUE}.
#' If variables are in the same units already, you might not wish to
#' standardize.
#' @param covars Names of the factor variables.
#' @param boot_ci Logical flag for computing bootstrap confidence intervals. Default = FALSE.
#' @param parallel Specifies whether to conduct parallel computing
#' @return Returns an object with \itemize{
#' \item{sis.ix0}{The vector of indices selected by
#' only SIS.}
#' \item{ix}{ The vector of indices selected by
#' (I)SIS with regularization step.}
#' \item{coef.est}{The vector of coefficients of the final model selected by (I)SIS.}
#' \item{fit}{A fitted object of type \code{ncvreg},
#' \code{cv.ncvreg}, \code{glmnet}, or \code{cv.glmnet} for the final model selected by the (I)SIS procedure. If \code{tune='cv'}, the returned fitted object is of type \code{cv.ncvreg} if \code{penalty='SCAD'} or
#' \code{penalty='MCP'}; otherwise, the returned fitted object is of type
#' \code{cv.glmnet}. For the remaining options of \code{tune}, the returned
#' object is of type \code{glmnet} if \code{penalty='lasso'}, and \code{ncvreg}
#' otherwise. }
#' \item{path.index}{ The index along the solution path of
#' \code{fit} for which the criterion specified in \code{tune} is minimized. }
#' \item{ix0}{The vector of indices ordering by decreasing importance.}
#' \item{ix_list}{The list of vectors of indices ordering by decreasing importance, for each screening step.}
#' \item{cis}{Data frame with effect estimates: \itemize{
#' \item{coef}{Coefficient}
#' \item{CI_low}{Lower confidence interval of the coefficient}
#' \item{CI_up}{Upper confidence interval of the coefficient}
#' \item{Est}{Effect estimate (mean difference for Gaussian family, hazard ratio for Cox
#' family and odds ratio for binomial family) when comparing the two specified quantiles in \code{probs}}
#' \item{CI_low_perc}{Lower confidence interval of the effect estimate when comparing the quantiles specified in \code{probs}}
#' \item{CI_up_perc}{Upper confidence interval of the effect estimate when comparing the quantiles specified in \code{probs}}
#' }
#' }
#' }
#' @author Jianqing Fan, Yang Feng, Diego Franco Saldana, Richard Samworth, Arce Domingo-Relloso and
#' Yichao Wu
#' @seealso \code{\link{predict.SIS}}
#' @references
#' Diego Franco Saldana and Yang Feng (2018) SIS: An R package for Sure Independence Screening in
#' Ultrahigh Dimensional Statistical Models, \emph{Journal of Statistical Software}, \bold{83}, 2, 1-25.
#'
#' Jianqing Fan and Jinchi Lv (2008) Sure Independence Screening
#' for Ultrahigh Dimensional Feature Space (with discussion). \emph{Journal of
#' Royal Statistical Society B}, \bold{70}, 849-911.
#'
#' Jianqing Fan and Rui Song (2010) Sure Independence Screening in Generalized
#' Linear Models with NP-Dimensionality. \emph{The Annals of Statistics},
#' \bold{38}, 3567-3604.
#'
#' Jianqing Fan, Richard Samworth, and Yichao Wu (2009) Ultrahigh Dimensional
#' Feature Selection: Beyond the Linear Model. \emph{Journal of Machine
#' Learning Research}, \bold{10}, 2013-2038.
#'
#' Jianqing Fan, Yang Feng, and Yichao Wu (2010) High-dimensional Variable
#' Selection for Cox Proportional Hazards Model. \emph{IMS Collections},
#' \bold{6}, 70-86.
#'
#' Jianqing Fan, Yang Feng, and Rui Song (2011) Nonparametric Independence
#' Screening in Sparse Ultrahigh Dimensional Additive Models. \emph{Journal of
#' the American Statistical Association}, \bold{106}, 544-557.
#'
#' Jiahua Chen and Zehua Chen (2008) Extended Bayesian Information Criteria for
#' Model Selection with Large Model Spaces. \emph{Biometrika}, \bold{95},
#' 759-771.
#' @keywords models
#' @examples
#'
#'
#' set.seed(0)
#' n <- 400
#' p <- 50
#' rho <- 0.5
#' corrmat <- diag(rep(1 - rho, p)) + matrix(rho, p, p)
#' corrmat[, 4] <- sqrt(rho)
#' corrmat[4, ] <- sqrt(rho)
#' corrmat[4, 4] <- 1
#' corrmat[, 5] <- 0
#' corrmat[5, ] <- 0
#' corrmat[5, 5] <- 1
#' cholmat <- chol(corrmat)
#' x <- matrix(rnorm(n * p, mean = 0, sd = 1), n, p)
#' x <- x %*% cholmat
#'
#' # gaussian response
#' set.seed(1)
#' b <- c(4, 4, 4, -6 * sqrt(2), 4 / 3)
#' y <- x[, 1:5] %*% b + rnorm(n)
#'
#'
#' # SIS without regularization
#' model10 <- SIS(x, y, family = "gaussian", iter = FALSE)
#' model10$sis.ix0
#' # The top 10 selected variables
#' model10$ix0[1:10]
#' # The top 10 selected variables for each step
#' lapply(model10$ix_list, f <- function(x) {
#' x[1:10]
#' })
#' # ISIS with regularization
#' model11 <- SIS(x, y, family = "gaussian", tune = "bic")
#' model12 <- SIS(x, y, family = "gaussian", tune = "bic", varISIS = "aggr", seed = 11)
#' model11$ix
#' model12$ix
#' \dontrun{
#' # binary response
#' set.seed(2)
#' feta <- x[, 1:5] %*% b
#' fprob <- exp(feta) / (1 + exp(feta))
#' y <- rbinom(n, 1, fprob)
#' model21 <- SIS(x, y, family = "binomial", tune = "bic")
#' model22 <- SIS(x, y, family = "binomial", tune = "bic", varISIS = "aggr", seed = 21)
#' model21$ix
#' model22$ix
#'
#' # poisson response
#' set.seed(3)
#' b <- c(0.6, 0.6, 0.6, -0.9 * sqrt(2))
#' myrates <- exp(x[, 1:4] %*% b)
#' y <- rpois(n, myrates)
#' model31 <- SIS(x, y,
#' family = "poisson", penalty = "lasso", tune = "bic", perm = TRUE, q = 0.9,
#' greedy = TRUE, seed = 31
#' )
#' model32 <- SIS(x, y,
#' family = "poisson", penalty = "lasso", tune = "bic", varISIS = "aggr",
#' perm = TRUE, q = 0.9, seed = 32
#' )
#' model31$ix
#' model32$ix
#'
#'
#' # Cox model
#' set.seed(4)
#' b <- c(4, 4, 4, -6 * sqrt(2), 4 / 3)
#' myrates <- exp(x[, 1:5] %*% b)
#' Sur <- rexp(n, myrates)
#' CT <- rexp(n, 0.1)
#' Z <- pmin(Sur, CT)
#' ind <- as.numeric(Sur <= CT)
#' y <- survival::Surv(Z, ind)
#' model41 <- SIS(x, y,
#' family = "cox", penalty = "lasso", tune = "bic",
#' varISIS = "aggr", seed = 41
#' )
#' model42 <- SIS(x, y,
#' family = "cox", penalty = "lasso", tune = "bic",
#' varISIS = "cons", seed = 41
#' )
#' model41$ix
#' model42$ix
#'
#' # SIS with bootstrap confidence intervals
#' sis <- SIS(x, y, family = "cox", penalty='aenet', tune='cv', varISIS='cons',
#' seed = 41, boot_ci=TRUE)
#' sis$cis
#'}
#'
SIS <- function(x, y, family = c("gaussian", "binomial", "poisson", "cox", "multinom"), penalty = c("SCAD", "MCP", "lasso", "enet", "aenet", "msaenet"),
concavity.parameter = switch(penalty, SCAD = 3.7, 3), tune = c("bic", "ebic", "aic", "cv"), nfolds = 10,
type.measure = c("deviance", "class", "auc", "mse", "mae"), gamma.ebic = 1, nsis = NULL, iter = TRUE, iter.max = ifelse(greedy ==
FALSE, 10, floor(nrow(x) / log(nrow(x)))), varISIS = c("vanilla", "aggr", "cons"), perm = FALSE, q = 1,
greedy = FALSE, greedy.size = 1, seed = NULL, standardize = TRUE, covars=NULL, boot_ci = FALSE, parallel=TRUE) {
this.call <- match.call()
family <- match.arg(family)
penalty <- match.arg(penalty)
tune <- match.arg(tune)
type.measure <- match.arg(type.measure)
varISIS <- match.arg(varISIS)
set.seed(seed)
if (is.null(x) || is.null(y)) {
stop("The data is missing!")
}
if (class(concavity.parameter) != "numeric") {
stop("concavity.parameter must be numeric!")
}
if (class(nfolds) != "numeric") {
stop("nfolds must be numeric!")
}
if (!is.null(seed) & class(seed) != "numeric") {
stop("seed must be numeric!")
}
if (family == 'multinom' | is.null(penalty)){
penalty = 'lasso'
}
if (tune != "cv" && penalty %in% c("aenet", "msaenet")) {
stop("Model currently not implemented with selected tuning option")
}
if (family == "multinom" && penalty %in% c("SCAD", "MCP", "aenet", "msaenet")) {
stop("Multinom model currently not implemented with selected penalty")
}
if (type.measure %in% c("class", "auc") && family %in% c("gaussian", "poisson", "cox")) {
stop("'class' and 'auc' type measures are only available for logistic regression")
}
if (type.measure %in% c("class", "auc", "mse", "mae") && penalty %in% c("SCAD", "MCP")) {
stop("Only 'deviance' is available as type.measure for non-convex penalties")
}
if (is.null(colnames(x))){
colnames(x) <- unlist(lapply(seq(1:dim(x)[2]), function(y) paste0('V',y)))
}
fit <- switch(family, gaussian = sisglm(
x, y, "gaussian", penalty, concavity.parameter, tune,
nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
standardize, boot_ci, covars, parallel
),
binomial = sisglm(
x, y, "binomial", penalty, concavity.parameter, tune,
nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
standardize, boot_ci, covars, parallel
), poisson = sisglm(
x, y, "poisson", penalty, concavity.parameter, tune,
nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
standardize, boot_ci, covars, parallel
), cox = sisglm(
x, y, "cox", penalty, concavity.parameter, tune,
nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
standardize, boot_ci, covars, parallel
), multinom = sisglm(
x, y, "multinom", penalty, concavity.parameter, tune,
nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
standardize, boot_ci, covars, parallel
)
)
fit$call <- this.call
class(fit) <- c(class(fit), "SIS")
return(fit)
}
sisglm <- function(x, y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic, nsis,
iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed, standardize, boot_ci, covars, parallel, s1 = NULL, s2 = NULL, split.tries = 0) {
storage.mode(x) <- "numeric"
n <- dim(x)[1]
p <- dim(x)[2]
models <- vector("list")
if (is.null(nsis) == TRUE) {
nsis <- calculate.nsis(family = family, varISIS = varISIS, n = n, p = p)
}
if (is.null(s1) == TRUE) {
if (!is.null(seed)) {
set.seed(seed)
}
split.sample <- sample(1:n)
s1 <- split.sample[1:ceiling(n / 2)]
s2 <- setdiff(split.sample, s1)
}
old.x <- x
if (standardize == TRUE) {
x <- scale(x)
}
ix_list <- NULL
iterind <- 0
if (iter == TRUE) {
ix0_list <- obtain.ix0(
x = x, y = y, s1 = s1, s2 = s2, family = family, nsis = nsis, iter = iter, varISIS = varISIS,
perm = perm, q = q, greedy = greedy, greedy.size = greedy.size, iterind = iterind
)
ix0 <- ix0_list$ix0
ix0all <- ix0_list$ix0all
sis.ix0 <- ix0
ix_list[[1]] <- ix0all
repeat {
iterind <- iterind + 1
cat("Iter", iterind, ", screening: ", ix0, "\n")
if (length(ix0) == 1 & penalty == "lasso") {
ix0 <- c(ix0, p + 1 - ix0)
}
pen.ind <- ix0
selection.fit <- tune.fit(old.x[, ix0, drop = FALSE], y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic, parallel, seed)
coef.beta <- selection.fit$beta
a0 <- selection.fit$a0
lambda <- selection.fit$lambda
lambda.ind <- selection.fit$lambda.ind
ix1 <- ix0[selection.fit$ix]
if (length(ix1) == 0) {
split.tries <- split.tries + 1
split.sample <- sample(1:n)
s1 <- split.sample[1:ceiling(n / 2)]
s2 <- setdiff(split.sample, s1)
cat("Sample splitting attempt: ", split.tries, "\n")
if (split.tries >= 20) {
cat("No variables remaining after ", split.tries, " sample splitting attempts! \n")
cat("You can try a more conservative variable screening approach! \n")
} else {
return(sisglm(
old.x, y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic,
nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed, standardize, boot_ci, covars, parallel, s1 = s1, s2 = s2, split.tries
))
}
}
cat("Iter", iterind, ", selection: ", ix1, "\n")
if (length(ix1) >= nsis || iterind >= iter.max) {
ix0 <- ix1
if (length(ix1) >= nsis) {
cat("Maximum number of variables selected \n")
}
if (iterind >= iter.max) {
cat("Maximum number of iterations reached \n")
}
break
}
models[[iterind]] <- ix1
flag.models <- 0
if (iterind > 1) {
for (j in 1:(iterind - 1)) {
if (identical(models[[j]], ix1) == TRUE) {
flag.models <- 1
}
}
}
if (flag.models == 1) {
cat("Model already selected \n")
break
}
candind <- setdiff(1:p, ix1)
pleft <- nsis - length(ix1)
newix_list <- obtain.newix(
x = x, y = y, candind = candind, ix1 = ix1, s1 = s1, s2 = s2, family = family,
pleft = pleft, varISIS = varISIS, perm = perm, q = q, greedy = greedy, greedy.size = greedy.size,
iterind = iterind
)
newix <- newix_list$newix
newixall <- newix_list$newixall
cat("Iter", iterind, ", conditional-screening: ", newix, "\n")
ix_list[[iterind + 1]] <- c(ix1, newixall)
ix1 <- c(ix1, newix)
if (setequal(ix1, ix0)) {
flag.models <- 1
}
ix0 <- ix1
if (length(ix1) == 0) break
} # end repeat
} else {
# end if(iter==TRUE)
ix0_list <- obtain.ix0(
x = x, y = y, s1 = s1, s2 = s2, family = family, nsis = nsis, iter = iter, varISIS = varISIS,
perm = perm, q = q, greedy = greedy, greedy.size = greedy.size, iterind = iterind
)
ix0 <- ix0_list$ix0
ix0all <- ix0_list$ix0all
ix_list[[1]] <- ix0all
sis.ix0 <- ix0
if (length(ix0) == 1 & penalty == "lasso") {
ix0 <- c(ix0, p + 1 - ix0)
}
pen.ind <- ix0
selection.fit <- tune.fit(old.x[, ix0, drop = FALSE], y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic, parallel, seed)
coef.beta <- selection.fit$beta
a0 <- selection.fit$a0
lambda <- selection.fit$lambda
lambda.ind <- selection.fit$lambda.ind
ix1 <- ix0[selection.fit$ix]
}
if (family == "cox") {
if (length(ix1) > 0) {
names(coef.beta) <- paste("X", ix1, sep = "")
}
} else {
coef.beta <- c(a0, coef.beta)
if (length(ix1) > 0) {
names(coef.beta) <- c("(Intercept)", paste("X", ix1, sep = ""))
}
}
if (boot_ci == TRUE){
cis <- boot_sis(x=old.x[,ix1], y=y, family=family, penalty=penalty, covars=covars, parallel=parallel)
return(list(sis.ix0 = sis.ix0, ix = ix1, coef.est = coef.beta, fit = selection.fit$fit, lambda = lambda, ix0 = pen.ind, ix_list = ix_list, cis=cis))
} else{
return(list(sis.ix0 = sis.ix0, ix = ix1, coef.est = coef.beta, fit = selection.fit$fit, lambda = lambda, ix0 = pen.ind, ix_list = ix_list))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.