SIS: Sure Independence Screening

Documented in SIS

#' (Iterative) Sure Independence Screening ((I)SIS) and Fitting in Generalized
#' Linear Models and Cox's Proportional Hazards Models
#'
#' This function first implements the Iterative Sure Independence Screening for
#' different variants of (I)SIS, and then fits the final regression model using
#' the R packages \pkg{ncvreg}, \pkg{glmnet}, \pkg{msaenet} and \pkg{Coxnet} for the SCAD/MCP/LASSO/ENET/AENET
#' regularized loglikelihood for the variables picked by (I)SIS.
#'
#' @export
#' @importFrom glmnet glmnet
#' @importFrom msaenet aenet
#' @importFrom ncvreg ncvreg
#' @importFrom glmnet cv.glmnet
#' @importFrom ncvreg cv.ncvreg
#' @importFrom survival coxph
#' @importFrom survival Surv
#' @importFrom stats cor glm.fit
#' @importFrom stats glm.fit
#' @importFrom stats quantile
#' @importFrom stats binomial
#' @importFrom stats coef
#' @importFrom stats gaussian
#' @importFrom stats poisson
#' @importFrom stats deviance
#' @importFrom stats predict
#' @importFrom nnet multinom
#' @import doParallel
#' @importFrom foreach foreach
#' @importFrom Coxnet Coxnet
#'
#' @param x The design matrix, of dimensions n * p, without an intercept. Each
#' row is an observation vector.  \code{SIS} standardizes the data and includes
#' an intercept by default.
#' @param y The response vector of dimension n * 1. Quantitative for
#' \code{family='gaussian'}, non-negative counts for \code{family='poisson'},
#' binary (0-1) for \code{family='binomial'}, factor for \code{family='multinom'}.
#' For \code{family='cox'}, \code{y} should be an object of class \code{Surv}, 
#' as provided by the function \code{Surv()} in the package \pkg{survival}.
#' @param family Response type (see above).
#' @param penalty The penalty to be applied in the regularized likelihood
#' subproblems. 'SCAD', 'MCP', or 'lasso' are provided. 'lasso' is the default
#' for family = 'multinom' or 'cox', 'SCAD' is the default for other families. 
#' @param concavity.parameter The tuning parameter used to adjust the concavity
#' of the SCAD/MCP penalty. Default is 3.7 for SCAD and 3 for MCP.
#' @param tune Method for tuning the regularization parameter of the penalized
#' likelihood subproblems and of the final model selected by (I)SIS. Options
#' include \code{tune='bic'}, \code{tune='ebic'}, \code{tune='aic'}, and
#' \code{tune='cv'}.
#' @param nfolds Number of folds used in cross-validation. The default is 10.
#' @param type.measure Loss to use for cross-validation. Currently five
#' options, not all available for all models. The default is
#' \code{type.measure='deviance'}, which uses squared-error for gaussian models
#' (also equivalent to \code{type.measure='mse'} in this case), deviance for
#' logistic and poisson regression, and partial-likelihood for the Cox model.
#' Both \code{type.measure='class'} and \code{type.measure='auc'} apply only to
#' logistic regression and give misclassification error and area under the ROC
#' curve, respectively. \code{type.measure='mse'} or \code{type.measure='mae'}
#' (mean absolute error) can be used by all models except the \code{'cox'};
#' they measure the deviation from the fitted mean to the response. For
#' \code{penalty='SCAD'} and \code{penalty='MCP'}, only
#' \code{type.measure='deviance'} is available.
#' @param gamma.ebic Specifies the parameter in the Extended BIC criterion
#' penalizing the size of the corresponding model space. The default is
#' \code{gamma.ebic=1}. See references at the end for details.
#' @param nsis Number of pedictors recuited by (I)SIS.
#' @param iter Specifies whether to perform iterative SIS. The default is
#' \code{iter=TRUE}.
#' @param iter.max Maximum number of iterations for (I)SIS and its variants.
#' @param varISIS Specifies whether to perform any of the two ISIS variants
#' based on randomly splitting the sample into two groups. The variant
#' \code{varISIS='aggr'} is an aggressive variable screening procedure, while
#' \code{varISIS='cons'} is a more conservative approach. The default is
#' \code{varISIS='vanilla'}, which performs the traditional vanilla version of
#' ISIS. See references at the end for details.
#' @param perm Specifies whether to impose a data-driven threshold in the size
#' of the active sets calculated during the ISIS procedures. The threshold is
#' calculated by first decoupling the predictors \eqn{x_i} and response
#' \eqn{y_i} through a random permutation \eqn{\pi} of \eqn{(1,...,n)} to form
#' a null model.  For this newly permuted data, marginal regression
#' coefficients for each predictor are recalculated.  As the marginal
#' regression coefficients of the original data should be larger than most
#' recalculated coefficients in the null model, the data-driven threshold is
#' given by the \eqn{q}th quantile of the null coefficients. This data-driven
#' threshold only allows a \eqn{1-q} proportion of inactive variables to enter
#' the model when \eqn{x_i} and \eqn{y_i} are not related (in the null model).
#' The default is here is \code{perm=FALSE}. See references at the end for
#' details.
#' @param q Quantile for calculating the data-driven threshold in the
#' permutation-based ISIS. The default is \code{q=1} (i.e., the maximum
#' absolute value of the permuted estimates).
#' @param greedy Specifies whether to run the greedy modification of the
#' permutation-based ISIS. The default is \code{greedy=FALSE}.
#' @param greedy.size Maximum size of the active sets in the greedy
#' modification of the permutation-based ISIS. The default is
#' \code{greedy.size=1}.
#' @param seed Random seed used for sample splitting, random permutation, and
#' cross-validation sampling of training and test sets.
#' @param standardize Logical flag for x variable standardization, prior to
#' performing (iterative) variable screening.  The resulting coefficients are
#' always returned on the original scale. Default is \code{standardize=TRUE}.
#' If variables are in the same units already, you might not wish to
#' standardize.
#' @param covars Names of the factor variables.
#' @param boot_ci Logical flag for computing bootstrap confidence intervals. Default = FALSE.
#' @param parallel Specifies whether to conduct parallel computing
#' @return Returns an object with \itemize{
#' \item{sis.ix0}{The vector of indices selected by
#' only SIS.} 
#' \item{ix}{ The vector of indices selected by
#' (I)SIS with regularization step.} 
#' \item{coef.est}{The vector of coefficients of the final model selected by (I)SIS.} 
#' \item{fit}{A fitted object of type \code{ncvreg},
#' \code{cv.ncvreg}, \code{glmnet}, or \code{cv.glmnet} for the final model selected by the (I)SIS procedure. If \code{tune='cv'}, the returned fitted object is of type \code{cv.ncvreg} if \code{penalty='SCAD'} or
#' \code{penalty='MCP'}; otherwise, the returned fitted object is of type
#' \code{cv.glmnet}. For the remaining options of \code{tune}, the returned
#' object is of type \code{glmnet} if \code{penalty='lasso'}, and \code{ncvreg}
#' otherwise.  } 
#' \item{path.index}{ The index along the solution path of
#' \code{fit} for which the criterion specified in \code{tune} is minimized.  }
#' \item{ix0}{The vector of indices ordering by decreasing importance.}
#' \item{ix_list}{The list of vectors of indices ordering by decreasing importance, for each screening step.}
#' \item{cis}{Data frame with effect estimates: \itemize{ 
#' \item{coef}{Coefficient} 
#' \item{CI_low}{Lower confidence interval of the coefficient}
#' \item{CI_up}{Upper confidence interval of the coefficient} 
#' \item{Est}{Effect estimate (mean difference for Gaussian family, hazard ratio for Cox
#' family and odds ratio for binomial family) when comparing the two specified quantiles in \code{probs}}
#' \item{CI_low_perc}{Lower confidence interval of the effect estimate when comparing the quantiles specified in \code{probs}}
#' \item{CI_up_perc}{Upper confidence interval of the effect estimate when comparing the quantiles specified in \code{probs}}
#' }
#' }
#' }
#' @author Jianqing Fan, Yang Feng, Diego Franco Saldana, Richard Samworth, Arce Domingo-Relloso and
#' Yichao Wu
#' @seealso \code{\link{predict.SIS}}
#' @references
#' Diego Franco Saldana and Yang Feng (2018) SIS: An R package for Sure Independence Screening in
#' Ultrahigh Dimensional Statistical Models, \emph{Journal of Statistical Software}, \bold{83}, 2, 1-25.
#'
#' Jianqing Fan and Jinchi Lv (2008) Sure Independence Screening
#' for Ultrahigh Dimensional Feature Space (with discussion). \emph{Journal of
#' Royal Statistical Society B}, \bold{70}, 849-911.
#'
#' Jianqing Fan and Rui Song (2010) Sure Independence Screening in Generalized
#' Linear Models with NP-Dimensionality.  \emph{The Annals of Statistics},
#' \bold{38}, 3567-3604.
#'
#' Jianqing Fan, Richard Samworth, and Yichao Wu (2009) Ultrahigh Dimensional
#' Feature Selection: Beyond the Linear Model. \emph{Journal of Machine
#' Learning Research}, \bold{10}, 2013-2038.
#'
#' Jianqing Fan, Yang Feng, and Yichao Wu (2010) High-dimensional Variable
#' Selection for Cox Proportional Hazards Model. \emph{IMS Collections},
#' \bold{6}, 70-86.
#'
#' Jianqing Fan, Yang Feng, and Rui Song (2011) Nonparametric Independence
#' Screening in Sparse Ultrahigh Dimensional Additive Models. \emph{Journal of
#' the American Statistical Association}, \bold{106}, 544-557.
#'
#' Jiahua Chen and Zehua Chen (2008) Extended Bayesian Information Criteria for
#' Model Selection with Large Model Spaces. \emph{Biometrika}, \bold{95},
#' 759-771.
#' @keywords models
#' @examples
#'
#'
#' set.seed(0)
#' n <- 400
#' p <- 50
#' rho <- 0.5
#' corrmat <- diag(rep(1 - rho, p)) + matrix(rho, p, p)
#' corrmat[, 4] <- sqrt(rho)
#' corrmat[4, ] <- sqrt(rho)
#' corrmat[4, 4] <- 1
#' corrmat[, 5] <- 0
#' corrmat[5, ] <- 0
#' corrmat[5, 5] <- 1
#' cholmat <- chol(corrmat)
#' x <- matrix(rnorm(n * p, mean = 0, sd = 1), n, p)
#' x <- x %*% cholmat
#' 
#' # gaussian response
#' set.seed(1)
#' b <- c(4, 4, 4, -6 * sqrt(2), 4 / 3)
#' y <- x[, 1:5] %*% b + rnorm(n)
#' 
#' 
#' # SIS without regularization
#' model10 <- SIS(x, y, family = "gaussian", iter = FALSE)
#' model10$sis.ix0
#' # The top 10 selected variables
#' model10$ix0[1:10]
#' # The top 10 selected variables for each step
#' lapply(model10$ix_list, f <- function(x) {
#'   x[1:10]
#' })
#' # ISIS with regularization
#' model11 <- SIS(x, y, family = "gaussian", tune = "bic")
#' model12 <- SIS(x, y, family = "gaussian", tune = "bic", varISIS = "aggr", seed = 11)
#' model11$ix
#' model12$ix
#' \dontrun{
#' # binary response
#' set.seed(2)
#' feta <- x[, 1:5] %*% b
#' fprob <- exp(feta) / (1 + exp(feta))
#' y <- rbinom(n, 1, fprob)
#' model21 <- SIS(x, y, family = "binomial", tune = "bic")
#' model22 <- SIS(x, y, family = "binomial", tune = "bic", varISIS = "aggr", seed = 21)
#' model21$ix
#' model22$ix
#'
#' # poisson response
#' set.seed(3)
#' b <- c(0.6, 0.6, 0.6, -0.9 * sqrt(2))
#' myrates <- exp(x[, 1:4] %*% b)
#' y <- rpois(n, myrates)
#' model31 <- SIS(x, y,
#'   family = "poisson", penalty = "lasso", tune = "bic", perm = TRUE, q = 0.9,
#'   greedy = TRUE, seed = 31
#' )
#' model32 <- SIS(x, y,
#'   family = "poisson", penalty = "lasso", tune = "bic", varISIS = "aggr",
#'   perm = TRUE, q = 0.9, seed = 32
#' )
#' model31$ix
#' model32$ix
#'
#'
#' # Cox model
#' set.seed(4)
#' b <- c(4, 4, 4, -6 * sqrt(2), 4 / 3)
#' myrates <- exp(x[, 1:5] %*% b)
#' Sur <- rexp(n, myrates)
#' CT <- rexp(n, 0.1)
#' Z <- pmin(Sur, CT)
#' ind <- as.numeric(Sur <= CT)
#' y <- survival::Surv(Z, ind)
#' model41 <- SIS(x, y,
#'   family = "cox", penalty = "lasso", tune = "bic",
#'   varISIS = "aggr", seed = 41
#' )
#' model42 <- SIS(x, y,
#'   family = "cox", penalty = "lasso", tune = "bic",
#'   varISIS = "cons", seed = 41
#' )
#' model41$ix
#' model42$ix
#' 
#' # SIS with bootstrap confidence intervals
#' sis <- SIS(x, y, family = "cox", penalty='aenet', tune='cv', varISIS='cons',
#' seed = 41, boot_ci=TRUE)
#' sis$cis
#'}
#'

SIS <- function(x, y, family = c("gaussian", "binomial", "poisson", "cox", "multinom"), penalty = c("SCAD", "MCP", "lasso", "enet", "aenet", "msaenet"),
                      concavity.parameter = switch(penalty, SCAD = 3.7, 3), tune = c("bic", "ebic", "aic", "cv"), nfolds = 10,
                      type.measure = c("deviance", "class", "auc", "mse", "mae"), gamma.ebic = 1, nsis = NULL, iter = TRUE, iter.max = ifelse(greedy ==
                                                                                                                                                FALSE, 10, floor(nrow(x) / log(nrow(x)))), varISIS = c("vanilla", "aggr", "cons"), perm = FALSE, q = 1,
                      greedy = FALSE, greedy.size = 1, seed = NULL, standardize = TRUE, covars=NULL, probs=c(0.1,0.9), boot_ci = FALSE, parallel=TRUE) {
  this.call <- match.call()
  family <- match.arg(family)
  penalty <- match.arg(penalty)
  tune <- match.arg(tune)
  type.measure <- match.arg(type.measure)
  varISIS <- match.arg(varISIS)
  set.seed(seed)
  
  if (is.null(x) || is.null(y)) {
    stop("The data is missing!")
  }
  if (class(concavity.parameter) != "numeric") {
    stop("concavity.parameter must be numeric!")
  }
  if (class(nfolds) != "numeric") {
    stop("nfolds must be numeric!")
  }
  if (!is.null(seed) & class(seed) != "numeric") {
    stop("seed must be numeric!")
  }
  if (family == 'multinom' | is.null(penalty)){
    penalty = 'lasso'
  }
  if (tune != "cv" && penalty %in% c("aenet", "msaenet")) {
    stop("Model currently not implemented with selected tuning option")
  }
  if (family == "multinom" && penalty %in% c("SCAD", "MCP", "aenet", "msaenet")) {
    stop("Multinom model currently not implemented with selected penalty")
  }
  if (type.measure %in% c("class", "auc") && family %in% c("gaussian", "poisson", "cox")) {
    stop("'class' and 'auc' type measures are only available for logistic regression")
  }
  
  if (type.measure %in% c("class", "auc", "mse", "mae") && penalty %in% c("SCAD", "MCP")) {
    stop("Only 'deviance' is available as type.measure for non-convex penalties")
  }
  if (is.null(colnames(x))){
    colnames(x) <- unlist(lapply(seq(1:dim(x)[2]), function(y) paste0('V',y)))
    
  }
  
  fit <- switch(family, gaussian = sisglm(
    x, y, "gaussian", penalty, concavity.parameter, tune,
    nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
    standardize, boot_ci, covars, probs, parallel
  ),
  binomial = sisglm(
    x, y, "binomial", penalty, concavity.parameter, tune,
    nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
    standardize, boot_ci, covars, probs, parallel
  ), poisson = sisglm(
    x, y, "poisson", penalty, concavity.parameter, tune,
    nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
    standardize, boot_ci, covars, probs, parallel
  ), cox = sisglm(
    x, y, "cox", penalty, concavity.parameter, tune,
    nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
    standardize, boot_ci, covars, probs, parallel
  ), multinom = sisglm(
    x, y, "multinom", penalty, concavity.parameter, tune,
    nfolds, type.measure, gamma.ebic, nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed,
    standardize, boot_ci, covars, probs, parallel
  )
  )
  fit$call <- this.call
  class(fit) <- c(class(fit), "SIS")
  return(fit)
}

sisglm <- function(x, y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic, nsis,
                   iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed, standardize, boot_ci, covars, probs, parallel, s1 = NULL, s2 = NULL, split.tries = 0) {
  storage.mode(x) <- "numeric"
  n <- dim(x)[1]
  p <- dim(x)[2]
  models <- vector("list")
  if (is.null(nsis) == TRUE) {
    nsis <- calculate.nsis(family = family, varISIS = varISIS, n = n, p = p)
  }
  if (is.null(s1) == TRUE) {
    if (!is.null(seed)) {
      set.seed(seed)
    }
    split.sample <- sample(1:n)
    s1 <- split.sample[1:ceiling(n / 2)]
    s2 <- setdiff(split.sample, s1)
  }
  old.x <- x
  if (standardize == TRUE) {
    x <- scale(x)
  }
  ix_list <- NULL
  iterind <- 0
  
  if (iter == TRUE) {
    ix0_list <- obtain.ix0(
      x = x, y = y, s1 = s1, s2 = s2, family = family, nsis = nsis, iter = iter, varISIS = varISIS,
      perm = perm, q = q, greedy = greedy, greedy.size = greedy.size, iterind = iterind
    )
    ix0 <- ix0_list$ix0
    ix0all <- ix0_list$ix0all
    sis.ix0 <- ix0
    ix_list[[1]] <- ix0all
    repeat {
      iterind <- iterind + 1
      cat("Iter", iterind, ", screening: ", ix0, "\n")
      if (length(ix0) == 1 & penalty == "lasso") {
        ix0 <- c(ix0, p + 1 - ix0)
      }
      pen.ind <- ix0
      
      selection.fit <- tune.fit(old.x[, ix0, drop = FALSE], y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic, parallel, seed)
      coef.beta <- selection.fit$beta
      a0 <- selection.fit$a0
      
      lambda <- selection.fit$lambda
      lambda.ind <- selection.fit$lambda.ind
      ix1 <- ix0[selection.fit$ix]
      if (length(ix1) == 0) {
        split.tries <- split.tries + 1
        split.sample <- sample(1:n)
        s1 <- split.sample[1:ceiling(n / 2)]
        s2 <- setdiff(split.sample, s1)
        cat("Sample splitting attempt: ", split.tries, "\n")
        if (split.tries >= 20) {
          cat("No variables remaining after ", split.tries, " sample splitting attempts! \n")
          cat("You can try a more conservative variable screening approach! \n")
        } else {
          return(sisglm(
            old.x, y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic,
            nsis, iter, iter.max, varISIS, perm, q, greedy, greedy.size, seed, standardize, boot_ci, covars, parallel, s1 = s1, s2 = s2, split.tries
          ))
        }
      }
      
      
      cat("Iter", iterind, ", selection: ", ix1, "\n")
      if (length(ix1) >= nsis || iterind >= iter.max) {
        ix0 <- ix1
        if (length(ix1) >= nsis) {
          cat("Maximum number of variables selected \n")
        }
        if (iterind >= iter.max) {
          cat("Maximum number of iterations reached \n")
        }
        break
      }
      
      models[[iterind]] <- ix1
      flag.models <- 0
      if (iterind > 1) {
        for (j in 1:(iterind - 1)) {
          if (identical(models[[j]], ix1) == TRUE) {
            flag.models <- 1
          }
        }
      }
      if (flag.models == 1) {
        cat("Model already selected \n")
        break
      }
      
      candind <- setdiff(1:p, ix1)
      pleft <- nsis - length(ix1)
      newix_list <- obtain.newix(
        x = x, y = y, candind = candind, ix1 = ix1, s1 = s1, s2 = s2, family = family,
        pleft = pleft, varISIS = varISIS, perm = perm, q = q, greedy = greedy, greedy.size = greedy.size,
        iterind = iterind
      )
      newix <- newix_list$newix
      newixall <- newix_list$newixall
      cat("Iter", iterind, ", conditional-screening: ", newix, "\n")
      ix_list[[iterind + 1]] <- c(ix1, newixall)
      ix1 <- c(ix1, newix)
      if (setequal(ix1, ix0)) {
        flag.models <- 1
      }
      ix0 <- ix1
      if (length(ix1) == 0) break
    } # end repeat
  } else {
    # end if(iter==TRUE)
    ix0_list <- obtain.ix0(
      x = x, y = y, s1 = s1, s2 = s2, family = family, nsis = nsis, iter = iter, varISIS = varISIS,
      perm = perm, q = q, greedy = greedy, greedy.size = greedy.size, iterind = iterind
    )
    ix0 <- ix0_list$ix0
    ix0all <- ix0_list$ix0all
    ix_list[[1]] <- ix0all
    sis.ix0 <- ix0
    if (length(ix0) == 1 & penalty == "lasso") {
      ix0 <- c(ix0, p + 1 - ix0)
    }
    pen.ind <- ix0
    selection.fit <- tune.fit(old.x[, ix0, drop = FALSE], y, family, penalty, concavity.parameter, tune, nfolds, type.measure, gamma.ebic, parallel, seed)
    coef.beta <- selection.fit$beta
    a0 <- selection.fit$a0
    lambda <- selection.fit$lambda
    lambda.ind <- selection.fit$lambda.ind
    ix1 <- ix0[selection.fit$ix]
  }
  
  
  
  if (family == "cox") {
    if (length(ix1) > 0) {
      names(coef.beta) <- paste("X", ix1, sep = "")
    }
  } else {
    coef.beta <- c(a0, coef.beta)
    if (length(ix1) > 0) {
      names(coef.beta) <- c("(Intercept)", paste("X", ix1, sep = ""))
    }
  }
  
  if (boot_ci == TRUE){
    cis <- boot_sis(x=old.x[,ix1], y=y, family=family, penalty=penalty, covars=covars, probs=probs, parallel=parallel)
    return(list(sis.ix0 = sis.ix0, ix = ix1, coef.est = coef.beta, fit = selection.fit$fit, lambda = lambda, ix0 = pen.ind, ix_list = ix_list, cis=cis))
  } else{
    return(list(sis.ix0 = sis.ix0, ix = ix1, coef.est = coef.beta, fit = selection.fit$fit, lambda = lambda, ix0 = pen.ind, ix_list = ix_list))
  }
}