R/stability.R
In MLGL: Multi-Layer Group-Lasso

Documented in stability.MLGL

#' Stability selection for \code{\link{MLGL}}
#'
#' @title Stability Selection for Multi-Layer Group-lasso
#'
#' @author Quentin Grimonprez
#' @param X matrix of size n*p
#' @param y vector of size n. If loss = "logit", elements of y must be in \{-1,1\}
#' @param B number of bootstrap sample
#' @param fraction Fraction of data used at each of the \code{B} sub-samples
#' @param lambda lambda values for group lasso. If not provided, the function generates its own values of lambda
#' @param hc output of \code{\link{hclust}} function. If not provided, \code{\link[fastcluster]{hclust}} is run
#' with ward.D2 method
#' @param weightLevel a vector of size p for each level of the hierarchy. A zero indicates that the level will be ignored.
#' If not provided, use 1/(height between 2 successive levels)
#' @param weightSizeGroup a vector
#' @param loss a character string specifying the loss function to use, valid options are: "ls" least squares loss
#' (regression) and "logit" logistic loss (classification)
#' @param intercept should an intercept be included in the model ?
#' @param verbose print some information
#' @param ... Others parameters for \code{\link[gglasso]{gglasso}} function
#'
#' @return a stability.MLGL object containing:
#' \describe{
#' \item{lambda}{sequence of \code{lambda}.}
#' \item{B}{Number of bootstrap samples.}
#' \item{stability}{A matrix of size length(lambda)*number of groups containing the probability of selection of each group}
#' \item{var}{vector containing the index of covariates}
#' \item{group}{vector containing the index of associated groups of covariates}
#' \item{time}{computation time}
#' }
#'
#' @references Meinshausen and Buhlmann (2010). Stability selection.
#' Journal of the Royal Statistical Society: Series B (Statistical Methodology) 72.4, p. 417-473.
#'
#' @details
#' Hierarchical clustering is performed with all the variables. Then, the partitions from the different
#' levels of the hierarchy are used in the different runs of MLGL for estimating the probability of selection of each group.
#'
#' @examples
#' \donttest{
#' set.seed(42)
#' # Simulate gaussian data with block-diagonal variance matrix containing 12 blocks of size 5
#' X <- simuBlockGaussian(50, 12, 5, 0.7)
#'
#' # Generate a response variable
#' y <- X[, c(2, 7, 12)] %*% c(2, 2, -2) + rnorm(50, 0, 0.5)
#'
#' # Apply stability.MLGL method
#' res <- stability.MLGL(X, y)
#' }
#'
#' @seealso \link{cv.MLGL}, \link{MLGL}
#'
#' @export
stability.MLGL <- function(X, y, B = 50, fraction = 0.5, hc = NULL, lambda = NULL, weightLevel = NULL,
                           weightSizeGroup = NULL, loss = c("ls", "logit"), intercept = TRUE, verbose = FALSE, ...) {
  # check parameters
  loss <- match.arg(loss)
  .checkParameters(X, y, hc, lambda, weightLevel, weightSizeGroup, intercept, verbose, loss, NULL)

  # B
  if (!is.numeric(B) | (length(B) != 1)) {
    stop("B must be a positive integer.")
  }
  if (!.is.wholenumber(B)) {
    stop("B must be a positive integer.")
  }
  # fraction
  if (!is.numeric(fraction) | (length(fraction) != 1)) {
    stop("fraction must be a positive real lesser than 1.")
  }
  if ((fraction < 0) | (fraction > 1)) {
    stop("fraction must be a positive real lesser than 1.")
  }

  ################################ same as MLGL function
  # define some useful variables
  n <- nrow(X)
  p <- ncol(X)
  tcah <- rep(NA, 3)

  # if no hc output provided, we make one
  if (is.null(hc)) {
    if (verbose) {
      cat("Computing hierarchical clustering...")
    }

    t1 <- proc.time()
    d <- dist(t(X))
    hc <- hclust(d, method = "ward.D2")
    t2 <- proc.time()
    tcah <- t2 - t1
    if (verbose) {
      cat("DONE in ", tcah[3], "s\n")
    }
  }

  # compute weight, active variables and groups
  if (verbose) {
    cat("Preliminary step...")
  }
  t1 <- proc.time()
  prelim <- preliminaryStep(hc, weightLevel, weightSizeGroup)

  # duplicate data
  Xb <- X[, prelim$var]
  t2 <- proc.time()
  if (verbose) {
    cat("DONE in ", (t2 - t1)[3], "s\n")
  }

  ################################ END same as MLGL function


  ######## stability selection
  if (verbose) {
    cat("Running stability selection...\n")
  }
  t1 <- proc.time()

  # create lambda sequence, if not provided
  if (is.null(lambda)) {
    lambda <- lambdaseq(Xb, y, prelim$group, prelim$weight, intercept = intercept)
  }

  proba <- Matrix(0, nrow = length(lambda), ncol = prelim$group[length(prelim$group)])

  # bootstrap
  for (b in 1:B)
  {
    t1b <- proc.time()
    if (verbose) {
      cat("   Running sample", b, "...")
    }
    # sample of size n/2
    testind <- sample(1:n, floor(n * fraction))

    res <- gglasso(Xb[testind, ], y[testind], prelim$group,
      lambda = lambda, pf = prelim$weight,
      intercept = intercept, loss = loss, ...
    )

    # record selected groups
    for (iLambda in seq_along(lambda)) {
      non0 <- which(res$beta[, iLambda] != 0) # non 0 coefficients
      non0gr <- unique(prelim$group[non0]) # associated non 0 groups
      proba[iLambda, non0gr] <- proba[iLambda, non0gr] + 1 # increment counter of selected groups
    } # fin for lambda
    t2b <- proc.time()

    if (verbose) {
      cat("DONE in", (t2b - t1b)[3], "s \r")
    }
  } # fin for B

  # divide by B for having probabilities
  proba <- proba / B
  t2 <- proc.time()

  if (verbose) {
    cat("\nStability selection DONE in", (t2 - t1)[3], "s")
  }

  # create output
  res <- list()
  res$lambda <- lambda
  res$stability <- proba
  res$B <- B
  res$var <- prelim$var
  res$group <- prelim$group
  res$time <- c(tcah[3], (t2 - t1)[3])
  names(res$time) <- c("hclust", "stability")
  class(res) <- "stability.MLGL"


  return(res)
}


#
# generate lambda sequence for group lasso
#
# @param X data matrix
# @param y response
# @param group vector indicates the grouping of covariates
# @param lambda.factor the ratio lambdamin/lambdamax
# @param length length of the lambda sequence
#
# @return lambda sequence
#
lambdaseq <- function(X, y, group, weight, lambda.factor = NULL, length = 100, intercept = TRUE) {
  # dimension of X
  n <- nrow(X)
  p <- ncol(X)

  # compute the max lambda by KKT
  if (intercept) {
    y <- y - mean(y)
  }
  l <- tapply(1:p, group, FUN = function(x) {
    sqrt(sum((t(X[, x]) %*% y)^2)) / n
  })
  l <- l / weight
  lambdamax <- max(l)

  # heuristic min lambda
  if (is.null(lambda.factor)) {
    lambda.factor <- ifelse(n < p, 0.05, 0.001)
  }

  lambdamin <- lambdamax * lambda.factor

  # generate sequence on a log scale
  lambda <- exp(seq(log(lambdamax), log(lambdamin), length = length))

  return(lambda)
}