R/cv.MLGL.R

Defines functions cv.MLGL

Documented in cv.MLGL

#' V-fold cross validation for \code{\link{MLGL}} function
#'
#' @title  Multi-Layer Group-Lasso with cross V-fold validation
#' @author Quentin Grimonprez
#' @param X matrix of size n*p
#' @param y vector of size n. If loss = "logit", elements of y must be in {-1,1}
#' @param nfolds number of folds
#' @param lambda lambda values for group lasso. If not provided, the function generates its own values of lambda
#' @param hc output of \code{\link{hclust}} function. If not provided, \code{\link{hclust}} is run with ward.D2 method
#' @param weightLevel a vector of size p for each level of the hierarchy. A zero indicates that the level will be ignored. 
#' If not provided, use 1/(height between 2 successive levels)
#' @param weightSizeGroup a vector
#' @param loss a character string specifying the loss function to use, valid options are: "ls" least squares 
#' loss (regression) and "logit" logistic loss (classification)
#' @param sizeMaxGroup maximum size of selected groups. If NULL, no restriction
#' @param intercept should an intercept be included in the model ?
#' @param verbose print some informations
#' @param ... Others parameters for \code{\link{cv.gglasso}} function
#'
#'
#' @return a cv.MLGL object containing:
#' \describe{
#' \item{lambda}{values of \code{lambda}.}
#' \item{cvm}{the mean cross-validated error.}
#' \item{cvsd}{estimate of standard error of \code{cvm}}
#' \item{cvupper}{upper curve = \code{cvm+cvsd}}
#' \item{cvlower}{lower curve = \code{cvm-cvsd}}
#' \item{lambda.min}{The optimal value of \code{lambda} that gives minimum cross validation error \code{cvm}.}
#' \item{lambda.1se}{The largest value of \code{lambda} such that error is within 1 standard error of the minimum.}
#' \item{time}{computation time}
#' }
#'
#'
#' @details
#' Hierarchical clustering is performed with all the variables. Then, the partitions from the different
#'  levels of the hierarchy are used in the different run of MLGL for cross validation.
#'
#' @examples
#' set.seed(42)
#' # Simulate gaussian data with block-diagonal variance matrix containing 12 blocks of size 5
#' X <- simuBlockGaussian(50, 12, 5, 0.7)
#' # Generate a response variable
#' y <- X[, c(2, 7, 12)] %*% c(2, 2, -2) + rnorm(50, 0, 0.5)
#' # Apply cv.MLGL method
#' res <- cv.MLGL(X, y)
#' @seealso \link{MLGL}, \link{stability.MLGL}, \link{predict.cv.gglasso}, \link{coef.cv.MLGL}, \link{plot.cv.MLGL}
#'
#' @export
cv.MLGL <- function(X, y, nfolds = 5, lambda = NULL, hc = NULL, weightLevel = NULL, weightSizeGroup = NULL, 
                    loss = c("ls", "logit"), intercept = TRUE, sizeMaxGroup = NULL, verbose = FALSE, ...) {

  # check parameters
  loss <- match.arg(loss)
  .checkParameters(X, y, hc, lambda, weightLevel, weightSizeGroup, intercept, verbose, loss, sizeMaxGroup)

  # nfolds
  if (!is.numeric(nfolds) | (length(nfolds) != 1)) {
    stop("nfolds must be an integer greater than 3.")
  }
  if (!.is.wholenumber(nfolds) | nfolds <= 2 | nfolds > nrow(X)) { # restriction >=3 comes from cv.gglasso function
    stop("nfolds must be an integer greater than 3.")
  }

  ################################ same as MLGL function
  # define some useful variables
  n <- nrow(X)
  p <- ncol(X)
  tcah <- rep(NA, 3)

  # if no hc output provided, we make one
  if (is.null(hc)) {
    if (verbose) {
      cat("Computing hierarchical clustering...")
    }

    t1 <- proc.time()
    d <- dist(t(X))
    hc <- hclust(d, method = "ward.D2")
    t2 <- proc.time()
    tcah <- t2 - t1
    if (verbose) {
      cat("DONE in ", tcah[3], "s\n")
    }
  }

  # compute weight, active variables and groups
  if (verbose) {
    cat("Preliminary step...")
  }
  t1 <- proc.time()
  prelim <- preliminaryStep(hc, weightLevel, weightSizeGroup, sizeMaxGroup)

  # duplicate data
  Xb <- X[, prelim$var]
  t2 <- proc.time()
  if (verbose) {
    cat("DONE in ", (t2 - t1)[3], "s\n")
  }

  ################################ END same as MLGL function



  ######## group lasso
  if (verbose) {
    cat("Computing group-lasso...")
  }
  t1 <- proc.time()
  res <- cv.gglasso(Xb, y, prelim$group, pf = prelim$weight, nfolds = nfolds, lambda = lambda, 
                    intercept = intercept, loss = loss, ...)
  t2 <- proc.time()
  tgglasso <- t2 - t1
  if (verbose) {
    cat("DONE in ", tgglasso[3], "s\n")
  }

  # delete some gglasso output
  res$name <- NULL
  res$gglasso.fit <- NULL

  # rename output
  res$cvlower <- res$cvlo
  res$cvlo <- NULL

  res$time <- c(tcah[3], tgglasso[3])
  names(res$time) <- c("hclust", "glasso")
  class(res) <- "cv.MLGL"


  return(res)
}

Try the MLGL package in your browser

Any scripts or data that you put into this service are public.

MLGL documentation built on March 31, 2023, 9:32 p.m.