R/fullProcess.R

Defines functions .checkFullProcess fullProcess.formula fullProcess.default fullProcess

Documented in fullProcess fullProcess.default fullProcess.formula

#' Run hierarchical clustering following by a group-lasso on all the different partition and a hierarchical testing procedure. 
#' Only for linear regression problem.
#'
#' @title Full process of MLGL
#'
#' @author Quentin Grimonprez
#' @param X matrix of size n*p
#' @param y vector of size n.
#' @param hc output of \code{\link{hclust}} function. If not provided, \code{\link{hclust}} is run with ward.D2 method. User 
#' can also provide the desired method: "single", "complete", "average", "mcquitty", "ward.D", "ward.D2", "centroid", "median".
#' @param control either "FDR" or "FWER"
#' @param alpha control level for testing procedure
#' @param test test used in the testing procedure. Default is \link{partialFtest}
#' @param fractionSampleMLGL a real between 0 and 1: the fraction of individuals to use in the sample for MLGL (see Details).
#' @param BHclust number of replicates for computing the distance matrix for the hierarchical clustering tree
#' @param nCore number of cores used for distance computation. Use all cores by default.
#' @param addRoot If TRUE, add a common root containing all the groups
#' @param Shaffer If TRUE, a Shaffer correction is performed (only if control = "FWER")
#' @param ... Others parameters for MLGL
#'
#' @return a list containing:
#' \describe{
#'   \item{res}{output of \link{MLGL} function}
#'   \item{lambdaOpt}{lambda values maximizing the number of rejects}
#'   \item{var}{A vector containing the index of selected variables for the first \code{lambdaOpt} value}
#'   \item{group}{A vector containing the values index of selected groups for the first \code{lambdaOpt} value}
#'   \item{selectedGroups}{Selected groups for the first \code{lambdaOpt} value}
#'   \item{reject}{Selected groups for all lambda values}
#'   \item{alpha}{Control level}
#'   \item{test}{Test used in the testing procedure}
#'   \item{control}{"FDR" or "FWER"}
#'   \item{time}{Elapsed time}
#' }
#'
#' @details
#' Divide the n individuals in two samples. Then the three following steps are done:
#' 1) Bootstrap Hierarchical Clustering of the variables of X
#' 2) MLGL on the second sample of individuals
#' 3) Hierarchical testing procedure on the first sample of individuals.
#'
#' @examples
#' # least square loss
#' set.seed(42)
#' X <- simuBlockGaussian(50, 12, 5, 0.7)
#' y <- X[, c(2, 7, 12)] %*% c(2, 2, -2) + rnorm(50, 0, 0.5)
#' res <- fullProcess(X, y)
#' @seealso \link{MLGL}, \link{hierarchicalFDR}, \link{hierarchicalFWER}, \link{selFDR}, \link{selFWER}
#'
#' @export
fullProcess <- function(X, ...) {
  UseMethod("fullProcess")
}

#' @rdname fullProcess
#' @export
fullProcess.default <- function(X, y, control = c("FWER", "FDR"), alpha = 0.05, test = partialFtest, 
                        hc = NULL, fractionSampleMLGL = 1 / 2, BHclust = 50, nCore = NULL, 
                        addRoot = FALSE, Shaffer = FALSE, ...) {
  loss <- "ls"
  # if(loss == "logit" & identical(test, partialFtest))
  #   test = partialChisqtest
  .checkFullProcess(X, y, hc, alpha, test, fractionSampleMLGL, loss)

  n <- nrow(X)

  # Split the data in 2
  ind2 <- sample(n, floor(n * fractionSampleMLGL))
  ind1 <- (1:n)[-ind2]

  ##### part 1: hierarchical clustering with half of the data
  if (is.null(hc) | is.character(hc)) {
    # center variables and sd = 1
    Xb <- scale(X, center = TRUE, scale = FALSE)
    Xb <- scale(Xb, center = FALSE, scale = sqrt(colSums(Xb^2) / n))


    # hierarchical clustering
    hc <- bootstrapHclust(Xb, frac = 1, B = BHclust, method = ifelse(is.character(hc), hc, "ward.D2"), nCore = nCore)
  }


  ##### part 2: group-lasso
  res <- MLGL(X[ind2, ], y[ind2], hc = hc, loss = loss, ...)


  ##### part 3: testing procedure
  outTest <- HMT(res, X[ind1, ], y[ind1], control, alpha, test, addRoot, Shaffer)

  outObj <- c(list(res = res), outTest)
  class(outObj) <- "fullProcess"

  return(outObj)
}


#' @param formula an object of class "formula" (or one that can be coerced to that class): a symbolic description of the 
#' model to be fitted.
#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing 
#' the variables in the model. If not found in data, the variables are taken from environment (formula)
#'
#'
#' @rdname fullProcess
#'
#' @export
fullProcess.formula <- function(formula, data, control = c("FWER", "FDR"), alpha = 0.05, 
                                test = partialFtest, hc = NULL, fractionSampleMLGL = 1 / 2, 
                                BHclust = 50, nCore = NULL, addRoot = FALSE, Shaffer = FALSE, ...) {
  cl <- match.call()
  mf <- match.call(expand.dots = FALSE)
  m <- match(c("formula", "data"), names(mf), 0L)
  mf <- mf[c(1L, m)]
  mf$drop.unused.levels <- TRUE
  mf[[1L]] <- quote(stats::model.frame)
  mf <- eval(mf, parent.frame())
  mt <- attr(mf, "terms")
  y <- model.response(mf, "numeric")
  X <- model.matrix(mt, mf)
  X <- as.matrix(X[, -1])

  res <- fullProcess.default(X, y, control, alpha, test, hc, fractionSampleMLGL, BHclust, nCore, addRoot, Shaffer, ...)

  return(res)
}

# check parameters of MLGL function
.checkFullProcess <- function(X, y, hc, alpha, test, fractionSampleMLGL, loss) {
  # check X
  if (!is.matrix(X)) {
    stop("X has to be a matrix.")
  }
  if (any(is.na(X))) {
    stop("Missing values in X not allowed.")
  }
  if (!is.numeric(X)) {
    stop("X has to be a matrix of real.")
  }

  # check y
  if (!is.numeric(y)) {
    stop("y has to be a vector of real.")
  }
  if (any(is.na(y))) {
    stop("Missing values in y not allowed.")
  }
  if (loss == "logit" && any(y %in% c(-1, 1) == FALSE)) {
    stop("Classification method requires the response y to be in {-1,1}")
  }

  # check if X and y are compatible
  if (nrow(X) != length(drop(y))) {
    stop("The length of y and the number of rows of X don't match.")
  }

  # check hc
  if (!is.null(hc)) {
    if (is.character(hc)) {
      if (!(hc %in% c("single", "complete", "average", "mcquitty", "ward.D", "ward.D2", "centroid", "median"))) {
        stop("In character mode, hc must be \"single\", \"complete\", \"average\", \"mcquitty\", \"ward.D\", 
             \"ward.D2\", \"centroid\" or \"median\".")
      }
    } else {
      # check if hc is a hclust object
      if (!inherits(hc, "hclust")) {
        stop("hc must be an hclust object.")
      }
      # check if hc and X are compatible
      if (length(hc$order) != ncol(X)) {
        stop("hc is not a clustering of the p covariates of X.")
      }
    }
  }

  # alpha
  if (length(alpha) != 1) {
    stop("alpha must be a real between 0 and 1.")
  }
  if ((alpha <= 0) || (alpha > 1)) {
    stop("alpha must be a real between 0 and 1.")
  }

  # check if test is a function
  if (!is.function(test)) {
    stop("test must be a function.")
  }

  # fractionSampleMLGL
  if (length(fractionSampleMLGL) != 1) {
    stop("fractionSampleMLGL must be a real between 0 and 1.")
  }
  if ((fractionSampleMLGL <= 0) || (fractionSampleMLGL >= 1)) {
    stop("fractionSampleMLGL must be a real between 0 and 1.")
  }

  invisible(return(NULL))
}

Try the MLGL package in your browser

Any scripts or data that you put into this service are public.

MLGL documentation built on March 31, 2023, 9:32 p.m.