R/epx.R
In EPX: Ensemble of Phalanxes

Documented in epx

#' Fitting an Ensemble of Phalanxes
#'
#' \code{epx} forms phalanxes of variables from training data for
#' binary classification with a rare class. The phalanxes are
#' disjoint subsets of variables, each of which is fit with a base classifier.
#' Together they form an ensemble.
#'
#' @references
#' Tomal, J. H., Welch, W. J., & Zamar, R. H. (2015).
#' Ensembling classification models based on phalanxes of variables with
#' applications in drug discovery.
#' \emph{The Annals of Applied Statistics},
#' \emph{9}(1), 69-93.
#' \doi{10.1214/14-AOAS778}
#'
#' @param x Explanatory variables (predictors, features) contained in a data frame.
#' @param y Binary response variable vector (numeric or integer):
#'   1 for the rare class, 0 for the majority class.
#' @param phalanxes.initial Initial variable group indices; default one group
#'   per variable. Example: vector c(1, 1, 2, 2, 3, ...) puts variables 1 and 2
#'   in group 1, variables 3 and 4 in group, 2, etc.  Indices cannot
#'   be skipped, e.g., c( 1, 3, 3, 4, 4, 3, 1) skips group 2 and is invalid.
#' @param alpha Lower-tail probability for the critical quantile of the reference
#'   distribution of the \code{performance} measure for a classifier that ranks
#'   at random (i.e., the predictors have no explanatory power); default is 0.95.
#' @param nsim Number of simulations for the reference empirical distribution of
#'   the performance measure; default is 1000.
#' @param rmin.target To merge the pair of groups with the
#' minimum ratio of performance measures (ensemble of models to single model)
#' into a single group their ratio must be less than
#' \code{rmin.target}, otherwise merging stops; default is 1.
#' @param classifier Base classifier, one of
#'   \code{c("random forest", "logistic regression", "neural network")};
#'   default is "random forest", which uses
#'   \code{\link[randomForest]{randomForest}}.
#' @param classifier.args Arguments for the base \code{classifier}
#'   specified in a list as follows: \code{list(argName1 = value1, argName2 =
#'   value2, ...)}. If the list is empty, the classifier will use
#'   its defaults.  For "random forest", user may specify \code{replace, cutoff,
#'   nodesize, maxnodes}. For "logistic regression" there are no options. For
#'   "neural network", user may specify \code{size, trace}.
#' @param performance Performance assessment metric, one of
#'   \code{c("\link{AHR}", "\link{IE}", "\link{TOP1}", "\link{RKL}")};
#'   default is \code{\link{AHR}}.
#' @param performance.args Arguments for the \code{performance} measure
#'   specified in a list as follows:
#'   \code{list(argName1 = value1, argName2 = value2, ...)}.
#'   If the list is empty, the performance measure will use its
#'   defaults. Currently,  only \code{\link{IE}} takes an argument list,
#'   and its only argument is \code{cutoff}.
#' @param computing Whether to compute sequentially or in parallel. Input is one
#'   of \code{c("sequential", "parallel")}; default is "sequential".
#' @param ... Further arguments passed to or from other methods.
#'
#' @details Please see Tomal et al. (2015) for more description of phalanx formation.
#'
#' @return Returns an object of class \code{epx}, which is
#' a list containing the following components:
#' \item{PHALANXES}{List of four vectors, each the same length as the number of
#' explanatory variables (columns in \code{x}): \code{phalanxes.initial},
#' \code{phalanxes.filtered}, \code{phalanxes.merged}, \code{phalanxes.final}.
#' Each vector contains the phalanx membership indices of all explanatory variables
#' at one of the four stages of phalanx-formation. Element \eqn{i} of a vector
#' is the index of the phalanx to which variable \eqn{i} belongs. Phalanx 0 does
#' not exist and so membership in phalanx 0 indicates that the variable does not
#' belong to any phalanx; it has been screened out.}
#' \item{PHALANXES.FINAL.PERFORMANCE}{Vector of \code{performance} measures of
#' the final phalanxes: the first element is for phalanx 1, etc.}
#' \item{PHALANXES.FINAL.FITS}{A matrix with number of rows equal to the number
#' of observations in the training data and number of columns equal to the
#' number of final phalanxes.  Column \eqn{i} contains the predicted
#' probabilities of class 1 from fitting the base \code{classifier} to the
#' variables in phalanx \eqn{i}.}
#' \item{ENSEMBLED.FITS}{The predicted probabilities of class 1 from the
#' ensemble of phalanxes based on \code{phalanxes.final}.}
#' \item{BASE.CLASSIFIER.ARGS}{(Parsed) record of user-specified arguments for
#' \code{classifier}.}
#' \item{PERFORMANCE.ARGS}{(Parsed) record of user-specified arguments for
#' \code{performance}.}
#' \item{X}{User-provided data frame of explanatory variables.}
#' \item{Y}{User-provided binary response vector.}
#'
#' @seealso \code{\link{summary.epx}} prints a summary of the results,
#' and \code{\link{cv.epx}} assesses performance via cross-validation.
#'
#'
#' @examples
#' # Example with data(harvest)
#'
#' ## Phalanx-formation using a base classifier with 50 trees (default = 500)
#' \donttest{
#' set.seed(761)
#' model <- epx(x = harvest[, -4], y = harvest[, 4],
#'              classifier.args = list(ntree = 50))
#'
#' ## Phalanx-membership of explanatory variables at the four stages
#' ## of phalanx formation (0 means not in a phalanx)
#' model$PHALANXES
#'
#' ## Summary of the final phalanxes (matches above)
#' summary(model)
#' \dontrun{
#' ## Parallel computing
#' clusters <- parallel::detectCores()
#' cl <- parallel::makeCluster(clusters)
#' doParallel::registerDoParallel(cl)
#' set.seed(761)
#' model.par <- epx(x = harvest[, -4], y = harvest[, 4],
#'                  computing = "parallel")
#' parallel::stopCluster(cl)
#' }
#' }
#' @export
epx <- function(x, y,
                phalanxes.initial = c(1:ncol(x)),
                alpha = 0.95,
                nsim = 1000,
                rmin.target = 1,
                classifier = "random forest",
                classifier.args = list(),
                performance = "AHR",
                performance.args = list(),
                computing = "sequential",
                ...) {

  p <- ncol(x)

  ## Error handling ============================================================

  if (missing(x) || missing(y)) {
    stop("Arguments 'x', and 'y' are missing.")
  }

  # y is passed to epx_algorithm as a numeric or integer vector with
  # values 0 or 1 because AHR requires a numeric vector.
  # i.e. y will be converted to a factor when required, e.g., in random forest
  # due to .ClassifierFormula.
  if (!is.numeric(y) & !is.integer(y)) {
    stop("'y' must be a numeric or integer vector.")
  }

  if ( !all(y %in% c(0, 1)) || !all(c(0, 1) %in% y) ) {
    stop("'y' must be a binary response vector; 1 is the rare class, 0 is majority. ")
  }

  if (class(x) != "data.frame") {
    stop("'x' is not a dataframe.")
  }

  if (is.null(nrow(x))) {
    stop("'x' cannot be empty.")
  }

  if (nrow(x) == 0L) {
    stop("0 (non-NA) cases")
  }

  if (any(!(phalanxes.initial %in% 0:length(phalanxes.initial)))) {
    stop("Initial phalaxes not well-defined.")
  }

  if (length(phalanxes.initial) != ncol(x)) {
    stop("Initial phalanxes are not defined for all variables.")
  }

  if (length(unique(phalanxes.initial)) != max(phalanxes.initial)) {
    stop("Initial phalanxes are not well-defined.")
  }

  if (alpha > 1 || alpha < 0) {
    stop("Invalid index of the reference distribution quantile.")
  }
  if (nsim <= 0) {
    stop("Invalid number of simulations.")
  }
  if (rmin.target >= Inf || rmin.target <= 0) {
    stop("Invalid rmin.target, the minimum threshold ratio.")
  }

  ## Parallel computing ========================================================
  computing <- match.arg(arg = computing,
                         choices = c("sequential", "parallel"),
                         several.ok = FALSE)

  if (computing == "sequential") { # suppress irritating warning message for seq
    foreach::registerDoSEQ()
  } else if (foreach::getDoParWorkers() == 1) {
    message("Selected parallel computing, but only 1 execution worker
        in the currently registered doPar backend.")
  }

  ## epx algoritm ==============================================================
  RES <- epxAlgorithm(x = x,
                      y = y,
                      phalanxes.initial = phalanxes.initial,
                      alpha = alpha,
                      nsim = nsim,
                      rmin.target = rmin.target,
                      classifier = classifier,
                      classifier.args = classifier.args,
                      performance = performance,
                      performance.args = performance.args,
                      ...)

  ## Return epx ================================================================
  return(RES)
  }