Nothing
#' Fitting an Ensemble of Phalanxes
#'
#' \code{epx} forms phalanxes of variables from training data for
#' binary classification with a rare class. The phalanxes are
#' disjoint subsets of variables, each of which is fit with a base classifier.
#' Together they form an ensemble.
#'
#' @references
#' Tomal, J. H., Welch, W. J., & Zamar, R. H. (2015).
#' Ensembling classification models based on phalanxes of variables with
#' applications in drug discovery.
#' \emph{The Annals of Applied Statistics},
#' \emph{9}(1), 69-93.
#' \doi{10.1214/14-AOAS778}
#'
#' @param x Explanatory variables (predictors, features) contained in a data frame.
#' @param y Binary response variable vector (numeric or integer):
#' 1 for the rare class, 0 for the majority class.
#' @param phalanxes.initial Initial variable group indices; default one group
#' per variable. Example: vector c(1, 1, 2, 2, 3, ...) puts variables 1 and 2
#' in group 1, variables 3 and 4 in group, 2, etc. Indices cannot
#' be skipped, e.g., c( 1, 3, 3, 4, 4, 3, 1) skips group 2 and is invalid.
#' @param alpha Lower-tail probability for the critical quantile of the reference
#' distribution of the \code{performance} measure for a classifier that ranks
#' at random (i.e., the predictors have no explanatory power); default is 0.95.
#' @param nsim Number of simulations for the reference empirical distribution of
#' the performance measure; default is 1000.
#' @param rmin.target To merge the pair of groups with the
#' minimum ratio of performance measures (ensemble of models to single model)
#' into a single group their ratio must be less than
#' \code{rmin.target}, otherwise merging stops; default is 1.
#' @param classifier Base classifier, one of
#' \code{c("random forest", "logistic regression", "neural network")};
#' default is "random forest", which uses
#' \code{\link[randomForest]{randomForest}}.
#' @param classifier.args Arguments for the base \code{classifier}
#' specified in a list as follows: \code{list(argName1 = value1, argName2 =
#' value2, ...)}. If the list is empty, the classifier will use
#' its defaults. For "random forest", user may specify \code{replace, cutoff,
#' nodesize, maxnodes}. For "logistic regression" there are no options. For
#' "neural network", user may specify \code{size, trace}.
#' @param performance Performance assessment metric, one of
#' \code{c("\link{AHR}", "\link{IE}", "\link{TOP1}", "\link{RKL}")};
#' default is \code{\link{AHR}}.
#' @param performance.args Arguments for the \code{performance} measure
#' specified in a list as follows:
#' \code{list(argName1 = value1, argName2 = value2, ...)}.
#' If the list is empty, the performance measure will use its
#' defaults. Currently, only \code{\link{IE}} takes an argument list,
#' and its only argument is \code{cutoff}.
#' @param computing Whether to compute sequentially or in parallel. Input is one
#' of \code{c("sequential", "parallel")}; default is "sequential".
#' @param ... Further arguments passed to or from other methods.
#'
#' @details Please see Tomal et al. (2015) for more description of phalanx formation.
#'
#' @return Returns an object of class \code{epx}, which is
#' a list containing the following components:
#' \item{PHALANXES}{List of four vectors, each the same length as the number of
#' explanatory variables (columns in \code{x}): \code{phalanxes.initial},
#' \code{phalanxes.filtered}, \code{phalanxes.merged}, \code{phalanxes.final}.
#' Each vector contains the phalanx membership indices of all explanatory variables
#' at one of the four stages of phalanx-formation. Element \eqn{i} of a vector
#' is the index of the phalanx to which variable \eqn{i} belongs. Phalanx 0 does
#' not exist and so membership in phalanx 0 indicates that the variable does not
#' belong to any phalanx; it has been screened out.}
#' \item{PHALANXES.FINAL.PERFORMANCE}{Vector of \code{performance} measures of
#' the final phalanxes: the first element is for phalanx 1, etc.}
#' \item{PHALANXES.FINAL.FITS}{A matrix with number of rows equal to the number
#' of observations in the training data and number of columns equal to the
#' number of final phalanxes. Column \eqn{i} contains the predicted
#' probabilities of class 1 from fitting the base \code{classifier} to the
#' variables in phalanx \eqn{i}.}
#' \item{ENSEMBLED.FITS}{The predicted probabilities of class 1 from the
#' ensemble of phalanxes based on \code{phalanxes.final}.}
#' \item{BASE.CLASSIFIER.ARGS}{(Parsed) record of user-specified arguments for
#' \code{classifier}.}
#' \item{PERFORMANCE.ARGS}{(Parsed) record of user-specified arguments for
#' \code{performance}.}
#' \item{X}{User-provided data frame of explanatory variables.}
#' \item{Y}{User-provided binary response vector.}
#'
#' @seealso \code{\link{summary.epx}} prints a summary of the results,
#' and \code{\link{cv.epx}} assesses performance via cross-validation.
#'
#'
#' @examples
#' # Example with data(harvest)
#'
#' ## Phalanx-formation using a base classifier with 50 trees (default = 500)
#' \donttest{
#' set.seed(761)
#' model <- epx(x = harvest[, -4], y = harvest[, 4],
#' classifier.args = list(ntree = 50))
#'
#' ## Phalanx-membership of explanatory variables at the four stages
#' ## of phalanx formation (0 means not in a phalanx)
#' model$PHALANXES
#'
#' ## Summary of the final phalanxes (matches above)
#' summary(model)
#' \dontrun{
#' ## Parallel computing
#' clusters <- parallel::detectCores()
#' cl <- parallel::makeCluster(clusters)
#' doParallel::registerDoParallel(cl)
#' set.seed(761)
#' model.par <- epx(x = harvest[, -4], y = harvest[, 4],
#' computing = "parallel")
#' parallel::stopCluster(cl)
#' }
#' }
#' @export
epx <- function(x, y,
phalanxes.initial = c(1:ncol(x)),
alpha = 0.95,
nsim = 1000,
rmin.target = 1,
classifier = "random forest",
classifier.args = list(),
performance = "AHR",
performance.args = list(),
computing = "sequential",
...) {
p <- ncol(x)
## Error handling ============================================================
if (missing(x) || missing(y)) {
stop("Arguments 'x', and 'y' are missing.")
}
# y is passed to epx_algorithm as a numeric or integer vector with
# values 0 or 1 because AHR requires a numeric vector.
# i.e. y will be converted to a factor when required, e.g., in random forest
# due to .ClassifierFormula.
if (!is.numeric(y) & !is.integer(y)) {
stop("'y' must be a numeric or integer vector.")
}
if ( !all(y %in% c(0, 1)) || !all(c(0, 1) %in% y) ) {
stop("'y' must be a binary response vector; 1 is the rare class, 0 is majority. ")
}
if (class(x) != "data.frame") {
stop("'x' is not a dataframe.")
}
if (is.null(nrow(x))) {
stop("'x' cannot be empty.")
}
if (nrow(x) == 0L) {
stop("0 (non-NA) cases")
}
if (any(!(phalanxes.initial %in% 0:length(phalanxes.initial)))) {
stop("Initial phalaxes not well-defined.")
}
if (length(phalanxes.initial) != ncol(x)) {
stop("Initial phalanxes are not defined for all variables.")
}
if (length(unique(phalanxes.initial)) != max(phalanxes.initial)) {
stop("Initial phalanxes are not well-defined.")
}
if (alpha > 1 || alpha < 0) {
stop("Invalid index of the reference distribution quantile.")
}
if (nsim <= 0) {
stop("Invalid number of simulations.")
}
if (rmin.target >= Inf || rmin.target <= 0) {
stop("Invalid rmin.target, the minimum threshold ratio.")
}
## Parallel computing ========================================================
computing <- match.arg(arg = computing,
choices = c("sequential", "parallel"),
several.ok = FALSE)
if (computing == "sequential") { # suppress irritating warning message for seq
foreach::registerDoSEQ()
} else if (foreach::getDoParWorkers() == 1) {
message("Selected parallel computing, but only 1 execution worker
in the currently registered doPar backend.")
}
## epx algoritm ==============================================================
RES <- epxAlgorithm(x = x,
y = y,
phalanxes.initial = phalanxes.initial,
alpha = alpha,
nsim = nsim,
rmin.target = rmin.target,
classifier = classifier,
classifier.args = classifier.args,
performance = performance,
performance.args = performance.args,
...)
## Return epx ================================================================
return(RES)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.