Nothing
#' Energy Forests
#'
#' Fits an Energy Forest, in the form of either a bagging of Energy Trees or a
#' Random Energy Forest, depending on the value of the \code{random_covs}
#' parameter.
#'
#' @param ntrees Number of Energy Trees to grow, i.e., the number of
#' bootstrap samples to be generated and used for fitting.
#' @param ncores Number of cores to use, i.e., at most how many child processes
#' will be run simultaneously. Must be exactly 1 on Windows (which uses the
#' master process). \code{ncores} corresponds to \code{mc.cores} in
#' \code{\link[parallel:mclapply]{mclapply()}}, which is actually used to grow the single
#' Energy Trees in a parallel fashion.
#' @param perf_metric Performance metric that is used to compute the Out-Of-Bag
#' score. If \code{NULL}, default choices are used: Balanced Accuracy for
#' classification and Root Mean Square Percentage Error for regression. See
#' Details for further information and possible alternatives.
#' @param random_covs Size of the random subset of covariates to choose from at
#' each split. If set to \code{NULL}, all the covariates are considered each
#' time, resulting in a bagging of Energy Trees. When \code{random_covs} is an
#' integer greater than 1 and less than the total number of covariates, the
#' model is a Random Energy Forest. By default, it is equal to \code{"auto"},
#' which implies the square root of the number of covariates for
#' classification, or one third of the number of covariates for regression (in
#' both cases, rounded down to the nearest integer).
#' @param verbose Logical indicating whether to print a one-line notification
#' for the conclusion of each tree's fitting process.
#' @inheritParams etree
#'
#' @details
#'
#' \code{eforest()} generates \code{ntrees} bootstrap samples and then calls
#' \code{\link[etree:etree]{etree()}} on each of them. Then, it computes the Out-Of-Bag (OOB)
#' score using the performance metric defined through \code{perf_metric}.
#'
#' For classification, possible values of \code{perf_metric} are \code{"BAcc"}
#' and \code{"WBAcc"}. Both are general enough to be used in multiclass
#' classification problems, still producing sensible results in the case of
#' binary classification. The two options are based on the calculation of a
#' ground performance metric, the Balanced Accuracy, which is defined as the
#' arithmetic mean between Sensitivity and Specificity. In this framework,
#' Balanced Accuracy is computed using a "One vs. All" approach, i.e.,
#' considering one class at a time: positive instances are those belonging to
#' that class, and negatives are the ones belonging to any other class. Then,
#' the "One vs. All" Balanced Accuracies obtained by considering each class must
#' be averaged. When \code{perf_metric = "BAcc"} (default for classification
#' tasks), the average is arithmetic. When \code{perf_metric = "WBAcc"}, the
#' average is weighted using class sizes, hence giving more importance to the
#' "One vs. All" Balanced Accuracy of larger classes.
#'
#' For regression, the default value of \code{perf_metric} is \code{"RMSPE"},
#' namely, Root Mean Square Percentage Error. Other available options are
#' \code{c("MAE", "MAPE", "MedianAE", "MedianAPE", "MSE", "NRMSE", "RAE",
#' "RMSE", "RMLSE")}. Each of these name points to the corresponding homonym
#' function from the package \code{\link[MLmetrics]{MLmetrics}}, whose
#' documentation provides more information about their definition.
#'
#' @returns
#' Object of class \code{"eforest"} with three elements: 1) \code{ensemble},
#' which is a list gathering all the fitted trees; 2) \code{oob_score},
#' an object of class \code{"numeric"} representing the OOB score computed using
#' the performance metric defined through \code{perf_metric}; 3)
#' \code{perf_metric}, an object of class \code{"character"} returning the
#' performance metric used for computations.
#'
#' @examples
#'
#' \donttest{
#'
#' ## Covariates
#' set.seed(123)
#' nobs <- 100
#' cov_num <- rnorm(nobs)
#' cov_nom <- factor(rbinom(nobs, size = 1, prob = 0.5))
#' cov_gph <- lapply(1:nobs, function(j) igraph::sample_gnp(100, 0.2))
#' cov_fun <- fda.usc::rproc2fdata(nobs, seq(0, 1, len = 100), sigma = 1)
#' cov_list <- list(cov_num, cov_nom, cov_gph, cov_fun)
#'
#' ## Response variable(s)
#' resp_reg <- cov_num ^ 2
#' y <- round((cov_num - min(cov_num)) / (max(cov_num) - min(cov_num)), 0)
#' resp_cls <- factor(y)
#'
#' ## Regression ##
#' eforest_fit <- eforest(response = resp_reg, covariates = cov_list, ntrees = 12)
#' print(eforest_fit$ensemble[[1]])
#' plot(eforest_fit$ensemble[[1]])
#' mean((resp_reg - predict(eforest_fit)) ^ 2)
#'
#' ## Classification ##
#' eforest_fit <- eforest(response = resp_cls, covariates = cov_list, ntrees = 12)
#' print(eforest_fit$ensemble[[12]])
#' plot(eforest_fit$ensemble[[12]])
#' table(resp_cls, predict(eforest_fit))
#' }
#'
#' @export
eforest <- function(response,
covariates,
weights = NULL,
ntrees = 100,
ncores = 1L,
minbucket = 1,
alpha = 1,
R = 500,
split_type = 'cluster',
coeff_split_type = 'test',
p_adjust_method = 'fdr',
perf_metric = NULL,
random_covs = 'auto',
verbose = FALSE) {
# Check whether covariates is a list
if (!is.list(covariates)) {
stop("Argument 'covariates' must be provided as a list")
}
# Check that response is factor or numeric
if (!is.factor(response) &&
!is.numeric(response)) {
stop("Argument 'response' must be provided either as a factor or as an
object of mode 'numeric'")
}
# Check that minbucket is positive and ensure it is integer
stopifnot((minbucket > 0))
minbucket <- as.integer(minbucket)
# Check that alpha is between 0 and 1
stopifnot((alpha >= 0) && (alpha <= 1))
# Check that ntrees is positive and ensure it is integer
stopifnot((ntrees > 0))
ntrees <- as.integer(ntrees)
# If perf_metric is NULL, set it to default choices
if (is.null(perf_metric)) {
if (is.factor(response)) perf_metric <- 'BAcc' else
if (is.numeric(response)) perf_metric <- 'RMSPE'
} else {
# If perf_metric is given, check that it has an acceptable value
if (is.factor(response)) {
if (isFALSE(perf_metric %in% c('BAcc', 'WBAcc'))) {
stop("The value provided for 'perf_metric' is not available for this
task")
}
} else if (is.numeric(response)) {
if (isFALSE(perf_metric %in% c('MAPE', 'RMSPE', 'NRMSE', 'MAE',
'MedianAE', 'MedianAPE', 'MSE', 'RAE',
'RMSE', 'RMLSE'))) {
stop("The value provided for 'perf_metric' is not available for this
task")
}
}
}
# If random_covs is 'auto', set it to traditional values
if (random_covs == 'auto') {
if (is.factor(response)) {
random_covs <- floor(sqrt(length(covariates)))
} else if (is.numeric(response)) {
random_covs <- floor(length(covariates) / 3)
}
}
# Set counter
counter <- 0L
# Number of observations
nobs <- length(response)
# New list of covariates
newcovariates <- .create_newcov(covariates = covariates,
response = response,
split_type = split_type)
# Distances
cov_distance <- lapply(covariates, dist_comp)
# Large list with covariates, newcovariates and distances
covariates_large <- list('cov' = covariates,
'newcov' = newcovariates,
'dist' = cov_distance)
# Large list with response and the corresponding distances
response_large <- list('response' = response,
'response_dist' = dist_comp(response))
# Generate B bootstrap samples
boot_idx <- lapply(1:ntrees,
function(b) sample.int(nobs, replace = TRUE))
# Energy Tree fits for each bootstrap sample
etree_boot_fits <- parallel::mclapply(boot_idx, function(b_i) {
# Covariates for this bootstrap sample
boot_cov_large <- list()
boot_cov_large$cov <- lapply(covariates_large$cov,
function(cov) {
if (inherits(cov, 'data.frame')) cov[b_i, ]
else cov[b_i]
}
)
# New covariates for this bootstrap sample
boot_cov_large$newcov <- lapply(covariates_large$newcov, function(ncov) {
if (is.null(attr(ncov, 'cov_type'))) ncov[b_i]
else {
ct <- attr(ncov, 'cov_type')
boot_ncov <- if (inherits(ncov, 'data.frame')) ncov[b_i, ] else ncov[b_i]
attr(boot_ncov, 'cov_type') <- ct
boot_ncov
}
})
# Re-index newcov only if using 'cluster'
if (split_type == 'cluster') {
boot_cov_large$newcov <- lapply(boot_cov_large$newcov, function (ncov) {
if (isFALSE(is.null(attr(ncov, 'cov_type')))) {
ct <- attr(ncov, 'cov_type')
ncov <- factor(1:nobs)
attr(ncov, 'cov_type') <- ct
ncov
}
ncov
})
}
# Distances (covariates) for this bootstrap sample
boot_cov_large$dist <- lapply(covariates_large[[3]],
function(cov_dist) {
boot_dist <- usedist::dist_subset(cov_dist,
b_i)
boot_dist <-
usedist::dist_setNames(boot_dist, 1:nobs)
return(boot_dist)
})
# Response and distances (response) for this bootstrap sample
resp_dist <- usedist::dist_subset(response_large$response_dist, b_i)
resp_dist <- usedist::dist_setNames(resp_dist, 1:nobs)
boot_resp_large <- list('response' = response[b_i],
'response_dist' = resp_dist)
# Energy Trees fit
e_fit <- etree(response = boot_resp_large,
covariates = boot_cov_large,
weights = weights,
minbucket = minbucket,
alpha = alpha,
R = R,
split_type = split_type,
coeff_split_type = coeff_split_type,
p_adjust_method = p_adjust_method,
random_covs = random_covs)
# Remove .Environment attribute from terms
attr(e_fit$terms, '.Environment') <- NULL
# Print counter if verbose is TRUE
if (isTRUE(verbose)) {
counter <<- counter + 1
print(paste('Fitted Energy Tree n.', counter))
}
# Return
return(e_fit)
},
mc.cores = ncores,
mc.set.seed = TRUE)
# Delete counter if verbose is TRUE
if (isTRUE(verbose)) rm(counter, pos = 1)
# For each obs, predicted response with its own OOB trees
oob_pred_resp <- lapply(1:nobs, function(i) {
# Covariates for observation i (to be used as newdata in predict())
covs_i <- lapply(covariates, function(cov) cov[i])
# Indices of bootstrap samples for which the observation is OOB
is_oob <- sapply(boot_idx, function(b) !(i %in% b))
is_oob_idx <- which(is_oob)
# Predict response for this obs only with trees with indices in is_oob_idx
oob_pred_resp <- sapply(is_oob_idx, function(o) {
predict(etree_boot_fits[[o]],
newdata = covs_i)
})
# Return predicted response
return(oob_pred_resp)
})
# Predicted responses and OOB performance metric calculation
if (is.factor(response)) {
## Classification ##
# Predicted response: majority voting rule
pred_resp <- factor(sapply(oob_pred_resp,
function (i) {
if (length(i) == 0) return(NA)
else names(which.max(table(i)))
}
))
# OOB performance metric (measured via BAcc or WBAcc)
if (perf_metric == 'BAcc' || perf_metric == 'WBAcc') {
# Ignore NAs
resps <- .ignore_NA(y_pred = pred_resp,
y_true = response)
pred_resp <- resps$y_pred
response <- resps$y_true
# Balanced Accuracy for each class (each given by (TP/P + TN/N) / 2)
bal_accs <- .comp_bal_accs(y_pred = pred_resp,
y_true = response)
# OOB score
#Balanced Accuracy -> arithmetic mean
#Weighted Balanced Accuracy -> weighted mean (with class sizes as weights)
oob_score <- switch(perf_metric,
BAcc = mean(bal_accs),
WBAcc = sum(bal_accs * table(response)) /
length(response))
}
} else if (is.numeric(response)) {
## Regression ##
# Predicted response: average
pred_resp <- sapply(oob_pred_resp,
function(i) {
if (length(i) == 0) return(NA)
else mean(i)
})
# Ignore NAs
resps <- .ignore_NA(y_pred = pred_resp,
y_true = response)
pred_resp <- resps$y_pred
response <- resps$y_true
# OOB performance metric: various choices (default is 'RMSPE')
if (requireNamespace('MLmetrics', quietly = TRUE)) {
oob_score <-
switch(perf_metric,
MAPE = MLmetrics::MAPE(pred_resp, response),
RMSPE = MLmetrics::RMSPE(pred_resp, response),
NRMSE = MLmetrics::RMSE(pred_resp, response) / mean(response),
MAE = MLmetrics::MAE(pred_resp, response),
MedianAE = MLmetrics::MedianAE(pred_resp, response),
MedianAPE = MLmetrics::MedianAPE(pred_resp, response),
MSE = MLmetrics::MSE(pred_resp, response),
RAE = MLmetrics::RAE(pred_resp, response),
RMSE = MLmetrics::RMSE(pred_resp, response),
RMLSE = MLmetrics::RMSLE(pred_resp, response))
} else {
warning("Package MLmetrics is needed for computing the OOB score.")
}
}
# Create object with etree_boot_fits, oob_score, and perf_metric
eforest_obj <- list(ensemble = etree_boot_fits,
oob_score = oob_score,
perf_metric = perf_metric)
class(eforest_obj) <- 'eforest'
# Return eforest object
return(eforest_obj)
}
#' Predictions for Energy Forests
#'
#' Compute predictions for objects of class \code{"eforest"} (i.e., as returned
#' by \code{\link[etree:eforest]{eforest()}}).
#'
#' @param object A fitted Energy Forest of class \code{"eforest"}.
#' @param newdata Optional set of new covariates used to make predictions. Must
#' be provided as a list, where each element is a different variable.
#' Currently available types and the form they need to have to be correctly
#' recognized are the following:
#' \itemize{
#' \item Numeric: numeric or integer vectors;
#' \item Nominal: factors;
#' \item Functions: objects of class \code{"fdata"};
#' \item Graphs: (lists of) objects of class \code{"igraph"}.
# \item Persistence diagrams: (lists of) objects with
# \code{attributes(x)$names == "diagram"}.
#' }
#' Each element (i.e., variable) in the covariates list must have the same
#' \code{length()}, which corresponds to the (new) sample size. If
#' \code{newdata} is omitted, fitted values of individual trees are somehow
#' combined (see Details) and returned.
#' @param ... Additional arguments.
#'
#' @details
#' The \code{predict()} method for \code{"eforest"} objects computes predictions
#' for Energy Forests as returned by \code{\link[etree:eforest]{eforest()}}.
#' Predictions are based either on the fitted values (if \code{newdata} is
#' \code{NULL}) or on the new set of covariates (when \code{newdata} is
#' provided). In both cases, each tree in \code{object$ensemble} is used to make
#' predictions by calling \code{\link[etree:predict.etree]{predict()}} on it
#' (with the same specification of \code{newdata}). Then, individual trees'
#' predictions for any single observation are combined by majority voting rule
#' for classification or by arithmetic mean for regression.
#'
#' @returns
#' Predictions, in the form of a factor for classification or as a numeric
#' vector for regression.
#'
# @examples
#'
#' @method predict eforest
#' @export
predict.eforest <- function(object, newdata = NULL, ...) {
# Individual predictions with base learners
#(newdata check, split retrieval, newcov computation are all done in predict)
ind_pred_resp <- sapply(object$ensemble, function(tree) {
predict(tree, newdata = newdata)
})
# Predict response, differently based on the type of problem (CLS or REG)
response <- object$ensemble[[1]]$fitted$`(response)`
if (is.factor(response)) {
# Majority voting rule
pred_resp <- apply(ind_pred_resp, 1, function(i) names(which.max(table(i))))
pred_resp <- factor(pred_resp)
} else if (is.numeric(response)) {
# Average
pred_resp <- apply(ind_pred_resp, 1, mean)
}
# Return predicted response
return(pred_resp)
}
.comp_bal_accs <- function(y_pred,
y_true) {
bal_accs <- sapply(levels(y_true),
function(lev) {
# Sensitivity
true_pos <- sum(y_pred == lev &
y_true == lev)
pos <- sum(y_true == lev)
sens <- true_pos / pos
# Specificity
true_neg <- sum(y_pred != lev &
y_true != lev)
neg <- sum(y_true != lev)
spec <- true_neg / neg
# Balanced Accuracy (with the current class as pos)
bal_acc_lev <- (sens + spec) / 2
# Return Balanced Accuracy
return(bal_acc_lev)
})
return(bal_accs)
}
.ignore_NA <- function(y_pred,
y_true) {
# Indices of non-missing values
ok_idx <- which(!is.na(y_pred))
# Select from predictions and actual response
y_pred <- y_pred[ok_idx]
y_true <- y_true[ok_idx]
# Return
return(list(y_pred = y_pred,
y_true = y_true))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.