#'Thresholding step of VSURF
#'
#'Thresholding step is dedicated to roughly eliminate irrelevant variables a the
#'dataset. This is the first step of the \code{\link{VSURF}} function. For
#'refined variable selection, see VSURF other steps: \code{\link{VSURF_interp}}
#'and \code{\link{VSURF_pred}}.
#'
#'First, \code{nfor.thres} random forests are computed using the function
#'\code{randomForest} with arguments \code{importance=TRUE}, and our choice of
#'default values for \code{ntree} and \code{mtry} (which are higher than default
#'in \code{\link[randomForest]{randomForest}} to get a more stable variable importance
#'measure). Then variables are sorted according to their mean variable
#'importance (VI), in decreasing order. This order is kept all along the
#'procedure. Next, a threshold is computed: \code{min.thres}, the minimum
#'predicted value of a pruned CART tree fitted to the curve of the standard
#'deviations of VI. Finally, the actual thresholding is performed: only
#'variables with a mean VI larger than \code{nmin} * \code{min.thres} are kept.
#'
#'@param x,formula A data frame or a matrix of predictors, the columns represent
#' the variables. Or a formula describing the model to be fitted.
#'@param ntree.thres Number of trees of each forest grown.
#'@param nfor.thres Number of forests grown.
#'@inheritParams VSURF
#'
#'@return An object of class \code{VSURF_thres}, which is a list with the
#' following components:
#'
#' \item{varselect.thres}{A vector of indices of selected variables, sorted
#' according to their mean VI, in decreasing order.}
#'
#' \item{imp.varselect.thres}{A vector of importance of the
#' \code{varselect.thres} variables.}
#'
#' \item{min.thres}{The minimum predicted value of a pruned CART tree fitted to
#' the curve of the standard deviations of VI.}
#'
#' \item{num.varselect.thres}{The number of selected variables.}
#'
#' \item{imp.mean.dec}{A vector of the variables importance means (over
#' \code{nfor.thres} runs), in decreasing order.}
#'
#' \item{imp.mean.dec.ind}{The ordering index vector associated to the sorting
#' of variables importance means.}
#'
#' \item{imp.sd.dec}{A vector of standard deviations of all variables
#' importance. The order is given by \code{imp.mean.dec.ind}.}
#'
#' \item{mean.perf}{The mean OOB error rate, obtained by a random forests build
#' with all variables.}
#'
#' \item{pred.pruned.tree}{The predictions of the CART tree fitted to the curve
#' of the standard deviations of VI.}
#'
#' \item{nmin}{Value of the parameter in the call.}
#'
#' \item{comput.time}{Computation time.}
#'
#' \item{RFimplem}{The RF implementation used to run
#' \code{VSURF_thres}.}
#'
#' \item{ncores}{The number of cores used to run \code{VSURF_thres} in parallel
#' (NULL if VSURF_thres did not run in parallel).}
#'
#' \item{clusterType}{The type of the cluster used to run \code{VSURF_thres} in
#' parallel (NULL if VSURF_thres did not run in parallel).}
#'
#' \item{call}{The original call to \code{VSURF}.}
#'
#' \item{terms}{Terms associated to the formula (only if formula-type call was
#' used).}
#'
#'@author Robin Genuer, Jean-Michel Poggi and Christine Tuleau-Malot
#'@seealso \code{\link{VSURF}}, \code{\link{tune}}
#'@references Genuer, R. and Poggi, J.M. and Tuleau-Malot, C. (2010),
#' \emph{Variable selection using random forests}, Pattern Recognition Letters
#' 31(14), 2225-2236
#'@references Genuer, R. and Poggi, J.M. and Tuleau-Malot, C. (2015),
#' \emph{VSURF: An R Package for Variable Selection Using Random Forests}, The
#' R Journal 7(2):19-33
#'
#' @examples
#'
#' data(iris)
#' iris.thres <- VSURF_thres(iris[,1:4], iris[,5])
#' iris.thres
#'
#' \dontrun{
#' # A more interesting example with toys data (see \code{\link{toys}})
#' # (a few minutes to execute)
#' data(toys)
#' toys.thres <- VSURF_thres(toys$x, toys$y)
#' toys.thres}
#'
#'@importFrom randomForest randomForest
#'@importFrom rpart rpart prune
#'@importFrom doParallel registerDoParallel
#'@importFrom foreach foreach %dopar%
#'@importFrom parallel makeCluster stopCluster mclapply detectCores
#'@importFrom utils tail
#'@importFrom stats model.frame model.response na.fail predict reformulate
#'@importFrom stats sd terms na.omit
#'@export
VSURF_thres <- function (x, ...) {
UseMethod("VSURF_thres")
}
#' @rdname VSURF_thres
#' @export
VSURF_thres.default <- function(
x, y, mtry = max(floor(ncol(x) / 3), 1), ntree.thres = 500, nfor.thres = 20,
nmin = 1, RFimplem = "randomForest", parallel = FALSE,
clusterType = "PSOCK", ncores = parallel::detectCores() - 1,
verbose = TRUE, ntree = NULL, ...) {
# x: input
# y: output
# nfor.thres: number of forests to compute the mean importance of variables (VI)
# nmin: thresholding parameter (if this procedure step keeps too much variables,
# this value can be increased, e.g. to 3 or 5)
start <- Sys.time()
if (!is.null(ntree)) cat(paste(
"\nntree parameter is deprecated, please use ntree.thres instead\n"))
if (verbose == TRUE) cat(paste("Thresholding step\n"))
if (!parallel) {
ncores <- NULL
} else {
ncores <- min(nfor.thres, ncores)
}
# determinination the problem type: classification or regression
# (code gratefully stolen from randomForest.default function of randomForest
# package)
classRF <- is.factor(y)
if (!classRF && length(unique(y)) <= 5) {
warning("The response has five or fewer unique values.
Are you sure you want to do regression?")
}
if (classRF && length(unique(y)) < 2)
stop("Need at least two classes to do classification.")
if (classRF) {
type <- "classif"
}
else {
type <- "reg"
}
dat <- cbind(x, "y" = y)
# m: matrix with VI
# perf: matrix with OOB errors
m <- matrix(NA, nrow=nfor.thres, ncol=ncol(x))
perf <- matrix(NA, nrow=nfor.thres, ncol=1)
# if all forests have to be stored in memory, lines involving "rfmem"
# must be uncommented
#rfmem=list()
# filling of matrix m by running nfor.thres forests and keeping VI
# filling of perf with the nfor.thres forests OOB errors
if (RFimplem == "Rborist") RFimplem <- "randomForest"
if (clusterType == "Rborist") clusterType <- "PSOCK"
if (RFimplem == "randomForest") {
rf.classif <- function(i, ...) {
rf <- randomForest::randomForest(x=x, y=y, ntree=ntree.thres, mtry=mtry,
importance=TRUE, ...)
m <- rf$importance[, length(levels(y))+1]
perf <- tail(rf$err.rate[,1], n=1)
return(list(m=m, perf=perf))
}
rf.reg <- function(i, ...) {
rf <- randomForest::randomForest(x=x, y=y, ntree=ntree.thres, mtry=mtry,
importance=TRUE, ...)
m <- rf$importance[, 1]
perf <- tail(rf$mse, n=1)
return(list(m=m, perf=perf))
}
}
if (RFimplem == "ranger") {
rf.ranger <- function(i, ...) {
rf <- ranger::ranger(dependent.variable.name = "y", data=dat,
num.trees=ntree.thres, mtry=mtry, importance="permutation",
...)
m <- rf$variable.importance
perf <- rf$prediction.error
return(list(m=m, perf=perf))
}
}
if (verbose == TRUE) {
if (RFimplem == "randomForest") {
if (type=="classif") {
timeOneRF <- system.time(rf.classif(1, ...))
}
if (type=="reg") {
timeOneRF <- system.time(rf.reg(1, ...))
}
}
if (RFimplem == "ranger") {
timeOneRF <- system.time(rf.ranger(1, num.threads = 1, ...))
}
cat(paste("Estimated computational time (on one core):",
round(nfor.thres * timeOneRF[3], 1), "sec.\n"))
}
# initialization of the progress bar
if (verbose == TRUE & (parallel == FALSE | clusterType == "ranger")) {
pb <- utils::txtProgressBar(style = 3)
nBar <- 1
}
if (!parallel) {
if (RFimplem == "randomForest") {
if (type=="classif") {
for (i in 1:nfor.thres){
rf <- rf.classif(i, ...)
m[i,] <- rf$m
perf[i] <- rf$perf
if (verbose == TRUE) {
utils::setTxtProgressBar(pb, nBar/nfor.thres)
nBar <- nBar + 1
}
}
}
if (type=="reg") {
for (i in 1:nfor.thres){
rf <- rf.reg(i, ...)
m[i,] <- rf$m
perf[i] <- rf$perf
if (verbose == TRUE) {
utils::setTxtProgressBar(pb, nBar/nfor.thres)
nBar <- nBar + 1
}
}
}
}
if (RFimplem == "ranger") {
for (i in 1:nfor.thres) {
rf <- rf.ranger(i, num.threads = 1, ...)
m[i,] <- rf$m
perf[i] <- rf$perf
if (verbose == TRUE) {
utils::setTxtProgressBar(pb, nBar/nfor.thres)
nBar <- nBar + 1
}
}
}
} else {
if (clusterType == "ranger") {
if (RFimplem != "ranger") stop("RFimplem must be set to 'ranger' to use clusterType 'ranger'")
for (i in 1:nfor.thres) {
rf <- rf.ranger(i, num.threads = ncores, ...)
m[i,] <- rf$m
perf[i] <- rf$perf
if (verbose == TRUE) {
utils::setTxtProgressBar(pb, nBar/nfor.thres)
nBar <- nBar + 1
}
}
} else {
if (clusterType=="FORK") {
if (RFimplem == "randomForest") {
if (type=="classif") {
res <- parallel::mclapply(X=1:nfor.thres, FUN=rf.classif, ...,
mc.cores=ncores)
}
if (type=="reg") {
res <- parallel::mclapply(X=1:nfor.thres, FUN=rf.reg, ...,
mc.cores=ncores)
}
}
if (RFimplem == "ranger") {
res <- parallel::mclapply(X=1:nfor.thres, FUN=rf.ranger,
num.threads = 1, ..., mc.cores=ncores)
}
} else {
clust <- parallel::makeCluster(spec=ncores, type=clusterType)
doParallel::registerDoParallel(clust)
if (RFimplem == "randomForest") {
if (type=="classif") {
res <- foreach::foreach(i=1:nfor.thres, .packages="randomForest") %dopar% {
out <- rf.classif(i, ...)
}
}
if (type=="reg") {
res <- foreach::foreach(i=1:nfor.thres, .packages="randomForest") %dopar% {
out <- rf.reg(i, ...)
}
}
}
if (RFimplem == "ranger") {
res <- foreach::foreach(i=1:nfor.thres, .packages="ranger") %dopar% {
out <- rf.ranger(i, num.threads = 1, ...)
}
}
parallel::stopCluster(clust)
}
for (i in 1:nfor.thres) {
m[i,] <- res[[i]]$m
perf[i] <- res[[i]]$perf
}
}
}
m_na.omit <- stats::na.omit(m)
if (nrow(m_na.omit) != nrow(m)) {
warning(
paste0(nrow(m) - nrow(m_na.omit), " runs of RF were removed
(among ", nfor.thres, ") because they contained no OOB
observations for some trees")
)
m <- m_na.omit
}
# ord.imp contains the VI means in decreasing order
ord.imp <- sort( colMeans(m), index.return=TRUE, decreasing=TRUE)
imp.mean.dec <- ord.imp$x
imp.mean.dec.ind <- ord.imp$ix
# mean.perf contains the forests mean OOB error
mean.perf <- mean(perf)
# imp.sd.dec contains VI standard deviations of all variables sorted according to imp.mean.dec.ind
sd.imp <- apply(m, 2, sd)
imp.sd.dec <- sd.imp[imp.mean.dec.ind]
# particular case where x has only one variable
s <- NULL
if (ncol(as.matrix(x))==1) {
s <- 1
}
else {
p <- ncol(x)
u <- data.frame(imp.sd.dec, 1:p)
# estimation of the standard deviations curve with CART (using "rpart" package)
# construction of the maximal tree and search of optimal complexity
tree <- rpart::rpart(imp.sd.dec ~., data=u, cp=0, minsplit=2)
d <- tree$cptable
argmin.cp <- which.min(d[,4])
# pruning
pruned.tree <- rpart::prune(tree, cp=d[argmin.cp, 1])
pred.pruned.tree <- predict(pruned.tree)
# determination of the y-value of the lowest stair: this is the estimation
# of the mean standard deviation of VI
min.pred <- min(pred.pruned.tree)
# thresholding: all variables with VI mean lower than min.pred are discarded
w <- which(imp.mean.dec < nmin*min.pred)
if (length(w)==0) {
s <- p
}
else {
s <- min(w)-1
}
}
# varselect: selected variables index
# impvarselect: corresponding VI means
varselect.thres <- imp.mean.dec.ind[1:s]
imp.varselect.thres <- imp.mean.dec[1:s]
cl <- match.call()
cl[[1]] <- as.name("VSURF_thres")
if (!parallel) clusterType <- NULL
comput.time <- Sys.time()-start
output <- list('varselect.thres'=varselect.thres,
'imp.varselect.thres'=imp.varselect.thres,
'min.thres'=min.pred,
'num.varselect.thres'=s,
'imp.mean.dec'=imp.mean.dec,
'imp.mean.dec.ind'=imp.mean.dec.ind,
'imp.sd.dec'=imp.sd.dec,
'mean.perf'=mean.perf,
'pred.pruned.tree'=pred.pruned.tree,
'nmin' = nmin,
'comput.time'=comput.time,
'RFimplem'=RFimplem,
'ncores'=ncores,
'clusterType'=clusterType,
'call'=cl)
class(output) <- c("VSURF_thres")
output
}
#' @rdname VSURF_thres
#' @export
VSURF_thres.formula <- function(formula, data, ..., na.action = na.fail) {
### formula interface for VSURF_thres.
### code gratefully stolen from svm.formula (package e1071).
###
if (!inherits(formula, "formula"))
stop("method is only for formula objects")
m <- match.call(expand.dots = FALSE)
## Catch xtest and ytest in arguments.
if (any(c("xtest", "ytest") %in% names(m)))
stop("xtest/ytest not supported through the formula interface")
names(m)[2] <- "formula"
if (is.matrix(eval(m$data, parent.frame())))
m$data <- as.data.frame(data)
m$... <- NULL
m$na.action <- na.action
m[[1]] <- as.name("model.frame")
m <- eval(m, parent.frame())
y <- model.response(m)
Terms <- attr(m, "terms")
attr(Terms, "intercept") <- 0
attr(y, "na.action") <- attr(m, "na.action")
## Drop any "negative" terms in the formula.
## test with:
## randomForest(Fertility~.-Catholic+I(Catholic<50),data=swiss,mtry=2)
m <- model.frame(terms(reformulate(attributes(Terms)$term.labels)),
data.frame(m))
## if (!is.null(y)) m <- m[, -1, drop=FALSE]
for (i in seq(along=ncol(m))) {
if (is.ordered(m[[i]])) m[[i]] <- as.numeric(m[[i]])
}
ret <- VSURF_thres.default(x=m, y=y, ...)
cl <- match.call()
cl[[1]] <- as.name("VSURF")
ret$call <- cl
ret$terms <- Terms
if (!is.null(attr(y, "na.action"))) {
ret$na.action <- attr(y, "na.action")
}
class(ret) <- c("VSURF_thres.formula", class(ret))
warning(
"VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data")
return(ret)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.