#' Racing
#'
#' The function implementes the Racing algorithm for selecting the best technique to re-balance or remove noisy instances in unbalanced datasets.
#'
#' @param formula formula describing the model to be fitted.
#' @param data the unbalanced dataset
#' @param algo the classification algorithm to use with the mlr package.
#' @param positive label of the positive (minority) class.
#' @param ncore the number of core to use in the Race. Race is performed with parallel exectuion when ncore > 1.
#' @param nFold number of folds in the cross-validation that provides the subset of data to the Race
#' @param maxFold maximum number of folds to use in the Race
#' @param maxExp maximum number of experiments to use in the Race
#' @param stat.test statistical test to use to remove candidates which perform significantly worse than the best.
#' @param metric metric used to asses the classification (f1, auc or gmean).
#' @param ubConf configuration of the balancing techniques used in the Race.
#' @param threshold threshold used to classify instances. If NULL use default values by mlr package.
#' @param verbose print extra information (TRUE/FALSE)
#' @param \dots dditional arguments pass to train function in mlr package.
#'
#'
#' @details The argument metric can take the following values: "gmean", "f1" (F-score or F-measure), "auc" (Area Under ROC curve). Argument stat.test defines the statistical test used to remove candidates during the race. It can take the following values: "friedman" (Friedman test), "t.bonferroni" (t-test with bonferroni correction), "t.holm" (t-test with holm correction), "t.none" (t-test without correction), "no" (no test, the Race continues until new subsets of data are provided by the cross validation). Argument ubConf is a list passed to function ubBalance that is used for configuration.
#'
#'
#' @return The function returns a list:
#' \item{Race}{matrix containing accuracy results for each technique in the Race.}
#' \item{best}{best technique selected in the Race.}
#' \item{avg}{average of the metric used in the Race for the technique selected.}
#' \item{sd}{standard deviation of the metric used in the Race for the technique selected.}
#' \item{N.test}{number of experiments used in the Race.}
#' \item{Gain}{\% of computational gain with resepct to the maximum number of experiments given by the cross validation.}
#'
#'
#' @note The function ubRacing is a modified version of the race function availble in the race package: \url{http://cran.r-project.org/package=race}.
#' @seealso \code{\link{ubBalance}}, \code{\link{ubOver}}, \code{\link{ubUnder}}, \code{\link{ubSMOTE}}, \code{\link{ubOSS}}, \code{\link{ubCNN}}, \code{\link{ubENN}}, \code{\link{ubNCL}}, \code{\link{ubTomek}}
#'
#' @references 1. Dal Pozzolo, Andrea, et al. "Racing for unbalanced methods selection." Intelligent Data Engineering and Automated Learning - IDEAL 2013. Springer Berlin Heidelberg, 2013. 24-31.\cr 2. Birattari, Mauro, et al. "A Racing Algorithm for Configuring Metaheuristics."GECCO. Vol. 2. 2002.
#'
#' @examples
#' # use Racing to select the best technique for an unbalanced dataset
#' library(unbalanced)
#' data(ubIonosphere)
#'
#' # configure sampling parameters
#' ubConf <- list(percOver=200, percUnder=200, k=2, perc=50, method="percPos", w=NULL)
#'
#' #load the classification algorithm that you intend to use inside the Race
#' #see 'mlr' package for supported algorithms
#' library(randomForest)
#' #use only 5 trees
#' results <- ubRacing(Class ~., ubIonosphere, "randomForest", positive=1, ubConf=ubConf, ntree=5)
#'
#' # try with 500 trees
#' results <- ubRacing(Class ~., ubIonosphere, "randomForest", positive=1, ubConf=ubConf, ntree=500)
#' # let's try with a different algorithm
#' library(e1071)
#' # results <- ubRacing(Class ~., ubIonosphere, "svm", positive=1, ubConf=ubConf)
#'
#' @export
ubRacing <-
function(formula, data, algo, positive=1, ncore = 1, nFold=10, maxFold=10, maxExp=100,
stat.test="friedman", metric="f1", ubConf, threshold=NULL, verbose=FALSE, ...){
stopifnot(class(formula)=="formula", is.data.frame(data), NROW(data)>1, NCOL(data)>1, is.logical(verbose), ncore>0, nFold>1)
metric <- match.arg(metric, c("f1","gmean", "auc"))
stat.test <- match.arg(stat.test, c("friedman","t.bonferroni", "t.holm", "t.none", "no"))
stopifnot(class(ubConf)=="list", names(ubConf) %in% c("percOver", "percUnder", "k", "perc", "method", "w"))
target <- as.character(formula[[2]])
tgt <- which(names(data) == target)
if(length(tgt) == 0)
stop("target variable not defined")
#Function used to make predictions with each candicate of the Race
predCandidate <- function(Xtr, Ytr, Xts, Yts, algo, balType, positive, ubConf, metric, threshold, verbose, ...){
if (balType != "unbal"){
#re-balance the dataset
data <- ubBalance(Xtr, factor(Ytr), type=balType, positive=positive,
ubConf$percOver, ubConf$percUnder, ubConf$k,
ubConf$perc, ubConf$method, ubConf$w, verbose)
TR <- data.frame(data$X, Y=as.factor(data$Y))
} else {
#leave the dataset unbalanced
Ytr <- factor(Ytr == positive, levels = c(FALSE, TRUE), labels = c(0, 1))
TR <- data.frame(Xtr, Y=Ytr)
}
Yts <- factor(Yts == positive, levels = c(FALSE, TRUE), labels = c(0, 1))
TS <- data.frame(Xts, Y=Yts)
#single prediction without parameter tuning
#library(mlr)
lrnTask <- makeClassifTask(id=paste(algo, balType, sep="_"), data=TR, target="Y", positive=positive)
lrnTask <- removeConstantFeatures(lrnTask, show.info=verbose)
L <- lrnTask$task.desc$class.levels
negative <- setdiff(L, positive)
if(length(L) > 2)
stop("only binary classification supported yet")
race.lrn <- paste(lrnTask$task.desc$type, algo, sep=".")
lrn <- mlr::makeLearner(race.lrn, predict.type = "prob", ...)
mod <- mlr::train(lrn, lrnTask)
pred <- stats::predict(mod, newdata=TS)
if(!is.null(threshold))
pred <- setThreshold(pred, threshold)
perf <- mlr::performance(pred, measures = list(gmean, f1, mlr::auc))
res.metric <- as.numeric(perf[metric])
#revert metric since racing is a minimizing algorithm
res <- 1 - res.metric
res
}
#test each methods on the same observations
#return a vector of errors (1 or 0 for each method)
testCandidates <- function(Xtr, Xts, Ytr, Yts, algo, positive, balanceTypes, ubConf, metric, threshold, ncore, verbose, ...){
nBalanceTypes <- length(balanceTypes)
#library(doParallel)
if (ncore > 1) {
doParal <- `%dopar%`
} else {
doParal <- `%do%`
}
j <- NULL
#We assume that a cluster is already registered!
#library(foreach)
#make predictions
error <- doParal( foreach(j=1:nBalanceTypes, .combine=c, .packages=c('mlr'),
.export=c('predCandidate', 'ubBalance', 'ubUnder', 'ubCNN', 'ubENN',
'ubNCL', 'ubOSS', 'ubOver', 'ubSMOTE', 'ubSmoteExs', 'ubTomek')),
predCandidate(Xtr, Ytr, Xts, Yts, algo, balanceTypes[j], positive, ubConf, metric, threshold, verbose, ...))
# # to debug avoid using foreach
# error <- NULL
# for(j in 1:nBalanceTypes) {
# error <- c(error, predCandidate(Xtr, Ytr, Xts, Yts, algo, balanceTypes[j], positive, ubConf, metric, threshold, verbose, ...))
# }
names(error) <- balanceTypes
return(error)
}
#from race package but edited
aux2.friedman.edit<- function(y, I = 1:ncol(y), n=nrow(y), conf.level = 0.95,interactive=F) {
k <- length(I)
r <- t(apply(y[1:n, I, drop=FALSE], 1, rank))
A <- sum(as.vector(r)^2)
R <- apply(r, 2, sum)
J <- I[order(R)]
alpha <- 1 - conf.level
TIES <- tapply(r, row(r), table)
STATISTIC <- ((12 * sum((R - n * (k + 1)/2)^2))/(n *
k * (k + 1) - (sum(unlist(lapply(TIES, function(u) {
u^3 - u
})))/(k - 1))))
PARAMETER <- k - 1
PVAL <- stats::pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
if (!is.nan(PVAL) && (PVAL < alpha)) {
if (interactive)
cat("|-|")
t <- stats::qt(1 - alpha/2, (n - 1) * (k - 1)) * (2 * (n *
A - sum(R^2))/((n - 1) * (k - 1)))^(1/2)
o <- order(R)
J <- I[o[1]]
for (j in 2:k) if (abs(R[o[j]] - R[o[1]]) > t)
break
else J <- c(J, I[o[j]])
}
else {
if (interactive)
cat("|=|")
}
return(J)
}
#from race package but edited
aux.friedman.edit<- function(Results,no.subtasks.sofar,alive,conf.level=0.95,interactive=F) {
which.alive <- which(alive)
no.alive<-length(which.alive)
if (no.alive == 2) {
V1 <- Results[1:(no.subtasks.sofar), which.alive[1]]
V2 <- Results[1:(no.subtasks.sofar), which.alive[2]]
PVAL <- stats::wilcox.test(V1, V2, paired = TRUE, exact = FALSE)$p.value
D <- V1 - V2
w.stat <- sum(sign(D) * rank(abs(D)))
if (!is.nan(PVAL) && !is.na(PVAL) && (PVAL < 1 - conf.level)) {
if (interactive)
cat("|-|")
if (w.stat < 0) {
best <- which.alive[1]
alive[which.alive[2]] <- FALSE
}
else {
best <- which.alive[2]
alive[which.alive[1]] <- FALSE
}
}
else {
if (interactive)
cat("|=|")
if (w.stat < 0) {
best <- which.alive[1]
}
else {
best <- which.alive[2]
}
}
}
else {
J <- aux2.friedman.edit(Results[1:(no.subtasks.sofar), ,drop=FALSE], I=which.alive, conf.level=conf.level, interactive=interactive)
alive[-J] <- FALSE
best <- J[1]
}
return(list(alive=alive,best=best))
}
#from race package but edited
aux.ttest.edit <- function(Results,no.subtasks.sofar,alive,adjust=c("none", "bonferroni", "holm"),conf.level=0.95,interactive=F) {
which.alive <- which(alive)
no.alive<-length(which.alive)
adjust <- match.arg(adjust)
mean.all <- array(0, c(ncol(Results)))
for (j in 1:ncol(Results)) mean.all[j] <- sum(Results[1:no.subtasks.sofar,
j]/no.subtasks.sofar)
best <- match(min(mean.all[alive]), mean.all)
PJ <- array(0, dim = c(2, 0))
for (j in which.alive) {
Vb <- Results[1:no.subtasks.sofar, best]
Vj <- Results[1:no.subtasks.sofar, j]
p <- stats::t.test(Vb, Vj, paired = TRUE)$p.value
if (!is.nan(p) & !is.na(p))
PJ <- array(c(PJ, j, p), dim = dim(PJ) + c(0,
1))
}
PJ[2, ] <- stats::p.adjust(PJ[2, ], method = adjust)
dropped.any <- FALSE
for (j in 1:ncol(PJ)) if (PJ[2, j] < (1 - conf.level)) {
alive[PJ[1, j]]<- FALSE
dropped.any <- TRUE
}
if (interactive) {
if (dropped.any)
cat("|-|")
else cat("|=|")
}
return(list(alive=alive,best=best))
}
#from race package but edited
format.precis <- function(title, value,title.width,value.width) {
dots <- function(n) return(paste(rep(".", n), collapse = ""))
spaces <- function(n) return(paste(rep(" ", n), collapse = ""))
string <- paste(title, dots(title.width - nchar(title)), sep = "")
if (nchar(value) <= value.width) {
string <- paste(string, dots(value.width - nchar(value)),value, sep = "")
}
else {
f.vec <- strwrap(value, width = value.width)
first <- f.vec[1]
first <- paste(dots(title.width - nchar(first)),
first, sep = "")
rest <- format(f.vec[-1])
rest <- paste(spaces(title.width + value.width - max(nchar(rest))), rest, sep = "", collapse = "\n")
string <- paste(string, paste(first, rest, sep = "\n"), sep = "")
}
return(paste(string, "\n"))
}
#from function createFolds in caret package but edited
idFolds <- function(y, k = 10){
stopifnot(k > 0)
if (is.numeric(y)) {
cuts <- floor(length(y)/k)
if (cuts < 2)
cuts <- 2
if (cuts > 5)
cuts <- 5
y <- cut(y, unique(stats::quantile(y, probs = seq(0, 1, length = cuts))), include.lowest = TRUE)
}
if (k < length(y)) {
y <- factor(as.character(y))
numInClass <- table(y)
foldVector <- vector(mode = "integer", length(y))
for (i in 1:length(numInClass)) {
seqVector <- rep(1:k, numInClass[i]%/%k)
if (numInClass[i]%%k > 0)
seqVector <- c(seqVector, sample(1:k, numInClass[i]%%k))
foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
}
} else
foldVector <- seq(along = y)
foldVector
}
foldVector <- idFolds(data[ ,tgt], nFold)
input <- data[ ,-tgt]
output <- data[ ,tgt]
balanceTypes <- c("unbal", "ubOver", "ubUnder", "ubSMOTE", "ubOSS","ubCNN", "ubENN", "ubNCL", "ubTomek")
nBalanceTypes <- length(balanceTypes)
title <- paste("Racing for unbalanced methods selection in", nFold, "fold CV")
title.width<-32
value.width<-title.width
precis <- paste("\n", title,"\n",
format.precis("Number of candidates", nBalanceTypes, title.width, value.width),
format.precis("Max number of folds in the CV", maxFold,title.width,value.width),
format.precis("Max number of experiments", ifelse(maxExp, maxExp, "unlimited"),title.width,value.width),
#format.precis("Metric", metric, title.width, value.width),
format.precis("Statistical test",
switch(stat.test,
friedman = "Friedman test",
t.bonferroni = "t-test with Bonferroni's correction for multiple comparisons",
t.holm = "t-test with Holm's correction for multiple comparisons",
t.none = "t-test with no correction for multiple comparisons",
no = "No test"),title.width,value.width)
)
cat(precis)
if (nBalanceTypes > maxExp & maxExp!=0)
stop("Max number of experiments is smaller than number of candidates")
if (maxFold > nFold)
maxFold <- nFold
Results <- matrix(NA, nrow=nFold, ncol=nBalanceTypes)
colnames(Results) <- balanceTypes
alive <- rep(TRUE,nBalanceTypes)
n.Exp <- 0
n.alive <- nBalanceTypes
types.alive <- balanceTypes
#open a cluster
if (ncore > 1) {
cl <- makeCluster(ncore)
cat(" Parallel execution with", ncore, "cores \n")
registerDoParallel(cl)
} else
makeCluster <- NULL
cat(" \n",
" Markers: \n",
" x No test is performed. \n",
" - The test is performed and \n",
" some candidates are discarded.\n",
" = The test is performed but \n",
" no candidate is discarded. \n",
" \n",
"+-+-----------+-----------+-----------+-----------+-----------+\n",
"| | Fold| Alive| Best| Mean best| Exp so far|\n",
"+-+-----------+-----------+-----------+-----------+-----------+\n")
for(i in 1:nFold){
n.Exp <- n.Exp + n.alive
if ((n.Exp>maxExp | maxFold<i) & maxExp!=0 | n.alive==1)
break()
max.tasks <- i
id.fold <- which(foldVector == i)
Xtr <- input[foldVector == i, ]
Xts <- input[foldVector != i, ]
Ytr <- output[foldVector == i]
Yts <- output[foldVector != i]
if(any(is.na(types.alive)))
stop("NAs in balanceTypes")
err <- testCandidates(Xtr, Xts, Ytr, Yts, algo, positive, types.alive, ubConf, metric, threshold, ncore, verbose, ...)
Results[i, match(types.alive, balanceTypes)] <- err
#the best candidate is the one that obtains the smallest results on the various tasks considered
if (i==1)
best <- as.numeric(which(err==min(err)))[1]
#make a test on the error
if (i>1){
if (stat.test=="friedman")
test <- aux.friedman.edit(Results, no.subtasks.sofar=i, alive, conf.level=0.95, interactive=TRUE)
if (stat.test=="t.none")
test<-aux.ttest.edit(Results,no.subtasks.sofar=i,alive,"none", conf.level=0.95)
if (stat.test=="t.holm")
test<-aux.ttest.edit(Results,no.subtasks.sofar=i,alive,"holm", conf.level=0.95)
if (stat.test=="t.bonferroni")
test<-aux.ttest.edit(Results,no.subtasks.sofar=i,alive,"bonferroni", conf.level=0.95)
if (stat.test=="no"){
test <- list(alive=rep(TRUE, nBalanceTypes), best=as.numeric(which(err==min(err)))[1])
cat("|x|")
}
alive <- test$alive
best <- test$best
}
#remove significantly worse mehtods
which.alive <- which(alive)
n.alive <- length(which.alive)
types.alive <- balanceTypes[which.alive]
if(i==1){
avg.best <- min(err)
cat(" |x|")
}
else
avg.best <- as.numeric(apply(Results, 2, mean, na.rm=TRUE)[best])
#revert metric since racing is a minimizing algorithm
avg.best <- 1 - avg.best
cat(paste(formatC(i, width = 11), "|",
formatC(n.alive, width = 11), "|",
formatC(best, width = 11), "|",
formatC(avg.best, width = 11), "|",
formatC(n.Exp, width = 11), "|\n"," ",
sep = ""))
}
cat("+-+-----------+-----------+-----------+-----------+-----------+\n")
colnames(Results) <- balanceTypes
Results <- Results[1:max.tasks, ]
#revert metric since racing is a minimizing algorithm
Results <- 1 - Results
#write.csv(Results,file="Race.csv")
#close the cluster
if (ncore > 1) {
stopCluster(cl)
rm(cl)
} else
stopCluster <- NULL
if(max.tasks>1){
avg.err <- apply(Results, 2, mean, na.rm=TRUE)
sd.err <- apply(Results, 2, stats::sd, na.rm=TRUE)
}
else { #max.tasks==1
avg.err <- Results
sd.err <- Results
}
avg.best <- as.numeric(avg.err[best])
sd.best <- as.numeric(sd.err[best])
best.desc <- balanceTypes[best]
#percentage of computational gain
percGain <- round((1 - n.Exp/(nFold*nBalanceTypes))*100, 0)
cat("Selected candidate:",best.desc,"\t metric:", metric, "\t mean value:", round(avg.best, 4),"\n")
return(list(best=best.desc, avg=avg.best, sd=sd.best, N.test=n.Exp, Gain=percGain, Race=Results))
}
# #' Illustration of predCandidate function
# #'
# #' Function used to make predictions with each candicate of the Race in \code{\link{ubRacing}}
# #'
# #' @param Xtr input features of the training set.
# #' @param Ytr output feature of the training set.
# #' @param Xts input features of the testing set.
# #' @param Yts output feature of the testing set.
# #' @param algo classification algorithm used to make predictions; see mlr package for supported algorithms.
# #' @param balType the type of balancing technique to use; see argument type in \code{\link{ubBalance}}.
# #' @param positive the majority class of the response variable.
# #' @param ubConf configuration of the balancing techniques used in the Race.
# #' @param metric metric used to asses the classification.
# #' @param verbose print extra information (TRUE/FALSE)
# #' @param ... additional arguments pass to train function in mlr package.
# #'
# #' @return performance in terms of the metric selected
# #'
# #' @seealso \code{\link{brocolors}}
# #'
# #' @examples
# #' library(unbalanced)
# #' data(ubIonosphere)
# #' #make training and testing sets
# #' train <- ubIonosphere[1:200, ]
# #' Xtr <- subset(train, select=-Class)
# #' Ytr <- subset(train, select=Class, drop=TRUE)
# #' test <- ubIonosphere[201:351, ]
# #' Xts <- subset(test, select=-Class)
# #' Yts <- subset(test, select=Class, drop=TRUE)
# #'
# #' #configure sampling parameters
# #' ubConf <- list(percOver=200, percUnder=200, k=2, perc=50, method="percPos", w=NULL)
# #' #load the classification algorithm that you intend to use inside the Race
# #' #see 'mlr' package for supported algorithms
# #' library(randomForest)
# #' #use only 5 trees
# #' predCandidate(Xtr, Ytr, Xts, Yts, "randomForest", "ubUnder", positive=1, ubConf=ubConf, metric="auc", verbose=TRUE, ntree=5)
# #'
# #' @export
# predCandidate <- function(Xtr, Ytr, Xts, Yts, algo, balType, positive, ubConf, metric, verbose, ...){
#
# if (balType != "unbal"){
# #re-balance the dataset
# data <- ubBalance(Xtr, factor(Ytr), type=balType, positive=positive,
# ubConf$percOver, ubConf$percUnder, ubConf$k,
# ubConf$perc, ubConf$method, ubConf$w, verbose)
# TR <- data.frame(data$X, Y=as.factor(data$Y))
# } else {
# #leave the dataset unbalance
# Ytr <- factor(Ytr == positive, levels = c(FALSE, TRUE), labels = c(0, 1))
# TR <- data.frame(Xtr, Y=Ytr)
# }
#
# Yts <- factor(Yts == positive, levels = c(FALSE, TRUE), labels = c(0, 1))
# TS <- data.frame(Xts, Y=Yts)
#
# #single prediction without parameter tuning
# #library(mlr)
# lrnTask <- makeClassifTask(id=paste(algo, balType, sep="_"), data=TR, target="Y", positive=positive)
# lrnTask <- removeConstantFeatures(lrnTask, show.info=verbose)
# L <- lrnTask$task.desc$class.levels
# negative <- setdiff(L, positive)
# if(length(L) > 2)
# stop("only binary classification supported yet")
#
# race.lrn <- paste(lrnTask$task.desc$type, algo, sep=".")
# lrn <- makeLearner(race.lrn, predict.type = "prob", ...)
# mod <- train(lrn, lrnTask)
# pred <- predict(mod, newdata=TS)
#
# perf <- mlr::performance(pred, measures = list(gmean, f1, mlr::auc))
# res.metric <- as.numeric(perf[metric])
# #revert metric since racing is a minimizing algorithm
# res <- 1 - res.metric
#
# res
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.