#*********************************************************************
# This is free software and comes with ABSOLUTELY NO WARRANTY.
# You are welcome to redistribute it under GPL conditions.
# Copyright (C) Andrea Dal Pozzolo, March 2015
# License: GPL (>= 2)
#*********************************************************************
#***********************
# The following code is used to asses the wrapping effect
# of undersampling on the posterior probability
# use a large number of folds to make sure we use almost all the dataset for training
#***********************
#' @title warpingUnder function
#'
#' @description
#' Function used to asses the wrapping effect of undersampling on the posterior probability.
#' use a large number of folds to make sure we use almost all the dataset for training
#'
#' @param formula A formula of the form y ~ x1 + x2 + ...
#' @param data Data frame from which variables specified in formula are preferentially to be taken
#' @param algo classification algorithm supported by mlr package
#' @param task.id name of the task
#' @param positive value of the positive (minority) class
#' @param costs cost matrix
#' @param nCV number of repetation of the Cross Validation (CV)
#' @param B number of models to create for each fold of the CV
#' @param nFold number of folds of the CV
#' @param ncore number of cores to use in multicore computation
#' @param dirPlot directory where to save plots (set dirPlot=NA to avoid plots)
#' @param verbose print extra information (logical variable)
#' @param ... extra parameters passed to mlr train function
#'
#' @return The function returns a list:
#' \item{results}{results of undersampling}
#' \item{probs}{probabilities}
#' \item{mres}{mres}
#' \item{meltResALL}{meltResALL}
#'
#' @examples
#' library(mlbench)
#' data(Ionosphere)
#' library(warping)
#' res <- warpingUnder(Class ~., Ionosphere, "randomForest", task.id="rf_Ionosphere", positive="bad", nCV=3, B=1, nFold=5)
#'
#' @export
warpingUnder <- function(formula, data, algo, task.id="cv", positive=1, costs=NULL, nCV=10, B=1, nFold=100, ncore = 1, dirPlot=NA, verbose=TRUE, ...){
stopifnot(class(formula) == "formula",
NCOL(data)>1, NROW(data)>1,
is.logical(verbose), ncore > 0,
nCV >= 0, B > 0, nFold > 1)
#derivative of ps w.r.t. p, where ps (p) is the posterior with (without) undersampling
dps <- function(p, beta) beta / (p +beta * (1-p))^2
#compute the sd of the posterior probabilities for each instance across all the repeated CV
probSd <- function(pred.data, positive) {
#library(plyr)
#rename the column to be used with ddply
colnames(pred.data)[which(colnames(pred.data)==paste("prob", positive, sep = "."))] <- "prob.1"
#take the variance of the probability for each instance over all the CV.
dt <- ddply(pred.data, .(id, truth), summarize, sd.prob = sd(prob.1))
dt$sd.prob
}
if (!is.na(dirPlot))
if (!file.exists(dirPlot))
dir.create(dirPlot)
target <- as.character(formula[[2]])
tgt <- which(names(data) == target)
if(length(tgt) == 0)
stop("target variable not defined")
#define task
type <- ifelse(is.factor(data[ ,tgt]), "classif", "regr")
if(type == "classif")
task <- makeClassifTask(id=task.id, data=data, target=target, positive=positive)
else
stop("target variable must be a factor for classification")
L <- task$task.desc$class.levels
negative <- setdiff(L, positive)
if(length(L) > 2)
stop("only binary classification supported yet")
type <- task$task.desc$type
cv.lrn <- paste(type, algo, sep=".")
## Define the learner: set it up for predicting probabilities
lrn <- makeLearner(cv.lrn, predict.type = "prob", ...)
if(verbose)
print(lrn)
task <- removeConstantFeatures(task)
Y <- data[ ,tgt]
N <- length(Y)
N.pos <- sum(Y == positive)
#define a cost matrix if not given using class priors
if(is.null(costs))
costs <- matrix(c(cTP=0, cFP=N.pos/N, cFN=(N-N.pos)/N, cTN=0), 2)
else
stopifnot(is.matrix(costs), ncol(costs) == 2, nrow(costs) == 2)
colnames(costs) <- rownames(costs) <- task$task.desc$class.levels
if(verbose){
cat("cost matrix \n")
print(costs)
}
## Calculate the theoretical threshold for the positive class
th <- costs[2,1]/(costs[2,1] + costs[1,2]) #FP cost / (FP cost + FN cost)
metrics <- list(gmean, f1, ppv, tpr) #mlr::auc
submetrics <- list(gmean, f1)
if(nCV > 1) #change the aggration method in the case of repeated CV
metrics <- lapply(metrics, function(x) setAggregation(x, testgroup.mean))
#predict once with all samples
mod <- train(lrn, task)
#test on the training set itself
pred <- predict(mod, newdata=data)
pred.all <- setThreshold(pred, th)
res.all <- mlr::performance(pred.all, metrics, task)
prob.all <- getProbabilities(pred)
## define a 10-fold cross-validation that will be used for all tasks (unbalanced, undersampling, oversampling ..)
if(nCV == 1)
rdesc <- makeResampleDesc("CV", iters = nFold, stratify = TRUE)
if(nCV > 1)
rdesc <- makeResampleDesc("RepCV", folds = nFold, reps = nCV, stratify = TRUE)
# if(nCV == 0)
# rdesc <- makeResampleDesc("LOO")
# Create a resample instance based an a task in order to calculate the performance of all learners on the same instances
rin <- makeResampleInstance(rdesc, task = task)
if(ncore > 1){
#library("parallelMap")
parallelStart("multicore", ncore) #Starting parallelization in mode=multicore with cpus=ncore.
}
#cross-validation without resampling (dataset unbalanced)
cv <- resample(lrn, task, rin, measures = metrics, models=FALSE, show.info=FALSE)
# cv <- crossval(lrn, task, iters=10L, stratify=TRUE, models=FALSE, show.info=FALSE)
if(ncore > 1)
parallelStop() #Stop parallelization
predcv <- cv$pred
#extract the posterior probability of the positive class
probs <- data.frame(id=factor(predcv$data$id), class=predcv$data$truth, iter=predcv$data$iter, prob.unbal=getProbabilities(predcv))
pred.th <- setThreshold(predcv, th)
res.th <- mlr::performance(pred.th, metrics, task)
if(verbose){
cat("\n Performance with theoretical threshold", th, "\n")
print(res.th)
}
#take the sd of the probability for each instance over all the CV.
sdUnbal <- probSd(pred.th$data, positive)
#extract 10 values of beta
betas.under <- betasUnder(data[ ,tgt], positive, 5, "perc")
# repeat undersampling for different values of beta
probs.under <- res.under <- res.ucal <- NULL
preds.under <- preds.ucal <- list()
for(b in betas.under){
dirPlot.b <- ifelse(b == betas.under[1], dirPlot, NA)
res.b <- undersampling(task, lrn, b, metrics, rin, positive, negative, verbose, dirPlot.b, ncore, B=B)
preds.under[[length(preds.under) + 1]] <- res.b$pred.under$data
preds.ucal[[length(preds.ucal) + 1]] <- res.b$pred.cal$data
prob.b <- res.b$prob[ ,-1]
colnames(prob.b) <- paste(colnames(prob.b), b, sep="_")
probs <- cbind(probs, prob.b)
res.under <- rbind(res.under, res.b$under)
res.ucal <- rbind(res.ucal, res.b$cal)
}
row.names(res.under) <- paste("res.under.b", betas.under, sep="-")
row.names(res.ucal) <- paste("res.ucal.b", betas.under, sep="-")
names(preds.under) <- paste("pred.under.b", betas.under, sep="-")
names(preds.ucal) <- paste("pred.ucal.b", betas.under, sep="-")
sdUnder <- sapply(preds.under, function(x) probSd(x, positive))
sdUcal <- sapply(preds.ucal, function(x) probSd(x, positive))
sdRatio <- sdUnder/sdUnbal
#compute the probability of having dps > sdRatio using the probability obtained without CV
#higher <- matrix(NA, nrow=length(prob.all), ncol=length(betas.under))
dpsMat <- matrix(NA, nrow=length(prob.all), ncol=length(betas.under))
for(b in 1:length(betas.under)){
dpsMat[ ,b] <- dps(prob.all, betas.under[b])
}
higher <- dpsMat >= sdRatio
probHigher <- apply(higher, 2, function(x) mean(x, na.rm=TRUE))
cat("\n probability of the derivative being higher \n")
names(probHigher) <- sub("pred.under.b", "beta", names(probHigher))
print(probHigher)
#write.csv(probHigher, file="probHigher.csv")
resBoth <- resLow <- resHigh <- NULL
for(b in 1:length(betas.under)){
higher.b <- higher[ ,b]
preds.under.b <- preds.under[[b]]
id.prob <- which(colnames(preds.under.b) == paste("prob", positive, sep="."))
y <- factor(preds.under.b$truth == positive, levels=c(TRUE, FALSE), labels=c(1, 0))
#to do compute the performances separately for each iteration
id.high <- which(higher.b)
if(length(id.high) > 0){
pred.under.high <- subset(preds.under.b, id %in% id.high)
pred.unbal.high <- subset(pred.th$data, id %in% id.high)
dd.under <- res.b$pred.under
dd.under$data <- pred.under.high
res.under.high <- mlr::performance(dd.under, metrics, task.under)
dd.unbal <- pred.th
dd.under$data <- pred.unbal.high
res.unbal.high <- mlr::performance(dd.under, metrics, task)
res.high <- data.frame(rbind(res.unbal.high, res.under.high))
res.high$type <- c("unbal", "under")
# y.high <- factor(pred.under.high$truth == positive, levels=c(TRUE, FALSE), labels=c(1, 0))
# res.under.high <- probMetrics(pred.under.high[ ,id.prob], y.high, verbose=FALSE)
# res.unbal.high <- probMetrics(pred.unbal.high[ ,id.prob], y.high, verbose=FALSE)
# res.high <- rbind(data.frame(res.unbal.high, type="unbal"), data.frame(res.under.high, type="under"))
res.high$beta <- betas.under[b]
resHigh <- rbind(resHigh, res.high)
}
id.low <- which(!higher.b)
if(length(id.low) > 0){
pred.under.low <- subset(preds.under.b, id %in% id.low)
pred.unbal.low <- subset(pred.th$data, id %in% id.low)
dd.under <- res.b$pred.under
dd.under$data <- pred.under.low
res.under.low <- mlr::performance(dd.under, metrics, task.under)
dd.unbal <- pred.th
dd.under$data <- pred.unbal.low
res.unbal.low <- mlr::performance(dd.under, metrics, task)
res.low <- data.frame(rbind(res.unbal.low, res.under.low))
res.low$type <- c("unbal", "under")
# y.low <- factor(pred.under.low$truth == positive, levels=c(TRUE, FALSE), labels=c(1, 0))
# res.under.low <- probMetrics(pred.under.low[ ,id.prob], y.low, verbose=FALSE)
# res.unbal.low <- probMetrics(pred.unbal.low[ ,id.prob], y.low, verbose=FALSE)
# res.low <- rbind(data.frame(res.unbal.low, type="unbal"), data.frame(res.under.low, type="under"))
res.low$beta <- betas.under[b]
resLow <- rbind(resLow, res.low)
}
dd.under <- res.b$pred.under
dd.under$data <- preds.under.b
res.under.both <- mlr::performance(dd.under, metrics, task.under)
res.unbal.both <- mlr::performance(pred.th, metrics, task)
res.both <- data.frame(rbind(res.unbal.both, res.under.both))
res.both$type <- c("unbal", "under")
# res.under.both <- probMetrics(preds.under.b[ ,id.prob], y, verbose=FALSE)
# res.unbal.both <- probMetrics(pred.th$data[ ,id.prob], y, verbose=FALSE)
# res.both <- rbind(data.frame(res.unbal.both, type="unbal"), data.frame(res.under.both, type="under"))
res.both$beta <- betas.under[b]
resBoth <- rbind(resBoth, res.both)
}
resALL <- rbind(data.frame(resHigh, high=TRUE), data.frame(resLow, high=FALSE), data.frame(resBoth, high="ALL"))
resALL$high <- factor(resALL$high)
#write.csv(resALL, "resALL.csv")
#library(reshape2)
meltResALL <- reshape2::melt(resALL, id.vars = c("type", "beta", "high"), variable.name = "metric")
#meltResALL <- melt(resALL, .(type, beta, high), variable.name="metric")
#save(meltResALL, file="meltResALL.Rdata")
if (!is.na(dirPlot)){
#plot the density of the posterior probability
dd <- predcv$data
colnames(dd)[which(colnames(dd)==paste("prob", positive, sep = "."))] <- "prob.1"
pp <- as.numeric(quantile(prob.all, c(0.25, 0.5, 0.75)))
if(all(pp < 0.001))
pp <- round(pp, 3)
#library(ggplot2)
d <- ggplot(dd, aes(x=prob.1, fill=truth)) + geom_density()
d <- d + labs(list(title = paste0(task.id, '\n'), x = "\n p(+|x)"))
for(q in pp)
d <- d + geom_vline(xintercept = q, colour="blue", linetype = "longdash")
ggsave(filename=paste0(dirPlot, "/dens_beta_1.pdf"), plot=d)
ggsave(filename=paste0(dirPlot, "/dens_beta_1.svg"), plot=d)
#plot the derivative and sdRatioMean
sdRatioMean <- apply(sdRatio, 2, function(x) mean(x[is.finite(x)]))
vardd <- NULL
for(z in 1:length(pp))
vardd <- rbind(vardd, data.frame(beta=betas.under, sdRatioMean, p=pp[z], dps=dps(pp[z], betas.under)))
vardd$p <- factor(vardd$p)
p <- ggplot(data = vardd, aes(x=beta, y = dps, colour=p))
p <- p + geom_line() + ggtitle(paste0(task.id, "\n"))
p <- p + geom_hline(yintercept = 1, colour="blue", linetype = "longdash")
p <- p + geom_line(aes(x=beta, y = sdRatioMean), colour="black", linetype = "longdash")
p <- p + ylim(0, max(vardd$dps, vardd$sdRatioMean)) + theme_bw()
ggsave(filename=paste0(dirPlot, "/dps_",task.id,".pdf"), plot=p)
ggsave(filename=paste0(dirPlot, "/dps_",task.id,".svg"), plot=p)
# library(gridExtra)
# pdf(paste0(dirPlot, "/dps_p_",task.id,".pdf"))
# grid.arrange(arrangeGrob(p, d, ncol=2))
# dev.off()
}
#library(reshape2)
m.var <- setdiff(colnames(probs), c("class", "id", "iter"))
mdd <- reshape2::melt(probs, measure.vars = m.var, variable.name = "prob")
mdd$beta <- as.numeric(sapply(strsplit(as.character(mdd$prob), "_", fixed=TRUE), function(x) x[2]))
mdd$type <- sapply(strsplit(as.character(mdd$prob), ".", fixed=TRUE), function(x) x[2])
mdd$type <- factor(sapply(strsplit(mdd$type, "_", fixed=TRUE), function(x) x[1]))
mdd[which(mdd$type=="unbal"), 'beta'] <- 1
mdd$lambda <- with(mdd, (sqrt(beta)-beta)/(1-beta))
brier.all <- brierScore(getProbabilities(pred.all), pred.all$data$truth, positive)
res.all <- c(res.all, beta=1, pi=N.pos/N, brier=brier.all)
brier.th <- brierScore(getProbabilities(pred.th), pred.th$data$truth, positive)
res.th <- c(res.th, beta=1, pi=N.pos/N, brier=brier.th)
results <- rbind(res.all, res.th, res.under, res.ucal)
m <- colnames(results)
m <- sub("tpr", "recall", m)
m <- sub("ppv", "precision", m)
colnames(results) <- m
#save(results, file="results.Rdata")
#write.csv(results, file=paste0("results_", task.id, ".csv"))
df <- data.frame(results, prob=row.names(results))
mres <- reshape2::melt(df, id.vars=c("prob", "beta"), variable.name = "metric")
mres$type <- sapply(strsplit(as.character(mres$prob), ".", fixed=TRUE), function(x) x[2])
mres$type <- factor(sapply(strsplit(mres$type, "_", fixed=TRUE), function(x) x[1]))
#save(mres, file="mres.Rdata")
l <- list(results=results, probs=probs, mres=mres, meltResALL=meltResALL)
return(l)
}
#' @title Brier Score
#'
#' @description
#' Function used to compute Brier Score, a measure of probability calibration.
#'
#' @param phat probability of a instance to belogn to the class positive
#' @param truth class of the instance
#' @param positive value of the positive (minority) class
#'
#' @return Brier Score statistics
#'
#' @examples
#' truth <- c(1,1,1,0,0,0,1,0,1,0)
#' phat <- c(0.9,0.8,0.4,0.5,0.3,0.2,0.8,0.3,0.8,0.3)
#' brierScore(phat, truth, 1)
#'
#' @export
brierScore <- function(phat, truth, positive=1) {
stopifnot(length(phat) == length(truth), positive %in% truth)
y <- as.numeric(truth == positive)
mean((y - phat)^2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.