# Roxygen documentation
#' Bastiat, please estimate this classifier.
#'
#' @return A list including the classifier object plus a performance data frame.
#' @references www.globaltradealert.org
#' @author Johannes Fritz for GTA.
# Function infos and parameters --------------------------------------------
bt_estimate_classifier = function(training.data=NULL,
training.id="bid",
training.y="evaluation",
estimation.model="XGB",
robustness.turns=0,
train.share=.82,
update.log=F,
detective.name=NULL,
detective.number=NULL,
max.training.rows = 15000
){
library(SuperLearner)
if(is.null(training.data)){stop("Please provide training data.")}
classifier.performance=data.frame()
algorithm.dictionary=data.frame(short=c("XGB","KNN","RNF", "SVM"),
full=c("SL.xgboost", "SL.kernelKnn", "SL.randomForest", "SL.ksvm"),
stringsAsFactors = F)
if(is.null(estimation.model)| (! estimation.model %in% c("XGB","KNN","RNF", "SVM"))){
stop("Unknown estimation model specified.")
}
#to stop my computer dying
#reduces the size of the data according to the param.
if(nrow(training.data) > max.training.rows){
training.data = training.data[sample(nrow(training.data), max.training.rows),]
message(paste("training data sized reduced to", max.training.rows, "rows"))
}
for(model in estimation.model){
sl.model=algorithm.dictionary$full[algorithm.dictionary$short==model]
this.performance=data.frame()
for(round in 1:(robustness.turns+1)){
print(paste("Start estimation round",round,"for model", model))
train.split=sample(unique(training.data[,c(training.id)]), ceiling(nrow(training.data)*train.share))
eval(parse(text = paste("train.x =subset(training.data, ",training.id," %in% train.split)",sep="")))
eval(parse(text = paste("test.x =subset(training.data, !",training.id," %in% train.split)",sep="")))
test.y=test.x[,c(training.y)]
test.x=test.x[,setdiff(names(test.x),c(training.id, training.y))]
train.y=train.x[,c(training.y)]
train.x=train.x[,setdiff(names(train.x),c(training.id, training.y))]
## estimation
classifier = SuperLearner(train.y,train.x, family = binomial(),
SL.library = sl.model)
pred.train=data.frame(obs=train.y, pred=predict(classifier, train.x)$pred[,1])
pred.test= data.frame(obs=test.y, pred=predict(classifier, test.x)$pred[,1])
## stats
c.train=b_cutoff_probability(observations = pred.train$obs, predictions = pred.train$pred)
c.test=b_cutoff_probability(observations = pred.test$obs, predictions = pred.test$pred)
capture=nrow(subset(subset(pred.test, obs==1), pred>=c.train))/nrow(subset(pred.test, obs==1))
reduction=1-nrow(subset(pred.test, obs==0& pred>=c.train))/nrow(subset(pred.test, obs==0))
capture.adjusted=nrow(subset(subset(pred.test, obs==1), pred>=c.test))/nrow(subset(pred.test, obs==1))
reduction.adjusted=1-nrow(subset(pred.test, obs==0 & pred>=c.test))/nrow(subset(pred.test, obs==0))
this.performance=rbind(this.performance,
data.frame(algorithm=sl.model,
score=b_score_me(capture, reduction),
score.adjusted=b_score_me(capture.adjusted, reduction.adjusted),
observed.cutoff=c.train,
assumed.cutoff=c.test,
cutoff.ratio=c.test/c.train,
capture=nrow(subset(subset(pred.test, obs==1), pred>=c.train))/nrow(subset(pred.test, obs==1)),
reduction=1-nrow(subset(pred.test, obs==0& pred>=c.train))/nrow(subset(pred.test, obs==0)),
capture.adjusted=nrow(subset(subset(pred.test, obs==1), pred>=c.test))/nrow(subset(pred.test, obs==1)),
reduction.adjusted=1-nrow(subset(pred.test, obs==0 & pred>=c.test))/nrow(subset(pred.test, obs==0)),
stringsAsFactors = F))
print(paste("Concluded estimation round",round,"for model", model))
}
classifier.performance=rbind(classifier.performance,
data.frame(algorithm=sl.model,
score=mean(this.performance$score),
score.adjusted=mean(this.performance$score.adjusted),
observed.cutoff=mean(this.performance$observed.cutoff),
assumed.cutoff=min(this.performance$observed.cutoff)*mean(this.performance$cutoff.ratio),
cutoff.ratio=mean(this.performance$cutoff.ratio),
cutoff.ratio.sd=sd(this.performance$cutoff.ratio),
capture=mean(this.performance$capture),
capture.sd=sd(this.performance$capture),
reduction=mean(this.performance$reduction),
reduction.sd=sd(this.performance$reduction),
capture.adjusted=mean(this.performance$capture.adjusted),
capture.adjusted.sd=sd(this.performance$capture.adjusted),
reduction.adjusted=mean(this.performance$reduction.adjusted),
reduction.adjusted.sd=sd(this.performance$reduction.adjusted),
stringsAsFactors = F))
if(classifier.performance$score[classifier.performance$algorithm==sl.model]>=classifier.performance$score.adjusted[classifier.performance$algorithm==sl.model]){
cutoff=classifier.performance$observed.cutoff[classifier.performance$algorithm==sl.model]
} else {
cutoff=classifier.performance$assumed.cutoff[classifier.performance$algorithm==sl.model]
}
}
## storing the information and/or classifier, if called for
if(update.log){
if(is.null(detective.name) & is.null(detective.number)){
print("Please specify detective.")
} else {
load("content/0 core/Classifier statistics & history.Rdata")
if(is.null(detective.number)){
detective.number=max(subset(model, name==detective.name)$detective.no)
}
if(is.null(detective.name)){
detective.name=subset(model, detective.no==detective.number)$name
}
}
if(is.null(detective.number)){print("Need detective number to store result. Result NOT stored.")} else {
load("content/0 core/Classifier statistics & history.Rdata")
new.stats=classifier.performance
new.stats$detective.no=detective.number
new.stats$date=Sys.Date()
stats=bt_bind(stats,
new.stats[,names(new.stats)[names(new.stats) %in% names(stats)]])
save(model, stats, file="content/0 core/Classifier statistics & history.Rdata")
}
}
output.list<- list("performance"=classifier.performance,
"classifier"=classifier,
"cutoff"=cutoff)
return(output.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.