#computes AUC in the training set excluding the partition that the base method was learned on
auc_rank_train <- function(scores, y, nmethods, x, class1=NULL) {
nrows <- unlist(lapply(1:nmethods, function(x) y[[x]] %>% as_tibble() %>% nrow()))
exclude.folds <- setdiff(1:22, 1:nmethods)
y <- y[-exclude.folds]
y <- do.call(rbind, y) %>% as_tibble() %>% pull(y)
cumsum.nrows <- c(0, cumsum(nrows))
remove.idx <- (cumsum.nrows[x] + 1): cumsum.nrows[x+1]
scores <- scores[-remove.idx]
nan.values <- which(is.na(scores))
y <- y[-remove.idx]
if (!identical(integer(0), nan.values)){
y <- y[-nan.values]
scores <- scores[-nan.values]
}
# validate inputs
stopifnot(length(scores) == length(y))
if (is.null(attr(y, 'rho')) || attr(y, 'rho') == 0) { y <- to_label(y, class1=class1) }
# calculate class 1 and class 2
N <- attr(y, 'N')
N1 <- attr(y, 'N1')
N2 <- attr(y, 'N2')
mat <- data.table(scores=scores, y=y)
mat$rank <- frankv(scores, order=-1, ties.method="random")
res <- abs(sum(mat$rank[y == attr(y, 'class1')])/N1 - sum(mat$rank[y == attr(y, 'class2')])/N2)/N + 0.5
if (res < 0.5) {
message('... class label might be wrong.')
res <- 1 - 0.5
}
return (res)
}
not_any_na <- function(x) all(!is.na(x))
#predicts the class label usnig the training set estimated FiDEL parameters
predict.mtrainer.train <- function(mtrainer, newdata2=NULL, class1=NULL) {
message(paste0('... predict using ', mtrainer$nmethods, ' base classifiers'))
if (is.null(class1)) {
class1 <- mtrainer$fitlist[[1]]$finalModel$obsLevels[1]
}
exclude.folds <- setdiff(1:22, 1:mtrainer$nmethods)
newdata2 <- newdata2[-exclude.folds]
nrows <- unlist(lapply(1:mtrainer$nmethods, function(x) newdata2[[x]] %>% as_tibble() %>% nrow()))
cumsum.nrows <- c(0, cumsum(nrows))
tot.nrow <- sum(nrows)
mtrainer$predictions <- matrix(nrow=tot.nrow, ncol=mtrainer$nmethods)
newdata3 <- do.call(rbind, newdata2) %>% as_tibble()
for (j in 1:mtrainer$nmethods){
remove.idx <- (cumsum.nrows[j] + 1): cumsum.nrows[j+1]
newdata <- newdata3[-remove.idx,]
if(!is.null(newdata)) {
mtrainer$test_data <- newdata
}
stopifnot(!is.null(mtrainer$test_data))
tmp <- predict(mtrainer$fitlist[j], newdata=mtrainer$test_data, type='prob')
pred <- tmp[[1]][, class1]
mtrainer$predictions[-remove.idx, j] <- pred
}
colnames(mtrainer$predictions) <- mtrainer$model_list
return(mtrainer)
}
#plots the overall_performance comparing the FiDEL, WoC, and Best Individual classifier
overall_performance <- function(fde.obj, woc.obj, nmethod_list, nsample, seed, method){
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
fde5 <- fde.obj
woc5 <- woc.obj
df <- cal_partial_performance(fde5, nmethod_list=nmethod_list, nsample=nsample, seed=seed)
df2 <- cal_partial_performance(woc5, nmethod_list=nmethod_list, nsample=nsample, seed=seed)
df$WoC <- df2$FiDEL
df <- melt(df, id.vars = 'nmethod', variable.name='method', value.name='AUC')
tmp <- df %>% group_by(nmethod, method) %>%
mutate(Performance=mean(AUC)) %>%
mutate(sd=sd(AUC)) %>%
mutate(N=length(AUC))
if (method == 'SE') {
tmp$sd <- tmp$sd/sqrt(tmp$N)
}
tmp <- tmp %>% mutate(shape=ifelse(method=="FiDEL", "21", ifelse(method=="WoC", "25", "23")))
#c(0.75, 0.7)
g <- ggplot(tmp, aes(x=nmethod, y=Performance)) + theme_classic() +
geom_line(aes(linetype=method, color=method), size=2) +
geom_errorbar(width=.1, aes(ymin=Performance-sd, ymax=Performance+sd)) +
geom_point(aes(shape=method), size=2, fill='white') +
xlab('Number of methods (M)') +
ylab('Performance (AUC)') +
theme(legend.position = c(0.75, 0.17)) + scale_color_manual(values=c("FiDEL"=cbPalette[3], "WoC"="black", "Best_Indv"="grey70")) + scale_shape_manual(values=c("FiDEL"=21, "Best_Indv"=23, "WoC"=25))+ labs(color = "Method", shape = "Method") + guides(linetype=FALSE)
}
#takes in FiDEL object and produces correlation matrix of ranks of test set items from base classifiers
corrank <- function(object, type="weight"){
rankmatrix <- object@rank_matrix
labels <- as.character(object@actual_label)
labels <- ifelse(labels=="Yes", 1, 0)
class1freq <- mean(labels)
ranker <- rankmatrix %>% as_tibble() %>% mutate(labels=labels)
#C5.0 has all equal ranks, break ranks randomly
ranker$C5.0 <- sample(1:dim(ranker)[1], replace=FALSE, size=dim(ranker)[1])
split.data <- split(ranker, ranker$labels)
mats <- lapply(split.data, function(x) split.data$"0" %>% select(-c(labels, C5.0, A_FD)) %>% cor() %>% round(2) )
if (type=="weight"){
cor_m <- mats$"0"*(1-class1freq) + mats$"1"*class1freq
}
if (type=="equal_weight"){
cor_m <- (mats$"0" + mats$"1")/2
}
#cor_m[upper.tri(cor_m)] <- NA
#melted_cor_m <- reshape2::melt(cor_m, na.rm=TRUE)
#melted_cor_m
cor_m
}
#Computes the empirical probability of class given rank vector for each of the base methods
#note: we can compare the output of fidel.fits() with fd.coords() to understand how well the empirical and analytical probability of class given rank vector are
fidel.fits <- function(t1, traininglist, n.iter=200, n=1000){
mtrainer <- t1
newdata2 <- traininglist
pcr.matrix <- matrix(ncol=mtrainer$nmethods, nrow=n)
for (i in 1:mtrainer$nmethods){
exclude.folds <- setdiff(1:22, 1:mtrainer$nmethods)
newdata2 <- newdata2[-exclude.folds]
newdata2 <- newdata2[-i]
newdata3 <- do.call(rbind, newdata2) %>% as_tibble()
rho <- do.call(rbind, traininglist) %>% as_tibble() %>% mutate(p=ifelse(as.character(y)=="Yes", 1, 0)) %>% pull(p) %>% mean()
split.data <- split(newdata3, newdata3$y)
yes.data <- split.data$Yes
no.data <- split.data$No
pcr <- vector("numeric", n)
nclass1 <- round(n*rho)
nclass0 <- n - nclass1
for (j in 1:n.iter){
idx.rows.class0 <- sample(x = dim(no.data)[1], size = nclass0, replace = FALSE)
idx.rows.class1 <- sample(x = dim(yes.data)[1], size = nclass1, replace = FALSE)
data <- rbind(no.data[idx.rows.class0, ], yes.data[idx.rows.class1, ]) %>% as_tibble()
true.labels <- data$y
tmp <- predict(mtrainer$fitlist[i], newdata=data, type='prob')
#maybe add ties.method="random"
pred <- 1 + n - (tmp[[1]][, "Yes"] %>% frankv(ties.method="random"))
class_ranked <- data.frame(pred=pred, true=true.labels) %>% as_tibble() %>% arrange(pred) %>% mutate(t=ifelse(as.character(true)=="Yes", 1, 0)) %>% pull(t)
pcr <- pcr + class_ranked
}
pcr <- pcr / n.iter
pcr.matrix[,i] <- pcr
}
}
#defines fermi-dirac distribution
fd <- function(x, beta, mu){
r <- c(1:x)
1 / (1 + exp(beta * (r - mu)))
}
#computes the fermi-dirac distribution for each of the base classifiers given the beta and mu values estimated from the training set
fd.coords <- function(fde4, pcr.matrix){
lapply(1:length(fde4@beta), function(i) fd(x=dim(pcr.matrix)[1], beta=fde4@beta[i], mu=fde4@mu[i]))
}
#helper functions for training base classifiers for the springleaf marketing dataset
caret_train_slm <- function(method, mtrainer, formula, tr_data, n_cores) {
message(paste0('Training algorithm : ', method, ' with : ', n_cores, ' cores'))
flush.console()
cl <- makePSOCKcluster(n_cores, setup_strategy="sequential")
registerDoParallel(cl)
if (method %in% c('gbm', 'nnet')) {
if (method=="nnet"){
fit <- caret::train(y~., data=tr_data, method='nnet', trControl=mtrainer$control, metric="ROC", tuneLength=4, preProc = c("center", "scale"), verbose=FALSE, threshold=0.3)
} else{
fit <- caret::train(formula, data=tr_data, method=method, trControl=mtrainer$control, metric="ROC", tuneLength=4, preProc = c("center", "scale"), verbose=FALSE, threshold=0.3)
}
} else {
fit <- caret::train(formula, data=tr_data, method=method, trControl=mtrainer$control, metric="ROC", tuneLength=4, preProc = c("center", "scale"))
}
stopCluster(cl)
return (fit)
}
train.mtrainer.slm <- function(mtrainer, formula, data_list, update=FALSE, n_cores=-1) {
if (n_cores == -1) n_cores <- detectCores() - 1
fname <- paste0(mtrainer$dataInfo, '.RData')
if (file.exists(fname) & !update) { mtrainer$fitlist <- readRDS(fname) }
tic(cat('... train model with ', mtrainer$nmethods, ' algorithms\n'))
for (i in 1:mtrainer$nmethods) {
if (mtrainer$model_list[i] %in% names(mtrainer$fitlist) && !update) {
message(paste0('... using cached result: ', mtrainer$model_list[i]))
} else {
if (length(data_list) == 1) {
fit <- caret_train_slm(mtrainer$model_list[i], mtrainer, formula, data_list[[1]], n_cores)
} else {
fit <- caret_train_slm(mtrainer$model_list[i], mtrainer, formula, data_list[[i]], n_cores)
}
fitlist <- list(fit)
names(fitlist) <- c(mtrainer$model_list[i])
mtrainer$fitlist <- append(mtrainer$fitlist, fitlist)
}
saveRDS(mtrainer$fitlist, file = fname)
}
mtrainer$nmethods <- length(mtrainer$fitlist)
toc()
mtrainer
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.