# ensemble.R - ensemble object using FD distribution
#
# Sungcheol Kim @ IBM
# 2020/03/10
#
# version 1.0
# version 1.1 - revision with beta and mu
# version 1.2 - 2020/3/26 - correlation plot
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(tictoc)
library(data.table)
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
setClass("FiDEL",
representation(predictions = "matrix",
logit_matrix = "matrix",
rank_matrix = "matrix",
beta = "numeric",
mu = "numeric",
rstar = "numeric",
nsamples = "numeric",
nmethods = "numeric",
method_names = "character",
ensemble_auc = "numeric",
actual_performance = "numeric",
actual_prevalence = "numeric",
actual_label = "numeric",
estimated_performance = "numeric",
estimated_prevalence = "numeric",
estimated_logit = "numeric",
estimated_rank = "numeric",
estimated_label = "factor",
estimated_prob = "numeric"
))
setMethod("initialize", "FiDEL", function(.Object, predictions, ...) {
#cat("... FiDEL initializer \n")
.Object@predictions <- predictions
.Object@nsamples <- nrow(predictions)
.Object@nmethods <- ncol(predictions)
# set base classifier names
if (is.null(colnames(predictions))) {
colnames(predictions) <- 1:.Object@nmethods
}
.Object@method_names <- colnames(predictions)
names(.Object@method_names) <- colnames(predictions)
#if ties allowed
#.Object@logit_matrix <- apply(predictions, 2, frankv, ties.method="random")
#.Object@rank_matrix <- apply(predictions, 2, frankv, ties.method="random")
.Object@logit_matrix <- apply(predictions, 2, frankv, ties.method="random")
.Object@rank_matrix <- apply(predictions, 2, frankv, ties.method="random")
.Object@estimated_label <- as.factor(c('class1', rep('class2', .Object@nsamples-1)))
return(.Object)
})
fde <- FiDEL <- function(predictions, actual_label) {
obj <- new (Class="FiDEL", predictions = predictions)
if (!missing(actual_label)) {
message('... FiDEL with actual_label')
calculate_performance(obj, actual_label)
} else {
message('... FiDEL with only predictions')
obj
}
}
setMethod("show", "FiDEL", function(object) {
cat("# of methods: ", object@nmethods, "\n",
"# of samples: ", object@nsamples)
})
setValidity("FiDEL", function(object) {
if (object@nsamples == dim(object@predictions)[1]) {
return (TRUE)
} else {
return ("predictions should have dimensions as nsamples x nmethods")
}
})
setGeneric("calculate_performance", function(.Object, actual_label, ensemble_method="FiDEL", alpha=1) {standardGeneric("calculate_performance")})
setMethod("calculate_performance", "FiDEL", function(.Object, actual_label, ensemble_method="FiDEL", alpha=1) {
tic(sprintf('... N:%d, M:%d', .Object@nsamples, .Object@nmethods))
# save label data
.Object@actual_label <- actual_label
# calculate auc using labels
.Object@actual_performance <- apply(.Object@predictions, 2, auc_rank, actual_label)
for (i in seq_along(.Object@actual_performance)) {
.Object@method_names[i] <- paste0(.Object@method_names[i], '\n', 'A=',
round(.Object@actual_performance[i], digits=3))
}
.Object@actual_performance[round(.Object@actual_performance, 3)==.500] <- .501
#rescaling of fidel parameters
if (ensemble_method=="FiDEL"){
.Object@beta <- .Object@beta / dim(.Object@predictions)[1]
}
if (ensemble_method=="WoC"){
.Object@beta <- rep(1, length(.Object@beta))
}
.Object@mu <- .Object@mu * dim(.Object@predictions)[1]
.Object@rstar <- .Object@rstar * dim(.Object@predictions)[1]
# calculate new rank scores
colnames(.Object@logit_matrix) <- colnames(.Object@predictions)
.Object@logit_matrix <- apply(.Object@predictions, 2, frankv)
for (m in seq(1, .Object@nmethods)) {
.Object@logit_matrix[, m] <- .Object@beta[m]^alpha *(.Object@rstar[m] - .Object@logit_matrix[, m])
}
.Object@estimated_logit <- -rowSums(.Object@logit_matrix)
.Object@estimated_prob <- 1/(1+exp(-.Object@estimated_logit))
#if ties allowed
#.Object@estimated_rank <- frankv(.Object@estimated_logit, ties.method='random')
.Object@estimated_rank <- frankv(.Object@estimated_logit)
if (ncol(.Object@rank_matrix) != ncol(.Object@predictions)) {
.Object@rank_matrix <- .Object@rank_matrix[, -ncol(.Object@rank_matrix)]
}
#colnames(.Object@rank_matrix) <- classifier_names
.Object@rank_matrix <- cbind(.Object@rank_matrix, A_FD=.Object@estimated_rank)
# calculate ensemble AUC
.Object@ensemble_auc <- auc_rank(.Object@estimated_logit, actual_label)
cat('... Ensemble AUC:', .Object@ensemble_auc, '\n')
cat('... AUC list:', .Object@actual_performance, '\n')
cat('... beta list:', .Object@beta, '\n')
cat('... mu list:', .Object@mu, '\n')
toc()
return(.Object)
})
#not loading when sourcing R for some reason!!
setGeneric("predict_performance", function(.Object, actual_performance, p, nrow.train, alpha=1) {standardGeneric("predict_performance")})
setMethod("predict_performance", "FiDEL", function(.Object, actual_performance, p, nrow.train, alpha=1) {
tic(sprintf('... N:%d, M:%d', .Object@nsamples, .Object@nmethods))
# calculate auc using labels
.Object@actual_performance <- actual_performance
.Object@estimated_performance <- actual_performance
for (i in seq_along(.Object@actual_performance)) {
.Object@method_names[i] <- paste0(.Object@method_names[i], '\n', 'A=',
round(.Object@actual_performance[i], digits=3))
}
#any value exactly .5 is set to .501 for numerical stability issues
.Object@actual_performance[round(.Object@actual_performance, 3)==.500] <- .501
.Object@actual_prevalence <- mean(p)
b_list <- unlist(lapply(1:.Object@nmethods, function(x) get_fermi(.Object@actual_performance[[x]], rho=p[x], N=1)))
b_listdataframe <- do.call(rbind, b_list %>% as.list()) %>% as.data.frame()
b_listdataframe$name <- rownames(b_listdataframe)
b_listdataframe <- b_listdataframe %>% as_tibble()
.Object@beta <- b_listdataframe %>% as_tibble() %>% filter(grepl("beta", name)) %>% pull(V1)
.Object@mu <- b_listdataframe %>% as_tibble() %>% filter(grepl("mu", name)) %>% pull(V1)
.Object@rstar <- b_listdataframe %>% as_tibble() %>% filter(grepl("rs", name)) %>% pull(V1)
#cat('... Ensemble AUC:', .Object@ensemble_auc, '\n')
cat('... AUC list:', .Object@actual_performance, '\n')
cat('... beta list:', .Object@beta, '\n')
cat('... mu list:', .Object@mu, '\n')
toc()
return(.Object)
})
setGeneric("plot_FDstatistics", function(.Object) {standardGeneric("plot_FDstatistics")})
setMethod("plot_FDstatistics", "FiDEL", function(.Object) {
df <- data.frame(x=.Object@actual_performance, b=.Object@beta, m=.Object@mu, rs=.Object@rstar)
g1 <- ggplot(data = df) + geom_point(aes(x=x, y=b)) + xlab('AUC') + ylab('beta') + theme_classic()
g2 <- ggplot(data = df) + geom_point(aes(x=x, y=m)) + xlab('AUC') + ylab('mu') + theme_classic()
g3 <- ggplot(data = df) + geom_point(aes(x=x, y=rs)) + xlab('AUC') + ylab('R_star') + theme_classic()
g <- ggarrange(g1, g2, g3, labels=c("A", "B", "C"), ncol=3 , nrow=1)
print(g)
})
setGeneric("plot_cor", function(.Object, filename='cor.pdf', ...) {standardGeneric("plot_cor")})
setMethod("plot_cor", "FiDEL", function(.Object, filename='cor.pdf', class_flag='all', legend_flag=FALSE) {
if (length(.Object@actual_label) == 0) {
colorder <- order(.Object@estimated_performance)
colorder <- c(colorder, .Object@nmethods+1)
if (class_flag == 'all') { cor_m <- cor(.Object@rank_matrix) }
if (class_flag == 'positive') {
cor_m <- cor(.Object@rank_matrix[.Object@estimated_label == attr(.Object@estimated_label, 'class1'), colorder])
}
if (class_flag == 'negative') {
cor_m <- cor(.Object@rank_matrix[.Object@estimated_label != attr(.Object@estimated_label, 'class1'), colorder])
}
} else {
colorder <- order(.Object@actual_performance)
colorder <- c(colorder, .Object@nmethods+1)
if (class_flag == 'all') { cor_m <- cor(.Object@rank_matrix) }
if (class_flag == 'positive') {
cor_m <- cor(.Object@rank_matrix[.Object@actual_label == attr(.Object@actual_label, 'class1'), colorder])
}
if (class_flag == 'negative') {
cor_m <- cor(.Object@rank_matrix[.Object@actual_label != attr(.Object@actual_label, 'class1'), colorder])
}
}
cor_m[upper.tri(cor_m)] <- NA
melted_cor_m <- reshape2::melt(cor_m, na.rm=TRUE)
g <- ggplot(melted_cor_m, aes(Var1, Var2)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(mid = "white", low = cbPalette[6], high=cbPalette[7], midpoint=0, limit=c(-1,1)) +
theme_grey(base_size = 10) + labs(x = "", y = "") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(axis.text.x = element_text(angle = 45, vjust = 1,size = 9, hjust = 1)) +
geom_text(aes(x=Var1, y=Var2, label=round(value, digits = 3)), size=2) +
ggtitle(class_flag)
if (!legend_flag) {
g <- g + theme(legend.position = "none")
}
ggsave(filename, g, width=8, height=7)
print(g)
#return(cor_m)
})
setGeneric("plot_ensemble", function(.Object, filename='ens.pdf', ...) {standardGeneric("plot_ensemble")})
setMethod("plot_ensemble", "FiDEL", function(.Object, filename='ens.pdf', method='auc', alpha=0.95, amax=0) {
# prepare data
method_list = c('auc', 'asof', 'correlation', 'random', 'invauc')
if (!(method %in% method_list)) {
cat('... possible options for method: ', method_list)
return (FALSE)
}
if (method == 'auc') {
order_index = order(.Object@actual_performance)
} else if (method == 'invauc') {
order_index = order(.Object@actual_performance, decreasing = TRUE)
} else if (method == 'asof') {
order_index = 1:.Object@nmethods
} else if (method == 'correlation') {
order_index = cal_least_cor_list(.Object)
} else if (method == 'random') {
order_index = sample(1:.Object@nmethods)
}
agg_logit <- t(apply(.Object@logit_matrix[,order_index], 1, cumsum))
agg_auc <- apply(agg_logit, 2, auc_rank, .Object@actual_label)
min_auc <- min(.Object@actual_performance)
max_auc <- max(.Object@actual_performance)
df <- data.table(x=1:.Object@nmethods, agg_auc=agg_auc, algorithm=.Object@method_names[order_index])
# check ensemble auc > max base auc
if (max(agg_auc) > max_auc) {
label_x = .Object@nmethods
hjust = 1
} else {
label_x = 1
hjust = 0
}
print(df)
# generate plot
g <- ggplot(df) + geom_line(aes(x, agg_auc)) +
geom_point(aes(x,agg_auc)) + theme_classic() +
geom_label_repel(aes(x=x, y=agg_auc, label=algorithm),
size=3.5, fill=cbPalette[3], color='white', segment.color='gray',
box.padding = 0.75, alpha=alpha) +
xlab('Number of methods') + ylab('Aggregated AUC') +
ggtitle(paste0('Sequentially Aggregated Classifiers ordered by ', method)) +
geom_hline(yintercept = min_auc, linetype='dashed', color=cbPalette[6], alpha=0.7) +
geom_hline(yintercept = max_auc, linetype='dashed', color=cbPalette[2], alpha=0.7) +
annotate(geom='text', x=.Object@nmethods, y=min_auc+0.001, label='min. of base classifiers', hjust=1, vjust=0) +
annotate(geom='text', x=label_x, y=max_auc-0.001, label='max. of base classifiers', hjust=hjust, vjust=1)
if (amax > 0) {
g <- g + ylim(c(min_auc, amax))
}
ggsave(filename, width=8, height=6)
print(g)
})
setGeneric("plot_single", function(.Object, target, ...) {standardGeneric("plot_single")})
setMethod("plot_single", "FiDEL", function(.Object, target, c, n=100, m=100) {
if (missing(c)) { c <- 0 }
if (c > .Object@nmethods) { c <- .Object@nmethods }
if (c < 0) { c <- 0 }
if (target == 'score') {
if (c == 0) { scores <- .Object@estimated_logit }
else { scores <- .Object@predictions[ , c] }
if (length(.Object@actual_label) > 0) {
g <- plot.scores(scores, .Object@actual_label)
} else {
g <- plot.scores(scores)
}
print(g)
}
if (target == 'roc') {
pcr1 <- pcr(.Object@predictions[ , c], .Object@actual_label, sample_size=n, sample_n=m)
g <- plot.curves(pcr1)
print(g)
}
if (target == 'pcr') {
pcr1 <- pcr(.Object@predictions[ , c], .Object@actual_label, sample_size=n, sample_n=m)
g <- plot.pcr(pcr1)
print(g)
}
})
setGeneric("plot_performance", function(.Object, ...) {standardGeneric("plot_performance")})
setMethod("plot_performance", "FiDEL", function(.Object, nmethod_list=5:7, nsample=20, trendline=FALSE, filename='FiDEL_perf.pdf') {
df <- cal_partial_performance(.Object, nmethod_list=nmethod_list, nsample=nsample)
df$nmethod <- as.factor(df$nmethod)
x_all <- max(.Object@actual_performance)
y_all <- .Object@ensemble_auc
y_final <- mean(y_all)
min_y <- min(c(df$Best_Indv, df$FiDEL, x_all, y_all))
max_y <- max(c(df$Best_Indv, df$FiDEL, x_all, y_all))
g <- ggplot(df, aes(x=Best_Indv, y=FiDEL)) + theme_classic() +
geom_point() + facet_wrap(~nmethod) +
xlab('Best AUC from random sampled methods') + xlim(c(min_y, max_y)) +
ylab('FiDEL AUC') + ylim(c(min_y, max_y)) +
geom_abline(slope=1, linetype='dashed', alpha=0.7)
#annotate(geom="curve", x=x_all, y=y_all, xend=x_all, yend=x_all, curvature=-.3, arrow = arrow(length = unit(2, "mm"))) +
#annotate(geom="text", x=max_y, y=x_all, label='All', hjust=0)
if (trendline)
g <- g + geom_smooth(method=loess)
ggsave(filename, width=11, height=4)
return (g)
})
plot_performance_nmethods <- function(.Object, nmethod_list=5:7, nsample=20, seed=100, method='SE', filename='FiDEL_perf_nmethod.pdf') {
df <- cal_partial_performance(.Object, nmethod_list=nmethod_list, nsample=nsample, seed=seed)
df <- melt(df, id.vars = 'nmethod', variable.name='method', value.name='AUC')
tmp <- df %>% group_by(nmethod, method) %>%
mutate(Performance=mean(AUC)) %>%
mutate(sd=sd(AUC)) %>%
mutate(N=length(AUC))
if (method == 'SE') {
tmp$sd <- tmp$sd/sqrt(tmp$N)
}
#tmp$ci <- tmp$se * qt(conf.interval/2 + .5, tmp$N-1)
tmp$shape <- ifelse(tmp$method == 'FiDEL', "21", "23")
g <- ggplot(tmp, aes(x=nmethod, y=Performance)) + theme_classic() +
geom_line(aes(linetype=method, color=method), size=2) +
geom_errorbar(width=.1, aes(ymin=Performance-sd, ymax=Performance+sd)) +
geom_point(aes(shape=method), size=2, fill='white') +
xlab('Number of methods (M)') +
ylab('Performance (AUC)') +
theme(legend.position = c(0.75, 0.7)) + scale_color_manual(values=c("FiDEL"="black", "Best_Indv"="grey70")) + scale_shape_manual(values=c("FiDEL"=21, "Best_Indv"=23))+ labs(color = "Method", shape = "Method") + guides(linetype=FALSE)
ggsave(filename, width=6, height=4)
return (g)
}
cal_partial_performance <- function(.Object, nmethod_list=5:7, nsample=20, seed=100) {
set.seed(seed)
x <- c()
y <- c()
n <- c()
for (j in nmethod_list) {
for (i in 1:nsample) {
modellist <- sample(1:.Object@nmethods, j)
FDAuc <- auc_rank(-rowSums(.Object@logit_matrix[,modellist]), .Object@actual_label)
BestAuc <- max(.Object@actual_performance[modellist])
x <- c(x, BestAuc)
y <- c(y, FDAuc)
n <- c(n, j)
}
}
df <- data.table(FiDEL=y, Best_Indv=x, nmethod=n)
#df$nmethod <- as.factor(df$nmethod)
return(df)
}
cal_least_cor_list <- function(.Object) {
# reorder by AUC
colorder <- order(.Object@actual_performance, decreasing = TRUE)
# create correlation matrix
cor_m <- abs(cor(.Object@rank_matrix[.Object@actual_label == attr(.Object@actual_label, 'class1'),
colorder]))
#print(cor_m)
idx_lc <- c(which.max(cor_m[1,]), which.min(cor_m[1,]))
cor_m <- cor_m[,-idx_lc]
names_lc <- names(idx_lc)
for (i in seq(2:(.Object@nmethods-2))) {
# find least correlated method
corsum <- colSums(cor_m[idx_lc,])
j <- which.min(corsum)
idx_lc <- c(idx_lc, j)
if (ncol(cor_m) > 2) {
names_lc <- c(names_lc, names(j))
} else {
names_lc <- c(names_lc, names(which.min(corsum)), names(which.max(corsum)))
}
cor_m <- cor_m[,-j]
}
#print(idx_lc)
return(names_lc)
}
# calculate ensemble score using fermi-dirac statistics
ensemble.fermi <- function(predictions, y, alpha = 1.0, method='+', debug.flag = FALSE) {
M <- ncol(predictions)
N <- nrow(predictions)
res <- matrix(0, nrow=N, ncol=M)
fd <- matrix(0, nrow=M, ncol=4)
auclist <- apply(predictions, 2, auc_rank, y)
fd[, 2:4] <- t(sapply(auclist, get_fermi, attr(y, 'rho'), N=N))
fd[, 1] <- t(auclist)
for(m in 1:M) {
res[ , m] <- frankv(predictions[, m])
if (method == '+')
res[ , m] <- -fd[m, 2]^alpha *(fd[m, 4] - res[, m])
else
res[ , m] <- 12*N*(fd[m, 1] - 0.5)/(N*N - 1) *((N+1.)/2.- res[, m])
}
if(debug.flag) {
print(fd)
}
return(rowSums(res))
}
# find eigen vector of estimate \hat{R} of the rank-one matrix R
# leading eigen vector is close to the true one
# adapted from r-summa (https://github.com/learn-ensemble/R-SUMMA/blob/master/R/matrix_calculations.R)
find_eigen <- function(covMatrix, tol=1e-6, niter_max=10000) {
eig <- eigen(covMatrix, symmetric=TRUE)
eig_all <- vector()
l <- 0
iter <- 1
while (abs(l - eig$values[1]) > tol & (iter < niter_max)) {
l <- eig$values[1]
r <- (covMatrix - diag(diag(covMatrix)) + diag(l[1]*eig$vectors[, 1]^2))
eig <- eigen(r, symmetric=TRUE)
eig_all <- c(eig_all, eig$values[1])
iter <- iter + 1
}
return (list(eig=eig, eig_all=eig_all))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.