In ryanholbrook/bayeserror: Estimating Bayes Error in Classification Tasks

Estimation of Bayesian Error

Procedure

Say $X$ is a data set with categorical response variable $y$ with $C$ number of categories. We will form an ensemble of base classifiers $f^1, \ldots, f^N$, which we will call $f^{Ave}$. We desire that the classifiers have errors with minimal bias and that the errors are minimally correlated. It is not necessary that the errors are especially small, however. The classifiers don't need to be especially accurate, but they do need to be distinct and unbiased. We will train these classifiers on the data set, and by comparing their individual errors to the ensemble error, we can get an estimate of the informativeness of the data set.

Partition the data into training sets and prediction sets. We might use K-fold CV with K tending on the higher side (or even LOO); again, we want to reduce bias in the errors.
For each fold: a. Fit each of the classifiers $f^1, \ldots, f^n$ and $f^{Ave}$ on the training set. b. Make predictions for each base classifier $f^1, \ldots, f^n$ on the testing set. We will have a vector $P^k_i$ of predictions $P^1, \ldots, P^N$ for each classifier. c. Make predictions for $\f^{Ave}$ to get $P^{Ave}i$. If predictive probabilities were used, this is the average of these probabilities. If a classification response was used, take the majority vote. d. Compute the log-loss (cross-entropy) errors of these predictions. We will now have errors $\epsilon^1, \ldots, \epsilon^N, \epsilon^{Ave}{Total}$. e. Compute the total error for all of the base classifiers by averaging across classifiers. We will have $\epsilon_{Total} = \sum_{N} \epsilon^k$.
Average errors produced by each fold to get $E^1, \ldots, E^{N}$, $E_{Total}$ and $E^Ave_{Total}$.
Compute the Normalized Dual Total Correlation of Error for the base classifiers: [ \delta = \operatorname{ND}(E^1, \ldots, E^{N}) = \frac{H(E^1, \ldots, E^{N}) - \sum_k H(E^k \mid E^{-k})}{H(E^{1}, \ldots, E^{N}) ]
Compute the estimated Bayes error. [ E_{Bayes} = \frac{N E^{Ave}{Total} - ((N - 1) \delta + 1) E{Total}}{(N - 1)(1 - \delta)} ]

Question: When is the best time to compute the error correlation? Once, after the resampling, as here? Other options are to use the initial errors on each category and averaging or to use the error produced by each fold and averaging.

source("R/bayeserror.R")

## Binary classification data

## 2D Gaussian
set.seed(31415)
data_2dnormal <- as.data.frame(mlbench::mlbench.2dnormals(5000, 2))
data_2dnormal <- data.matrix(data_2dnormal)
data_2dnormal[, 3] <- data_2dnormal[, 3] - 1 # need 0/1 classes instead of 1/2
x <- data_2dnormal[, 1:2]
y <- data_2dnormal[, 3]

## We will use three classifiers
classifiers <- c(randomForest_classify, naivebayes_classify, svm_classify)
bayeserror(x, y, classifiers)

set.seed(31415)
data(Sonar, package = "mlbench")
data_sonar <- data.matrix(Sonar)
data_sonar[, ncol(Sonar)] <- data_sonar[, ncol(Sonar)] - 1 # need 0/1 classes instead of 1/2
x <- data_sonar[, -ncol(data_sonar)]
y <- data_sonar[, ncol(data_sonar)]
## We will use three classifiers
classifiers <- c(randomForest_classify, naivebayes_classify, svm_classify)
bayeserror(x, y, classifiers)

Look at the package philentropy for distance measures, smartsvm in Python.

print_information_measures<- function(X, ave) {
    X <- infotheo::discretize(X)
    ave <- infotheo::discretize(ave)
    mi <- unlist(lapply(X = X,
                        FUN = function (x)
                            infotheo::mutinformation(x, ave)))
    ent <- infotheo::entropy(X)
    multi_mi <- infotheo::multiinformation(X)
    sum_mi <- sum(mi)
    avg_mi <- mean(mi)
    sum_multi <- sum_mi / multi_mi
    avg_multi <- avg_mi / multi_mi
    sum_ent <- sum_mi / ent # Maybe?
    avg_ent <- avg_mi / ent
    message("MI ", paste(mi, "\n"))
    message(paste("\nEnt", ent,
                  "\nMulti", multi_mi,
                  "\nSum", sum_mi,
                  "\nAvg", avg_mi,
                  "\nSum Multi", sum_multi,
                  "\nAvg Multi", avg_multi,
                  "\nSum Ent", sum_ent,
                  "\nAvg Ent", avg_ent))
}

library("zeallot")

split_vec <- function(x, n) split(x, sample(x %% n) + 1)
split_idx <- function(x, n) split_vec(seq_len(nrow(x)), n)

data(titanic3, package="PASWR")
setDT(titanic3)
c(idx_train, idx_eval) %<-%
    split_idx(titanic3, 2)
c(idx_train, idx_eval) %<-%
    split_idx(titanic3, 2)
Xy_calibrate_train <- titanic3[idx_train]
X_calibrate_train <- titanic3[idx_train, !"survived"]
y_calibrate_train <- titanic3[idx_train][["survived"]]
Xy_calibrate_eval <- titanic3[idx_eval]
X_calibrate_eval <- titanic3[idx_eval, !"survived"]
y_calibrate_eval <- titanic3[idx_eval][["survived"]]

fit_rpart <- rpart::rpart(survived ~ sex+age+pclass+sibsp+parch, Xy_calibrate_train, method = "class")
rpart.plot::prp(fit_rpart, faclen = 10, varlen = 15, cex = 1.2, box.col = c("red", "lightblue")[fit_rpart$frame$yval], extra = 108, type = 5)

pred_rpart <- predict(fit_rpart, Xy_calibrate_eval, type = "class")

MLmetrics::ConfusionMatrix(y_calibrate_eval, pred_rpart)
MLmetrics::Accuracy(y_calibrate_eval, pred_rpart)
MLmetrics::LogLoss(y_calibrate_eval, as.integer(pred_rpart) - 1)
MLmetrics::AUC(y_calibrate_eval, as.integer(pred_rpart) - 1)

fit_naivebayes <- naivebayes::naive_bayes(x = X_calibrate_train[, .(sex, age, pclass, sibsp, parch)],
                                          y = factor(y_calibrate_train),
                                          laplace = 0.5)

plot(fit_naivebayes, ask = TRUE)

pred_naivebayes <- predict(fit_naivebayes, newdata = X_calibrate_eval)
MLmetrics::ConfusionMatrix(y_calibrate_eval, pred_naivebayes)
MLmetrics::Accuracy(y_calibrate_eval, pred_naivebayes)
MLmetrics::LogLoss(y_calibrate_eval, as.integer(pred_naivebayes) - 1)
MLmetrics::AUC(y_calibrate_eval, as.integer(pred_naivebayes) - 1)

fit_oner <- OneR::OneR(survived ~ sex+age+pclass+sibsp+parch,
                       Xy_calibrate_train)

plot(fit_oner, ask = TRUE)

pred_oner <- predict(fit_oner, newdata = X_calibrate_eval)
MLmetrics::ConfusionMatrix(y_calibrate_eval, pred_oner)
MLmetrics::Accuracy(y_calibrate_eval, pred_oner)
MLmetrics::LogLoss(y_calibrate_eval, as.integer(pred_oner) - 1)
MLmetrics::AUC(y_calibrate_eval, as.integer(pred_oner) - 1)


as.data.frame(list(rpart = as.integer(pred_rpart),
                   nb = as.integer(pred_naivebayes),
                   oner = as.integer(pred_oner))) %>%
  cor