Say $X$ is a data set with categorical response variable $y$ with $C$ number of categories. We will form an ensemble of base classifiers $f^1, \ldots, f^N$, which we will call $f^{Ave}$. We desire that the classifiers have errors with minimal bias and that the errors are minimally correlated. It is not necessary that the errors are especially small, however. The classifiers don't need to be especially accurate, but they do need to be distinct and unbiased. We will train these classifiers on the data set, and by comparing their individual errors to the ensemble error, we can get an estimate of the informativeness of the data set.
Question: When is the best time to compute the error correlation? Once, after the resampling, as here? Other options are to use the initial errors on each category and averaging or to use the error produced by each fold and averaging.
source("R/bayeserror.R") ## Binary classification data ## 2D Gaussian set.seed(31415) data_2dnormal <- as.data.frame(mlbench::mlbench.2dnormals(5000, 2)) data_2dnormal <- data.matrix(data_2dnormal) data_2dnormal[, 3] <- data_2dnormal[, 3] - 1 # need 0/1 classes instead of 1/2 x <- data_2dnormal[, 1:2] y <- data_2dnormal[, 3] ## We will use three classifiers classifiers <- c(randomForest_classify, naivebayes_classify, svm_classify) bayeserror(x, y, classifiers) set.seed(31415) data(Sonar, package = "mlbench") data_sonar <- data.matrix(Sonar) data_sonar[, ncol(Sonar)] <- data_sonar[, ncol(Sonar)] - 1 # need 0/1 classes instead of 1/2 x <- data_sonar[, -ncol(data_sonar)] y <- data_sonar[, ncol(data_sonar)] ## We will use three classifiers classifiers <- c(randomForest_classify, naivebayes_classify, svm_classify) bayeserror(x, y, classifiers)
Look at the package philentropy
for distance measures, smartsvm
in Python.
print_information_measures<- function(X, ave) { X <- infotheo::discretize(X) ave <- infotheo::discretize(ave) mi <- unlist(lapply(X = X, FUN = function (x) infotheo::mutinformation(x, ave))) ent <- infotheo::entropy(X) multi_mi <- infotheo::multiinformation(X) sum_mi <- sum(mi) avg_mi <- mean(mi) sum_multi <- sum_mi / multi_mi avg_multi <- avg_mi / multi_mi sum_ent <- sum_mi / ent # Maybe? avg_ent <- avg_mi / ent message("MI ", paste(mi, "\n")) message(paste("\nEnt", ent, "\nMulti", multi_mi, "\nSum", sum_mi, "\nAvg", avg_mi, "\nSum Multi", sum_multi, "\nAvg Multi", avg_multi, "\nSum Ent", sum_ent, "\nAvg Ent", avg_ent)) }
library("zeallot") split_vec <- function(x, n) split(x, sample(x %% n) + 1) split_idx <- function(x, n) split_vec(seq_len(nrow(x)), n) data(titanic3, package="PASWR") setDT(titanic3) c(idx_train, idx_eval) %<-% split_idx(titanic3, 2) c(idx_train, idx_eval) %<-% split_idx(titanic3, 2) Xy_calibrate_train <- titanic3[idx_train] X_calibrate_train <- titanic3[idx_train, !"survived"] y_calibrate_train <- titanic3[idx_train][["survived"]] Xy_calibrate_eval <- titanic3[idx_eval] X_calibrate_eval <- titanic3[idx_eval, !"survived"] y_calibrate_eval <- titanic3[idx_eval][["survived"]] fit_rpart <- rpart::rpart(survived ~ sex+age+pclass+sibsp+parch, Xy_calibrate_train, method = "class") rpart.plot::prp(fit_rpart, faclen = 10, varlen = 15, cex = 1.2, box.col = c("red", "lightblue")[fit_rpart$frame$yval], extra = 108, type = 5) pred_rpart <- predict(fit_rpart, Xy_calibrate_eval, type = "class") MLmetrics::ConfusionMatrix(y_calibrate_eval, pred_rpart) MLmetrics::Accuracy(y_calibrate_eval, pred_rpart) MLmetrics::LogLoss(y_calibrate_eval, as.integer(pred_rpart) - 1) MLmetrics::AUC(y_calibrate_eval, as.integer(pred_rpart) - 1) fit_naivebayes <- naivebayes::naive_bayes(x = X_calibrate_train[, .(sex, age, pclass, sibsp, parch)], y = factor(y_calibrate_train), laplace = 0.5) plot(fit_naivebayes, ask = TRUE) pred_naivebayes <- predict(fit_naivebayes, newdata = X_calibrate_eval) MLmetrics::ConfusionMatrix(y_calibrate_eval, pred_naivebayes) MLmetrics::Accuracy(y_calibrate_eval, pred_naivebayes) MLmetrics::LogLoss(y_calibrate_eval, as.integer(pred_naivebayes) - 1) MLmetrics::AUC(y_calibrate_eval, as.integer(pred_naivebayes) - 1) fit_oner <- OneR::OneR(survived ~ sex+age+pclass+sibsp+parch, Xy_calibrate_train) plot(fit_oner, ask = TRUE) pred_oner <- predict(fit_oner, newdata = X_calibrate_eval) MLmetrics::ConfusionMatrix(y_calibrate_eval, pred_oner) MLmetrics::Accuracy(y_calibrate_eval, pred_oner) MLmetrics::LogLoss(y_calibrate_eval, as.integer(pred_oner) - 1) MLmetrics::AUC(y_calibrate_eval, as.integer(pred_oner) - 1) as.data.frame(list(rpart = as.integer(pred_rpart), nb = as.integer(pred_naivebayes), oner = as.integer(pred_oner))) %>% cor
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.