This vignette explores how does uncertainty of classification results manifest itself in the CBET.
library(dplyr) # That infamous Mr. Wickham's! ;) library(tidyr) # Tidying tall & wide dataframes. Infamous too! ;) #library(infotheo) # The functionality provided by this has to be rerouted through entropies library(entropies) # This package library(ggtern) # Ternary diagrams on ggplot library(caret) # Adopt caret's Weltanschauung for ML library(datasets) # Many different datasets for R: iris library(vcd) # Categorical benchmarks: Arthritis library(mlbench) # ml benchmarks: Ionosphere, Glass, BreastCancer, Sonar library(candisc) # datasets: Wine #knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication #knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment=NA, fig.width=7, fig.height=7)
The inventory of databases to be explored:
# the inventory of databases you can access dsNames <- c("Ionosphere", "iris", "Glass", "Arthritis", "BreastCancer", "Sonar", "Wine") # classNames <- c("Class","Species", "Type", "Improved", "Class", "Class", "Cultivar") # Name of class attribute K <- c(2, 3, 7, 3, 2, 2, 3) # No. of classes in the class variable, as per the database #classVar <- c(35, 5, 10, 5, 11, 61, 1) idNumber <- c(NaN, NaN, NaN, 1, 1, NaN, NaN) # Other attributes to dispose of: mainly identifiers. # If there where more to dispone of, the interface would have to change. # To select a dataset by name # Caveat: you have to ensure that the containing package has been attached evalDataset <- function(dsName){ dsName <- as.character(dsName) switch(dsName, #TODO: improve this way of "invoking" the dataset. "iris" = {data(iris); iris}, "Ionosphere" = {data(Ionosphere); Ionosphere}, "Glass" = {data(Glass); Glass}, "Arthritis" = {data(Arthritis); Arthritis}, "BreastCancer" = {data(BreastCancer); BreastCancer}, "Sonar" = {data(Sonar); Sonar}, "Wine" = {data(Wine); Wine} ) #This value "FALLS THROUGH" } # #To select the #of column of the class findClassId <- function(ds, className){which(colnames(evalDataset(ds))==className)} findClassCardinality <-function(ds,classId){#classCardinality("iris",5) length(levels(evalDataset(ds)[,classId])) } classIds <- mapply(findClassId, dsNames, classNames)# ordinal of the class attribute Kreal <- as.vector(mapply(findClassCardinality, dsNames, classNames)) # Actaul No. of classes in the class variable # CAVEAT: Glass has actually only 6 classes, since class 4 examples are missing. m <- sapply(dsNames, function(n){nrow(evalDataset(n))}) # no. of instances in the dataset n <- sapply(dsNames, function(n){ncol(evalDataset(n))}) - 1 # class excepted datasets <- data.frame(name=dsNames, className=classNames, classId=classIds, idNumber, K=as.integer(K), n=as.integer(n), m, stringsAsFactors=FALSE) # To prevent casting away of names. datasets
Let's print this information to latex:
library(xtable) ds4latexing <- datasets %>% dplyr::select(-className, -classId, -idNumber) row.names(ds4latexing) <- NULL names(ds4latexing) <- c("Dataset Name", "class card.", "num. features", "num. instances") thisLatex <- xtable(ds4latexing, caption="Some datasets considered in this study", label="tab:datasets") align(thisLatex) <- xalign(thisLatex) thisLatex
The following code is extracted from caret's main vignette:
# from: http://machinelearningmastery.com/how-to-estimate-model-accuracy-in-r-using-the-caret-package/ # load the library #library(caret) # load the iris dataset #data(iris) #data(atmosphere) # define training control train_control <- trainControl(method="cv", number=10, returnData = TRUE, savePredictions="all" ) # # fix the parameters of the algorithm: Naive Bayes # thisMethod <- "nb" #naive bay # These parameters depend on Naive Bayes! # thisGrid <- expand.grid(fL=c(0), usekernel=c(FALSE), adjust=c(0)) # thisPreprocess <- "" # fix the parameters of the algorithm: knn thisMethod <- "knn" #k-nearest neighbours thisGrid <- NULL thisPreprocess <- c("center", "scale") # train the model for each dataset edf <- data.frame() #for(i in 1:nrow(datasets)){#go over datasets #for(i in c(1,2,3,6)){#iris, Glass #for(i in c(2,3)){#iris, Glass for(i in c(2,3,6,7)){#iris(??), Glass(well-behaved, so-so), Sonar (well-behaved, bad), Wine (??) set.seed(17) #process the dataset dsRecord <- datasets[i, ] #filter(datasets, name == dsName) dsName <- dsRecord$name print(paste("processing", dsName, "with", thisMethod)) theseData <- evalDataset(dsName) # The formula for classification can safely now include everything but the class thisFormula <- formula(paste0(dsRecord$className,"~.")) #train models for the dataset i and the method j trainClasses <- theseData[,dsRecord$classId] trainData <- theseData[,-dsRecord$classId] # For machine learning, it is important to dispose of columns that correlate # perfectly, if artificially, with the class, for instance, identifiers. These are # unwanted effects of the data sampling, not the data themselves. if (!is.nan(dsRecord$idNumber)){ trainData <- trainData[,-dsRecord$idNumber]#dplyr::select(theseData,formula(paste0("-",idNumber))) } thisModel <- train(#thisFormula,data=theseData, trainData, trainClasses, preProcess=thisPreprocess, trControl=train_control, method=thisMethod, tuneGrid=thisGrid # These parameters depend on Naive Bayes! #savePredictions="all" ) # summarize results print(thisModel) # ``` # # ## Evaluation on the the test set for each fold # # ```r predictions <- thisModel$pred #This has the preds and obs for each fold and value of the grid ct <- table(predictions %>% dplyr::select(pred, obs, Resample)) # 'Resample' is the grid. #obtain the informations edf <- rbind(edf, cbind(jentropies(ct),Classifier=thisMethod,Dataset=dsName) ) }
In an attempt at define confidence intervals for the results of the classifier on each particular dataset, we print the entropic characterization of each fold.
First we print the results of each fold for reference in later plots:
fancy <- FALSE smet <- ggmetern(edf %>% filter(type == "XY"), fancy) + geom_point(mapping=aes(colour=Dataset), size=1) + facet_wrap(~Dataset) smet
Next we print the densities as an approximation to the confidence intervals:
smet1 <- smet + geom_density_tern() smet1
The density for iris and wine seem strange:
for iris, the dots at the tip of the triangle do not seem to be taken into consideration while the zone towards the bottom right seems to be over-emphasized.
for wine the same we can observe the same behaviour with respect to the points at the tip.
This is important since the tip of the triangle is the "desired outcome" for any possible classifier.
This behaviour seems to be reproduce in the geom_confidence_tern geometry too:
smet2 <- smet + geom_confidence_tern() # stat_confidence_tern(breaks=c(50)) # stat_confidence_tern(breaks=c(99,95,90)) #I don't know what I am really doing here! smet2
Are the unaccountable spread of the results for iris and the non-inclusion of the tip for wine (I cannot see whether this is the case for iris too.) a device of the calculation of confidence intervals in compositional data?
edf2 <- edf %>% dplyr::filter(Dataset=="iris") %>% # dplyr::mutate(DeltaH_P2=DeltaH_P, M_P2=M_P, VI_P2=VI_P) %>% # dplyr::mutate(DeltaH_P=M_P2, M_P2=VI_P2, VI_P=DeltaH_P2) dplyr::mutate(Dummy=DeltaH_P) %>% dplyr::mutate(DeltaH_P=M_P, M_P=VI_P, VI_P=Dummy) smet02 <- ggmetern(edf2 %>% filter(type == "XY"), fancy) + geom_point(mapping=aes(colour=Dataset), #color="red", size=1) + facet_wrap(~Dataset) smet02 + geom_confidence_tern()
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.