This vignette explores how does uncertainty of classification results manifest itself in the CBET.

Environment construction

library(dplyr)     # That infamous Mr. Wickham's! ;)
library(tidyr)     # Tidying tall & wide dataframes. Infamous too! ;)
#library(infotheo)  # The functionality provided by this has to be rerouted through entropies
library(entropies) # This package
library(ggtern)    # Ternary diagrams on ggplot
library(caret)     # Adopt caret's Weltanschauung for ML
library(datasets) # Many different datasets for R: iris
library(vcd)       # Categorical benchmarks: Arthritis
library(mlbench)   # ml benchmarks: Ionosphere, Glass, BreastCancer, Sonar
library(candisc)   # datasets: Wine
#knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication
#knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(comment=NA, fig.width=7, fig.height=7)

Datasets available

The inventory of databases to be explored:

# the inventory of databases you can access
dsNames <- c("Ionosphere", "iris", "Glass", "Arthritis", "BreastCancer", "Sonar", "Wine") # 
classNames <- c("Class","Species", "Type", "Improved", "Class", "Class", "Cultivar")  # Name of class attribute
K <- c(2, 3, 7, 3, 2, 2, 3)  # No. of classes in the class variable, as per the database
#classVar <- c(35, 5, 10, 5, 11, 61, 1)   
idNumber <- c(NaN, NaN, NaN, 1, 1, NaN, NaN)
# Other attributes to dispose of: mainly identifiers.
# If there where more to dispone of, the interface would have to change.
# To select a dataset by name
# Caveat: you have to ensure that the containing package has been attached
evalDataset <- function(dsName){
    dsName <- as.character(dsName)
    switch(dsName, #TODO: improve this way of "invoking" the dataset.
        "iris" =         {data(iris); iris},
        "Ionosphere" =   {data(Ionosphere); Ionosphere},
        "Glass" =        {data(Glass); Glass},
        "Arthritis" =    {data(Arthritis); Arthritis},
        "BreastCancer" = {data(BreastCancer); BreastCancer},
        "Sonar" =        {data(Sonar); Sonar},
        "Wine" =         {data(Wine); Wine}
    ) #This value "FALLS THROUGH"
}
# #To select the #of column of the class
findClassId <- function(ds, className){which(colnames(evalDataset(ds))==className)}
findClassCardinality <-function(ds,classId){#classCardinality("iris",5)
    length(levels(evalDataset(ds)[,classId]))
    }
classIds <-  mapply(findClassId, dsNames, classNames)# ordinal of the class attribute
Kreal <- as.vector(mapply(findClassCardinality, dsNames, classNames))  
# Actaul No. of classes in the class variable
# CAVEAT: Glass has actually only 6 classes, since class 4 examples are missing.
m <- sapply(dsNames, function(n){nrow(evalDataset(n))}) # no. of instances in the dataset
n <- sapply(dsNames, function(n){ncol(evalDataset(n))}) - 1 # class excepted 
datasets <- data.frame(name=dsNames, 
                       className=classNames, 
                       classId=classIds,
                       idNumber, 
                       K=as.integer(K), 
                       n=as.integer(n), 
                       m, 
                       stringsAsFactors=FALSE) # To prevent casting away of names.
datasets

Let's print this information to latex:

library(xtable)
ds4latexing <- datasets %>% dplyr::select(-className, -classId, -idNumber)
row.names(ds4latexing) <- NULL
names(ds4latexing) <- c("Dataset Name", "class card.", "num. features", "num. instances")
thisLatex <- xtable(ds4latexing, 
                    caption="Some datasets considered in this study",
                    label="tab:datasets")
align(thisLatex) <- xalign(thisLatex)
thisLatex

Data preparation

The following code is extracted from caret's main vignette:

# from: http://machinelearningmastery.com/how-to-estimate-model-accuracy-in-r-using-the-caret-package/
# load the library
#library(caret)
# load the iris dataset
#data(iris)
#data(atmosphere)
# define training control
train_control <- trainControl(method="cv", 
                              number=10,
                              returnData = TRUE,                   
                              savePredictions="all"
                            )
# # fix the parameters of the algorithm: Naive Bayes
# thisMethod <- "nb" #naive bay
# These parameters depend on Naive Bayes!
# thisGrid <- expand.grid(fL=c(0), usekernel=c(FALSE), adjust=c(0))
# thisPreprocess <- ""
# fix the parameters of the algorithm: knn
thisMethod <- "knn" #k-nearest neighbours
thisGrid <- NULL
thisPreprocess <- c("center", "scale")

# train the model for each dataset
edf <- data.frame()
#for(i in 1:nrow(datasets)){#go over datasets
#for(i in c(1,2,3,6)){#iris, Glass
#for(i in c(2,3)){#iris, Glass
for(i in c(2,3,6,7)){#iris(??), Glass(well-behaved, so-so), Sonar (well-behaved, bad), Wine (??)
    set.seed(17)
    #process the dataset
    dsRecord <-  datasets[i, ] #filter(datasets, name == dsName)
    dsName <- dsRecord$name
    print(paste("processing", dsName, "with", thisMethod))
    theseData <- evalDataset(dsName)
    # The formula for classification can safely now include everything but the class
    thisFormula <- formula(paste0(dsRecord$className,"~."))
    #train models for the dataset i and the method j
    trainClasses <- theseData[,dsRecord$classId]
    trainData <- theseData[,-dsRecord$classId]
    # For machine learning, it is important to dispose of columns that correlate 
    # perfectly, if artificially, with the class, for instance, identifiers. These are 
    # unwanted effects of the data sampling, not the data themselves. 
    if (!is.nan(dsRecord$idNumber)){
        trainData <- trainData[,-dsRecord$idNumber]#dplyr::select(theseData,formula(paste0("-",idNumber)))
    }
    thisModel <- train(#thisFormula,data=theseData, 
                    trainData, trainClasses, 
                    preProcess=thisPreprocess,
                    trControl=train_control, 
                    method=thisMethod, 
                    tuneGrid=thisGrid # These parameters depend on Naive Bayes!
                    #savePredictions="all"
    )
    # summarize results
    print(thisModel)
# ```
# 
# ## Evaluation on the the test set for each fold
# 
# ```r
    predictions <- thisModel$pred
    #This has the preds and obs for each fold and value of the grid
    ct  <- table(predictions %>% dplyr::select(pred, obs, Resample))
    # 'Resample' is the grid.
    #obtain the informations
    edf <-  rbind(edf,
                  cbind(jentropies(ct),Classifier=thisMethod,Dataset=dsName)
                  )
}

Visualization

In an attempt at define confidence intervals for the results of the classifier on each particular dataset, we print the entropic characterization of each fold.

First we print the results of each fold for reference in later plots:

fancy <- FALSE
smet <-  ggmetern(edf %>% filter(type == "XY"), fancy) + 
    geom_point(mapping=aes(colour=Dataset), size=1) +     
    facet_wrap(~Dataset)
smet 

Next we print the densities as an approximation to the confidence intervals:

smet1 <-  smet + geom_density_tern() 
smet1

The density for iris and wine seem strange:

This is important since the tip of the triangle is the "desired outcome" for any possible classifier.

This behaviour seems to be reproduce in the geom_confidence_tern geometry too:

smet2 <-  smet + geom_confidence_tern()
#    stat_confidence_tern(breaks=c(50)) 
#    stat_confidence_tern(breaks=c(99,95,90)) #I don't know what I am really doing here!
smet2

Are the unaccountable spread of the results for iris and the non-inclusion of the tip for wine (I cannot see whether this is the case for iris too.) a device of the calculation of confidence intervals in compositional data?

edf2 <- edf %>% dplyr::filter(Dataset=="iris") %>% 
#    dplyr::mutate(DeltaH_P2=DeltaH_P, M_P2=M_P, VI_P2=VI_P) %>%
#    dplyr::mutate(DeltaH_P=M_P2, M_P2=VI_P2, VI_P=DeltaH_P2)
    dplyr::mutate(Dummy=DeltaH_P) %>% 
    dplyr::mutate(DeltaH_P=M_P, M_P=VI_P, VI_P=Dummy)
smet02 <-  ggmetern(edf2 %>% filter(type == "XY"), fancy) + 
    geom_point(mapping=aes(colour=Dataset), #color="red", 
               size=1) +     facet_wrap(~Dataset)
smet02 + geom_confidence_tern()

Postscriptum

sessionInfo()


FJValverde/entropies documentation built on Oct. 12, 2023, 10:17 p.m.