In FJValverde/entropies: Plotting ternary entropy diagrams for joint distributions, contingency matrices, etc

This vignette explores how does uncertainty of classification results manifest itself in the CBET.

Environment construction

library(dplyr)     # That infamous Mr. Wickham's! ;)
library(tidyr)     # Tidying tall & wide dataframes. Infamous too! ;)
#library(infotheo)  # The functionality provided by this has to be rerouted through entropies
library(entropies) # This package
library(ggtern)    # Ternary diagrams on ggplot
library(caret)     # Adopt caret's Weltanschauung for ML
library(datasets) # Many different datasets for R: iris
library(vcd)       # Categorical benchmarks: Arthritis
library(mlbench)   # ml benchmarks: Ionosphere, Glass, BreastCancer, Sonar
library(candisc)   # datasets: Wine
#knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication
#knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(comment=NA, fig.width=7, fig.height=7)

Datasets available

The inventory of databases to be explored:

# the inventory of databases you can access
dsNames <- c("Ionosphere", "iris", "Glass", "Arthritis", "BreastCancer", "Sonar", "Wine") # 
classNames <- c("Class","Species", "Type", "Improved", "Class", "Class", "Cultivar")  # Name of class attribute
K <- c(2, 3, 7, 3, 2, 2, 3)  # No. of classes in the class variable, as per the database
#classVar <- c(35, 5, 10, 5, 11, 61, 1)   
idNumber <- c(NaN, NaN, NaN, 1, 1, NaN, NaN)
# Other attributes to dispose of: mainly identifiers.
# If there where more to dispone of, the interface would have to change.
# To select a dataset by name
# Caveat: you have to ensure that the containing package has been attached
evalDataset <- function(dsName){
    dsName <- as.character(dsName)
    switch(dsName, #TODO: improve this way of "invoking" the dataset.
        "iris" =         {data(iris); iris},
        "Ionosphere" =   {data(Ionosphere); Ionosphere},
        "Glass" =        {data(Glass); Glass},
        "Arthritis" =    {data(Arthritis); Arthritis},
        "BreastCancer" = {data(BreastCancer); BreastCancer},
        "Sonar" =        {data(Sonar); Sonar},
        "Wine" =         {data(Wine); Wine}
    ) #This value "FALLS THROUGH"
}
# #To select the #of column of the class
findClassId <- function(ds, className){which(colnames(evalDataset(ds))==className)}
findClassCardinality <-function(ds,classId){#classCardinality("iris",5)
    length(levels(evalDataset(ds)[,classId]))
    }
classIds <-  mapply(findClassId, dsNames, classNames)# ordinal of the class attribute
Kreal <- as.vector(mapply(findClassCardinality, dsNames, classNames))  
# Actaul No. of classes in the class variable
# CAVEAT: Glass has actually only 6 classes, since class 4 examples are missing.
m <- sapply(dsNames, function(n){nrow(evalDataset(n))}) # no. of instances in the dataset
n <- sapply(dsNames, function(n){ncol(evalDataset(n))}) - 1 # class excepted 
datasets <- data.frame(name=dsNames, 
                       className=classNames, 
                       classId=classIds,
                       idNumber, 
                       K=as.integer(K), 
                       n=as.integer(n), 
                       m, 
                       stringsAsFactors=FALSE) # To prevent casting away of names.
datasets

Let's print this information to latex:

library(xtable)
ds4latexing <- datasets %>% dplyr::select(-className, -classId, -idNumber)
row.names(ds4latexing) <- NULL
names(ds4latexing) <- c("Dataset Name", "class card.", "num. features", "num. instances")
thisLatex <- xtable(ds4latexing, 
                    caption="Some datasets considered in this study",
                    label="tab:datasets")
align(thisLatex) <- xalign(thisLatex)
thisLatex

Data preparation

The following code is extracted from caret's main vignette:

# from: http://machinelearningmastery.com/how-to-estimate-model-accuracy-in-r-using-the-caret-package/
# load the library
#library(caret)
# load the iris dataset
#data(iris)
#data(atmosphere)
# define training control
train_control <- trainControl(method="cv", 
                              number=10,
                              returnData = TRUE,                   
                              savePredictions="all"
                            )
# # fix the parameters of the algorithm: Naive Bayes
# thisMethod <- "nb" #naive bay
# These parameters depend on Naive Bayes!
# thisGrid <- expand.grid(fL=c(0), usekernel=c(FALSE), adjust=c(0))
# thisPreprocess <- ""
# fix the parameters of the algorithm: knn
thisMethod <- "knn" #k-nearest neighbours
thisGrid <- NULL
thisPreprocess <- c("center", "scale")

# train the model for each dataset
edf <- data.frame()
#for(i in 1:nrow(datasets)){#go over datasets
#for(i in c(1,2,3,6)){#iris, Glass
#for(i in c(2,3)){#iris, Glass
for(i in c(2,3,6,7)){#iris(??), Glass(well-behaved, so-so), Sonar (well-behaved, bad), Wine (??)
    set.seed(17)
    #process the dataset
    dsRecord <-  datasets[i, ] #filter(datasets, name == dsName)
    dsName <- dsRecord$name
    print(paste("processing", dsName, "with", thisMethod))
    theseData <- evalDataset(dsName)
    # The formula for classification can safely now include everything but the class
    thisFormula <- formula(paste0(dsRecord$className,"~."))
    #train models for the dataset i and the method j
    trainClasses <- theseData[,dsRecord$classId]
    trainData <- theseData[,-dsRecord$classId]
    # For machine learning, it is important to dispose of columns that correlate 
    # perfectly, if artificially, with the class, for instance, identifiers. These are 
    # unwanted effects of the data sampling, not the data themselves. 
    if (!is.nan(dsRecord$idNumber)){
        trainData <- trainData[,-dsRecord$idNumber]#dplyr::select(theseData,formula(paste0("-",idNumber)))
    }
    thisModel <- train(#thisFormula,data=theseData, 
                    trainData, trainClasses, 
                    preProcess=thisPreprocess,
                    trControl=train_control, 
                    method=thisMethod, 
                    tuneGrid=thisGrid # These parameters depend on Naive Bayes!
                    #savePredictions="all"
    )
    # summarize results
    print(thisModel)
# ```
# 
# ## Evaluation on the the test set for each fold
# 
# ```r
    predictions <- thisModel$pred
    #This has the preds and obs for each fold and value of the grid
    ct  <- table(predictions %>% dplyr::select(pred, obs, Resample))
    # 'Resample' is the grid.
    #obtain the informations
    edf <-  rbind(edf,
                  cbind(jentropies(ct),Classifier=thisMethod,Dataset=dsName)
                  )
}

Visualization

In an attempt at define confidence intervals for the results of the classifier on each particular dataset, we print the entropic characterization of each fold.

First we print the results of each fold for reference in later plots:

fancy <- FALSE
smet <-  ggmetern(edf %>% filter(type == "XY"), fancy) + 
    geom_point(mapping=aes(colour=Dataset), size=1) +     
    facet_wrap(~Dataset)
smet

Next we print the densities as an approximation to the confidence intervals:

smet1 <-  smet + geom_density_tern() 
smet1

The density for iris and wine seem strange:

for iris, the dots at the tip of the triangle do not seem to be taken into consideration while the zone towards the bottom right seems to be over-emphasized.
for wine the same we can observe the same behaviour with respect to the points at the tip.

This is important since the tip of the triangle is the "desired outcome" for any possible classifier.

This behaviour seems to be reproduce in the geom_confidence_tern geometry too:

smet2 <-  smet + geom_confidence_tern()
#    stat_confidence_tern(breaks=c(50)) 
#    stat_confidence_tern(breaks=c(99,95,90)) #I don't know what I am really doing here!
smet2

Are the unaccountable spread of the results for iris and the non-inclusion of the tip for wine (I cannot see whether this is the case for iris too.) a device of the calculation of confidence intervals in compositional data?

edf2 <- edf %>% dplyr::filter(Dataset=="iris") %>% 
#    dplyr::mutate(DeltaH_P2=DeltaH_P, M_P2=M_P, VI_P2=VI_P) %>%
#    dplyr::mutate(DeltaH_P=M_P2, M_P2=VI_P2, VI_P=DeltaH_P2)
    dplyr::mutate(Dummy=DeltaH_P) %>% 
    dplyr::mutate(DeltaH_P=M_P, M_P=VI_P, VI_P=Dummy)
smet02 <-  ggmetern(edf2 %>% filter(type == "XY"), fancy) + 
    geom_point(mapping=aes(colour=Dataset), #color="red", 
               size=1) +     facet_wrap(~Dataset)
smet02 + geom_confidence_tern()

Postscriptum

sessionInfo()

FJValverde/entropies documentation built on Oct. 12, 2023, 10:17 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

FJValverde/entropies
Plotting ternary entropy diagrams for joint distributions, contingency matrices, etc

In FJValverde/entropies: Plotting ternary entropy diagrams for joint distributions, contingency matrices, etc

Environment construction

Datasets available

Data preparation

Visualization

Postscriptum

R Package Documentation

Browse R Packages

We want your feedback!

FJValverde/entropies Plotting ternary entropy diagrams for joint distributions, contingency matrices, etc

In FJValverde/entropies: Plotting ternary entropy diagrams for joint distributions, contingency matrices, etc

Environment construction

Datasets available

Data preparation

Visualization

Postscriptum

R Package Documentation

Browse R Packages

We want your feedback!

FJValverde/entropies
Plotting ternary entropy diagrams for joint distributions, contingency matrices, etc