Environment construction

library(tidyverse)  # That famous Mr. Wickham!
library(entropies) # This package
library(ggtern)    # Ternary diagrams on ggplot
library(vcd)       # Categorical benchmarks
library(mlbench)   # ml benchmarkss
library(candisc)   # Wine dataset
#knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication
knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4)
fancy <- TRUE  # set this for nicer on-screen visualization
fancy <- FALSE # Set this for either printing matter or...

Data preparation

Datasets available for entropy analysis in this package

datasets <- getDatasets()

Let's print this information to latex:

library(xtable)
ds4latexing <- datasets %>% dplyr::select(-className, -idNumber)
row.names(ds4latexing) <- NULL
names(ds4latexing) <- c("Dataset Name", "class card.", "num. features", "num. instances")
thisLatex <- xtable(ds4latexing, 
                    caption="Some datasets considered in this study",
                    label="tab:datasets")
align(thisLatex) <- xalign(thisLatex)
thisLatex

Obtaining the entropies

Obtain the entropies and some other data for plotting from all datasets.

edf <- data.frame()
for(dsName in unique(datasets$name)){
    dsRecord <-  filter(datasets, name == dsName)
    ds <- evalDataset(dsName)
#    for(withClass in withClasses){
#        if (withClass){
            print(sprintf("Analyzing dataset with class label: %s", dsName))
#        }else {
#            print(sprintf("Analyzing dataset without class label: %s", dsName))
            # as per: 
            # http://stackoverflow.com/questions/5234117/how-to-drop-columns-by-name-in-a-data-frame
            # Don't EVER use subset in PROGRAMS!
            #ds <- subset(ds, subset=1:nrow(ds), select=dsRecord$className, drop=TRUE)
#            ds <- ds[, !colnames(ds) == dsRecord$className] #fastest in bechnmark at bot. of url
#        }
        #Kdataset <- select(ds, which(names(ds) == dsRecord$className))
        Kdataset <- dplyr::select(ds, matches(dsRecord$className))
        Xdataset <- dplyr::select(ds, -matches(dsRecord$className))
        edf <- rbind(edf,
                     cbind(dsName,
                           transform="observation",
                           jentropies(Kdataset,Xdataset)# entropies of observations
                           )
#                     sentropies(ds, nbins=ceiling(nrow(ds)^(1/3))) %>%
                    )
        #select those columns which are numeric
        numColumns <- sapply(Xdataset, is.numeric)
        if (!any(numColumns)) next
        #catColumns <- !numColumns#categorical columns in X (not including the class)
        Tdataset <- Xdataset[,numColumns]#Until  we know how to apply Box-Cox
        # Tdataset <- log(Xdataset)#Box-Cox transformation
        # edf <- rbind(edf,
        #              cbind(dsName,
        #                    transform="Box-Cox:log",
        #                    jentropies(Xdataset,Tdataset)
        #                    )
        #              )
        pca <- prcomp(Tdataset, center = TRUE, scale. = TRUE) 
        Ydataset <- cbind(
                        Xdataset[,!numColumns],
                        predict(pca, newdata=Tdataset)
        )
        edf <- rbind(edf,
                     cbind(dsName, 
                           transform="PCAonBoxCox:log",
                           jentropies(Tdataset, Ydataset)
                           )
        )
}
str(edf)
# show the split entropies
#filter(edf, type != "XY")
#show all entropies for a data base
print(filter(edf, dsName == "iris"))

Visualization

Visualize both the aggregate and the split entropies.

fancy <- TRUE
#fancy <- FALSE
#select some entropies to visualize
cmet <- ggmetern(edf %>% filter(type != "XY"), 
                 #filter(edf, type != "XY"),#alternative not to visualize the aggregate
                 fancy) +
    geom_point(aes(color=transform, shape=type), size=3) +
    scale_shape_manual(values=c(4, 20, 1)) +
    labs(color="Dataset name", shape="Var type")
cmet + facet_wrap(~dsName, ncol=2)
#dev.off()#Necessary to do the textual plot.
#ggsave(str_interp("entropyEndToEnd_${dsName}_epoch_${thisEpoch}.jpeg"), plot=e2ePlot)

Postscriptum

More information about the evaluation of classifiers with the Channel Multivariate Entropy Triangle can be found in

library(bibtex)
print(citation("entropies")['val:pel:18c'], style="text")

Postscriptum

sessionInfo()


FJValverde/entropies documentation built on Oct. 12, 2023, 10:17 p.m.