library(tidyverse) # That famous Mr. Wickham! library(entropies) # This package library(ggtern) # Ternary diagrams on ggplot library(vcd) # Categorical benchmarks library(mlbench) # ml benchmarkss library(candisc) # Wine dataset #knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4) fancy <- TRUE # set this for nicer on-screen visualization fancy <- FALSE # Set this for either printing matter or...
datasets <- getDatasets()
Let's print this information to latex:
library(xtable) ds4latexing <- datasets %>% dplyr::select(-className, -idNumber) row.names(ds4latexing) <- NULL names(ds4latexing) <- c("Dataset Name", "class card.", "num. features", "num. instances") thisLatex <- xtable(ds4latexing, caption="Some datasets considered in this study", label="tab:datasets") align(thisLatex) <- xalign(thisLatex) thisLatex
Obtain the entropies and some other data for plotting from all datasets.
edf <- data.frame() for(dsName in unique(datasets$name)){ dsRecord <- filter(datasets, name == dsName) ds <- evalDataset(dsName) # for(withClass in withClasses){ # if (withClass){ print(sprintf("Analyzing dataset with class label: %s", dsName)) # }else { # print(sprintf("Analyzing dataset without class label: %s", dsName)) # as per: # http://stackoverflow.com/questions/5234117/how-to-drop-columns-by-name-in-a-data-frame # Don't EVER use subset in PROGRAMS! #ds <- subset(ds, subset=1:nrow(ds), select=dsRecord$className, drop=TRUE) # ds <- ds[, !colnames(ds) == dsRecord$className] #fastest in bechnmark at bot. of url # } #Kdataset <- select(ds, which(names(ds) == dsRecord$className)) Kdataset <- dplyr::select(ds, matches(dsRecord$className)) Xdataset <- dplyr::select(ds, -matches(dsRecord$className)) edf <- rbind(edf, cbind(dsName, transform="observation", jentropies(Kdataset,Xdataset)# entropies of observations ) # sentropies(ds, nbins=ceiling(nrow(ds)^(1/3))) %>% ) #select those columns which are numeric numColumns <- sapply(Xdataset, is.numeric) if (!any(numColumns)) next #catColumns <- !numColumns#categorical columns in X (not including the class) Tdataset <- Xdataset[,numColumns]#Until we know how to apply Box-Cox # Tdataset <- log(Xdataset)#Box-Cox transformation # edf <- rbind(edf, # cbind(dsName, # transform="Box-Cox:log", # jentropies(Xdataset,Tdataset) # ) # ) pca <- prcomp(Tdataset, center = TRUE, scale. = TRUE) Ydataset <- cbind( Xdataset[,!numColumns], predict(pca, newdata=Tdataset) ) edf <- rbind(edf, cbind(dsName, transform="PCAonBoxCox:log", jentropies(Tdataset, Ydataset) ) ) } str(edf) # show the split entropies #filter(edf, type != "XY") #show all entropies for a data base print(filter(edf, dsName == "iris"))
Visualize both the aggregate and the split entropies.
fancy <- TRUE #fancy <- FALSE #select some entropies to visualize cmet <- ggmetern(edf %>% filter(type != "XY"), #filter(edf, type != "XY"),#alternative not to visualize the aggregate fancy) + geom_point(aes(color=transform, shape=type), size=3) + scale_shape_manual(values=c(4, 20, 1)) + labs(color="Dataset name", shape="Var type") cmet + facet_wrap(~dsName, ncol=2) #dev.off()#Necessary to do the textual plot. #ggsave(str_interp("entropyEndToEnd_${dsName}_epoch_${thisEpoch}.jpeg"), plot=e2ePlot)
More information about the evaluation of classifiers with the Channel Multivariate Entropy Triangle can be found in
library(bibtex) print(citation("entropies")['val:pel:18c'], style="text")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.