This script gives an example on how to use the Channel Multivariate Entropy Triangle (CMET) to analyze Feature Transformations on a dataset.
We apply any number of transformations to its features (in this case PCA and ICA) and represent the accrued information loss incurred in the prescribed selection order of features in the CMET (each transformation orders the obtained features in some way.)
Finally, we consider the transformed features as data sources in their own right, whence we can compare all of them in the Source Multivariate Entropy Triangle (SMET, upside-down triangle).
Take into consideration that this is NOT an investigation of whether PCA is ''better'' than ICA, whatever that might mean.
library(tidyverse) # That nofamous Mr. Wickham!;) library(RColorBrewer) # Facilities for palettes library(entropies) # This package. Depends heavily on "entropy", "infotheory". #library(ggtern) # Ternary diagrams on ggplot, already attached library(vcd) # Categorical benchmarks library(mlbench) # ml benchmarkss library(candisc) # Wine dataset # Naive transformation from factors to numbers in 0 to num.factors - 1 factor.as.numeric <- function(f){ nums <- as.numeric(f) return(nums - min(nums)) }
Some top level switches and options gathered in one place.
debugLevel <- 0 # Debug level 0-non-existent, 1-minimal, the greater the more verbose. fancy <- TRUE # set this for nicer on-screen visualization. #fancy <- FALSE # Set this for either printing matter or more austere plots. getPlot <- TRUE # Flag to obtain plots for publication. getPlot <- FALSE #Comment to get .jpeg files rather than plots of ets. knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4) if (getPlot) knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication
data(datasets) if (getPlot){# For papers, it helps to have the table in latex. library(xtable) print.xtable(xtable(dplyr::select(datasets, name, K, n, m))) }
I suggest something like this if you are developing some algorithm and testing on a number of different databases.
# Uncomment the name of the database to be analyzed # dsName <- "Ionosphere" dsName <- "iris" # dsName <- "BreastCancer"#Supply the name of the database to be analyzed#FACTORS # dsName <- "Wine" # dsName <- "Glass"#Cannot take logarithms for PCA: zeros returns -Inf # dsName <- "Arthritis"#It has non-numeric factors. # dsName <- "Sonar" # Suggestions: choose: "iris", "BreastCancer" or "Glass" to replicate Entropy 2018 paper. dsRecord <- filter(datasets, name == dsName) ds <- loadDataset(dsRecord$name,dsRecord$packName) #id columns, if existent in dsRecord$idNumber # log transform of everything but the class and any id if existant. if (!is.na(dsRecord$idNumber)){ ds <- ds[,-dsRecord$idNumber] } #class column ds.classNum <- which(names(ds)==dsRecord$className) #take away the class, but keep it just in case. class.ds <- ds[, ds.classNum]#saving the class. Warning A FACTOR! ds <- ds[,-ds.classNum] ds <- ds %>% #transform factors to number mutate_if(is.factor,factor.as.numeric) %>% # Dispose of columns with NaN select_if(function(v) !any(is.na(v))) %>% # Dispose of constant columns: they carry no information select_if(function(v)(var(v) > 0)) ncols <- ncol(ds)#Mnemonic shortcut: num of columns dsDiscretized <- infotheo::discretize(ds, disc="equalwidth") if (dsName != "Ionosphere"){ log.ds <- log(ds)#this has to be made conditional on the database log.dsDiscretized <- infotheo::discretize(log.ds) #TODO: try to get rid of annoying warnings each time entropy is called. }
We next choose glyphs, colors, etc. for visualizing with the CMET and SMET.
# The colors for the different feature sets for the plots. if (fancy){#Different color schemes you may try: #orderingColors <- rev(terrain.colors(ncols+1))[1:ncols + 1]#too similar #orderingColors <- rev(terrain.colors(ncols+1))[1:ncols + 1] #orderingColors <- rev(topo.colors(ncols+1))[1:ncols + 1]#too flashy #orderingColors <- rev(cm.colors(ncols+1))[1:ncols + 1]#too washed out #orderingColors <- rev(heat.colors(ncols+1))[1:ncols + 1] orderingPalette <- "BuPu" } else {#These are supposed to be classical. #orderingColors <- rev(gray(0:ncols / ncols))[1:ncols + 1] #orderingColors <- scale_colour_grey(end=0.9)#Not for discrete levels orderingPalette <- "Blues" } # We discard the lightest shade because it is too akin to white,the background. # REcipe from: https://stackoverflow.com/questions/29466239/how-to-set-the-color-range-of-scale-colour-brewer-in-ggplot2-palette-selecte # library(RColorBrewer) # my_orange = brewer.pal(n = 9, "Oranges")[3:9] #there are 9, I exluded the two lighter hues #orderingColors <- brewer.pal(n = ncols + 2, orderingPalette)[1:ncols + 2]#Does not provide enough # But since Brewer color values are only up to 9, we need another mechanims to interpolate # https://stackoverflow.com/questions/16922988/interpolating-a-sequential-brewer-palette-as-a-legend-for-ggplot2 myColorsFun <- colorRampPalette(brewer.pal(9,orderingPalette)) orderingColors <- myColorsFun(ncols+2)[1:ncols + 2] if (debugLevel > 0){#When debugging, have a peek at the colors. pie(rep(1,ncols),col=orderingColors) } # The shapes for the different types of transform transformShapes <- c("log"=4, "PCA"=21, "ICA"=23)#no fill, solid # The shapes to represent the components of observations sourceShapes <- c("none"=8, "log"=4, "PCA"=21, "ICA"=23)#no fill, solid #Classical for channel ETs: X for the x and circle for Y typeShapes <- c("X" = 4, "Y" = 6, "ALL" = 10, "XY" = 20)
First we collect the information transformation from data to log data in entropy triangle data frames for sources and transformations.
etdf <- data.frame() sedf <- data.frame() #The following point of etdf shoul print as a point on a cross, being deterministic. if (dsName != "Ionosphere"){ etdf <- rbind(etdf, cbind(data=paste0("1_",ncols), transform="log", jentropies(dsDiscretized, log.dsDiscretized) ) ) sedf <- rbind(sedf,cbind(data=paste0("1_",ncols), transform="log", sentropies(log.dsDiscretized[,1:ncols]) ) ) } sedf <- rbind(sedf, cbind(data=paste0("1_",ncols), transform="none", sentropies(dsDiscretized[,1:ncols], type="none") ) ) #Reorder the factors for the plots' legends orderedLevels <-sapply((1:ncols), function(i) paste0("1_",i)) sedf <- sedf %>% mutate(data=factor(data,levels=orderedLevels))
Next we obtain the PCA for the log-transformed data.
# Idea for script from: # https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/ # apply PCA - scale. = TRUE is highly # advisable, but default is FALSE. #pca.model <- prcomp(log.ds, center = TRUE, scale. = TRUE) pca.model <- prcomp(ds, center = TRUE, scale. = TRUE) # print method print(pca.model) # plot method plot(pca.model, type = "l") # summary method: describes the importance of the components. summary(pca.model) # Finally we obtained the transformed data, the Y to be analized dsPca <- as.data.frame(predict(pca.model, newdata= ds)) # Why predict as opposed to just working out the entropy? # Because the pca.model is just the set of matrices for carriying out the pca. We need to generate the data! dsPcaDiscretized <- infotheo::discretize(dsPca, disc="equalwidth")
Next we see how the information from the original data is transferred to the principal components cumulatively. Firts we work out the transformation entropies from retaining only the first n features and then we visualize it.
for( i in (1:ncols)){ etdf <- rbind(etdf, cbind(data=paste0("1_",i), transform="PCA", jentropies(dsDiscretized, as.tibble(dsPcaDiscretized[ ,1:i])) ) ) } etdf <- etdf %>% mutate(data=factor(data,levels=orderedLevels)) transfo_cmet <- ggmetern(etdf %>% filter(transform == "PCA"), fancy) + geom_point(aes(color=data, shape=type), size=3) + scale_shape_manual(values=typeShapes) + scale_color_manual(values=orderingColors) + labs(color="Feature set", shape="Entropy type") + theme(legend.key=element_blank()) transfo_cmet if (getPlot){ dev.off()#Necessary to do the textual plot. ggsave(stringr::str_interp("featureSel_CMET_PCA_${dsName}.jpeg"), plot=transfo_cmet) }
The observation of the gain of adopting the PCA has to be seen in the source triangle (see below).
#library(ica)#ICA has to be found for each value of the library(fastICA) set.seed(25)#for reproducibilit for(i in (1:ncols)){ # ica <- icaimax( # ds[,1:ncols], nc=i, # center=TRUE # ) ica <- fastICA(X=ds[,1:ncols], i, alg.typ="parallel", fun="logcosh", alpha=1, method="C", row.norm= FALSE, maxit=200, tol=0.0001, verbose=FALSE ) dsIca <- data.frame(ica$S)#Matrix of source signal estimates (S=Y%*%R) as per ?icaimax dsIcaDiscretized <- infotheo::discretize(dsIca, disc="equalwidth") etdf <- rbind(etdf, cbind(data=paste0("1_",i), transform="ICA", jentropies(dsDiscretized, as.data.frame(dsIcaDiscretized[,1:i])) ) ) }
First we visualize ICA on its own to parallel the previous visualization.
# show the split entropies transfo_cmet <- ggmetern(etdf %>% filter(transform == "ICA"), fancy) + # ggmetern(etdf %>% filter(type != "XY" & transform == "PCA"), fancy) + geom_point(aes(color=data, shape=type), size=3) + scale_shape_manual(values=typeShapes) + #scale_color_brewer(palette=orderingPalette) + #scale_fill_brewer(palette=orderingPalette) + #Shapes are non-fillable. scale_color_manual(values=orderingColors) + labs(color="Feature set", shape="Entropy type") + theme(legend.key=element_blank()) # ggmetern(etdf %>% filter(type != "XY" & transform == "ICA"), fancy) + # geom_point(aes(color=data, shape=type), size=3) + # scale_shape_manual(values=trasnforType) + # scale_color_manual(values=orderingColors) + # labs(color="Feature set", shape="Type") transfo_cmet if (getPlot){ dev.off()#Necessary to do the textual plot. ggsave(stringr::str_interp("featureSel_CMET_ICA_${dsName}.jpeg"), plot=transfo_cmet) }
Next, to compare, PCA, ICA and their absence we plot just the output entropy of the transformation:
# show the split entropies #fancy <- !TRUE#TRUE is better for interactive use. !TRUE for illustrations. comparison_cmet <- ggmetern(etdf %>% filter(type == "Y"), fancy) + geom_point(aes(fill=data, shape=transform), size=3, alpha=.8) + scale_shape_manual(values=transformShapes) + #scale_fill_brewer(palette=orderingPalette) + #scale_color_brewer(palette=orderingPalette) + scale_fill_manual(values=orderingColors) + labs(fill="Feature set", shape="Transform type", panel.background="white") + guides(fill=guide_legend(override.aes=list(shape=22))) + #show a shape than can be filled! theme(legend.key=element_blank()) comparison_cmet if (getPlot){ dev.off()#Necessary to do the textual plot. ggsave(stringr::str_interp("featureSel_CMET_compare_PCA_ICA_${dsName}.jpeg"), plot=comparison_cmet) }
In the case of iris, we see that choosing only the first component in ICA or PCA carries much more information on the input per feature than taking the whole database. This is also true for choosing up to 2 or 3 components, but not for 4 components.
We next plot the source contents of the databases made with and witout the re-parameterization, both for PCA and ICA.
for( i in (1:ncols)){ sedf <- rbind(sedf, cbind(data=paste0("1_",i), transform="PCA", sentropies(as.data.frame(dsPcaDiscretized[,1:i])) ) ) }
#Now get the same approximations as before for the sources: for( i in (1:ncols)){ sedf <- rbind(sedf, cbind(data=paste0("1_",i), transform="ICA", sentropies(as.data.frame(dsIcaDiscretized[,1:i])) ) ) }
In this plot we compare the different subsets of features as entropy sources.
transfo_smet <- ggmetern(filter(sedf, name == "ALL" & transform != "log"), fancy) + geom_point(aes(fill=data, shape=transform), size=3) + scale_shape_manual(values=sourceShapes) + scale_fill_manual(values=orderingColors) + guides(fill=guide_legend(override.aes=list(shape=22))) + #show a shape than can be filled! theme(legend.key=element_blank()) transfo_smet if (getPlot){ dev.off()#Necessary to do the textual plot. ggsave(stringr::str_interp("featureSel_SMET_compare_PCA_ICA_${dsName}.jpeg"), plot=transfo_smet) }
In the case of iris we see that the reason to choose 1, 2, or 3 features is because the carry less redundancy. However, they sample the input space slightly less uniformly than the original database.
Try this script with Glass or Arthritis for completely different behaviours.
More information about the evaluation of feature transformations with the Channel Multivariate Entropy Triangle can be found in
library(bibtex) print(citation("entropies")['val:pel:18c'], style="text")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.