This script gives an example on how to use the Channel Multivariate Entropy Triangle (CMET) to analyze Feature Transformations on a dataset.

We apply any number of transformations to its features (in this case PCA and ICA) and represent the accrued information loss incurred in the prescribed selection order of features in the CMET (each transformation orders the obtained features in some way.)

Finally, we consider the transformed features as data sources in their own right, whence we can compare all of them in the Source Multivariate Entropy Triangle (SMET, upside-down triangle).

Take into consideration that this is NOT an investigation of whether PCA is ''better'' than ICA, whatever that might mean.

Environment construction

library(tidyverse) # That nofamous Mr. Wickham!;)
library(RColorBrewer) # Facilities for palettes
library(entropies) # This package. Depends heavily on "entropy", "infotheory".
#library(ggtern)    # Ternary diagrams on ggplot, already attached
library(vcd)       # Categorical benchmarks
library(mlbench)   # ml benchmarkss
library(candisc)   # Wine dataset
# Naive transformation from factors to numbers in 0 to num.factors - 1
factor.as.numeric <- function(f){
    nums <- as.numeric(f)
    return(nums - min(nums))
}

Some top level switches and options gathered in one place.

debugLevel <- 0 # Debug level 0-non-existent, 1-minimal, the greater the more verbose.
fancy <- TRUE  # set this for nicer on-screen visualization.
#fancy <- FALSE # Set this for either printing matter or more austere plots.
getPlot <- TRUE # Flag to obtain plots for publication. 
getPlot <- FALSE #Comment to get .jpeg files rather than plots of ets.
knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4)
if (getPlot)
    knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication

Datasets available for entropy analysis in this package

data(datasets)
if (getPlot){# For papers, it helps to have the table in latex.
    library(xtable)
    print.xtable(xtable(dplyr::select(datasets, name, K, n, m)))
}

Database choosing and visualization setup

I suggest something like this if you are developing some algorithm and testing on a number of different databases.

# Uncomment the name of the database to be analyzed
# dsName <- "Ionosphere"
dsName <- "iris"
# dsName <- "BreastCancer"#Supply the name of the database to be analyzed#FACTORS
# dsName <- "Wine"
# dsName <- "Glass"#Cannot take logarithms for PCA: zeros returns -Inf
# dsName <- "Arthritis"#It has non-numeric factors.
# dsName <- "Sonar"
# Suggestions: choose: "iris", "BreastCancer" or "Glass" to replicate Entropy 2018 paper.
dsRecord <-  filter(datasets, name == dsName)
ds <- loadDataset(dsRecord$name,dsRecord$packName) 

#id columns, if existent in dsRecord$idNumber
# log transform of everything but the class and any id if existant. 
if (!is.na(dsRecord$idNumber)){
    ds <- ds[,-dsRecord$idNumber]
}
#class column
ds.classNum <- which(names(ds)==dsRecord$className)
#take away the class, but keep it just in case.
class.ds <- ds[, ds.classNum]#saving the class. Warning A FACTOR!
ds <- ds[,-ds.classNum]
ds <- ds %>%     
    #transform factors to number
    mutate_if(is.factor,factor.as.numeric) %>%
    # Dispose of columns with NaN
    select_if(function(v) !any(is.na(v))) %>% 
    # Dispose of constant columns: they carry no information
    select_if(function(v)(var(v) > 0))
ncols <- ncol(ds)#Mnemonic shortcut: num of columns
dsDiscretized <- infotheo::discretize(ds, disc="equalwidth")
if (dsName != "Ionosphere"){
    log.ds <- log(ds)#this has to be made conditional on the database
    log.dsDiscretized <- infotheo::discretize(log.ds)
    #TODO: try to get rid of annoying warnings each time entropy is called. 
}

We next choose glyphs, colors, etc. for visualizing with the CMET and SMET.

# The colors for the different feature sets for the plots. 
if (fancy){#Different color schemes you may try:
    #orderingColors <- rev(terrain.colors(ncols+1))[1:ncols + 1]#too similar
    #orderingColors <- rev(terrain.colors(ncols+1))[1:ncols + 1]
    #orderingColors <- rev(topo.colors(ncols+1))[1:ncols + 1]#too flashy
    #orderingColors <- rev(cm.colors(ncols+1))[1:ncols + 1]#too washed out
    #orderingColors <- rev(heat.colors(ncols+1))[1:ncols + 1]
    orderingPalette <- "BuPu"
} else {#These are supposed to be classical.
    #orderingColors <- rev(gray(0:ncols / ncols))[1:ncols + 1]
    #orderingColors <- scale_colour_grey(end=0.9)#Not for discrete levels
    orderingPalette <- "Blues"
}
# We discard the lightest shade because it is too akin to white,the background.
# REcipe from: https://stackoverflow.com/questions/29466239/how-to-set-the-color-range-of-scale-colour-brewer-in-ggplot2-palette-selecte
# library(RColorBrewer)
# my_orange = brewer.pal(n = 9, "Oranges")[3:9] #there are 9, I exluded the two lighter hues
#orderingColors <-  brewer.pal(n = ncols + 2, orderingPalette)[1:ncols + 2]#Does not provide enough
# But since Brewer color values are only up to 9, we need another mechanims to interpolate
# https://stackoverflow.com/questions/16922988/interpolating-a-sequential-brewer-palette-as-a-legend-for-ggplot2
myColorsFun <- colorRampPalette(brewer.pal(9,orderingPalette))
orderingColors <- myColorsFun(ncols+2)[1:ncols + 2]
if (debugLevel > 0){#When debugging, have a peek at the colors.
    pie(rep(1,ncols),col=orderingColors)
}
# The shapes for the different types of transform
transformShapes <- c("log"=4, "PCA"=21, "ICA"=23)#no fill, solid
# The shapes to represent the components of observations
sourceShapes <- c("none"=8, "log"=4, "PCA"=21, "ICA"=23)#no fill, solid
#Classical for channel ETs: X for the x and circle for Y
typeShapes <- c("X" = 4, "Y" = 6, "ALL" = 10, "XY" = 20)

Base case characterization

First we collect the information transformation from data to log data in entropy triangle data frames for sources and transformations.

etdf <- data.frame()
sedf <- data.frame()
#The following point of etdf shoul print as a point on a cross, being deterministic.
if (dsName != "Ionosphere"){
    etdf <- rbind(etdf,
                  cbind(data=paste0("1_",ncols), 
                        transform="log",
                        jentropies(dsDiscretized, log.dsDiscretized)
                  )
    )
    sedf <- rbind(sedf,cbind(data=paste0("1_",ncols),
                             transform="log", 
                             sentropies(log.dsDiscretized[,1:ncols])
                             )
                  )
}
sedf <- rbind(sedf,
              cbind(data=paste0("1_",ncols), transform="none", 
                     sentropies(dsDiscretized[,1:ncols], type="none")
                     )
        )
#Reorder the factors for the plots' legends
orderedLevels <-sapply((1:ncols), function(i) paste0("1_",i))
sedf <- sedf %>% mutate(data=factor(data,levels=orderedLevels))

Transformations

Exploring the PCA transformation

Next we obtain the PCA for the log-transformed data.

# Idea for script from:
# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/
# apply PCA - scale. = TRUE is highly 
# advisable, but default is FALSE. 
#pca.model <- prcomp(log.ds, center = TRUE, scale. = TRUE) 
pca.model <- prcomp(ds, center = TRUE, scale. = TRUE) 
# print method
print(pca.model)
# plot method
plot(pca.model, type = "l")
# summary method: describes the importance of the components.
summary(pca.model)
# Finally we obtained the transformed data, the Y to be analized
dsPca <- as.data.frame(predict(pca.model, newdata= ds))
# Why predict as opposed to just working out the entropy?
# Because the pca.model is just the set of matrices for carriying out the pca. We need to generate the data!
dsPcaDiscretized <- infotheo::discretize(dsPca, disc="equalwidth")

Next we see how the information from the original data is transferred to the principal components cumulatively. Firts we work out the transformation entropies from retaining only the first n features and then we visualize it.

for( i in (1:ncols)){
    etdf <- rbind(etdf,
                  cbind(data=paste0("1_",i), 
                        transform="PCA",
                        jentropies(dsDiscretized,
                                   as.tibble(dsPcaDiscretized[ ,1:i]))
                  )
    )
}
etdf <- etdf %>% mutate(data=factor(data,levels=orderedLevels))
transfo_cmet <- 
    ggmetern(etdf %>% filter(transform == "PCA"), fancy) +
    geom_point(aes(color=data, shape=type), size=3) +
    scale_shape_manual(values=typeShapes) +
    scale_color_manual(values=orderingColors) + 
    labs(color="Feature set", shape="Entropy type") +
    theme(legend.key=element_blank())
transfo_cmet
if (getPlot){
    dev.off()#Necessary to do the textual plot.
    ggsave(stringr::str_interp("featureSel_CMET_PCA_${dsName}.jpeg"),
           plot=transfo_cmet)
}

The observation of the gain of adopting the PCA has to be seen in the source triangle (see below).

Exploring the ICA transformation

#library(ica)#ICA has to be found for each value of the 
library(fastICA)
set.seed(25)#for reproducibilit
for(i in (1:ncols)){
    # ica <- icaimax(
    #     ds[,1:ncols], nc=i,
    #     center=TRUE
    # )

    ica <- fastICA(X=ds[,1:ncols], i, 
            alg.typ="parallel", fun="logcosh", alpha=1,
            method="C", row.norm= FALSE, maxit=200, tol=0.0001, 
            verbose=FALSE
    )
    dsIca <- data.frame(ica$S)#Matrix of source signal estimates (S=Y%*%R) as per ?icaimax
    dsIcaDiscretized <- infotheo::discretize(dsIca, disc="equalwidth")
    etdf <- rbind(etdf,
                  cbind(data=paste0("1_",i), transform="ICA",
                        jentropies(dsDiscretized,
                                   as.data.frame(dsIcaDiscretized[,1:i]))
                  )
    )
}

First we visualize ICA on its own to parallel the previous visualization.

# show the split entropies
transfo_cmet <- 
     ggmetern(etdf %>% filter(transform == "ICA"), fancy) +
#    ggmetern(etdf %>% filter(type != "XY" & transform == "PCA"), fancy) +
    geom_point(aes(color=data, shape=type), size=3) +
    scale_shape_manual(values=typeShapes) +
    #scale_color_brewer(palette=orderingPalette) +
    #scale_fill_brewer(palette=orderingPalette) + #Shapes are non-fillable. 
    scale_color_manual(values=orderingColors) + 
    labs(color="Feature set", shape="Entropy type") +
    theme(legend.key=element_blank())
   # ggmetern(etdf %>% filter(type != "XY" & transform == "ICA"), fancy) +
   #  geom_point(aes(color=data, shape=type), size=3) +
   #  scale_shape_manual(values=trasnforType) +
   #  scale_color_manual(values=orderingColors) + 
   #  labs(color="Feature set", shape="Type")
transfo_cmet
if (getPlot){
    dev.off()#Necessary to do the textual plot.
    ggsave(stringr::str_interp("featureSel_CMET_ICA_${dsName}.jpeg"), plot=transfo_cmet)
}

Comparison betweem PCA and ICA on the dataset

Comparing the gains in information transmission

Next, to compare, PCA, ICA and their absence we plot just the output entropy of the transformation:

# show the split entropies
#fancy <- !TRUE#TRUE is better for interactive use. !TRUE for illustrations.
comparison_cmet <- ggmetern(etdf %>% filter(type == "Y"), fancy) +
    geom_point(aes(fill=data, shape=transform), size=3, alpha=.8) +
    scale_shape_manual(values=transformShapes) + 
    #scale_fill_brewer(palette=orderingPalette) +
    #scale_color_brewer(palette=orderingPalette) +
    scale_fill_manual(values=orderingColors) + 
    labs(fill="Feature set", shape="Transform type", panel.background="white") +
    guides(fill=guide_legend(override.aes=list(shape=22))) + #show a shape than can be filled!
    theme(legend.key=element_blank())
comparison_cmet
if (getPlot){
    dev.off()#Necessary to do the textual plot.
    ggsave(stringr::str_interp("featureSel_CMET_compare_PCA_ICA_${dsName}.jpeg"),
           plot=comparison_cmet)
}

In the case of iris, we see that choosing only the first component in ICA or PCA carries much more information on the input per feature than taking the whole database. This is also true for choosing up to 2 or 3 components, but not for 4 components.

Comparing the entropy content of the sources

We next plot the source contents of the databases made with and witout the re-parameterization, both for PCA and ICA.

Exploring the PCA features as a data source

for( i in (1:ncols)){
    sedf <- rbind(sedf,
                  cbind(data=paste0("1_",i), transform="PCA",
                        sentropies(as.data.frame(dsPcaDiscretized[,1:i]))
                  )
    )
}

Exploring the ICA features as a data source

#Now get the same approximations as before for the sources:
for( i in (1:ncols)){
    sedf <- rbind(sedf,
                  cbind(data=paste0("1_",i), transform="ICA", 
                        sentropies(as.data.frame(dsIcaDiscretized[,1:i]))
                  )
    )
}

Visualisation using the SMET (q.v.).

In this plot we compare the different subsets of features as entropy sources.

transfo_smet <- 
    ggmetern(filter(sedf, name == "ALL" & transform != "log"), fancy) +
    geom_point(aes(fill=data, shape=transform), size=3) +
    scale_shape_manual(values=sourceShapes) +
    scale_fill_manual(values=orderingColors) + 
    guides(fill=guide_legend(override.aes=list(shape=22))) + #show a shape than can be filled!
    theme(legend.key=element_blank())
transfo_smet
if (getPlot){
    dev.off()#Necessary to do the textual plot.
    ggsave(stringr::str_interp("featureSel_SMET_compare_PCA_ICA_${dsName}.jpeg"),
           plot=transfo_smet)
}

In the case of iris we see that the reason to choose 1, 2, or 3 features is because the carry less redundancy. However, they sample the input space slightly less uniformly than the original database.

Try this script with Glass or Arthritis for completely different behaviours.

Postscriptum

More information about the evaluation of feature transformations with the Channel Multivariate Entropy Triangle can be found in

library(bibtex)
print(citation("entropies")['val:pel:18c'], style="text")

Session information

sessionInfo()


FJValverde/entropies documentation built on Oct. 12, 2023, 10:17 p.m.