This vignette uses the SMET to explore the limits of randomness in generating multivariate distributions.

Environment construction

#library(ggtern)   # Excellent package for ternary diagrams in the gg tradition
#library(entropy)  # To work out the appropriate coordinates.
library(dplyr)     # That infamous Mr. Wickham!
library(tidyr)     # Tidying tall & wide dataframes
library(infotheo)  # The functionality provided by this has to be rerouted through entropies
library(entropies) # This package
library(ggtern)    # Ternary diagrams on ggplot
library(vcd)       # Categorical benchmarks
library(mlbench)   # ml benchmarkss
#knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=6)

Generating distributions

Some top level switches and options gathered in one place.

set.seed(29)#For reproducibility purposes
tol <-  1e-5# A tolerance
# Number of features in each random distribution
nInstances <- c(10, 50, 100, 500, 1000, 5000, 10000) 
#nInstances <- c(10, 50) 

Next we generate the distributions and analyze their entropies.

edf <- data.frame() # to accumulate the entropic measures as we increase.
for(m in nInstances){#generate random features: 
    medf <- data.frame() # to accumulate the entropic measures as we increase.
    dsRandom <- data.frame() # to accumulate the distributions per m
    n <-  1
    dsRandom <- as.data.frame(x=as.factor(rbinom(m, 1, 0.5)))
    names(dsRandom) <-  NULL
    medf <-  rbind(medf, sentropies(dsRandom) %>% 
                       filter(name == "ALL") %>% 
                       mutate(m, n))
    repeat{
        n <- n + 1
        dsRandom <- cbind(dsRandom, as.factor(rbinom(m, 1, 0.5)))
        medf <-  rbind(medf, sentropies(dsRandom) %>% 
                           filter(name == "ALL") %>% 
                           mutate(m, n))
        if (medf[n, ]$VI_Pxi < tol | n == 200)
            break
    }
    edf <- rbind(edf, medf)
}
edf <-  filter(edf, name == "ALL") %>% 
            mutate(m=as.numeric(m), n=as.numeric(n))

The number of features needed to zero the residual entropy almost plateaus.

ggplot(data=filter(edf, VI_Pxi < tol)) + geom_line(aes(x=m, y=n)) +
    scale_x_log10(breaks=nInstances)
    #scale_x_continuous(breaks=nInstances)

In the entropy triangle the plot is different

ggmetern(filter(edf, m==100), fancy=FALSE) + geom_point(aes(shape=as.factor(m), color=n)) + 
    labs(shape="Num. instances", color="Num. variables") + 
    scale_shape_manual(values=19:(19+7))

Postscriptum

More information about the evaluation of classifiers with the Channel Binary Entropy Triangle can be found in

library(bibtex)
print(citation("entropies")['val:pel:14a'], style="text")

Session information

sessionInfo()


FJValverde/entropies documentation built on Oct. 12, 2023, 10:17 p.m.