This vignette uses the SMET to explore the limits of randomness in generating multivariate distributions.
#library(ggtern) # Excellent package for ternary diagrams in the gg tradition #library(entropy) # To work out the appropriate coordinates. library(dplyr) # That infamous Mr. Wickham! library(tidyr) # Tidying tall & wide dataframes library(infotheo) # The functionality provided by this has to be rerouted through entropies library(entropies) # This package library(ggtern) # Ternary diagrams on ggplot library(vcd) # Categorical benchmarks library(mlbench) # ml benchmarkss #knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=6)
Some top level switches and options gathered in one place.
set.seed(29)#For reproducibility purposes tol <- 1e-5# A tolerance # Number of features in each random distribution nInstances <- c(10, 50, 100, 500, 1000, 5000, 10000) #nInstances <- c(10, 50)
Next we generate the distributions and analyze their entropies.
edf <- data.frame() # to accumulate the entropic measures as we increase. for(m in nInstances){#generate random features: medf <- data.frame() # to accumulate the entropic measures as we increase. dsRandom <- data.frame() # to accumulate the distributions per m n <- 1 dsRandom <- as.data.frame(x=as.factor(rbinom(m, 1, 0.5))) names(dsRandom) <- NULL medf <- rbind(medf, sentropies(dsRandom) %>% filter(name == "ALL") %>% mutate(m, n)) repeat{ n <- n + 1 dsRandom <- cbind(dsRandom, as.factor(rbinom(m, 1, 0.5))) medf <- rbind(medf, sentropies(dsRandom) %>% filter(name == "ALL") %>% mutate(m, n)) if (medf[n, ]$VI_Pxi < tol | n == 200) break } edf <- rbind(edf, medf) } edf <- filter(edf, name == "ALL") %>% mutate(m=as.numeric(m), n=as.numeric(n))
The number of features needed to zero the residual entropy almost plateaus.
ggplot(data=filter(edf, VI_Pxi < tol)) + geom_line(aes(x=m, y=n)) + scale_x_log10(breaks=nInstances) #scale_x_continuous(breaks=nInstances)
In the entropy triangle the plot is different
ggmetern(filter(edf, m==100), fancy=FALSE) + geom_point(aes(shape=as.factor(m), color=n)) + labs(shape="Num. instances", color="Num. variables") + scale_shape_manual(values=19:(19+7))
More information about the evaluation of classifiers with the Channel Binary Entropy Triangle can be found in
library(bibtex) print(citation("entropies")['val:pel:14a'], style="text")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.