knitr::opts_chunk$set(echo = TRUE)
library(infotheo) library(tidyverse) library(ggplot2) library(devtools) # library(ClassifierResult) #library(tidyinfostats) devtools::load_all("..") set.seed(101) theme_set(standardPrintOutput::defaultFigureLayout())
Using set of distributions:
devtools::load_all("~/Git/tidy-info-stats") dists = list( normal = ClassifierResult::NormalDistribution$new(mean = 1, sd = 0.5), logNormal = ClassifierResult::LogNormalDistribution$new(mode = 1, sd = 0.5), uniform = ClassifierResult::UniformDistribution$new(min = 0,max = 2) ) dists$normal$plot(-2,5) dists$normal$theoreticalEntropy()
#devtools::load_all("..") set.seed(101) s = dists$normal$sample(600) #debug(calculateDiscreteEntropy_Grassberger) #debug(calculateDiscreteEntropy_Histogram) cutsDf = tibble(cut=seq(-1,3,length.out = 24)) s = s %>% discretise(x, x_discrete, method="Manual", cutsDf=cutsDf) s %>% calculateDiscreteEntropy_Histogram(groupVars=vars(x_discrete), mm=TRUE) s %>% calculateDiscreteEntropy_InfoTheo(groupVars=vars(x_discrete), infoTheoMethod="mm") s %>% calculateDiscreteEntropy_Grassberger(groupVars=vars(x_discrete)) s %>% calculateSelfInformation_Histogram(groupVars=vars(x_discrete), mm=TRUE) s %>% calculateSelfInformation_Grassberger(groupVars=vars(x_discrete))
devtools::load_all("..") # debug(calculateDiscreteEntropy_MontgomerySmith) # debug(calculateContinuousEntropy_Quantile) bins = 24 cutsDf = tibble(cut=seq(-1,3,length.out = bins-1)) out = NULL method = c("Histogram","Compression","Grassberger","MontgomerySmith") continuousMethod = c("Quantile","PDF") for (dist in dists) { thEnt = dist$theoreticalEntropy() for (i in c(1:10)) { #0)) { for (j in c(1:10,20,30,40,60,80,100)) { #,15,20,25,30,35,40,45,50,60,70,80,90,100)) { s = dist$sample(n=10*j) s = s %>% discretise(x, x_discrete, method="Manual", cutsDf=cutsDf) #s = s %>% discretise(x, x_discrete, method="ByRank") for (m in method) { estimate = calculateDiscreteEntropy(s, groupVars=vars(x_discrete), method = m, infoTheoMethod = "emp", mm=FALSE) %>% pull(I) out = out %>% bind_rows(tibble( N = 10*j, method = m, C = length(unique(s$x_discrete)), dist = dist$label(), estimate = estimate, theoretical = thEnt )) } for (m in continuousMethod) { estimate = calculateContinuousEntropy(s, continuousVar=x, method = m, probabilityMethod="SGolay", k_05=10) %>% pull(I) out = out %>% bind_rows(tibble( N = 10*j, method = m, C = length(unique(s$x_discrete)), dist = dist$label(), estimate = estimate, theoretical = thEnt )) } } } } # TODO: the theoretical mutual information is calculated using an integral which assumes equal bin width # There is probably an adjustment that involves log(Number of non empty bins) and log(Number of bins) that # would adjust the estimated values to come into line with the theoretical. # not sure this really works out.summ = out %>% group_by(N,method,dist) %>% summarise( mean = mean(estimate), sd = sd(estimate), #th = max(log(7/theoretical)) #th = max(log(20)-theoretical) #th = max(log(bins*theoretical)) #th = max(log(bins)*theoretical) th = max(theoretical), th2 = max(theoretical)-log((3-(-1))/(bins-1)) # correction is log(range/bins) # http://thirdorderscientist.org/homoclinic-orbit/2013/5/8/bridging-discrete-and-differential-entropy ) ggplot(out.summ, aes(x=N,colour=method,fill=method))+ #geom_line(aes(y = th), colour="black",alpha=0.1)+ #geom_ribbon((aes(ymin=th-1.96*th.sd, ymax=th+1.96*th.sd)),colour=NA,alpha=0.1)+ geom_line(aes(y=mean))+ geom_ribbon((aes(ymin=mean-1.96*sd, ymax=mean+1.96*sd)),colour=NA,alpha=0.2)+ geom_hline(aes(yintercept = th2),colour="grey50")+ geom_hline(aes(yintercept=th),colour="grey50")+ facet_wrap(vars(dist))+scale_x_log10()
Assume a continuous distribution
hb = ClassifierResult::ConditionalDistribution$new() hb$withDistribution(LogNormalDistribution$new(mode=12,sd=2), "asymptomatic", 0.75) hb$withDistribution(LogNormalDistribution$new(mode=8,sd=3), "tired", 0.25) hb$withDistribution(LogNormalDistribution$new(mode=4,sd=5), "unwell", 0.25) hb$plot(0,20)
And another
k = ClassifierResult::ConditionalDistribution$new() k$withDistribution(NormalDistribution$new(mean=1,sd=0.5), "unwell", 0.125) k$withDistribution(NormalDistribution$new(mean=2,sd=1), "asymptomatic", 0.75) k$withDistribution(NormalDistribution$new(mean=8,sd=3), "tired", 0.125)
We can use the properties of the underlying distributions to calculate theoretical values for the Mean, Variance and Mutual Information
tibble(measure = c("Mean","Variance","Mutual Information"), hb = c(hb$theoreticalMean(),hb$theoreticalVariance(), hb$theoreticalMI()), k = c(k$theoreticalMean(),k$theoreticalVariance(),k$theoreticalMI()))
We can also generate test data by sampling from the underlying distributions
devtools::load_all("..") set.seed(101) testData = k$sample(500) %>% mutate(test="k") %>% bind_rows( hb$sample(1000) %>% mutate(test="hb")) %>% rename(outcome=y, value=x) testData = testData[sample(nrow(testData)),] ggplot(testData, aes(fill=outcome, x=value))+geom_histogram(position="dodge", bins=50)+facet_grid(cols=vars(test)) tmp = testData %>% group_by(test,outcome) %>% probabilitiesFromContinuous_Kernel(value) ggplot(tmp, aes(x=value, colour=outcome))+geom_line(aes(y=p_x))+facet_wrap(vars(test)) # ggplot(tmp, aes(x=value, colour=outcome,fill=outcome))+stat_density(alpha=0.2,position="dodge")+facet_wrap(vars(test))
And use this sample to estimate the MI using a range of methods.
TODO: Explain the methods.
devtools::load_all("..") #debug(calculateDiscreteDiscreteMI) #debug(calculateDiscreteContinuousMI_DiscretiseByRank) #debug(calculateDiscreteDiscreteMI_Entropy) #debug(calculateDiscreteEntropy_Histogram) options(warn=1) results = NULL results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KWindow")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Entropy", entopyMethod="PDF",probabilityMethod = "SGolay")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Kernel")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByRank")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="Entropy", entropyMethod="InfoTheo")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="Histogram")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="MontgomerySmith")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="Entropy", entropyMethod="Grassberger")) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KNN", useKWindow=FALSE)) set.seed(101) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Compression")) set.seed(101) results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "DiscretiseByRank", binStrategy = linearBySize(8,4,256), discreteMethod="Entropy", entropyMethod="Compression")) testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Entropy", entropyMethod="Quantile") results = results %>% rbind(tibble(test=c("hb","k"),N=c(1000L,500L),I = c(hb$theoreticalMI(),k$theoreticalMI()), I_sd=as.numeric(NA),method=rep("theoretical",2))) results %>% group_by(test) %>% standardPrintOutput::mergeCells() # left_join(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(outcome, value, method = "Entropy") %>% rename(I.emp=I), by="test") %>%
devtools::load_all("..") testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousPointwiseMI(vars(outcome), value, method = "KWindow") testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousPointwiseMI(vars(outcome), value, method = "Kernel") #debug(calculateDiscreteContinuousPointwiseMI_Entropy) testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousPointwiseMI(vars(outcome), value, method = "Entropy", entropyMethod="Quantile")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.