knitr::opts_chunk$set(echo = TRUE)
library(infotheo)
library(tidyverse)
library(ggplot2)
library(devtools)
# library(ClassifierResult)
#library(tidyinfostats)
devtools::load_all("..")
set.seed(101)
theme_set(standardPrintOutput::defaultFigureLayout())

Check entropy calculations working:

Using set of distributions:

devtools::load_all("~/Git/tidy-info-stats")
dists = list(
  normal = ClassifierResult::NormalDistribution$new(mean = 1, sd = 0.5),
  logNormal = ClassifierResult::LogNormalDistribution$new(mode = 1, sd = 0.5),
  uniform = ClassifierResult::UniformDistribution$new(min = 0,max = 2)
)

dists$normal$plot(-2,5)

dists$normal$theoreticalEntropy()
#devtools::load_all("..")
set.seed(101)
s = dists$normal$sample(600)
#debug(calculateDiscreteEntropy_Grassberger)
#debug(calculateDiscreteEntropy_Histogram)

cutsDf = tibble(cut=seq(-1,3,length.out = 24))
s = s %>% discretise(x, x_discrete, method="Manual", cutsDf=cutsDf)
s %>% calculateDiscreteEntropy_Histogram(groupVars=vars(x_discrete), mm=TRUE)
s %>% calculateDiscreteEntropy_InfoTheo(groupVars=vars(x_discrete), infoTheoMethod="mm")
s %>% calculateDiscreteEntropy_Grassberger(groupVars=vars(x_discrete))

s %>% calculateSelfInformation_Histogram(groupVars=vars(x_discrete), mm=TRUE)
s %>% calculateSelfInformation_Grassberger(groupVars=vars(x_discrete))
devtools::load_all("..")
# debug(calculateDiscreteEntropy_MontgomerySmith)
# debug(calculateContinuousEntropy_Quantile)

bins = 24
cutsDf = tibble(cut=seq(-1,3,length.out = bins-1))

out = NULL
method = c("Histogram","Compression","Grassberger","MontgomerySmith")
continuousMethod = c("Quantile","PDF")
for (dist in dists) {
  thEnt = dist$theoreticalEntropy()
  for (i in c(1:10)) { #0)) {
    for (j in c(1:10,20,30,40,60,80,100)) { #,15,20,25,30,35,40,45,50,60,70,80,90,100)) {
      s = dist$sample(n=10*j)
      s = s %>% discretise(x, x_discrete, method="Manual", cutsDf=cutsDf)
      #s = s %>% discretise(x, x_discrete, method="ByRank")
      for (m in method) {
        estimate = calculateDiscreteEntropy(s, groupVars=vars(x_discrete), method = m, infoTheoMethod = "emp", mm=FALSE) %>% pull(I)
        out = out %>% bind_rows(tibble(
          N = 10*j,
          method = m,
          C = length(unique(s$x_discrete)),
          dist = dist$label(),
          estimate = estimate,
          theoretical = thEnt
        ))
      }
      for (m in continuousMethod) {
        estimate = calculateContinuousEntropy(s, continuousVar=x, method = m, probabilityMethod="SGolay", k_05=10) %>% pull(I)
        out = out %>% bind_rows(tibble(
          N = 10*j,
          method = m,
          C = length(unique(s$x_discrete)),
          dist = dist$label(),
          estimate = estimate,
          theoretical = thEnt
        ))
      }
    }
  }
}

# TODO: the theoretical mutual information is calculated using an integral which assumes equal bin width
# There is probably an adjustment that involves log(Number of non empty bins) and log(Number of bins) that 
# would adjust the estimated values to come into line with the theoretical.
# not sure this really works

out.summ = out %>% group_by(N,method,dist) %>% summarise(
  mean = mean(estimate),
  sd = sd(estimate),
  #th = max(log(7/theoretical))
  #th = max(log(20)-theoretical)
  #th = max(log(bins*theoretical))
  #th = max(log(bins)*theoretical)
  th = max(theoretical),
  th2 = max(theoretical)-log((3-(-1))/(bins-1)) # correction is log(range/bins)
  # http://thirdorderscientist.org/homoclinic-orbit/2013/5/8/bridging-discrete-and-differential-entropy
) 

ggplot(out.summ, aes(x=N,colour=method,fill=method))+
  #geom_line(aes(y = th), colour="black",alpha=0.1)+
  #geom_ribbon((aes(ymin=th-1.96*th.sd, ymax=th+1.96*th.sd)),colour=NA,alpha=0.1)+
  geom_line(aes(y=mean))+
  geom_ribbon((aes(ymin=mean-1.96*sd, ymax=mean+1.96*sd)),colour=NA,alpha=0.2)+

  geom_hline(aes(yintercept = th2),colour="grey50")+
  geom_hline(aes(yintercept=th),colour="grey50")+
  facet_wrap(vars(dist))+scale_x_log10()

Mutual information of continuous versus discrete disctirbutions

Assume a continuous distribution

hb = ClassifierResult::ConditionalDistribution$new()
hb$withDistribution(LogNormalDistribution$new(mode=12,sd=2), "asymptomatic", 0.75)
hb$withDistribution(LogNormalDistribution$new(mode=8,sd=3), "tired", 0.25)
hb$withDistribution(LogNormalDistribution$new(mode=4,sd=5), "unwell", 0.25)
hb$plot(0,20)

And another

k = ClassifierResult::ConditionalDistribution$new()
k$withDistribution(NormalDistribution$new(mean=1,sd=0.5), "unwell", 0.125)
k$withDistribution(NormalDistribution$new(mean=2,sd=1), "asymptomatic", 0.75)
k$withDistribution(NormalDistribution$new(mean=8,sd=3), "tired", 0.125)

We can use the properties of the underlying distributions to calculate theoretical values for the Mean, Variance and Mutual Information

tibble(measure = c("Mean","Variance","Mutual Information"),
hb = c(hb$theoreticalMean(),hb$theoreticalVariance(), hb$theoreticalMI()),
k = c(k$theoreticalMean(),k$theoreticalVariance(),k$theoreticalMI()))

We can also generate test data by sampling from the underlying distributions

devtools::load_all("..")
set.seed(101)
testData = k$sample(500) %>% mutate(test="k") %>% bind_rows(
  hb$sample(1000) %>% mutate(test="hb")) %>% rename(outcome=y, value=x)

testData = testData[sample(nrow(testData)),]

ggplot(testData, aes(fill=outcome, x=value))+geom_histogram(position="dodge", bins=50)+facet_grid(cols=vars(test))

tmp = testData %>% group_by(test,outcome) %>% probabilitiesFromContinuous_Kernel(value)

ggplot(tmp, aes(x=value, colour=outcome))+geom_line(aes(y=p_x))+facet_wrap(vars(test))

# ggplot(tmp, aes(x=value, colour=outcome,fill=outcome))+stat_density(alpha=0.2,position="dodge")+facet_wrap(vars(test))

And use this sample to estimate the MI using a range of methods.

TODO: Explain the methods.

devtools::load_all("..")
#debug(calculateDiscreteDiscreteMI)
#debug(calculateDiscreteContinuousMI_DiscretiseByRank)
#debug(calculateDiscreteDiscreteMI_Entropy)
#debug(calculateDiscreteEntropy_Histogram)
options(warn=1)
results = NULL
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KWindow"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Entropy", entopyMethod="PDF",probabilityMethod = "SGolay"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Kernel"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByRank"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="Entropy", entropyMethod="InfoTheo"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="Histogram"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="MontgomerySmith"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod="Entropy", entropyMethod="Grassberger"))
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KNN", useKWindow=FALSE))
set.seed(101)
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Compression"))
set.seed(101)
results = results %>% rbind(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "DiscretiseByRank", binStrategy = linearBySize(8,4,256), discreteMethod="Entropy", entropyMethod="Compression"))

testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Entropy", entropyMethod="Quantile")

results = results %>% rbind(tibble(test=c("hb","k"),N=c(1000L,500L),I = c(hb$theoreticalMI(),k$theoreticalMI()), I_sd=as.numeric(NA),method=rep("theoretical",2)))

results %>% group_by(test) %>% standardPrintOutput::mergeCells()

# left_join(testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousMI(outcome, value, method = "Entropy") %>% rename(I.emp=I), by="test") %>%
devtools::load_all("..")
testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousPointwiseMI(vars(outcome), value, method = "KWindow")
testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousPointwiseMI(vars(outcome), value, method = "Kernel")
#debug(calculateDiscreteContinuousPointwiseMI_Entropy)
testData %>% group_by(test) %>% tidyinfostats::calculateDiscreteContinuousPointwiseMI(vars(outcome), value, method = "Entropy", entropyMethod="Quantile")


terminological/tidy-info-stats documentation built on Nov. 19, 2022, 11:23 p.m.