knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
library(ggplot2)
library(standardPrintOutput)
#library(tidyinfostats)
devtools::load_all("..")
set.seed(101)
theme_set(standardPrintOutput::defaultFigureLayout())

Define test distributions

gaussians

gaussian test params y0 = [ .4 .5 .8 ]; % the center of the gaussian sigma_y = [ .2 .3 .25 ]; % the gaussian decay constant p{2} = [ .2 1 0.5 ]; % the (normalized) amplitude p(x)

gaussians = ConditionalDistribution$new()
gaussians$withDistribution(0.2,NormalDistribution$new(0.4,0.2))
gaussians$withDistribution(1,NormalDistribution$new(0.5,0.3))
gaussians$withDistribution(0.5,NormalDistribution$new(0.8,0.25))

Uniform distributions

square wave test params

a = [ 0 .1 .2 ]; % the left side of each square wave b_a = [ 1 1.1 1.1 ]; % the length in y of each square wave p{1} = [ .2 1 0.5 ]; % the (normalized) amplitude p(x)

squareWaves = ConditionalDistribution$new()
squareWaves$withDistribution(0.2,UniformDistribution$new(min=0,max=1))
squareWaves$withDistribution(1,UniformDistribution$new(min=0.1,max=1.2))
squareWaves$withDistribution(0.5,UniformDistribution$new(min=0.2,max=1.3))

Log normals

lognorm = ConditionalDistribution$new()
lognorm$withDistribution(0.5,LogNormalDistribution$new(mode=0.5,var=0.25))
lognorm$withDistribution(1,LogNormalDistribution$new(mode=1.5,var=1))
lognorm$withDistribution(0.3,LogNormalDistribution$new(mode=2.5,var=0.5))

Experiment 3

Objectives: * Look at sample size versus accuracy MI, Mean and SD with increasing sample size regression to predict SD of MI from est mean versus theoretical mean & est versus theoretical SD

#devtools::load_all("..")

#TODO: Include Discretise by value -> Grassberger in here
experiment3 = function(distribution, reps, meth = c("KWindow","KNN","SGolay","DiscretiseByRank","DiscretiseByValue","Compression")) {

  set.seed(101)
  result = NULL

  thMi = distribution$theoreticalMI()
  thMu = distribution$theoreticalMean()
  thSd = sqrt(distribution$theoreticalVariance())

  j=0
  for (samples in c(8,16,32,64,128,256,512,1024,2048,4096,8192,16384)) {

    for (i in c(1:reps)) {

      j=j+1

      df = distribution$sample(samples)
      minGroupSize = min(df %>% group_by(y) %>% count() %>% pull(n))
      estMi = sapply(meth, function(m) {
        # browser()
        calculateDiscreteContinuousMI(df, vars(y), x, method = m) %>% pull(I)
      })
      estMean = mean(df$x)
      estSd = sd(df$x)

      result = result %>% bind_rows(
        tibble(
          id = j,
          minGroupSize = minGroupSize,
          sample = samples,
          param = c(meth,"Mean","Std deviation"),
          theoretical = c(rep(thMi,length(estMi)),thMu,thSd),
          estimated = c(estMi,estMean,estSd)
        )
      )
    }
  }

  return(result)
}

# setup error

quantifyError = function(df) {
  return(
    df %>% mutate(
      absoluteError = estimated-theoretical,
      relativeError = ifelse(theoretical<0.05,NA,absoluteError/theoretical)
    ) %>% filter(!is.na(relativeError)) %>% group_by(sample,param) %>% summarise(
      minGroupSize =mean(minGroupSize),
      theoretical = mean(theoretical, na.rm=TRUE),
      rmse = sqrt(mean(absoluteError^2, na.rm=TRUE)),
      mae = mean(absoluteError, na.rm=TRUE),
      vae = var(absoluteError, na.rm=TRUE),
      nmae = mean(relativeError, na.rm=TRUE),
      nvae = var(relativeError, na.rm=TRUE),
      median_ae = quantile(absoluteError, probs=c(0.5), names=FALSE, na.rm=TRUE),
      upper_iqr_ae = quantile(absoluteError, probs=c(0.75), names=FALSE, na.rm=TRUE),
      lower_iqr_ae = quantile(absoluteError, probs=c(0.25), names=FALSE, na.rm=TRUE)
    )
  )
}



exp3aData = experiment3(lognorm,100)
exp3bData = experiment3(gaussians,100)
exp3cData = experiment3(squareWaves,100)
# todo summarise experimental data
plotExperiment3 = function(df, components) {
  summary = df %>% group_by(sample,param) %>% quantifyError()
  return(ggplot(summary %>% filter(param %in% components),aes(x=sample))+
        geom_line(aes(y=mae),colour="blue")+
        geom_ribbon(aes(ymin=mae-1.96*vae,ymax=mae+1.96*vae),fill="blue",alpha=0.1)+
        #geom_line(aes(y=estimatedMedian),colour="grey75")+
        #geom_ribbon(aes(ymin=estimatedLower,ymax=estimatedUpper),fill="grey75",alpha=0.1)+
        coord_cartesian(ylim = c(-0.25,0.25))+
      geom_hline(yintercept=0, colour="red")+
        ylab("absolute error")+
        xlab("sample size")+
        facet_wrap(vars(param))+scale_x_log10())
}

Log Normal

# devtools::load_all("..")

plotExperiment3(exp3aData,c("Mean","Std deviation"))
plotExperiment3(exp3aData,c("KWindow","KNN","SGolay","DiscretiseByRank","DiscretiseByValue","Compression"))
standardPrintOutput::saveThirdPageFigure(filename="~/Dropbox/featureSelection/mutinfo/bootstrappingLogNorm")

Gaussians

#devtools::load_all("..")

plotExperiment3(exp3bData,c("Mean","Std deviation"))
plotExperiment3(exp3bData,c("KWindow","KNN","SGolay","DiscretiseByRank","DiscretiseByValue","Compression"))
standardPrintOutput::saveThirdPageFigure(filename="~/Dropbox/featureSelection/mutinfo/bootstrappingGaussians")

Uniform

# devtools::load_all("..")

plotExperiment3(exp3cData,c("Mean","Std deviation"))
plotExperiment3(exp3cData,c("KWindow","KNN","SGolay","DiscretiseByRank","DiscretiseByValue","Compression"))
standardPrintOutput::saveThirdPageFigure(filename="~/Dropbox/featureSelection/mutinfo/bootstrappingUniform")


terminological/tidy-info-stats documentation built on Nov. 19, 2022, 11:23 p.m.