In terminological/classifier-result:

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(ggplot2)
devtools::load_all("~/Git/classifier-result/")

Distribution Functions

Dont use this unless you have very specific use cases.

The most sensible option is to use Distr6 https://github.com/alan-turing-institute/distr6 for anything but the use case of testing various combined functions

n = NormalDistribution$new(mean=0,sd=1)
n$plot(-5,5)

ggplot(n$sample(100000),aes(x=x))+geom_histogram(bins = 500)

Uniform density

n2 = UniformDistribution$new(min=2,max=4)
n2$plot(xmin=0,xmax=5)

Log Normal distribution

Log normal plots are specified using a mode and a measure of variance

$$\mathit{mode}={{e}^{\mu-{{\sigma}^{2}}}} \ \mathit{var}={{e}^{2\cdot \mu-{{\sigma}^{2}}}}\cdot \left( {{e}^{{{\sigma}^{2}}}}-1\right) $$

This can be rearranged to the following:

$$\mathit{var}+{{e}^{\mu}}\cdot \mathit{mode}-{{e}^{2\cdot \mu}}=0$$

$$ {e}^\mu = \frac{mode \pm \sqrt{mode^2 + 4var}}{2} $$ $$ \mu = log \left( \frac{mode \pm \sqrt{mode^2 + 4var}}{2} \right) $$

Which can be numerically solved for $\mu$ given the mode and variance, leaving the substitution for $\sigma$ $$\sigma=\sqrt{\mu-\mathrm{log}\left( \mathit{mode}\right) }$$

Which allows us to specify a log normal distribution with meaningful shape parameters.

devtools::load_all("..")
n3 = LogNormalDistribution$new(mode=6,sd=2)
n3$plot(0,20)
n3$q(seq(0,1,length.out = 5))

devtools::load_all("..")
n3 = LogNormalDistribution$new(mean=6,sd=2)
n3$plot(0,20)

n4 = KumaraswamyDistribution$new(mode=0.4,iqr=0.3)
n4$plot(0,1)
n5 = MirroredKumaraswamyDistribution$new(mode=0.6,iqr=0.3)
n5$plot(0,1)

The theoretical entropy value of this distribution is

paste0("Entropy: ",n3$theoreticalEntropy())

Combining distributions

Combining two distributions is also possible, and the default labelling can be overridden

cd = ConditionalDistribution$new()
cd$withDistribution(n)
cd$withDistribution(n2,"class 2",1)
cd$plot(xmin=-5,xmax=5)

And we can sample from this combined distribution which is returned as a dataframe containing the class in "y"

ggplot(cd$sample(10000),aes(x=x,fill=y))+geom_histogram(position="dodge",bins=40)

A limited number of theoretical values are available for the combined distribution (well numerically calculated)

paste0("Mean: ",cd$theoreticalMean())
paste0("Variance: ",cd$theoreticalVariance())
paste0("Mutual information between x and y: ",cd$theoreticalMI())

classifier simultation

A conditional probability distribution based on 2 kumuraswamy distributions

cd2 = ConditionalDistribution$new()
cd2$withDistribution(n4,"class 1",1)
cd2$withDistribution(n5,"class 2",2)
cd2$plot(xmin=0,xmax=1)

Random distribution testing

For testing purposes it is also useful to be able to generate a sample from a random collection of distributions with approximately sensible defaults. This is possible with a random distribution selector which will randomly pick from the known distributions

cd3 = ConditionalDistribution$new()
cd3$withRandomDistributions(3)
cd3$plot(-5,5)

Multiple independent distribution combinations

hb = ConditionalDistribution$new()
  hb$withDistribution(LogNormalDistribution$new(mode=12,sd=1.3), "asymptomatic")
  hb$withDistribution(LogNormalDistribution$new(mode=8,sd=1.5), "tired")
  hb$withDistribution(LogNormalDistribution$new(mode=4,sd=2), "unwell")

k = ConditionalDistribution$new()
  k$withDistribution(NormalDistribution$new(mean=1,sd=0.5), "unwell")
  k$withDistribution(NormalDistribution$new(mean=2,sd=1), "asymptomatic")
  k$withDistribution(NormalDistribution$new(mean=8,sd=3), "tired")

mvd = MultivariableDistribution$new()
mvd$withConditionalDistribution(hb,"haemoglobin")
mvd$withConditionalDistribution(k,"serum k")

mvd$withClassWeights(list(
  unwell=0.2,
  asymptomatic=0.6,
  tired=0.3
))

mvd$sample(100)
mvd$plot()