knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(ggplot2)
#library(tidyinfostats)
devtools::load_all("..")
set.seed(101)

Using DBPLYR tables in tidyinfostats

Suppose you have a database connection and you have some data loaded into it. Here we will use a single test table created from a test distribution

con <- DBI::dbConnect(RSQLite::SQLite(), dbname = ":memory:")

theoretical = tibble(
  measure = c("Mean","Variance","Mutual Information"),
  hb = c(hb$theoreticalMean(),hb$theoreticalVariance(), hb$theoreticalMI()),
  k = c(k$theoreticalMean(),k$theoreticalVariance(),k$theoreticalMI()))

testData = bloodResultsSimulation(10000)$data

testDataLazy = con %>% copy_to(testData)
testDataLazy

Executing a KWindow MI estimator in SQL

For reference results as executed in R

# devtools::load_all("..")

result = testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KWindow")
result

Now executed on the dbplyr database:

lazyResult = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KWindow")
lazyResult %>% show_query()

lazyResult %>% collect()

# DONT run this. The KNN method is not efficient in SQL
# lazyResult2 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KNN", useKWindow=FALSE)
# lazyResult2 %>% show_query()
# lazyResult2

Discretisation in database

By rank discretisation with a fixed number of bins is equivalent to an N_tile in sql

# devtools::load_all("..")
discretised = testDataLazy %>% group_by(feature) %>% tidyinfostats::discretise_ByRank(value, value_discrete, bins=3) %>% collect()
discretised %>% group_by(feature,value_discrete) %>% summarise(count = n())

# Non dbplyr for comparison:
# discretised = testData %>% group_by(feature) %>% tidyinfostats::discretise_ByRank(value, value_discrete, bins=3)
# discretised %>% group_by(test,value_discrete) %>% summarise(count = n())
devtools::load_all("..")

testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByRank", bins=100)


lazyResult3 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByRank", bins=100)
#lazyResult3 %>% show_query()
lazyResult3
# devtools::load_all("..")

#testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "DiscretiseByValue", discreteMethod = "MontgomerySmith")
testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod = "Grassberger")

#lazyResult4 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "DiscretiseByValue", discreteMethod = "MontgomerySmith")
lazyResult4 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod = "Grassberger")


lazyResult4

Continuous probability estimation

# devtools::load_all("..")

ggplot(
  testData %>% group_by(feature,outcome) %>% probabilitiesFromContinuous(value, method="SGolay"),
  aes(x=value,y=p_x, colour=outcome)) + geom_point() + facet_wrap(vars(feature))

# debug(probabilitiesFromContinuous_SGolay)
# debug(applySGolayFilter)

ggplot(
  testDataLazy %>% group_by(feature,outcome) %>% probabilitiesFromContinuous(value, method="SGolay"),
  aes(x=value,y=p_x, colour=outcome)) + geom_point() + facet_wrap(vars(feature))


terminological/tidy-info-stats documentation built on Nov. 19, 2022, 11:23 p.m.