knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(ggplot2) #library(tidyinfostats) devtools::load_all("..") set.seed(101)
Suppose you have a database connection and you have some data loaded into it. Here we will use a single test table created from a test distribution
con <- DBI::dbConnect(RSQLite::SQLite(), dbname = ":memory:") theoretical = tibble( measure = c("Mean","Variance","Mutual Information"), hb = c(hb$theoreticalMean(),hb$theoreticalVariance(), hb$theoreticalMI()), k = c(k$theoreticalMean(),k$theoreticalVariance(),k$theoreticalMI())) testData = bloodResultsSimulation(10000)$data testDataLazy = con %>% copy_to(testData) testDataLazy
For reference results as executed in R
# devtools::load_all("..") result = testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KWindow") result
Now executed on the dbplyr database:
lazyResult = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KWindow") lazyResult %>% show_query() lazyResult %>% collect() # DONT run this. The KNN method is not efficient in SQL # lazyResult2 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "KNN", useKWindow=FALSE) # lazyResult2 %>% show_query() # lazyResult2
By rank discretisation with a fixed number of bins is equivalent to an N_tile in sql
# devtools::load_all("..") discretised = testDataLazy %>% group_by(feature) %>% tidyinfostats::discretise_ByRank(value, value_discrete, bins=3) %>% collect() discretised %>% group_by(feature,value_discrete) %>% summarise(count = n()) # Non dbplyr for comparison: # discretised = testData %>% group_by(feature) %>% tidyinfostats::discretise_ByRank(value, value_discrete, bins=3) # discretised %>% group_by(test,value_discrete) %>% summarise(count = n())
devtools::load_all("..") testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByRank", bins=100) lazyResult3 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByRank", bins=100) #lazyResult3 %>% show_query() lazyResult3
# devtools::load_all("..") #testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "DiscretiseByValue", discreteMethod = "MontgomerySmith") testData %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod = "Grassberger") #lazyResult4 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "DiscretiseByValue", discreteMethod = "MontgomerySmith") lazyResult4 = testDataLazy %>% group_by(feature) %>% tidyinfostats::calculateDiscreteContinuousMI(vars(outcome), value, method = "Discretise", discretiseMethod="ByValue", mutualInfoMethod = "Grassberger") lazyResult4
# devtools::load_all("..") ggplot( testData %>% group_by(feature,outcome) %>% probabilitiesFromContinuous(value, method="SGolay"), aes(x=value,y=p_x, colour=outcome)) + geom_point() + facet_wrap(vars(feature)) # debug(probabilitiesFromContinuous_SGolay) # debug(applySGolayFilter) ggplot( testDataLazy %>% group_by(feature,outcome) %>% probabilitiesFromContinuous(value, method="SGolay"), aes(x=value,y=p_x, colour=outcome)) + geom_point() + facet_wrap(vars(feature))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.