Win-Vector LLC 12/11/2017
dplyr
SQL for the rquery
example. Notice the irrelevant columns live a few steps into the query sequence. Also notice the dplyr
SQL
does have less nesting than the rquery
SQL
.
suppressPackageStartupMessages(library("dplyr"))
packageVersion("dplyr")
## [1] '0.7.4'
my_db <- sparklyr::spark_connect(version='2.2.0',
master = "local")
d <- dplyr::copy_to(my_db,
data.frame(
subjectID = c(1,
1,
2,
2),
surveyCategory = c(
'withdrawal behavior',
'positive re-framing',
'withdrawal behavior',
'positive re-framing'
),
assessmentTotal = c(5,
2,
3,
4),
irrelevantCol1 = "irrel1",
irrelevantCol2 = "irrel2",
stringsAsFactors = FALSE),
name = 'd',
temporary = TRUE,
overwrite = FALSE)
scale <- 0.237
dq <- d %>%
group_by(subjectID) %>%
mutate(probability =
exp(assessmentTotal * scale)/
sum(exp(assessmentTotal * scale))) %>%
arrange(probability, surveyCategory) %>%
filter(row_number() == n()) %>%
ungroup() %>%
rename(diagnosis = surveyCategory) %>%
select(subjectID, diagnosis, probability) %>%
arrange(subjectID)
# directly prints, can not easilly and reliable capture SQL
show_query(dq)
## <SQL>
## SELECT `subjectID` AS `subjectID`, `diagnosis` AS `diagnosis`, `probability` AS `probability`
## FROM (SELECT `subjectID` AS `subjectID`, `surveyCategory` AS `diagnosis`, `assessmentTotal` AS `assessmentTotal`, `irrelevantCol1` AS `irrelevantCol1`, `irrelevantCol2` AS `irrelevantCol2`, `probability` AS `probability`
## FROM (SELECT `subjectID`, `surveyCategory`, `assessmentTotal`, `irrelevantCol1`, `irrelevantCol2`, `probability`
## FROM (SELECT `subjectID`, `surveyCategory`, `assessmentTotal`, `irrelevantCol1`, `irrelevantCol2`, `probability`, row_number() OVER (PARTITION BY `subjectID` ORDER BY `probability`, `surveyCategory`) AS `zzz2`, COUNT(*) OVER (PARTITION BY `subjectID`) AS `zzz3`
## FROM (SELECT *
## FROM (SELECT `subjectID`, `surveyCategory`, `assessmentTotal`, `irrelevantCol1`, `irrelevantCol2`, EXP(`assessmentTotal` * 0.237) / sum(EXP(`assessmentTotal` * 0.237)) OVER (PARTITION BY `subjectID`) AS `probability`
## FROM `d`) `uxpqmutlwi`
## ORDER BY `probability`, `surveyCategory`) `lcjklwhqji`) `tsvwikxgrf`
## WHERE (`zzz2` = `zzz3`)) `eoatesqffh`) `daniczjcfg`
## ORDER BY `subjectID`
# directly prints, can not easilly and reliable capture SQL
explain(dq)
## <SQL>
## SELECT `subjectID` AS `subjectID`, `diagnosis` AS `diagnosis`, `probability` AS `probability`
## FROM (SELECT `subjectID` AS `subjectID`, `surveyCategory` AS `diagnosis`, `assessmentTotal` AS `assessmentTotal`, `irrelevantCol1` AS `irrelevantCol1`, `irrelevantCol2` AS `irrelevantCol2`, `probability` AS `probability`
## FROM (SELECT `subjectID`, `surveyCategory`, `assessmentTotal`, `irrelevantCol1`, `irrelevantCol2`, `probability`
## FROM (SELECT `subjectID`, `surveyCategory`, `assessmentTotal`, `irrelevantCol1`, `irrelevantCol2`, `probability`, row_number() OVER (PARTITION BY `subjectID` ORDER BY `probability`, `surveyCategory`) AS `zzz4`, COUNT(*) OVER (PARTITION BY `subjectID`) AS `zzz5`
## FROM (SELECT *
## FROM (SELECT `subjectID`, `surveyCategory`, `assessmentTotal`, `irrelevantCol1`, `irrelevantCol2`, EXP(`assessmentTotal` * 0.237) / sum(EXP(`assessmentTotal` * 0.237)) OVER (PARTITION BY `subjectID`) AS `probability`
## FROM `d`) `kclmmhlpub`
## ORDER BY `probability`, `surveyCategory`) `wfudcfxndv`) `gmzibckwdr`
## WHERE (`zzz4` = `zzz5`)) `blwkzzupqr`) `qthsnydkse`
## ORDER BY `subjectID`
##
## <PLAN>
dq
## # Source: lazy query [?? x 3]
## # Database: spark_connection
## # Ordered by: probability, surveyCategory, subjectID
## subjectID diagnosis probability
## <dbl> <chr> <dbl>
## 1 1 withdrawal behavior 0.6706221
## 2 2 positive re-framing 0.5589742
sparklyr::spark_disconnect(my_db)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.