In YTLogos/rquery: Relational Query Algebra for Data Manipulation

dplyr SQL for the rquery example. Notice the irrelevant columns live a few steps into the query sequence. Also notice the dplyr SQL does have less nesting than the rquery SQL.

suppressPackageStartupMessages(library("dplyr"))
packageVersion("dplyr")

my_db <- sparklyr::spark_connect(version='2.2.0', 
                                 master = "local")

d <- dplyr::copy_to(my_db,
                    data.frame(
                      subjectID = c(1,
                                    1,
                                    2,
                                    2),
                      surveyCategory = c(
                        'withdrawal behavior',
                        'positive re-framing',
                        'withdrawal behavior',
                        'positive re-framing'
                      ),
                      assessmentTotal = c(5,
                                          2,
                                          3,
                                          4),
                      irrelevantCol1 = "irrel1",
                      irrelevantCol2 = "irrel2",
                      stringsAsFactors = FALSE),
                    name =  'd',
                    temporary = TRUE,
                    overwrite = FALSE)

scale <- 0.237

dq <- d %>%
  group_by(subjectID) %>%
  mutate(probability =
           exp(assessmentTotal * scale)/
           sum(exp(assessmentTotal * scale))) %>%
  arrange(probability, surveyCategory) %>%
  filter(row_number() == n()) %>%
  ungroup() %>%
  rename(diagnosis = surveyCategory) %>%
  select(subjectID, diagnosis, probability) %>%
  arrange(subjectID)

# directly prints, can not easilly and reliable capture SQL
show_query(dq)

# directly prints, can not easilly and reliable capture SQL
explain(dq)

dq

sparklyr::spark_disconnect(my_db)