In WinVector/rquery: Relational Query Generator for Data Manipulation at Scale

Let's time rquery, dplyr, and data.table on a non-trivial example.

These timings are on an late 2014 Mac Mini with 8GB of RAM running OSX everything current as of run-date.

First let's load our packages, establish a database connection, and declare an rquery ad hoc execution service (the "winvector_temp_db_handle").

library("data.table")
library("rquery")
library("rqdatatable")
library("dplyr")
library("microbenchmark")
library("ggplot2")

db <- DBI::dbConnect(RPostgres::Postgres(),
                     host = 'localhost',
                     port = 5432,
                     user = 'johnmount',
                     password = '')
# db <- DBI::dbConnect(MonetDBLite::MonetDBLite())
dbopts <- rq_connection_tests(db)
db_hdl <- rquery_db_info(connection = db,
                         is_dbi = TRUE,
                         connection_options = dbopts)
print(db_hdl)


packageVersion("rquery")
packageVersion("dplyr")
packageVersion("dbplyr")
packageVersion("DBI")
packageVersion("data.table")
packageVersion("RPostgres")


R.Version()

We now build and extended version of the example from Let’s Have Some Sympathy For The Part-time R User.

nrep <- 10000

dLocal <- data.frame(
  subjectID = c(1,                   
                1,
                2,                   
                2),
  surveyCategory = c(
    'withdrawal behavior',
    'positive re-framing',
    'withdrawal behavior',
    'positive re-framing'
  ),
  assessmentTotal = c(5,                 
                      2,
                      3,                  
                      4),
  stringsAsFactors = FALSE)
norig <- nrow(dLocal)
dLocal <- dLocal[rep(seq_len(norig), nrep), , drop=FALSE]
dLocal$subjectID <- paste((seq_len(nrow(dLocal)) -1)%/% norig,
                          dLocal$subjectID, 
                          sep = "_")
rownames(dLocal) <- NULL
head(dLocal)

dR <- rquery::rq_copy_to(db, 'dR',
                          dLocal,
                          temporary = TRUE, 
                          overwrite = TRUE)
cdata::qlook(db, dR$table_name)

dTbl <- dplyr::tbl(db, dR$table_name)
dplyr::glimpse(dTbl)

Now we declare our operation pipelines, both on local (in-memory data.frame) and remote (already in a database) data.

scale <- 0.237

# this is a function, 
# so body not evaluated until used
rquery_pipeline <- dR %.>%
  extend_nse(.,
             probability %:=%
               exp(assessmentTotal * scale))  %.>% 
  normalize_cols(.,
                 "probability",
                 partitionby = 'subjectID') %.>%
  pick_top_k(.,
             partitionby = 'subjectID',
             orderby = c('probability', 'surveyCategory'),
             reverse = c('probability')) %.>% 
  rename_columns(., 'diagnosis' %:=% 'surveyCategory') %.>%
  select_columns(., c('subjectID', 
                      'diagnosis', 
                      'probability')) %.>%
  orderby(., cols = 'subjectID')

rqdatatable <- function() {
  dLocal %.>% rquery_pipeline
}

rquery_database_roundtrip <- function() {
  dRT <- rquery::rq_copy_to(db, 'dR',
                          dLocal,
                          temporary = TRUE, 
                          overwrite = TRUE)
  rquery::execute(db_hdl, rquery_pipeline)
}


rquery_database_pull <- function() {
  rquery::execute(db_hdl, rquery_pipeline)
}

rquery_database_land <- function() {
  tabName <- "rquery_tmpx"
  rquery::materialize(db_hdl, rquery_pipeline, table_name = tabName,
                      overwrite = TRUE, temporary = TRUE)
  NULL
}


# this is a function, 
# so body not evaluated until used
dplyr_pipeline <- . %>%
  group_by(subjectID) %>%
  mutate(probability =
           exp(assessmentTotal * scale)/
           sum(exp(assessmentTotal * scale), na.rm = TRUE)) %>%
  arrange(probability, surveyCategory) %>%
  filter(row_number() == n()) %>%
  ungroup() %>%
  rename(diagnosis = surveyCategory) %>%
  select(subjectID, diagnosis, probability) %>%
  arrange(subjectID)

# this is a function, 
# so body not evaluated until used
# pipeline re-factored to have filter outside
# mutate 
# work around: https://github.com/tidyverse/dplyr/issues/3294
dplyr_pipeline2 <- . %>%
  group_by(subjectID) %>%
  mutate(probability =
           exp(assessmentTotal * scale)/
           sum(exp(assessmentTotal * scale), na.rm = TRUE)) %>%
  arrange(probability, surveyCategory) %>%
  mutate(count = n(), rank = row_number()) %>%
  ungroup() %>%
  filter(count == rank) %>%
  rename(diagnosis = surveyCategory) %>%
  select(subjectID, diagnosis, probability) %>%
  arrange(subjectID)


dplyr_local <- function() {
  dLocal %>% 
    dplyr_pipeline
}

dplyr_local_no_grouped_filter <- function() {
  dLocal %>% 
    dplyr_pipeline2
}

dplyr_tbl <- function() {
  dLocal %>%
    as_tibble %>%
    dplyr_pipeline
}

dplyr_round_trip <- function() {
  dTmp <- dplyr::copy_to(db, dLocal, "dplyr_tmp",
                         overwrite = TRUE,
                         temporary = TRUE
  )
  res <- dTmp %>% 
    dplyr_pipeline %>%
    collect()
  dplyr::db_drop_table(db, "dplyr_tmp")
  res
}

dplyr_database_pull <- function() {
  dTbl %>% 
    dplyr_pipeline %>%
    collect()
}

dplyr_database_land <- function() {
  tabName = "dplyr_ctmpx"
  dTbl %>% 
    dplyr_pipeline %>%
    compute(name = tabName)
  dplyr::db_drop_table(db, table = tabName)
  NULL
}

.datatable.aware <- TRUE

# improved code from:
# http://www.win-vector.com/blog/2018/01/base-r-can-be-fast/#comment-66746
data.table_local <- function() {
  dDT <- data.table::data.table(dLocal)
  dDT <- dDT[,list(diagnosis = surveyCategory,
                   probability = exp (assessmentTotal * scale ) /
                     sum ( exp ( assessmentTotal * scale ) ))
             ,subjectID ]
  setorder(dDT, subjectID, probability, -diagnosis)
  dDT <- dDT[,.SD[.N],subjectID]
  setorder(dDT, subjectID)
}

Let's inspect the functions.

head(rqdatatable())

head(rquery_database_roundtrip())

rquery_database_land()

head(rquery_database_pull())

head(dplyr_local())

head(dplyr_tbl())

head(dplyr_local_no_grouped_filter())

dplyr_database_land()

head(dplyr_database_pull())

head(dplyr_round_trip())

head(data.table_local())

Now let's measure the speeds with microbenchmark.

tm <- microbenchmark(
  "rqdatatable" = nrow(rqdatatable()),
  "rquery database roundtrip" = nrow(rquery_database_roundtrip()),
  "rquery from db to memory" = nrow(rquery_database_pull()),
  "rquery database land" = rquery_database_land(),
  "dplyr in memory" = nrow(dplyr_local()),
  "dplyr tbl in memory" = nrow(dplyr_tbl()),
  "dplyr in memory no grouped filter" = nrow(dplyr_local_no_grouped_filter()),
  "dplyr from memory to db and back" = nrow(dplyr_round_trip()),
  "dplyr from db to memory" = nrow(dplyr_database_pull()),
  "dplyr database land" = dplyr_database_land(),
  "data.table in memory" = nrow(data.table_local())
)
saveRDS(tm, "qtimings.RDS")
print(tm)
autoplot(tm)

rquery appears to be fast. The extra time for "rquery local" is because rquery doesn't really have a local mode, it has to copy the data to the database and back in that case. I currently guess rquery and dplyr are both picking up parallelism in the database.