Let's time rquery
, dplyr
, and data.table
on a non-trivial example.
These timings are on an late 2014 Mac Mini with 8GB of RAM running OSX everything current as of run-date.
First let's load our
packages, establish a database connection, and declare an rquery
ad hoc execution service (the "winvector_temp_db_handle
").
library("data.table") library("rquery") library("rqdatatable") library("dplyr") library("microbenchmark") library("ggplot2") db <- DBI::dbConnect(RPostgres::Postgres(), host = 'localhost', port = 5432, user = 'johnmount', password = '') # db <- DBI::dbConnect(MonetDBLite::MonetDBLite()) dbopts <- rq_connection_tests(db) db_hdl <- rquery_db_info(connection = db, is_dbi = TRUE, connection_options = dbopts) print(db_hdl) packageVersion("rquery") packageVersion("dplyr") packageVersion("dbplyr") packageVersion("DBI") packageVersion("data.table") packageVersion("RPostgres") R.Version()
We now build and extended version of the example from Let’s Have Some Sympathy For The Part-time R User.
nrep <- 10000 dLocal <- data.frame( subjectID = c(1, 1, 2, 2), surveyCategory = c( 'withdrawal behavior', 'positive re-framing', 'withdrawal behavior', 'positive re-framing' ), assessmentTotal = c(5, 2, 3, 4), stringsAsFactors = FALSE) norig <- nrow(dLocal) dLocal <- dLocal[rep(seq_len(norig), nrep), , drop=FALSE] dLocal$subjectID <- paste((seq_len(nrow(dLocal)) -1)%/% norig, dLocal$subjectID, sep = "_") rownames(dLocal) <- NULL head(dLocal) dR <- rquery::rq_copy_to(db, 'dR', dLocal, temporary = TRUE, overwrite = TRUE) cdata::qlook(db, dR$table_name) dTbl <- dplyr::tbl(db, dR$table_name) dplyr::glimpse(dTbl)
Now we declare our operation pipelines, both on local (in-memory data.frame
) and
remote (already in a database) data.
scale <- 0.237 # this is a function, # so body not evaluated until used rquery_pipeline <- dR %.>% extend_nse(., probability %:=% exp(assessmentTotal * scale)) %.>% normalize_cols(., "probability", partitionby = 'subjectID') %.>% pick_top_k(., partitionby = 'subjectID', orderby = c('probability', 'surveyCategory'), reverse = c('probability')) %.>% rename_columns(., 'diagnosis' %:=% 'surveyCategory') %.>% select_columns(., c('subjectID', 'diagnosis', 'probability')) %.>% orderby(., cols = 'subjectID') rqdatatable <- function() { dLocal %.>% rquery_pipeline } rquery_database_roundtrip <- function() { dRT <- rquery::rq_copy_to(db, 'dR', dLocal, temporary = TRUE, overwrite = TRUE) rquery::execute(db_hdl, rquery_pipeline) } rquery_database_pull <- function() { rquery::execute(db_hdl, rquery_pipeline) } rquery_database_land <- function() { tabName <- "rquery_tmpx" rquery::materialize(db_hdl, rquery_pipeline, table_name = tabName, overwrite = TRUE, temporary = TRUE) NULL } # this is a function, # so body not evaluated until used dplyr_pipeline <- . %>% group_by(subjectID) %>% mutate(probability = exp(assessmentTotal * scale)/ sum(exp(assessmentTotal * scale), na.rm = TRUE)) %>% arrange(probability, surveyCategory) %>% filter(row_number() == n()) %>% ungroup() %>% rename(diagnosis = surveyCategory) %>% select(subjectID, diagnosis, probability) %>% arrange(subjectID) # this is a function, # so body not evaluated until used # pipeline re-factored to have filter outside # mutate # work around: https://github.com/tidyverse/dplyr/issues/3294 dplyr_pipeline2 <- . %>% group_by(subjectID) %>% mutate(probability = exp(assessmentTotal * scale)/ sum(exp(assessmentTotal * scale), na.rm = TRUE)) %>% arrange(probability, surveyCategory) %>% mutate(count = n(), rank = row_number()) %>% ungroup() %>% filter(count == rank) %>% rename(diagnosis = surveyCategory) %>% select(subjectID, diagnosis, probability) %>% arrange(subjectID) dplyr_local <- function() { dLocal %>% dplyr_pipeline } dplyr_local_no_grouped_filter <- function() { dLocal %>% dplyr_pipeline2 } dplyr_tbl <- function() { dLocal %>% as_tibble %>% dplyr_pipeline } dplyr_round_trip <- function() { dTmp <- dplyr::copy_to(db, dLocal, "dplyr_tmp", overwrite = TRUE, temporary = TRUE ) res <- dTmp %>% dplyr_pipeline %>% collect() dplyr::db_drop_table(db, "dplyr_tmp") res } dplyr_database_pull <- function() { dTbl %>% dplyr_pipeline %>% collect() } dplyr_database_land <- function() { tabName = "dplyr_ctmpx" dTbl %>% dplyr_pipeline %>% compute(name = tabName) dplyr::db_drop_table(db, table = tabName) NULL } .datatable.aware <- TRUE # improved code from: # http://www.win-vector.com/blog/2018/01/base-r-can-be-fast/#comment-66746 data.table_local <- function() { dDT <- data.table::data.table(dLocal) dDT <- dDT[,list(diagnosis = surveyCategory, probability = exp (assessmentTotal * scale ) / sum ( exp ( assessmentTotal * scale ) )) ,subjectID ] setorder(dDT, subjectID, probability, -diagnosis) dDT <- dDT[,.SD[.N],subjectID] setorder(dDT, subjectID) }
Let's inspect the functions.
head(rqdatatable()) head(rquery_database_roundtrip()) rquery_database_land() head(rquery_database_pull()) head(dplyr_local()) head(dplyr_tbl()) head(dplyr_local_no_grouped_filter()) dplyr_database_land() head(dplyr_database_pull()) head(dplyr_round_trip()) head(data.table_local())
Now let's measure the speeds with microbenchmark
.
tm <- microbenchmark( "rqdatatable" = nrow(rqdatatable()), "rquery database roundtrip" = nrow(rquery_database_roundtrip()), "rquery from db to memory" = nrow(rquery_database_pull()), "rquery database land" = rquery_database_land(), "dplyr in memory" = nrow(dplyr_local()), "dplyr tbl in memory" = nrow(dplyr_tbl()), "dplyr in memory no grouped filter" = nrow(dplyr_local_no_grouped_filter()), "dplyr from memory to db and back" = nrow(dplyr_round_trip()), "dplyr from db to memory" = nrow(dplyr_database_pull()), "dplyr database land" = dplyr_database_land(), "data.table in memory" = nrow(data.table_local()) ) saveRDS(tm, "qtimings.RDS") print(tm) autoplot(tm)
rquery
appears to be fast. The extra time for "rquery
local" is because rquery
doesn't really have a local mode, it has to copy the data to the database and back
in that case. I currently guess rquery
and dplyr
are both picking up parallelism
in the database.
sessionInfo()
DBI::dbDisconnect(db_hdl$connection)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.