Win-Vector LLC 1/15/2018
Let's time rquery
, dplyr
, and data.table
on a non-trivial example.
These timings are on a late 2014 Mac Mini with 8GB of RAM running OSX 10.12.6, R version 3.4.3 (2017-11-30) -- "Kite-Eating Tree", and the current (2018-01-07) CRAN versions of all packages (except rquery
, which is not yet up on CRAN). We are getting database services from PostgreSQL version 9.6.1
in a docker container.
First let's load our packages, establish a database connection, and declare an rquery
ad hoc execution service (the "winvector_temp_db_handle
").
library("data.table") # load first so we can overwrite := with rquery
library("rquery")
## Loading required package: wrapr
##
## Attaching package: 'wrapr'
## The following object is masked from 'package:data.table':
##
## :=
## Loading required package: cdata
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("microbenchmark")
library("ggplot2")
source("cscan.R")
source("fns.R")
db <- NULL
# db <- DBI::dbConnect(RPostgres::Postgres(),
# host = 'localhost',
# port = 5432,
# user = 'postgres',
# password = 'pg')
if(!is.null(db)) {
winvector_temp_db_handle <- list(db = db)
print(db)
DBI::dbGetQuery(db, "SELECT version()", stringsAsFactors = FALSE)
}
packageVersion("rquery")
## [1] '0.2.0'
packageVersion("dplyr")
## [1] '0.7.4'
packageVersion("dbplyr")
## [1] '1.2.0'
packageVersion("DBI")
## [1] '0.7'
packageVersion("data.table")
## [1] '1.10.4.3'
packageVersion("RPostgres")
## [1] '1.0.4'
R.Version()
## $platform
## [1] "x86_64-apple-darwin15.6.0"
##
## $arch
## [1] "x86_64"
##
## $os
## [1] "darwin15.6.0"
##
## $system
## [1] "x86_64, darwin15.6.0"
##
## $status
## [1] ""
##
## $major
## [1] "3"
##
## $minor
## [1] "4.3"
##
## $year
## [1] "2017"
##
## $month
## [1] "11"
##
## $day
## [1] "30"
##
## $`svn rev`
## [1] "73796"
##
## $language
## [1] "R"
##
## $version.string
## [1] "R version 3.4.3 (2017-11-30)"
##
## $nickname
## [1] "Kite-Eating Tree"
We now build and extended version of the example from Let’s Have Some Sympathy For The Part-time R User.
nrep <- 10
dLocal <- mkData(nrep)
head(dLocal)
## subjectID surveyCategory assessmentTotal
## 1 s1 withdrawal behavior 3
## 2 s1 positive re-framing 8
## 3 s10 withdrawal behavior 9
## 4 s10 positive re-framing 1
## 5 s2 withdrawal behavior 3
## 6 s2 positive re-framing 5
dR <- NULL
dTbl <- NULL
if(!is.null(db)) {
dR <- rquery::dbi_copy_to(db, 'dR',
dLocal,
temporary = TRUE,
overwrite = TRUE)
dTbl <- dplyr::tbl(db, dR$table_name)
cdata::qlook(db, dR$table_name)
dplyr::glimpse(dTbl)
}
Now we declare our operation pipelines, both on local (in-memory data.frame
) and remote (already in a database) data.
scale <- 0.237
base_R_row_calculation <- function() {
base_r_calculate_rows(dLocal)
}
base_R_sequential_calculation <- function() {
base_r_calculate_sequenced(dLocal)
}
base_R_cframe_calculation <- function() {
base_r_calculate_cframe(dLocal)
}
base_R_tabular_calculation <- function() {
base_r_calculate_tabular(dLocal)
}
rquery_local <- function() {
dLocal %.>%
rquery_pipeline(.) %.>%
as.data.frame(., stringsAsFactors = FALSE) # force execution
}
rquery_database_pull <- function() {
dR %.>%
rquery_pipeline(.) %.>%
to_sql(., db) %.>%
DBI::dbGetQuery(db, ., stringsAsFactors = FALSE) %.>%
as.data.frame(., stringsAsFactors = FALSE) # shouldn't be needed
}
rquery_database_land <- function() {
tabName <- "rquery_tmpx"
sqlc <- dR %.>%
rquery_pipeline(.) %.>%
to_sql(., db)
DBI::dbExecute(db, paste("CREATE TABLE", tabName, "AS", sqlc))
DBI::dbExecute(db, paste("DROP TABLE", tabName))
NULL
}
rquery_database_count <- function() {
dR %.>%
rquery_pipeline(.) %.>%
sql_node(., "n" := "COUNT(1)") %.>%
to_sql(., db) %.>%
DBI::dbGetQuery(db, ., stringsAsFactors = FALSE) %.>%
as.data.frame(., stringsAsFactors = FALSE) # shouldn't be needed
}
dplyr_local <- function() {
dLocal %>%
dplyr_pipeline
}
dplyr_local_no_grouped_filter <- function() {
dLocal %>%
dplyr_pipeline2
}
dplyr_tbl <- function() {
dLocal %>%
as_tibble %>%
dplyr_pipeline
}
dplyr_round_trip <- function() {
dTmp <- dplyr::copy_to(db, dLocal, "dplyr_tmp",
# overwrite = TRUE,
temporary = TRUE
)
res <- dTmp %>%
dplyr_pipeline %>%
collect()
dplyr::db_drop_table(db, "dplyr_tmp")
res
}
dplyr_database_pull <- function() {
dTbl %>%
dplyr_pipeline %>%
collect()
}
dplyr_database_land <- function() {
tabName = "dplyr_ctmpx"
dTbl %>%
dplyr_pipeline %>%
compute(name = tabName)
dplyr::db_drop_table(db, table = tabName)
NULL
}
dplyr_database_count <- function() {
dTbl %>%
dplyr_pipeline %>%
tally() %>%
collect()
}
Let's inspect the functions.
check <- base_R_sequential_calculation()
head(check)
## subjectID diagnosis probability
## 1 s1 positive re-framing 0.7658456
## 2 s10 withdrawal behavior 0.8694381
## 3 s2 positive re-framing 0.6163301
## 4 s3 positive re-framing 0.6706221
## 5 s4 positive re-framing 0.5000000
## 6 s5 positive re-framing 0.6163301
if(!equiv_res(check, base_R_cframe_calculation())) {
stop("mismatch")
}
if(!equiv_res(check, base_R_row_calculation())) {
stop("mismatch")
}
if(!equiv_res(check, base_R_tabular_calculation())) {
stop("mismatch")
}
if(!equiv_res(check, dplyr_local())) {
stop("mismatch")
}
if(!equiv_res(check, dplyr_tbl())) {
stop("mismatch")
}
if(!equiv_res(check, dplyr_local_no_grouped_filter())) {
stop("mismatch")
}
if(!equiv_res(check, data.table_local())) {
stop("mismatch")
}
# From: http://www.win-vector.com/blog/2018/01/base-r-can-be-fast/#comment-66751
data.table_local4 <- function(dLocal) {
dDT <- data.table::data.table(dLocal)
setnames(dDT, "surveyCategory", "diagnosis")
dDT[,expaTs:=exp(assessmentTotal*scale)]
dDT[,sum_expaTs:=sum(expaTs),subjectID] # precalculate -> this uses gsum internally
dDT[,probability := expaTs / sum_expaTs]
dDT[,c("assessmentTotal","expaTs","sum_expaTs"):=NULL]
setorder(dDT, subjectID, -probability, diagnosis)
dDT[,.SD[1],subjectID]
}
if(!equiv_res(check, data.table_local4(dLocal))) {
stop("mismatch")
}
if(!is.null(db)) {
head(rquery_local())
rquery_database_land()
head(rquery_database_pull())
rquery_database_count()
dplyr_database_land()
head(dplyr_database_pull())
dplyr_database_count()
head(dplyr_round_trip())
}
Now let's measure the speeds with microbenchmark
.
timings <- NULL
# improved code from:
# http://www.win-vector.com/blog/2018/01/base-r-can-be-fast/#comment-66746
data.table_local3 <- function() {
dDT <- data.table::data.table(dLocal)
dDT <- dDT[,list(diagnosis = surveyCategory,
probability = exp (assessmentTotal * scale ) /
sum ( exp ( assessmentTotal * scale ) ))
,subjectID ]
setorder(dDT, subjectID, probability, -diagnosis)
dDT <- dDT[,.SD[.N],subjectID]
setorder(dDT, subjectID)
}
expressions <- list(
# "rquery in memory" = bquote({ nrow(rquery_local())}),
# "rquery from db to memory" = bquote({nrow(rquery_database_pull())}),
# "rquery database count" = bquote({rquery_database_count()}),
# "rquery database land" = bquote({rquery_database_land()}),
# "dplyr in memory" = bquote({nrow(dplyr_local())}),
# "dplyr tbl in memory" = bquote({nrow(dplyr_tbl())}),
"dplyr in memory no grouped filter" = bquote({nrow(dplyr_local_no_grouped_filter())}),
# "dplyr from memory to db and back" = bquote({nrow(dplyr_round_trip())}),
# "dplyr from db to memory" = bquote({nrow(dplyr_database_pull())}),
# "dplyr database count" = bquote({dplyr_database_count()}),
# "dplyr database land" = bquote({dplyr_database_land()}),
"data.table in memory" = bquote({nrow(data.table_local4(dLocal))}),
# "base R row calculation" = bquote({nrow(base_R_row_calculation())}),
"base R tabular calculation" = bquote({nrow(base_R_tabular_calculation())}),
# "base R sequential calculation" = bquote({nrow(base_R_sequential_calculation())})
"base R cframe calculation" = bquote({nrow(base_R_cframe_calculation())})
)
if(!is.null(db)) {
expressions <-
c(expressions,
list(
"rquery from memory to db and back" = bquote({ nrow(rquery_local())}),
# "rquery from db to memory" = bquote({nrow(rquery_database_pull())}),
"rquery database count" = bquote({rquery_database_count()}),
"rquery database land" = bquote({rquery_database_land()}),
# "dplyr in memory" = bquote({nrow(dplyr_local())}),
# "dplyr tbl in memory" = bquote({nrow(dplyr_tbl())}),
"dplyr from memory to db and back" = bquote({nrow(dplyr_round_trip())}),
# "dplyr from db to memory" = bquote({nrow(dplyr_database_pull())}),
"dplyr database count" = bquote({dplyr_database_count()}),
"dplyr database land" = bquote({dplyr_database_land()})
))
}
prune <- FALSE
for(nrep in c(1,
10,
100,
1000,
10000,
100000,
1000000)) {
print(paste("nrep:", nrep))
dLocal <- mkData(nrep)
print(paste("rows:", nrow(dLocal)))
dR <- NULL
dTbl <- NULL
if(!is.null(db)) {
dR <- rquery::dbi_copy_to(db, 'dR',
dLocal,
temporary = TRUE,
overwrite = TRUE)
dTbl <- dplyr::tbl(db, dR$table_name)
}
tm <- microbenchmark(
list = expressions,
times = 5L
)
print(tm)
plt <- autoplot(tm) + ggtitle(paste0("rows: ", nrow(dLocal)))
print(plt)
tmi <- as.data.frame(tm, stringsAsFactors = FALSE)
tmi$data_size <- nrow(dLocal)
timings <- rbind(timings, tmi)
if(prune) {
baddies <- unique(tmi$expr[tmi$time > 10*1e+9])
for(bi in baddies) {
expressions[[bi]] <- NULL
}
if(length(expressions)<=0) {
break
}
}
}
## [1] "nrep: 1"
## [1] "rows: 2"
## Unit: microseconds
## expr min lq mean median
## dplyr in memory no grouped filter 17093.711 18502.633 20873.779 18885.434
## data.table in memory 2398.241 2461.082 2838.094 2461.398
## base R tabular calculation 2174.808 2181.188 2578.358 2303.722
## base R cframe calculation 677.162 712.871 1095.345 1058.710
## uq max neval
## 22898.726 26988.391 5
## 3063.820 3805.928 5
## 2330.217 3901.855 5
## 1281.909 1746.075 5
## [1] "nrep: 10"
## [1] "rows: 20"
## Unit: microseconds
## expr min lq mean median
## dplyr in memory no grouped filter 18103.938 23419.238 23386.661 24279.069
## data.table in memory 2642.359 3085.040 3201.963 3266.684
## base R tabular calculation 2692.558 3144.201 3407.787 3374.973
## base R cframe calculation 726.014 1072.098 1075.301 1111.188
## uq max neval
## 25208.984 25922.075 5
## 3316.276 3699.456 5
## 3477.826 4349.377 5
## 1157.387 1309.818 5
## [1] "nrep: 100"
## [1] "rows: 200"
## Unit: microseconds
## expr min lq mean median
## dplyr in memory no grouped filter 22284.477 22516.579 25150.174 24497.526
## data.table in memory 3221.646 3254.257 3428.856 3430.565
## base R tabular calculation 3888.089 4323.419 5325.524 5992.453
## base R cframe calculation 936.452 994.121 1142.918 1181.963
## uq max neval
## 28092.619 28359.668 5
## 3556.454 3681.357 5
## 6095.100 6328.557 5
## 1263.678 1338.374 5
## [1] "nrep: 1000"
## [1] "rows: 2000"
## Unit: milliseconds
## expr min lq mean median
## dplyr in memory no grouped filter 51.674285 77.150215 75.517975 81.633480
## data.table in memory 3.339440 3.474448 4.040997 4.318096
## base R tabular calculation 18.842085 19.221082 23.318909 21.780901
## base R cframe calculation 1.193218 1.368413 1.443241 1.380113
## uq max neval
## 82.51997 84.611928 5
## 4.35286 4.720141 5
## 27.78488 28.965598 5
## 1.40330 1.871162 5
## [1] "nrep: 10000"
## [1] "rows: 20000"
## Unit: milliseconds
## expr min lq mean
## dplyr in memory no grouped filter 401.330091 434.587584 480.510831
## data.table in memory 9.889524 12.812629 15.485638
## base R tabular calculation 223.716139 257.465046 292.931891
## base R cframe calculation 6.575244 6.707435 9.369459
## median uq max neval
## 479.98477 483.86224 602.78948 5
## 13.94164 20.20621 20.57819 5
## 300.67289 338.20146 344.60392 5
## 9.78236 11.01194 12.77032 5
## [1] "nrep: 1e+05"
## [1] "rows: 200000"
## Unit: milliseconds
## expr min lq mean
## dplyr in memory no grouped filter 4931.53225 4963.63557 5651.3629
## data.table in memory 70.09192 73.99669 104.3876
## base R tabular calculation 3087.54650 3175.19333 3249.3754
## base R cframe calculation 95.02364 101.13144 179.5916
## median uq max neval
## 5178.66185 5911.1958 7271.7890 5
## 86.96318 101.8792 189.0069 5
## 3228.73675 3230.2300 3525.1706 5
## 190.81691 250.2387 260.7472 5
## [1] "nrep: 1e+06"
## [1] "rows: 2000000"
## Unit: milliseconds
## expr min lq mean
## dplyr in memory no grouped filter 58101.8360 58388.8882 61621.171
## data.table in memory 873.8924 957.7749 1041.705
## base R tabular calculation 34946.6670 36124.8858 36788.736
## base R cframe calculation 1399.6995 1657.8473 1733.163
## median uq max neval
## 58509.173 66193.804 66912.155 5
## 1054.780 1128.150 1193.927 5
## 36151.583 36491.518 40229.027 5
## 1744.881 1750.348 2113.041 5
saveRDS(timings, "qtimings4.RDS")
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2 ggplot2_2.2.1 microbenchmark_1.4-3
## [4] dplyr_0.7.4 rquery_0.2.0 cdata_0.5.1
## [7] wrapr_1.1.1 data.table_1.10.4-3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14.2 knitr_1.18 bindr_0.1 magrittr_1.5
## [5] munsell_0.4.3 colorspace_1.3-2 R6_2.2.2 rlang_0.1.6
## [9] plyr_1.8.4 stringr_1.2.0 tools_3.4.3 grid_3.4.3
## [13] gtable_0.2.0 htmltools_0.3.6 lazyeval_0.2.1 yaml_2.1.16
## [17] rprojroot_1.3-2 digest_0.6.13 assertthat_0.2.0 tibble_1.4.1
## [21] glue_1.2.0 evaluate_0.10.1 rmarkdown_1.8 stringi_1.1.6
## [25] compiler_3.4.3 pillar_1.0.1 scales_0.5.0 backports_1.1.2
## [29] pkgconfig_2.0.1
winvector_temp_db_handle <- NULL
if(!is.null(db)) {
DBI::dbDisconnect(db)
db <- NULL
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.