(In-memory variation of the Collecting Expressions example.)
For in-memory operations sequential operations appear to not be a problem, as they do not contribute to query complexity as in our earlier database examples. This emphasizes that in-memory intuition must be confirmed when working with remote systems.
First set up our packages, database connection, and remote table.
library("dplyr")
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("rquery")
library("microbenchmark")
library("ggplot2")
library("WVPlots")
library("rqdatatable")
library("cdata")
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
nrow <- 1000000
d <- data.frame(x = seq_len(nrow))
td <- local_td(d)
tbl <- as.tbl(d)
ncol <- 100
rqdatatable
/rquery
torture function: add 100 columns to a 1000000 row table. This is implemented using data.table
in a batch mode.
rquery_fn_batch <- function(d, ncol) {
expressions <- paste0("x + ", seq_len(ncol))
names(expressions) <- paste0("x_", seq_len(ncol))
ops <- local_td(d) %.>%
extend_se(., expressions) %.>%
select_rows_nse(., x == 3)
d %.>% ops
}
rquery_fn_batch(d, 5)[]
## x x_1 x_2 x_3 x_4 x_5
## 1: 3 4 5 6 7 8
The row-selection step is to cut down on the in-memory cost of bringing the result back to R
. Obviously we could optimize the example away by pivoting the filter to earlier in the example pipeline. We ask the reader to take this example as a stand-in for a more complicated (though nasty) real-world example where such optimizations are not available.
To break out how much time we are losing in planning/parsing (something we intend to improve) we can more a pre-compiled version of the rquery
batch query.
rquery_fn_batch_compiled <- function(d, ncol) {
expressions <- paste0("x + ", seq_len(ncol))
names(expressions) <- paste0("x_", seq_len(ncol))
local_td(d) %.>%
extend_se(., expressions) %.>%
select_rows_nse(., x == 3)
}
ops <- rquery_fn_batch_compiled(d, 5)
(d %.>% ops)[]
## x x_1 x_2 x_3 x_4 x_5
## 1: 3 4 5 6 7 8
We can also try a sequentinal version.
rquery_fn_seq <- function(d, ncol) {
ops <- local_td(d)
for(i in seq_len(ncol)) {
ops <- extend_se(ops, paste0("x_", i) %:=% paste0("x + ", i))
}
ops <- select_rows_nse(ops, x == 3)
d %.>% ops
}
rquery_fn_seq(d, 5)[]
## x x_1 x_2 x_3 x_4 x_5
## 1: 3 4 5 6 7 8
And a pre-compiled rquery
sequential pipeline.
rquery_fn_seq_comp <- function(d, ncol) {
ops <- local_td(d)
for(i in seq_len(ncol)) {
ops <- extend_se(ops, paste0("x_", i) %:=% paste0("x + ", i))
}
select_rows_nse(ops, x == 3)
}
ops <- rquery_fn_seq_comp(d, 5)
(d %.>% ops)[]
## x x_1 x_2 x_3 x_4 x_5
## 1: 3 4 5 6 7 8
Same torture for dplyr
.
dplyr_fn <- function(tbl, ncol) {
pipeline <- tbl
xvar <- rlang::sym("x")
for(i in seq_len(ncol)) {
res_i <- rlang::sym(paste0("x_", i))
pipeline <- pipeline %>%
mutate(., !!res_i := !!xvar + i)
}
pipeline <- pipeline %>%
filter(., x == 3)
pipeline %>% compute(.)
}
dplyr_fn(tbl, 5)
## # A tibble: 1 x 6
## x x_1 x_2 x_3 x_4 x_5
## <int> <int> <int> <int> <int> <int>
## 1 3 4 5 6 7 8
We can also collect expressions efficiently using seplyr
(seplyr
is a thin wrapper over dplyr
, so seplyr
's method mutate_se()
is essentially instructions how to do the same thing in batch using dplyr
/rlang
).
seplyr_fn <- function(tbl, ncol) {
expressions <- paste0("x + ", seq_len(ncol))
names(expressions) <- paste0("x_", seq_len(ncol))
pipeline <- tbl %>%
seplyr::mutate_se(., expressions) %>%
filter(., x == 3)
pipeline %>% compute(.)
}
seplyr_fn(tbl, 5)
## # A tibble: 1 x 6
## x x_1 x_2 x_3 x_4 x_5
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 4 5 6 7 8
And we can also run with data.table
either sequentially (as below) or in batch (which was the rqdatatable
result).
data_table_sequential_fn <- function(d, ncol) {
# make sure we have a clean copy
dt <- data.table::copy(as.data.table(d))
for(i in seq_len(ncol)) {
dt[, paste0("x_", i) := eval(parse(text=paste0("x + ", i)))]
}
dt[x==3, ]
}
data_table_sequential_fn(tbl, 5)
## x x_1 x_2 x_3 x_4 x_5
## 1: 3 4 5 6 7 8
Time the functions.
opsbc <- rquery_fn_batch_compiled(d, ncol)
opssc <- rquery_fn_seq_comp(d, ncol)
timings <- microbenchmark(
rqdatatable_batch = rquery_fn_batch(d, ncol),
rqdatatable_batch_compiled = { d %.>% opsbc },
rqdatatable_sequential = rquery_fn_seq(d, ncol),
rqdatatable_sequential_compiled = { d %.>% opssc },
dplyr = dplyr_fn(tbl, ncol),
seplyr = seplyr_fn(tbl, ncol),
data_table_sequential = data_table_sequential_fn(d, ncol),
times = 100L)
saveRDS(timings, "CollectExprs_memory_timings.RDS")
print(timings)
## Unit: milliseconds
## expr min lq mean median
## rqdatatable_batch 1248.6449 1412.5535 1561.0661 1498.3010
## rqdatatable_batch_compiled 1195.2919 1364.2783 1508.9881 1459.5200
## rqdatatable_sequential 2674.3209 2822.3152 2999.8298 2961.0056
## rqdatatable_sequential_compiled 1476.3741 1627.9700 1760.0313 1691.8208
## dplyr 723.5657 766.8512 922.2574 843.1273
## seplyr 583.6540 803.2014 863.6107 846.1414
## data_table_sequential 706.9933 925.6701 971.3457 969.2892
## uq max neval
## 1591.6803 3435.489 100
## 1607.2087 2293.913 100
## 3065.6322 4714.208 100
## 1820.1758 2779.317 100
## 1063.2391 1822.095 100
## 913.3188 1405.777 100
## 1028.2709 1393.019 100
#autoplot(timings)
Present the results.
timings <- as.data.frame(timings)
timings$seconds <- timings$time/10^9
timings$method <- factor(timings$expr)
timings$method <- reorder(timings$method, timings$seconds)
WVPlots::ScatterBoxPlotH(timings, "seconds", "method", "task time by method")
tratio <- timings %.>%
project_nse(.,
groupby = "method",
mean_seconds = mean(seconds)) %.>%
pivot_to_rowrecs(.,
columnToTakeKeysFrom = "method",
columnToTakeValuesFrom = "mean_seconds",
rowKeyColumns = NULL) %.>%
extend_nse(.,
ratio = dplyr/rqdatatable_batch)
tratio[]
## data_table_sequential dplyr rqdatatable_batch
## 1: 0.9713457 0.9222574 1.561066
## rqdatatable_batch_compiled rqdatatable_sequential
## 1: 1.508988 2.99983
## rqdatatable_sequential_compiled seplyr ratio
## 1: 1.760031 0.8636107 0.5907869
ratio_str <- sprintf("%.2g", 1/tratio$ratio)
rqdatatable
in batch mode is about 1.7 times slower than dplyr
(and the other sequential implementations and even batch implementations) for this task at this scale for this data implementation and configuration. Likely this is due to copying and re-parsing overhead from rqdatatable
itself (unlikely to be a data.table
issue).
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.