This is running dplyr
and
dtplyr
on the “rquery
Modes
Example”.
First we define the functions and data we used in the rquery
modes
example
example.
library(rquery)
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.2
library(microbenchmark)
## Warning: package 'microbenchmark' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
packageVersion('dplyr')
## [1] '0.8.3'
library(dtplyr)
## Warning: package 'dtplyr' was built under R version 3.5.2
packageVersion('dtplyr')
## [1] '0.0.3'
set.seed(2019)
n_rows <- 1000000
d_large <- data.frame(
x = rnorm(n = n_rows),
y = rnorm(n = n_rows),
g = sample(paste0('v_', seq_len(n_rows/10)),
size = n_rows,
replace = TRUE),
stringsAsFactors = FALSE
)
ops <- local_td(d_large) %.>% # Describe table for later operations
extend(., # add a new column
ratio := y / x) %.>%
extend(., # rank the rows by group and order
simple_rank := row_number(),
partitionby = 'g',
orderby = 'ratio',
reverse = 'ratio') %.>%
extend(., # mark the rows we want
choice := simple_rank == 1)
d_large %.>%
ops %.>%
order_rows(., 'g') %.>%
select_rows(., choice) %.>%
head(.) %.>%
knitr::kable(.)
| x | y | g | ratio | simple_rank | choice | | ----------: | ----------: | :-------- | --------: | -----------: | :----- | | 0.2773596 | 3.1546090 | v_1 | 11.373713 | 1 | TRUE | | 0.3373741 | 0.3947169 | v_10 | 1.169968 | 1 | TRUE | | -0.4033178 | -1.2618623 | v_100 | 3.128705 | 1 | TRUE | | 0.1218793 | 2.6972358 | v_1000 | 22.130387 | 1 | TRUE | | 0.0029194 | 0.1780082 | v_10000 | 60.974286 | 1 | TRUE | | -0.2867625 | -1.6812063 | v_100000 | 5.862713 | 1 | TRUE |
f_compiled <- function(dat) {
dat %.>% ops # use pre-compiled pipeline
}
f_immediate <- function(dat) {
dat %.>%
extend(., # add a new column
ratio := y / x) %.>%
extend(., # rank the rows by group and order
simple_rank := row_number(),
partitionby = 'g',
orderby = 'ratio',
reverse = 'ratio') %.>%
extend(., # mark the rows we want
choice := simple_rank == 1)
}
f_wrapped <- function(dat) {
dat %.>%
wrap %.>% # wrap data in a description
extend(., # add a new column
ratio := y / x) %.>%
extend(., # rank the rows by group and order
simple_rank := row_number(),
partitionby = 'g',
orderby = 'ratio',
reverse = 'ratio') %.>%
extend(., # mark the rows we want
choice := simple_rank == 1) %.>%
ex # signal construction done, and execute
}
The dplyr
version of the pipeline is similar, except the window
functions are not a single step- but a 4 stage block.
f_dplyr <- function(dat) {
dat %>%
mutate( # add a new column
ratio := y / x) %>%
group_by( # rank the rows by group and order
g) %>%
arrange( -ratio) %>%
mutate(
simple_rank := row_number()) %>%
ungroup() %>% # end of rank block
mutate( # mark the rows we want
choice := simple_rank == 1)
}
We are using the most current CRAN
versions of each (dtplyr
is
currently being re-engineered to try to also cut down the number
conversions).
Above we see a key difference between rquery
and dplyr
: rquery
grouped and window functions are single operators in rquery
, but are
driven by annotations between steps in dplyr
.
dtplyr
seems to error-out on this problem, meaning the automatic
translations from dplyr
to data.table
are not sufficient to our
task.
f_dplyr(data.table(d_large))
## Error in `:=`(ratio, y/x): Check that is.data.table(DT) == TRUE. Otherwise, := and `:=`(...) are defined for use in j, once only and in particular ways. See help(":=").
We can try to re-write the dtplyr
pipeline as follows. It appears
switching from :=
to =
and replacing row_number()
with .I
helps.
f_dtplyr <- function(dat) {
data.table(dat) %>%
mutate( # add a new column
ratio = y / x) %>%
group_by( # rank the rows by group and order
g) %>%
arrange( -ratio) %>%
mutate(
simple_rank = 1:.N) %>%
ungroup() %>% # end of rank block
mutate( # mark the rows we want
choice = simple_rank == 1)
}
res_dtplyr <- f_dtplyr(d_large)
res_dtplyr %.>%
order_rows(., 'g') %.>%
select_rows(., choice) %.>%
head(.) %.>%
knitr::kable(.)
| x | y | g | ratio | simple_rank | choice | | ----------: | ----------: | :-------- | --------: | -----------: | :----- | | 0.2773596 | 3.1546090 | v_1 | 11.373713 | 1 | TRUE | | 0.3373741 | 0.3947169 | v_10 | 1.169968 | 1 | TRUE | | -0.4033178 | -1.2618623 | v_100 | 3.128705 | 1 | TRUE | | 0.1218793 | 2.6972358 | v_1000 | 22.130387 | 1 | TRUE | | 0.0029194 | 0.1780082 | v_10000 | 60.974286 | 1 | TRUE | | -0.2867625 | -1.6812063 | v_100000 | 5.862713 | 1 | TRUE |
And we can also time data.table
itself (without the translation
overhead, though we are adding in the time to convert the data.frame
).
f_data_table = function(dat) {
dat <- data.table(dat)
dat[ , ratio := y / x
][order(-ratio) , simple_rank := 1:.N, by = list(g)
][ , choice := simple_rank == 1]
}
res_dt <- f_data_table(d_large)
res_dt %.>%
order_rows(., 'g') %.>%
select_rows(., choice) %.>%
head(.) %.>%
knitr::kable(.)
| x | y | g | ratio | simple_rank | choice | | ----------: | ----------: | :-------- | --------: | -----------: | :----- | | 0.2773596 | 3.1546090 | v_1 | 11.373713 | 1 | TRUE | | 0.3373741 | 0.3947169 | v_10 | 1.169968 | 1 | TRUE | | -0.4033178 | -1.2618623 | v_100 | 3.128705 | 1 | TRUE | | 0.1218793 | 2.6972358 | v_1000 | 22.130387 | 1 | TRUE | | 0.0029194 | 0.1780082 | v_10000 | 60.974286 | 1 | TRUE | | -0.2867625 | -1.6812063 | v_100000 | 5.862713 | 1 | TRUE |
timings <- microbenchmark(
rquery_compiled = f_compiled(d_large),
rquery_immediate = f_immediate(d_large),
rquery_wrapped = f_wrapped(d_large),
dplyr = f_dplyr(d_large),
dtplyr = f_dtplyr(d_large),
data.table = f_data_table(d_large),
times = 10L
)
print(timings)
## Unit: milliseconds
## expr min lq mean median uq
## rquery_compiled 610.2227 632.1930 693.5449 660.4172 776.1184
## rquery_immediate 766.6798 846.5643 906.3245 913.7986 962.4911
## rquery_wrapped 655.6050 695.1646 764.8073 726.0706 857.4486
## dplyr 7604.4869 7685.4948 8110.5947 7935.5223 8597.2715
## dtplyr 777.2004 856.2540 1003.5907 942.0050 1133.9786
## data.table 481.5167 495.3322 607.2953 581.8229 692.1075
## max neval
## 831.3586 10
## 1032.4353 10
## 903.7452 10
## 8715.6032 10
## 1456.3865 10
## 858.8021 10
For these short pipelines the extra copying in rquery
immediate mode
and dtplyr
are not causing big problems compared to the overall
translation overhead.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.