In WinVector/rquery: Relational Query Generator for Data Manipulation at Scale

This is running dplyr and dtplyr on the "rquery Modes Example".

First we define the functions and data we used in the rquery modes example example.

library(rquery)
library(data.table)
library(microbenchmark)

library(dplyr)
packageVersion('dplyr')
library(dtplyr)
packageVersion('dtplyr')

set.seed(2019)

n_rows <- 1000000

d_large <- data.frame(
  x = rnorm(n = n_rows),
  y = rnorm(n = n_rows),
  g = sample(paste0('v_', seq_len(n_rows/10)), 
             size = n_rows, 
             replace = TRUE),
  stringsAsFactors = FALSE
)

ops <- local_td(d_large) %.>%  # Describe table for later operations
  extend(.,       # add a new column
         ratio := y / x) %.>%
  extend(.,       # rank the rows by group and order
         simple_rank := row_number(),
         partitionby = 'g',
         orderby = 'ratio',
         reverse = 'ratio') %.>%
  extend(.,       # mark the rows we want
         choice := simple_rank == 1)

d_large %.>%
  ops %.>%
  order_rows(., 'g') %.>%
  select_rows(., choice) %.>% 
  head(.) %.>%
  knitr::kable(.)

f_compiled <- function(dat) {
  dat %.>% ops  # use pre-compiled pipeline
}

f_immediate <- function(dat) {
  dat %.>%
    extend(.,       # add a new column
           ratio := y / x) %.>%
    extend(.,       # rank the rows by group and order
           simple_rank := row_number(),
           partitionby = 'g',
           orderby = 'ratio',
           reverse = 'ratio') %.>%
    extend(.,       # mark the rows we want
           choice := simple_rank == 1)
}

f_wrapped <- function(dat) {
  dat %.>%
    wrap %.>%       # wrap data in a description
    extend(.,       # add a new column
           ratio := y / x) %.>%
    extend(.,       # rank the rows by group and order
           simple_rank := row_number(),
           partitionby = 'g',
           orderby = 'ratio',
           reverse = 'ratio') %.>%
    extend(.,       # mark the rows we want
           choice := simple_rank == 1) %.>%
    ex              # signal construction done, and execute
}

The dplyr version of the pipeline is similar, except the window functions are not a single step- but a 4 stage block.

f_dplyr <- function(dat) {
  dat %>%
    mutate(        # add a new column
           ratio := y / x) %>%
    group_by(      # rank the rows by group and order
             g) %>%
    arrange( -ratio) %>%
    mutate(     
           simple_rank := row_number()) %>%
    ungroup() %>%  # end of rank block
    mutate(        # mark the rows we want
           choice := simple_rank == 1)
}

We are using the most current CRAN versions of each (dtplyr is currently being re-engineered to try to also cut down the number conversions).

Above we see a key difference between rquery and dplyr: rquery grouped and window functions are single operators in rquery, but are driven by annotations between steps in dplyr.

dtplyr seems to error-out on this problem, meaning the automatic translations from dplyr to data.table are not sufficient to our task.

f_dplyr(data.table(d_large))

We can try to re-write the dtplyr pipeline as follows. It appears switching from := to = and replacing row_number() with .I helps.

f_dtplyr <- function(dat) {
  data.table(dat) %>%
    mutate(        # add a new column
           ratio = y / x) %>%
    group_by(      # rank the rows by group and order
             g) %>%
    arrange( -ratio) %>%
    mutate(     
           simple_rank = 1:.N) %>%
    ungroup() %>%  # end of rank block
    mutate(        # mark the rows we want
           choice = simple_rank == 1)
}

res_dtplyr <- f_dtplyr(d_large)

res_dtplyr %.>%
  order_rows(., 'g') %.>%
  select_rows(., choice) %.>% 
  head(.) %.>%
  knitr::kable(.)

And we can also time data.table itself (without the translation overhead, though we are adding in the time to convert the data.frame).

f_data_table = function(dat) {
  dat <- data.table(dat)
  dat[ , ratio := y / x
       ][order(-ratio) , simple_rank := 1:.N, by = list(g)
          ][ , choice := simple_rank == 1]
}

res_dt <- f_data_table(d_large)

res_dt %.>%
  order_rows(., 'g') %.>%
  select_rows(., choice) %.>% 
  head(.) %.>%
  knitr::kable(.)

timings <- microbenchmark(
  rquery_compiled = f_compiled(d_large),
  rquery_immediate = f_immediate(d_large),
  rquery_wrapped = f_wrapped(d_large),
  dplyr = f_dplyr(d_large),
  dtplyr = f_dtplyr(d_large),
  data.table = f_data_table(d_large),
  times = 10L
)

print(timings)

For these short pipelines the extra copying in rquery immediate mode and dtplyr are not causing big problems compared to the overall translation overhead.

WinVector/rquery documentation built on Aug. 24, 2023, 11:12 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com