Make example data.

set.seed(2020)
n <- 1000000

mk_data <- function(n) {
  d <- data.frame(x = rnorm(n))
  d$g <- sprintf("level_%09g", sample.int(n, size = n, replace = TRUE))
  return(d)
}

d <- mk_data(n)
write.csv(d, file = gzfile("d.csv.gz"), quote = FALSE, row.names = FALSE)

Example processing, rqdatatable.

library(rqdatatable)
packageVersion("rquery")
packageVersion("rqdatatable")

ops_rqdatatable <- local_td(d, name = 'd') %.>%
  extend(.,
         rn := row_number(),
         cs := cumsum(x),
         partitionby = 'g',
         orderby = 'x') %.>%
  order_rows(.,
             c('g', 'x'))

res_rqdatatable <- d %.>% ops_rqdatatable

knitr::kable(head(res_rqdatatable))

print(nrow(res_rqdatatable) == n)
print(max(res_rqdatatable$rn))
write.csv(res_rqdatatable, file = gzfile("res.csv.gz"), quote = FALSE, row.names = FALSE)

(Note, we could use := for assignment if we imported rquery or wrapr, but we are avoiding that to avoid colliding with data.table's or dplyr's use of the symbol.)

Example processing data.table.

library(data.table)
packageVersion("data.table")

f_data.table <- function(d) {
  dt <- data.table(d)
  res_data.table <- setorderv(dt, c('g', 'x'))[, `:=`(rn = seq_len(.N), cs = cumsum(x)), by = g]
  return(res_data.table)
}

res_data.table <- f_data.table(d)
knitr::kable(head(res_data.table))

stopifnot(all.equal(res_rqdatatable, data.frame(res_data.table)))

Example processing, dplyr.

library(dplyr)
packageVersion("dplyr")

ops_dplyr <- . %>%
  arrange(g, x) %>%
  group_by(g) %>%
  mutate(
    rn = row_number(),
    cs = cumsum(x)) %>%
  ungroup()

res_dplyr <- d %>% ops_dplyr

knitr::kable(head(res_dplyr))
stopifnot(all.equal(res_rqdatatable, data.frame(res_dplyr)))

Example processing, dtplyr.

library(dtplyr)
packageVersion("dtplyr")

f_dtplyr <- function(d) {
  res_dtplyr <- lazy_dt(d) %>%
    arrange(g, x) %>%
    group_by(g) %>%
    mutate(
      rn = row_number(),
      cs = cumsum(x)) %>%
    ungroup() %>%
    as_tibble()
  return(res_dtplyr)
}

res_dtplyr <- f_dtplyr(d)
stopifnot(all.equal(res_rqdatatable, data.frame(res_dtplyr)))
library(microbenchmark)

microbenchmark(
  data.table = f_data.table(d),
  dplyr = d %>% ops_dplyr,
  dtplyr = f_dtplyr(d),
  rqdatatable = d %.>% ops_rqdatatable,
  times = 5L)


WinVector/rqdatatable documentation built on Aug. 22, 2023, 3:25 p.m.