This note is a comment on some of the timings shared in the dplyr-0.8.0 pre-release announcement.

The original published timings were as follows:

With performance metrics: measurements are marketing. So let's dig in the above a bit.

These timings are be of small task large number of repetition breed that Matt Dowle writes against. So they at first wouldn't seem that decisive. Except, look at the following:

Let's try to reproduce these timings on a 2018 Dell XPS 13 Intel Core i5, 16GB Ram running Ubuntu 18.04, and also compare to some other packages: data.table and rqdatatable.

In this reproduction attempt we see:

All code for this benchmark is available here and here.

library("dplyr")
library("rqdatatable")
library("data.table")
library("microbenchmark")
library("WVPlots")
library("ggplot2")
levels <- sprintf("l_%06g", 
                  seq_len(10000))
d <- data.frame(
  g = rep(levels, 10),
  stringsAsFactors = FALSE)
d$x = runif(nrow(d))
db <- as_tibble(d)
dt <- as.data.table(d)
R.version.string
packageVersion("dplyr")
packageVersion("tibble")
packageVersion("rqdatatable")
packageVersion("data.table")
f_dplyr_mean <- function(d) {
  d %>% 
    group_by(g) %>%
    summarize(x = mean(x))
}

f_dplyr_sum_n <- function(d) {
  d %>% 
    group_by(g) %>%
    summarize(x = sum(x)/n())
}

f_rqdatatable <- function(d) {
  d %.>%
    project_nse(., 
                groupby = "g", 
                x = mean(x))
}

f_data.table <- function(dt) {
  dt[, j = list("x" = mean(x)), by = c("g")]
}

f_base_tapply <- function(d) {
  v <- tapply(d$x, d$g, mean)
  g <- names(v)
  names(v) <- NULL
  data.frame(g = g, 
             x = v, 
             stringsAsFactors = FALSE)
}
timings = microbenchmark(
  dplyr_mean = f_dplyr_mean(d),
  dplyr_sum_n = f_dplyr_sum_n(d),
  dplyr_mean_tibble = f_dplyr_mean(db),
  dplyr_sum_n_tibble = f_dplyr_sum_n(db),
  base_tapply = f_base_tapply(d),
  rqdatatable = f_rqdatatable(d),
  data.table = f_data.table(dt),
  times = 5L
)
print(timings)

res <- as.data.frame(timings)
res$seconds = res$time/1e+9
res$method = res$expr

res %.>%
  project_nse(.,
              groupby = "method",
              mean_seconds = mean(seconds)) %.>%
  knitr::kable(.)

WVPlots::ScatterBoxPlotH(
  res, 
  "seconds", "method", 
  "task run time by method")

WVPlots::ScatterBoxPlotH(
  res,  
  "seconds", "method", 
  "task run time by method") + 
  scale_y_log10()

Try again at larger data size.

levels <- sprintf("l_%06g", 
                  seq_len(1000000))
d <- data.frame(
  g = rep(levels, 10),
  stringsAsFactors = FALSE)
d$x = runif(nrow(d))
db <- as_tibble(d)
dt <- as.data.table(d)
timings2 = microbenchmark(
  dplyr_mean = f_dplyr_mean(d),
  dplyr_sum_n = f_dplyr_sum_n(d),
  dplyr_mean_tibble = f_dplyr_mean(db),
  dplyr_sum_n_tibble = f_dplyr_sum_n(db),
  base_tapply = f_base_tapply(d),
  rqdatatable = f_rqdatatable(d),
  data.table = f_data.table(dt),
  times = 5L
)
print(timings2)

res2 <- as.data.frame(timings2)
res2$seconds = res2$time/1e+9
res2$method = res2$expr

res2 %.>%
  project_nse(.,
              groupby = "method",
              mean_seconds = mean(seconds)) %.>%
  knitr::kable(.)

WVPlots::ScatterBoxPlotH(
  res2, 
  "seconds", "method", 
  "task run time by method (larger example)")

WVPlots::ScatterBoxPlotH(
  res2,  
  "seconds", "method", 
  "task run time by method (larger example)") + 
  scale_y_log10()


WinVector/rqdatatable documentation built on Aug. 22, 2023, 3:25 p.m.