Amazon EC2 r4.8xlarge
(244 GiB RAM) run (8-12-2018, 64-bit Ubuntu Server 16.04 LTS (HVM), SSD Volume Type - ami-ba602bc2, R 3.4.4 all packages current).
library("rqdatatable")
## Loading required package: rquery
library("microbenchmark")
library("ggplot2")
library("WVPlots")
library("cdata")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
set.seed(32523)
mk_data <- function(nrow) {
data.frame(col_a = sample(letters, nrow, replace=TRUE),
col_b = sample(letters, nrow, replace=TRUE),
col_c = sample(letters, nrow, replace=TRUE),
col_x = runif(nrow),
stringsAsFactors = FALSE)
}
ops <- mk_td("d", c("col_a", "col_b", "col_c", "col_x")) %.>%
orderby(., cols = c("col_a", "col_b", "col_c", "col_x"))
# from help(microbenchmark)
my_check <- function(values) {
all(sapply(values[-1], function(x) identical(values[[1]], x)))
}
rds_file <- "Sorting_runs.RDS"
if(!file.exists(rds_file)) {
pow <- 8
szs <- expand.grid(a = c(1,2,5), b = 10^{0:pow})
szs <- sort(unique(szs$a * szs$b))
szs <- szs[szs<=10^pow]
runs <- lapply(
szs,
function(sz) {
d <- mk_data(sz)
ti <- microbenchmark(
data.table = {
d %.>%
as.data.table(.) %.>%
setorder(., col_a, col_b, col_c, col_x) %.>%
setDF(.)[]
},
rqdatatable = {
d %.>%
ops %.>%
as.data.frame(.)
},
dplyr = {
dplyr::arrange(d, col_a, col_b, col_c, col_x)
},
times = 3L,
check = my_check)
ti <- as.data.frame(ti)
ti$rows <- sz
ti
})
saveRDS(runs, rds_file)
} else {
runs <- readRDS(rds_file)
}
timings <- do.call(rbind, runs)
timings$seconds <- timings$time/1e+9
timings$method <- factor(timings$expr)
timings$method <- reorder(timings$method, -timings$seconds)
ggplot(data = timings,
aes(x = rows, y = seconds, color = method)) +
geom_point() +
geom_smooth(se = FALSE) +
scale_x_log10() + scale_y_log10() +
ggtitle("sorting task time by rows and method",
subtitle = "log-log trend shown")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = timings[timings$method!="rqdatatable", , drop = FALSE],
aes(x = rows, y = seconds, color = method)) +
geom_point() +
geom_smooth(se = FALSE) +
scale_x_log10() + scale_y_log10() +
ggtitle("sorting task time by rows and method",
subtitle = "log-log trend shown")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
means <- timings %.>%
project_nse(.,
groupby = c("method", "rows"),
seconds = mean(seconds)) %.>%
pivot_to_rowrecs(.,
columnToTakeKeysFrom = "method",
columnToTakeValuesFrom = "seconds",
rowKeyColumns = "rows") %.>%
extend_nse(.,
ratio = dplyr/data.table,
ratio_by_log_rows = ratio/log(rows)) %.>%
orderby(., "rows") %.>%
as.data.frame(.)
knitr::kable(means)
| rows| data.table| dplyr| rqdatatable| ratio| ratio_by_log_rows| |------:|-----------:|------------:|------------:|----------:|---------------------:| | 1e+00| 0.0004077| 0.0076316| 0.0008804| 18.719959| Inf| | 2e+00| 0.0004035| 0.0014585| 0.0008818| 3.614217| 5.2142136| | 5e+00| 0.0004741| 0.0015127| 0.0008543| 3.190409| 1.9823126| | 1e+01| 0.0004051| 0.0014841| 0.0008885| 3.663244| 1.5909266| | 2e+01| 0.0004001| 0.0014963| 0.0008898| 3.739903| 1.2484104| | 5e+01| 0.0004791| 0.0014716| 0.0008554| 3.071596| 0.7851683| | 1e+02| 0.0004747| 0.0015271| 0.0008471| 3.216743| 0.6985069| | 2e+02| 0.0004983| 0.0015067| 0.0008857| 3.023557| 0.5706636| | 5e+02| 0.0005444| 0.0015958| 0.0009128| 2.931345| 0.4716862| | 1e+03| 0.0005576| 0.0017957| 0.0009517| 3.220292| 0.4661850| | 2e+03| 0.0006078| 0.0021119| 0.0011654| 3.474785| 0.4571543| | 5e+03| 0.0009465| 0.0032869| 0.0014573| 3.472520| 0.4077071| | 1e+04| 0.0012682| 0.0052561| 0.0016782| 4.144441| 0.4499770| | 2e+04| 0.0019908| 0.0104596| 0.0025562| 5.254009| 0.5305210| | 5e+04| 0.0057635| 0.0594585| 0.0068824| 10.316401| 0.9534761| | 1e+05| 0.0128816| 0.0584981| 0.0135057| 4.541204| 0.3944440| | 2e+05| 0.0155656| 0.1291835| 0.0558137| 8.299297| 0.6799318| | 5e+05| 0.0740364| 0.3483736| 0.0521505| 4.705438| 0.3585816| | 1e+06| 0.1285398| 0.7447626| 0.1051548| 5.794021| 0.4193852| | 2e+06| 0.2023114| 1.6917564| 0.2341636| 8.362141| 0.5763552| | 5e+06| 0.5872843| 5.3194821| 0.7157426| 9.057763| 0.5872151| | 1e+07| 1.2644908| 11.9558664| 1.6543575| 9.455084| 0.5866129| | 2e+07| 2.6836465| 26.7849712| 3.2131316| 9.980812| 0.5936986| | 5e+07| 7.9980163| 73.4681060| 8.4911241| 9.185791| 0.5181652| | 1e+08| 20.2643089| 156.8686366| 21.8431452| 7.741129| 0.4202412|
ggplot(data = means,
aes(x = rows, y = ratio)) +
geom_point() +
geom_smooth(se = FALSE) +
scale_x_log10() +
ggtitle("ratio of dplyr runtime to data.table runtime")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = means,
aes(x = rows, y = ratio_by_log_rows)) +
geom_point() +
geom_smooth(se = FALSE) +
scale_x_log10() +
ggtitle("ratio of dplyr runtime to data.table runtime dived by log(rows)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.