Amazon EC2 r4.8xlarge
(244 GiB RAM) run (8-12-2018, 64-bit Ubuntu Server 16.04 LTS (HVM), SSD Volume Type - ami-ba602bc2, R 3.4.4 all packages current).
library("rqdatatable") library("microbenchmark") library("ggplot2") library("WVPlots") library("cdata") library("dplyr") library("data.table") set.seed(32523) mk_data <- function(nrow) { data.frame(col_a = sample(letters, nrow, replace=TRUE), col_b = sample(letters, nrow, replace=TRUE), col_c = sample(letters, nrow, replace=TRUE), col_x = runif(nrow), stringsAsFactors = FALSE) }
ops <- mk_td("d", c("col_a", "col_b", "col_c", "col_x")) %.>% orderby(., cols = c("col_a", "col_b", "col_c", "col_x")) # from help(microbenchmark) my_check <- function(values) { all(sapply(values[-1], function(x) identical(values[[1]], x))) }
rds_file <- "Sorting_runs.RDS" if(!file.exists(rds_file)) { pow <- 8 szs <- expand.grid(a = c(1,2,5), b = 10^{0:pow}) szs <- sort(unique(szs$a * szs$b)) szs <- szs[szs<=10^pow] runs <- lapply( szs, function(sz) { d <- mk_data(sz) ti <- microbenchmark( data.table = { d %.>% as.data.table(.) %.>% setorder(., col_a, col_b, col_c, col_x) %.>% setDF(.)[] }, rqdatatable = { d %.>% ops %.>% as.data.frame(.) }, dplyr = { dplyr::arrange(d, col_a, col_b, col_c, col_x) }, times = 3L, check = my_check) ti <- as.data.frame(ti) ti$rows <- sz ti }) saveRDS(runs, rds_file) } else { runs <- readRDS(rds_file) }
timings <- do.call(rbind, runs) timings$seconds <- timings$time/1e+9 timings$method <- factor(timings$expr) timings$method <- reorder(timings$method, -timings$seconds) ggplot(data = timings, aes(x = rows, y = seconds, color = method)) + geom_point() + geom_smooth(se = FALSE) + scale_x_log10() + scale_y_log10() + ggtitle("sorting task time by rows and method", subtitle = "log-log trend shown") ggplot(data = timings[timings$method!="rqdatatable", , drop = FALSE], aes(x = rows, y = seconds, color = method)) + geom_point() + geom_smooth(se = FALSE) + scale_x_log10() + scale_y_log10() + ggtitle("sorting task time by rows and method", subtitle = "log-log trend shown") means <- timings %.>% project_nse(., groupby = c("method", "rows"), seconds = mean(seconds)) %.>% pivot_to_rowrecs(., columnToTakeKeysFrom = "method", columnToTakeValuesFrom = "seconds", rowKeyColumns = "rows") %.>% extend_nse(., ratio = dplyr/data.table, ratio_by_log_rows = ratio/log(rows)) %.>% orderby(., "rows") %.>% as.data.frame(.) knitr::kable(means) ggplot(data = means, aes(x = rows, y = ratio)) + geom_point() + geom_smooth(se = FALSE) + scale_x_log10() + ggtitle("ratio of dplyr runtime to data.table runtime") ggplot(data = means, aes(x = rows, y = ratio_by_log_rows)) + geom_point() + geom_smooth(se = FALSE) + scale_x_log10() + ggtitle("ratio of dplyr runtime to data.table runtime dived by log(rows)")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.