Re-plot some of the measurements from here.
library("ggplot2") # https://github.com/WinVector/rqdatatable library("rqdatatable") # devtools::install.packages("WinVector/rqdatatable")
all_timings <- readRDS("all_timings.RDS") all_timings$seconds <- all_timings$time/1e9 summary_pipeline <- local_td(all_timings) %.>% project_nse(., groupby = c("nrows", "expr"), mean_time_seconds = mean(seconds)) %.>% orderby(., c("nrows", "expr")) all_means <- as.data.frame(ex_data_table(summary_pipeline)) knitr::kable(all_means) # get a shared lvel ordering means <- all_means[all_means$nrows == max(all_means$nrows), , drop = FALSE] means <- means[order(means$mean_time_seconds), , drop = FALSE] levels <- means$expr # plot scatter plots for each experiment for(target_size in sort(unique(all_timings$nrows))) { timings <- all_timings[all_timings$nrows == target_size, , drop = FALSE] timings$implementation <- factor(timings$expr, levels) cutpt <- median(timings$seconds[timings$expr=="base_r_stats_aggregate"]) plt <- WVPlots::ScatterBoxPlotH(timings, "seconds", "implementation", paste0("task time in seconds by implementation\n(", timings$nrows[[1]], " row by ", timings$ncols[[1]], " column task)")) + geom_hline(yintercept = cutpt, linetype=2, alpha = 0.5) print(plt) } # plot as a function of problem size cmap <- c(dplyr_data_frame = "#a63603", dplyr_tbl = "#e6550d", dplyr_database_round_trip = "#fd8d3c", data.table = "#7851a9", rquery_database_round_trip = "#31a354", rquery_data.table = "#006d2c") lmap <- c(dplyr_data_frame = 1, dplyr_tbl = 3, dplyr_database_round_trip = 4, data.table = 1, rquery_database_round_trip = 4, rquery_data.table = 2) all_timings$implementation <- factor(all_timings$expr, rev(levels)) ex1 <- all_timings[all_timings$expr != 'base_r_stats_aggregate', , drop = FALSE] exb <- all_timings[all_timings$expr == 'base_r_stats_aggregate', , drop = FALSE] sm <- loess(seconds ~ nrows, data= exb) smf <- data.frame(nrows = exp(seq(log(min(ex1$nrows)), log(max(ex1$nrows)), length.out = 100))) smf$seconds <- predict(sm, newdata=smf) ymin = min(all_timings$seconds) ggplot(mapping = aes(x = nrows, y = seconds, ymax = seconds, ymin = ymin)) + geom_ribbon(data = smf, alpha = 0.3) + geom_line(data = ex1, se = FALSE, aes(color = implementation, linetype = implementation), stat = "smooth", method = "loess", alpha = 0.7, size = 1) + scale_x_log10() + scale_y_log10() + scale_color_manual(values = cmap) + scale_linetype_manual(values = lmap) + ggtitle("task time in seconds by nrows and implementation", subtitle = "shading boundary time taken by base R stats::aggregate() solution") # look at slopes summaries <- split(all_timings, all_timings$expr) %.>% lapply(., function(gi) { model <- lm(seconds ~ nrows, data= gi) si <- as.data.frame(summary(model)$coefficients) si$coef <- rownames(si) si$impementation <- as.character(gi$expr[[1]]) si }) %.>% data.table::rbindlist(.) colnames(summaries) <- gsub("Pr(>|t|)", "P[g.t. abs(t)]", colnames(summaries), fixed = TRUE) knitr::kable(summaries)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.