Some timings for %.>%
("dot arrow").
Keep in mind for any serious application the calculation time on data will far dominate any piping overhead, but it is fun to look.
So we will compare:
magrittr*
magrittr::%>%
piping.DotArrow*
wrapr::%.>%
piping.BizarroPipe*
->.;
piping.TidyPipe*
%>%
piping based on rlang
/tidyeval
(renamed to "%t>%
" in this run to avoid name collisions).# colors from http://colorbrewer2.org/#type=qualitative&scheme=Dark2&n=4 defs <- tibble::tribble( ~name, ~operator, ~color, 'magrittr', '%>%' , '#1b9e77', 'DotArrow', '%.>%', '#d95f02', 'BizarroPipe', '->.;', '#7570b3', 'TidyPipe', '%t>%', '#e7298a' ) sizes <- c(1, 2, 5, 10, 15, 20, 25, 50, 100, 200, 500, 1000) colorAssignment <- defs$color names(colorAssignment) <- defs$name fMap <- defs$operator names(fMap) <- defs$name # build source code for fns of size k buildFnsK <- function(k) { fs <- c("5", rep("sin(.)", k)) fns <- lapply(names(fMap), function(fn) { op <- fMap[[fn]] paste0(fn, '_', k, ' <- function() {\n ', paste(fs, collapse = paste0(' ', op, '\n ')), '\n}\n\n') }) paste(fns, collapse= '\n\n') } ftext <- paste( vapply(sizes, buildFnsK, character(1)), collapse = "\n\n") fileConn<-file("pGenFns.R") writeLines(ftext, fileConn) close(fileConn)
library("microbenchmark") library("wrapr") suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("dplyr")) library("glmnet") source('rlangPipe.R') # load generated examples prevNames <- ls() source("pGenFns.R") genFns <- setdiff(ls(), c(prevNames, 'prevNames', 'genFns')) # parser translates BizarroPipe to different code! cat(buildFnsK(5), sep = '\n') print(BizarroPipe_5) BizarroPipe_10() DotArrow_10() magrittr_10()
# get expressions into a nice presentation order fList <- data.frame(expr= genFns, stringsAsFactors = FALSE) fList$size <- as.numeric(gsub("[^0-9]+", "", fList$expr)) fList$fn <- gsub("[_0-9].*$", "", fList$expr) fList <- fList[order(fList$size, fList$fn), , drop=FALSE] cmd <- parse(text=paste0( "microbenchmark(\n ", paste(paste0(fList$expr,'()'), collapse=',\n '), ", times=1000L )\n" )) print(cmd) gc() datFile <- 'timings.RDS' if(!file.exists(datFile)) { print("running") bm <- eval(cmd) saveRDS(bm, file=datFile) } else { print("using cached results") bm <- readRDS(file=datFile) } print(bm) autoplot(bm)
d <- as.data.frame(bm) d$size <- as.numeric(gsub("[^0-9]+", "", d$expr)) d$fn <- gsub("[_0-9].*$", "", d$expr) d$fn <- reorder(d$fn, d$time) ggplot(d, aes(x=fn, y=time, color=fn)) + geom_violin() + scale_y_log10() + facet_wrap(~size, labeller="label_both") + coord_flip() + xlab("method") + ylab("time NS") + theme(legend.position = 'none') + scale_color_manual(values = colorAssignment) + ggtitle("distribution of runtime as function of method and problem size", subtitle = "log-time scale") # ggplot 2 legend in reverse order, so re-order to get that d$fn <- reorder(d$fn, -d$time) ggplot(d, aes(x=size, y=time, color=fn)) + geom_smooth() + scale_y_log10() + scale_x_log10() + scale_color_manual(values = colorAssignment) + xlab('size (number of pipe stages)') + ylab("time NS") + ggtitle("complexity of runtime as function of method and problem size", subtitle = "log/log scale")
# fit a linear function for runtime as a function of size # per group. dfits <- d %.>% split(., .$fn) %.>% lapply(., function(di) { mi <- lm(time ~ size + I(size*size), data=di) ctab <- as.data.frame(summary(mi)$coefficients) ctab$coef <- rownames(ctab) ctab }) %.>% add_name_column(., 'method') %.>% bind_rows(.) %.>% arrange(., method, coef) %.>% select(., method, coef, Estimate, `Std. Error`, `Pr(>|t|)`) # "Intercept" is roughly start-up cost # "size" is roughly the slope or growth rate of execution time # as a function of number of pipe stages. # "I(size * size)" is where we try to detect super-linear cost, # check that it is both statistically significant and that # it has a size that would affect predictions (is it times # the typical variation in size*size large?). print(dfits) # re-run with non-negative least squares dfitsnn <- d %.>% split(., .$fn) %.>% lapply(., function(di) { di$sizesq <- (di$size)^2 # always call glmnet with a non-trivial lambda series # some notes: https://github.com/WinVector/wrapr/blob/master/extras/glmnetNotes.md mi <- glmnet(as.matrix(di[, c('size', 'sizesq')]), di$time, lower.limits = 0, alpha=0.0, lambda=c(0, 1.0e-5, 1.0e-3, 0.1, 1, 10), intercept = TRUE, family = 'gaussian') ctab <- as.data.frame(as.matrix(coef(mi, s=0))) # lower.limites does not apply to intercept, # but intercept is always reported even if # turned off. if(ctab['(Intercept)',1]<0) { mi <- glmnet(as.matrix(di[, c('size', 'sizesq')]), di$time, lower.limits = 0, alpha=0.0, lambda=c(0, 1.0e-5, 1.0e-3, 0.1, 1, 10), intercept = FALSE, family = 'gaussian') ctab <- as.data.frame(as.matrix(coef(mi, s=0))) } names(ctab) <- "Estimate" ctab$coef <- rownames(ctab) ctab }) %.>% add_name_column(., 'method') %.>% bind_rows(.) %.>% arrange(., method, coef) %.>% select(., method, coef, Estimate) print(dfitsnn)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.