Some timings for %.>% ("dot arrow").

Keep in mind for any serious application the calculation time on data will far dominate any piping overhead, but it is fun to look.

So we will compare:

# colors from http://colorbrewer2.org/#type=qualitative&scheme=Dark2&n=4
defs <- tibble::tribble(
  ~name, ~operator, ~color,
  'magrittr', '%>%' , '#1b9e77',
  'DotArrow', '%.>%', '#d95f02',
  'BizarroPipe', '->.;', '#7570b3',
  'TidyPipe', '%t>%', '#e7298a'
)

sizes <- c(1, 2, 5, 10, 15, 20, 25, 50, 100, 200, 500, 1000)

colorAssignment <- defs$color
names(colorAssignment) <- defs$name
fMap <- defs$operator
names(fMap) <- defs$name


# build source code for fns of size k
buildFnsK <- function(k) {
  fs <- c("5", rep("sin(.)", k))
  fns  <- lapply(names(fMap),
                 function(fn) {
                   op <- fMap[[fn]]
                   paste0(fn, '_', k, ' <- function() {\n ',
                          paste(fs, collapse = paste0(' ', op, '\n   ')),
                          '\n}\n\n')

                 })
  paste(fns, collapse= '\n\n')
}


ftext <- paste(
  vapply(sizes,
         buildFnsK, character(1)),
  collapse = "\n\n")

fileConn<-file("pGenFns.R")
writeLines(ftext, fileConn)
close(fileConn)
library("microbenchmark")
library("wrapr")
suppressPackageStartupMessages(library("ggplot2"))
suppressPackageStartupMessages(library("dplyr"))
library("glmnet")
source('rlangPipe.R')

# load generated examples
prevNames <- ls()
source("pGenFns.R")
genFns <- setdiff(ls(), c(prevNames, 'prevNames', 'genFns'))

# parser translates BizarroPipe to different code!
cat(buildFnsK(5), sep = '\n')

print(BizarroPipe_5)

BizarroPipe_10()

DotArrow_10()

magrittr_10()
# get expressions into a nice presentation order
fList <- data.frame(expr= genFns, 
                    stringsAsFactors = FALSE)
fList$size <- as.numeric(gsub("[^0-9]+", "", fList$expr))
fList$fn <- gsub("[_0-9].*$", "", fList$expr)
fList <- fList[order(fList$size, fList$fn), , drop=FALSE]

cmd <- parse(text=paste0(
  "microbenchmark(\n ",
  paste(paste0(fList$expr,'()'), collapse=',\n '),
  ", 
  times=1000L
  )\n"
))

print(cmd)

gc()
datFile <- 'timings.RDS'
if(!file.exists(datFile)) {
  print("running")
  bm <- eval(cmd)
  saveRDS(bm, file=datFile)
} else {
  print("using cached results")
  bm <- readRDS(file=datFile)
}

print(bm)
autoplot(bm)
d <- as.data.frame(bm)
d$size <- as.numeric(gsub("[^0-9]+", "", d$expr))
d$fn <- gsub("[_0-9].*$", "", d$expr)

d$fn <- reorder(d$fn, d$time)
ggplot(d, aes(x=fn, y=time, color=fn)) + 
  geom_violin() + 
  scale_y_log10() + 
  facet_wrap(~size, labeller="label_both") + 
  coord_flip() + 
  xlab("method") +
  ylab("time NS") +
  theme(legend.position = 'none') +
  scale_color_manual(values = colorAssignment) +
  ggtitle("distribution of runtime as function of method and problem size",
          subtitle = "log-time scale")

# ggplot 2 legend in reverse order, so re-order to get that
d$fn <- reorder(d$fn, -d$time)
ggplot(d, aes(x=size, y=time, color=fn)) +
  geom_smooth() +
  scale_y_log10() +
  scale_x_log10() +
  scale_color_manual(values = colorAssignment) +
  xlab('size (number of pipe stages)') +
  ylab("time NS") +
  ggtitle("complexity of runtime as function of method and problem size",
          subtitle = "log/log scale")
# fit a linear function for runtime as a function of size
# per group.
dfits <- d %.>%
  split(., .$fn) %.>%
  lapply(., 
         function(di) { 
           mi <- lm(time ~ size + I(size*size), data=di) 
           ctab <- as.data.frame(summary(mi)$coefficients)
           ctab$coef <- rownames(ctab)
           ctab
         }) %.>%
  add_name_column(., 'method') %.>%
  bind_rows(.) %.>%
  arrange(., method, coef) %.>%
  select(.,  method, coef, Estimate, `Std. Error`, `Pr(>|t|)`)

# "Intercept" is roughly start-up cost 
# "size" is roughly the slope or growth rate of execution time
# as a function of number of pipe stages.
# "I(size * size)" is where we try to detect super-linear cost,
# check that it is both statistically significant and that 
# it has a size that would affect predictions (is it times
# the typical variation in size*size large?).
print(dfits)

# re-run with non-negative least squares

dfitsnn <- d %.>%
  split(., .$fn) %.>%
  lapply(., 
         function(di) { 
           di$sizesq <- (di$size)^2
           # always call glmnet with a non-trivial lambda series
           # some notes: https://github.com/WinVector/wrapr/blob/master/extras/glmnetNotes.md
           mi <- glmnet(as.matrix(di[, c('size', 'sizesq')]), 
                        di$time, 
                        lower.limits = 0, 
                        alpha=0.0, 
                        lambda=c(0, 1.0e-5, 1.0e-3, 0.1, 1, 10),
                        intercept = TRUE, 
                        family = 'gaussian')
           ctab <- as.data.frame(as.matrix(coef(mi, s=0)))
           # lower.limites does not apply to intercept,
           # but intercept is always reported even if
           # turned off.
           if(ctab['(Intercept)',1]<0) {
             mi <- glmnet(as.matrix(di[, c('size', 'sizesq')]), 
                        di$time, 
                        lower.limits = 0, 
                        alpha=0.0, 
                        lambda=c(0, 1.0e-5, 1.0e-3, 0.1, 1, 10),
                        intercept = FALSE, 
                        family = 'gaussian')
             ctab <- as.data.frame(as.matrix(coef(mi, s=0)))
           }
           names(ctab) <- "Estimate"
           ctab$coef <- rownames(ctab)
           ctab
         }) %.>%
  add_name_column(., 'method') %.>%
  bind_rows(.) %.>%
  arrange(., method, coef) %.>%
  select(.,  method, coef, Estimate)

print(dfitsnn)


WinVector/wrapr documentation built on Aug. 29, 2023, 4:51 a.m.