knitr::opts_chunk$set(
  collapse = TRUE,
  comment = " # "
)
options(width =100)
# small timings
library("data.table")
library("dplyr")
library("microbenchmark")
library("replyr")
library("ggplot2")

set.seed(32535)

data("iris",package = "datasets")
iris.dt <- data.table(iris)

iris.dt[, mean(Sepal.Length), by=Species]

let(list(GROUPCOL='Species', DATACOL='Sepal.Length'),
    iris.dt[, mean(DATACOL), by=GROUPCOL])

iris %>% group_by(Species) %>% summarize(mean(Sepal.Length))

let(list(GROUPCOL='Species', DATACOL='Sepal.Length'),
    iris %>% group_by(GROUPCOL) %>% summarize(mean(DATACOL)))

# gapply is very bad at this task both notationally, and in terms of speed
gapply(iris, 'Species',
       function(di) {
         mean(di[['Sepal.Length']])
       },
       partitionMethod = 'split',
       bindrows=FALSE)

ks <- sort(unique(floor(10^(0.25*(0:16)))))

meass <- lapply(ks,
    function(k) {
      irisk <- iris[rep(seq_len(nrow(iris)),k),,drop=FALSE]
      irisk.dt <- data.table(irisk)
      meas <- microbenchmark(data.table= irisk.dt[, mean(Sepal.Length), by=Species],
                     dplyr= irisk %>% group_by(Species) %>% summarize(mean(Sepal.Length)),
                     let.data.table= let(list(GROUPCOL='Species', DATACOL='Sepal.Length'),
                                         irisk.dt[, mean(DATACOL), by=GROUPCOL]),
                     let.dplyr= let(list(GROUPCOL='Species', DATACOL='Sepal.Length'),
                                    irisk %>% group_by(GROUPCOL) %>% summarize(mean(DATACOL))))
      meas %>% as.data.frame() %>% mutate(k=k)
    })


res <- lapply(meass,
  function(meas) {
    meas %>%
      as.data.frame() %>%
      group_by(expr) %>% 
      summarize(mean= mean(time),
                median= median(time),
                k= mean(k), # pseudo-aggregator
                q1= as.numeric(quantile(time,0.25)),
                q3= as.numeric(quantile(time,0.75)))
  })


res <- dplyr::bind_rows(res)
res$expr <- reorder(res$expr, -res$median)

ggplot(data=res, mapping=aes(x=k,y=median,color=expr)) +
  geom_ribbon(mapping=aes(ymin=q1,ymax=q3,fill=expr),alpha=0.2,color=NA) +
  geom_point() + geom_line() +
  scale_x_log10() + scale_y_log10() + 
  ylab('time') +
  ggtitle("median run times (nanosecond)",
    subtitle = "as a function of method and replications")


WinVector/replyr documentation built on Oct. 22, 2020, 8:07 p.m.