benchmark/benchmark_results.R

library(ggplot2)
library(RColorBrewer)
library(grid)


# -----------------------------------------------------------------------


model_color <- rep(brewer.pal(4, "Set2"), each = 2) %>%
  set_names(
    "lm"
    , "glm"
    , "oomlm"
    , "oomglm"
    , "biglm"
    , "bigglm"
    , "speedlm"
    , "speedglm"
  )


#' return title, subtitle text based on title type
#' 
#' @param title_type character backed or model type
#' @keywords internal
title_subtitle <- function(title_type) {

  y <- tibble(
    backend  = c("tbl", "psql"),
    model    = c("glm", "lm"),
    oom      = c(FALSE, TRUE),
    title    = c("Generalized Linear Model", "Linear Model"),
    prefix   = c("In-Memory", "Out-of-Memory"),
    subtitle = str_c("Data stored in ", c("tibble", "PostgreSQL table"), " ")
  )
  
  x   <- str_split(title_type, "_", simplify = TRUE)
  
  title <- y %>%
    filter(model == x[1]) %>%
    pull(title) %>%
    head(1)
  
  subtitle <- y %>%
    filter(backend == x[2]) %>%
    pull(subtitle) %>%
    head(1)
  
  prefix <- y %>%
    filter(backend == x[2]) %>%
    pull(prefix) %>%
    head(1)
  
  list(
    title    = str_c(prefix, " ", title),
    subtitle = subtitle
  )
    
}


#' make benchmark plots
#' 
#' @param df benchmark data
#' @param title_type indicator for title text
#' @param yval value to plot on y axis
#' @param ylab y axis label
#' 
#' @returns gtable object
#' @keywords internal
plot_benchmark <- function(df, title_type, yval, ylab) {
  
  lbls <- title_subtitle(title_type)
  
  df <- df %>%
    mutate(
      label = if_else(
        num_obs == max(num_obs),
        as.character(expression), NA_character_)
    ) 
  
  gt <- df %>%
    ggplot(aes_string(y = yval, x = "num_obs", col = "expression")) +
    geom_line(size = 1)  +
    geom_point(size = 2) +
    theme_bw()   +
    theme(
      plot.title    = element_text(size = 18),
      plot.subtitle = element_text(size = 14),
      plot.margin   = margin(t = 10, r = 45, b = 20, l = 25),
      panel.border  = element_blank(),
      axis.title    = element_text(size = 12),
      axis.title.y  = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)),
      axis.title.x  = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)),
      axis.text     = element_text(size = 10),
      axis.ticks.y  = element_blank(),
      legend.position = "none"
    ) +
    scale_y_continuous(
      breaks = scales::pretty_breaks(n = 5),
      labels = scales::percent
    ) +
    scale_colour_manual(
      name   = "expression",
      values = model_color
    ) +
    geom_text(
      data = df %>%
        filter(!is.na(label)) %>%
        mutate(num_obs = num_obs*1.05),
      aes_string(
        label = "label",
        x     = Inf,
        y     = yval,
        color = "expression"
      ),
      size = 5
    ) +
    labs(
      title    = lbls$title,
      subtitle = lbls$subtitle,
      x        = "Number of Observations",
      y        = ylab
    ) 
  
  gt <- ggplot_build(gt)
  gt <- ggplot_gtable(gt)
  gt$layout$clip[gt$layout$name=="panel"] <- "off"
  
  gt
  
}

#' write gtable to svg
#' 
#' @param file_name name of svg files
#' @param gt gtable
#' @keywords internal
save_plot <- function(file_name, gt) {
  svg(
    filename = glue("benchmark/results/{file_name}.svg"),
    width = 11, height = 8.5
  )
  grid.draw(gt)
  dev.off()
}


normalize_metrics <- function(df) {
  df %>%
    mutate(ploom = expression %in% c("oomlm", "oomglm")) %>%
    group_by(num_obs) %>%
    mutate(
      mem_alloc = mem_alloc / sum(mem_alloc * ploom),
      mean      = mean / sum(mean * ploom)
    ) %>%
    ungroup()
}



# build and save plots --------------------------------------------------
#

files   <- list.files("benchmark/results", full.names = TRUE, pattern = "*.Rds")
results <- files %>%
  map(readRDS)   %>%
  set_names(str_remove(basename(files), ".Rds")) %>%
  map(normalize_metrics)

time_plots <- map2(
  .x = results,
  .y = names(results),
  .f = plot_benchmark, yval = "mean", ylab = "Relative Seconds"
)

walk2(
  .x = str_c(names(time_plots), "_time"),
  .y = time_plots,
  .f = save_plot
)

memr_plots <- map2(
  .x = results,
  .y = names(results),
  .f = plot_benchmark, yval = "mem_alloc", ylab = "Relative Memory Allocated"
)

walk2(
  .x = str_c(names(memr_plots), "_memory"),
  .y = memr_plots,
  .f = save_plot
)
blakeboswell/yotta documentation built on May 30, 2019, 11:43 a.m.