knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) # Set a flag to determine if the vignette should be run in simple context (vignettes) # or in an extended context (pkgdown) simple_rendering <- !pkgdown::in_pkgdown() # NOTE: # To re-run the benchmarks, run the "benchmark" workflow on GitHub
library(magrittr)
To showcase the performance of SCDB
on different database backends, we include this vignette that summarises a simple
benchmark:
A sample dataset is created based on the datasets::iris
dataset. This data is repeated 10 times and given a unique ID
(the row number of the data).
This data forms the basis for three "snapshots" used in the benchmarks:
1) The data as described above.
2) As 1. but where any Sepal.Length
below the median is halved.
3) As 2. but where any Sepal.width
below the median is halved.
The benchmark function uses three consecutive calls to update_snapshot()
to create the table with first snapshot and
then update it to the second and third snapshot. Finally, the table is deleted.
The performance of this benchmark function is timed with the {microbenchmark}
package using 10 replicates.
All benchmarks are run on the same machine.
if (simple_rendering) { cat( "The results of the benchmark are shown graphically below", "(mean and standard deviation), where measure the", "performance of `SCDB`." ) } else { cat( "The results of the benchmark are shown graphically below", "(mean and standard deviation), where we compare the", "current development version of `SCDB` with the current CRAN version." ) }
benchmark_location <- c( system.file("extdata", "benchmarks.rds", package = "SCDB"), here::here("inst", "extdata", "benchmarks.rds") ) %>% purrr::discard(~ identical(., "")) %>% purrr::pluck(1) benchmarks <- readRDS(benchmark_location) %>% dplyr::mutate("version" = as.character(.data$version)) # Determine if the SHA is on main sha <- benchmarks %>% dplyr::distinct(.data$version) %>% dplyr::filter(!startsWith(.data$version, "SCDB"), .data$version != "main") %>% dplyr::pull("version") # Check local git history on_main <- tryCatch({ system(glue::glue("git branch main --contains {sha}"), intern = TRUE) %>% stringr::str_detect(stringr::fixed("main")) %>% isTRUE() }, warning = function(w) { # If on GitHub, git is not installed and we assume TRUE. # This will render the vignette as it will look once merged onto main. return(identical(Sys.getenv("CI"), "true")) }) # In the simple context we use the newest benchmark (version = sha) # This benchmark is then labelled with the newest version number of SCDB if (simple_rendering) { benchmarks <- benchmarks %>% dplyr::filter(.data$version == !!sha) %>% dplyr::mutate("version" = paste0("SCDB v", packageVersion("SCDB"))) } else if (on_main) { # If the SHA has been merged, use as the "main" version and remove the other, # older, main version benchmarks <- benchmarks %>% dplyr::filter(.data$version != "main") %>% dplyr::mutate( "version" = dplyr::if_else(.data$version == sha, "development", .data$version) ) } # Mean and standard deviation (see ggplot2::mean_se()) mean_sd <- function(x) { mu <- mean(x) sd <- sd(x) data.frame(y = mu, ymin = mu - sd, ymax = mu + sd) }
# Use data for benchmark 1 benchmark_1 <- benchmarks %>% dplyr::filter( !stringr::str_ends(.data$benchmark_function, stringr::fixed("complexity")) ) # Insert newline into database name to improve rendering of figures labeller <- ggplot2::as_labeller( function(l) stringr::str_replace_all(l, stringr::fixed(" v"), "\nv") ) # Apply "dodging" to sub-groups to show graphically dodge <- ggplot2::position_dodge(width = 0.6) g <- ggplot2::ggplot( benchmark_1, ggplot2::aes(x = version, y = time / 1e9, color = database) ) + ggplot2::stat_summary( fun.data = mean_sd, geom = "pointrange", size = 0.5, linewidth = 1, position = dodge ) + ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(n.dodge = 2)) + ggplot2::labs(x = "Codebase version", y = "Time (s)") + ggplot2::theme(legend.position = "bottom") if (simple_rendering) { # Reduce font size for simple version g <- g + ggplot2::theme(text = ggplot2::element_text(size = 8)) # Make the legend two rows g <- g + ggplot2::guides( color = ggplot2::guide_legend(title = "", nrow = 2, byrow = TRUE) ) } else { # Add facets to extended rendering g <- g + ggplot2::facet_grid( rows = ggplot2::vars(benchmark_function), cols = ggplot2::vars(database), labeller = labeller ) } g
We include another benchmark to highlight the complexity scaling of the update_snapshot()
with the size of the input
data. The datasets are similar to the first benchmark, but the number of repeats is varied to see the impact of
increasing data size. The benchmarks are run from a "clean" state, where the target_table does not exists. The benchmark
measures both the time to create the table and to remove it again afterwards (to restore the clean state).
The performance of this benchmark function is timed with the {microbenchmark}
package using 5 replicates.
All benchmarks are run on the same machine.
The results of the benchmark are shown graphically below (mean and standard deviation) and with linear scaling (dotted
line), where we compare the current development version of SCDB
with the current CRAN version.
NOTE: There are reports of a superlinear complexity for very large data sets. If you experience such problems, consider
batching the updates via the filters
argument.
# Use data for benchmark 2 benchmark_2 <- benchmarks %>% dplyr::filter( stringr::str_ends( .data$benchmark_function, stringr::fixed("complexity") ) ) %>% dplyr::mutate( "benchmark_function" = stringr::str_remove_all( benchmark_function, stringr::fixed("- complexity") ) ) # Apply "dodging" to sub-groups to show graphically dodge <- ggplot2::position_dodge(width = 0.6) # Set aesthetics for simple and extended versions if (simple_rendering) { aes <- ggplot2::aes(x = n * nrow(iris) / 1e3, y = time / 1e9, color = database) } else { aes <- ggplot2::aes(x = n * nrow(iris) / 1e3, y = time / 1e9, color = version) } g <- ggplot2::ggplot( benchmark_2, aes ) + ggplot2::stat_summary( fun.data = mean_sd, geom = "pointrange", size = 0.5, linewidth = 1, position = dodge ) + ggplot2::geom_smooth(method = "lm", formula = y ~ x, se = FALSE, linetype = 3) + ggplot2::labs( x = "Data size (1,000 rows)", y = "Time (s)", color = "Codebase version" ) + ggplot2::theme(panel.spacing = grid::unit(1, "lines"), legend.position = "bottom") if (simple_rendering) { # Reduce font size for simple version g <- g + ggplot2::theme(text = ggplot2::element_text(size = 8)) # Make the legend two rows g <- g + ggplot2::guides( color = ggplot2::guide_legend(title = "", nrow = 2, byrow = TRUE) ) } else { # Add facets to extended rendering g <- g + ggplot2::facet_grid( rows = ggplot2::vars(benchmark_function), cols = ggplot2::vars(database), labeller = labeller ) } g
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.