Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
# Set a flag to determine if the vignette should be run in simple context (vignettes)
# or in an extended context (pkgdown)
simple_rendering <- rlang::is_installed("pkgdown") && !pkgdown::in_pkgdown()
## ----setup--------------------------------------------------------------------
library(diseasystore)
## ----benchmark data, include = FALSE------------------------------------------
# Our benchmark data is the mtcars data set but repeated to increase the data size
# If updated, please also update the version in the data-raw/benchmark.R
data_generator <- function(repeats) {
purrr::map(
seq(repeats),
\(i) {
dplyr::mutate(
mtcars,
"row_id" = dplyr::row_number() + (i - 1) * nrow(mtcars),
"car" = paste(rownames(mtcars), .data$row_id)
)
}
) |>
purrr::reduce(rbind) |>
dplyr::rename_with(~ tolower(gsub(".", "_", .x, fixed = TRUE)))
}
benchmark_data <- tibble::as_tibble(data_generator(1000)) |>
dplyr::select("row_id", "car", dplyr::everything())
## ----benchmark data print-----------------------------------------------------
benchmark_data
## ----eval = FALSE-------------------------------------------------------------
# DiseasystoreMtcars <- R6::R6Class(
# classname = "DiseasystoreBase",
# inherit = DiseasystoreBase,
# private = list(
# .ds_map = list("n_cyl" = "mtcars_cyl", "vs" = "mtcars_vs"),
# mtcars_cyl = FeatureHandler$new(
# compute = function(start_date, end_date, slice_ts, source_conn) {
# out <- benchmark_data |>
# dplyr::transmute(
# "key_car" = .data$car, "n_cyl" = .data$cyl,
# "valid_from" = Sys.Date() - lubridate::days(2 * .data$row_id - 1),
# "valid_until" = .data$valid_from + lubridate::days(2)
# )
# return(out)
# },
# key_join = key_join_sum
# ),
# mtcars_vs = FeatureHandler$new(
# compute = function(start_date, end_date, slice_ts, source_conn) {
# out <- benchmark_data |>
# dplyr::transmute(
# "key_car" = .data$car, .data$vs,
# "valid_from" = Sys.Date() - lubridate::days(2 * .data$row_id),
# "valid_until" = .data$valid_from + lubridate::days(2)
# )
# return(out)
# },
# key_join = key_join_sum
# )
# )
# )
## ----benchmark context, results = "asis", echo = FALSE------------------------
if (simple_rendering) {
cat(
"The results of the benchmark are shown graphically below",
"(mean and standard deviation), where measure the",
"performance of `diseasystore`."
)
} else {
cat(
"The results of the benchmark are shown graphically below",
"(mean and standard deviation), where we compare the",
"current development version of `diseasystore` with the current CRAN version.",
"In addition, we run the benchmarks with the current development version of",
"`{SCDB}` versus the current CRAN version since `{SCDB}` is used",
"internally in `diseasystore`."
)
}
## ----echo = FALSE, eval = rlang::is_installed("here")-------------------------
benchmark_location <- c(
devtools::package_file("inst", "extdata", "benchmarks.rds"),
system.file("extdata", "benchmarks.rds", package = "diseasystore"),
here::here("inst", "extdata", "benchmarks.rds")
) |>
purrr::discard(~ identical(., "")) |>
purrr::pluck(1)
benchmarks <- readRDS(benchmark_location) |>
dplyr::mutate("version" = as.character(.data$version))
# The benchmark contains data for the CRAN version, the main branch and whatever
# branch the workflow was run on.
# We need to render the vignette differently dependent on where the vignette is
# rendered.
# If we render the vignette on CRAN, we need to only show benchmarks for the current
# CRAN version SCDB and the newest development benchmarks
# -- but mark these benchmarks as having the new version of diseasystore.
# (since we are releasing a new version of diseasystore, these benchmarks will belong
# to the new CRAN version)
# If we render the vignette on main, we need to render the benchmarks from the
# non-main branch as the main results.
# That is because these benchmarks will be newer than those on main, and the code
# that generate the results will have been merged onto main
#-- causing the vignette to be rendered.
# If we render the vignette on the non-main branch, we need to render all of
# the benchmarks
# Determine if the SHA is on main
sha <- benchmarks |>
dplyr::distinct(.data$version) |>
dplyr::filter(
!startsWith(.data$version, "diseasystore"),
.data$version != "main"
) |>
dplyr::pull("version")
# Check local git history
on_main <- tryCatch({
system(glue::glue("git branch main --contains {sha}"), intern = TRUE) |>
stringr::str_detect(stringr::fixed("main")) |>
isTRUE()
}, warning = function(w) {
# If on GitHub, git is not installed and we assume TRUE.
# This will render the vignette as it will look once merged onto main.
return(identical(Sys.getenv("CI"), "true"))
})
# In the simple context we use the newest benchmark (version = sha)
# This benchmark is then labelled with the newest version number of SCDB
if (simple_rendering) {
benchmarks <- benchmarks |>
dplyr::filter(.data$version == !!sha, .data$SCDB != "main") |>
dplyr::mutate(
"version" = paste0("diseasystore v", packageVersion("diseasystore"))
)
} else if (on_main) {
# If the SHA has been merged, use as the "main" version and remove the other,
# older, main version
benchmarks <- benchmarks |>
dplyr::filter(.data$version != "main") |>
dplyr::mutate(
"version" = dplyr::if_else(.data$version == sha, "development", .data$version)
)
}
## ----Generate alt text for benchmarks, echo = FALSE, results = "hide", eval = rlang::is_installed("here")----
benchmark_alt_text <- benchmarks |>
dplyr::filter(startsWith(.data$version, "diseasystore v"), .data$SCDB != "main") |>
dplyr::transmute(
.data$benchmark_function,
.data$database,
"time_seconds" = .data$time / 1e9
) |>
dplyr::mutate(
"backend" = stringr::str_remove(.data$database, r"{\sv[\w+\.]+$}")
) |>
dplyr::summarise(
"mean_time_seconds" = round(mean(.data$time_seconds)),
.by = c("benchmark_function", "backend")
) |>
dplyr::summarise(
"backends" = toString(.data$backend),
.by = c("benchmark_function", "mean_time_seconds")
) |>
dplyr::arrange(.data$benchmark_function, .data$mean_time_seconds) |>
purrr::pmap_chr(
\(benchmark_function, mean_time_seconds, backends) {
glue::glue(
"The {benchmark_function} benchmark takes {mean_time_seconds} seconds",
" to run on the {backends} backends."
)
}
) |>
purrr::reduce(
paste,
.init = "The results for the benchmarks of the CRAN versions are as follows:"
)
## ----echo = FALSE, fig.alt = benchmark_alt_text, eval = rlang::is_installed("here")----
# Mean and standard deviation (see ggplot2::mean_se())
mean_sd <- function(x) {
mu <- mean(x)
sd <- sd(x)
data.frame(y = mu, ymin = mu - sd, ymax = mu + sd)
}
# Determine subgroups
groups <- setdiff(
colnames(benchmarks),
c("expr", "time", "benchmark_function", "database", "version", "n")
)
# Apply "dodging" to sub-groups to show graphically
dodge <- ggplot2::position_dodge(width = 0.6)
# Insert newline into database name to improve rendering of figures
labeller <- ggplot2::as_labeller(
\(l) stringr::str_replace_all(l, stringr::fixed(" v"), "\nv")
)
# Set aesthetics for CRAN and non-CRAN versions
if (simple_rendering) {
aes <- ggplot2::aes(
x = version,
y = time / 1e9,
color = database
)
} else {
aes <- ggplot2::aes(
x = version,
y = time / 1e9,
group = !!switch(length(groups) > 0, as.symbol(groups)),
color = !!switch(length(groups) > 0, as.symbol(groups))
)
}
g <- ggplot2::ggplot(
benchmarks,
aes
) +
ggplot2::stat_summary(
fun.data = mean_sd,
geom = "pointrange",
size = 0.5,
linewidth = 1,
position = dodge
) +
ggplot2::facet_grid(
rows = ggplot2::vars(benchmark_function),
cols = ggplot2::vars(database),
scales = "free_y",
labeller = labeller
) +
ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(n.dodge = 2)) +
ggplot2::labs(x = "Codebase version", y = "Time (s)") +
ggplot2::theme(
legend.position = "bottom",
legend.justification = "left"
) +
ggplot2::ylim(0, NA)
if (simple_rendering) {
# Reduce font size for CRAN version
g <- g + ggplot2::theme(text = ggplot2::element_text(size = 8))
# Make the legend two rows
g <- g + ggplot2::guides(color = ggplot2::guide_legend(nrow = 2, byrow = TRUE))
} else {
# Add facets to non-CRAN rendering
g <- g +
ggplot2::facet_grid(
rows = ggplot2::vars(benchmark_function),
cols = ggplot2::vars(database),
scales = "free_y",
labeller = labeller
)
}
g
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.