# ARROWBENCH_LOCAL_DIR="path/to/arrowbench/storage" Rscript inst/tpch-answer-gen.R
library(arrowbench)
library(duckdb)
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
sf <- 1
tpch_files <- ensure_source("tpch", scale_factor = sf)
input_functions <- list()
input_functions[["dplyr"]] <- function(name) {
file <- tpch_files[[name]]
return(arrow::read_parquet(file, as_data_frame = TRUE))
}
input_functions[["arrow"]] <- function(name) {
file <- tpch_files[[name]]
return(arrow::open_dataset(file, format = "parquet"))
}
con <- dbConnect(duckdb::duckdb("answer_gen_db"))
dbExecute(con, paste0("PRAGMA threads=10"))
# DuckDB tables
for (name in tpch_tables) {
file <- path.expand(tpch_files[[name]])
sql_query <- paste0("CREATE TABLE ", name, " AS SELECT * FROM parquet_scan('", file, "');")
file <- tpch_files[[name]]
dbExecute(con, sql_query)
}
input_functions[["duckdb"]] <- function(name) {
return(dplyr::tbl(con, name))
}
# create directory to save the answers to
dir.create(glue::glue("./answers/scale-factor-{sf}/"), recursive = TRUE)
for (q in c(1:22)) {
message("==================================================")
message(glue::glue("Query: {q}"))
message("==================================================")
query <- q
# grab the sql queries from github (this URL might need to be updated if their location in the repo changes.)
sql <- paste0(httr::GET(
glue::glue("https://raw.githubusercontent.com/duckdb/duckdb/master/extension/tpch/dbgen/queries/q{stringr::str_pad(query, 2, pad = '0')}.sql")
), collapse = "\n")
# dplyr with scale factor 10 requires a lot of memory, if hitting `vector memory exhausted (limit reached?)` comment it out
# at scale factor 0.01 there are small differences between duckdb and arrow for some queries. This is likely due to decimal precision / rounding differences, but I haven't dug into it too deeply.
result_dplyr <- tpc_h_queries[[query]](input_functions[["dplyr"]])
result_arrow <- tpc_h_queries[[query]](input_functions[["arrow"]], collect_func = compute)
result_duckdb <- as_tibble(dbGetQuery(con, sql))
# compare the arrow results with both dplyr and duckdb versions
print(waldo::compare(as.data.frame(result_arrow), result_dplyr, tolerance = 0.01, x_arg = "arrow", y_arg = "dplyr"))
print(waldo::compare(as.data.frame(result_arrow), result_duckdb, tolerance = 0.01, x_arg = "arrow", y_arg = "duckdb"))
write_parquet(result_arrow, glue::glue("./answers/scale-factor-{sf}/tpch-q{stringr::str_pad(query, 2, pad = '0')}-sf{sf}.parquet"))
}
# clean up duckdb database file
DBI::dbDisconnect(con, shutdown = TRUE)
unlink("answer_gen_db")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.