Benchmark some possible backends for the registry.
library(bench)
library(contentid) # remotes::install_github("cboettig/contentid", upgrade = TRUE)
knitr::opts_chunk$set(error=TRUE)
ref <- contentid::resolve("hash://sha256/598032f108d602a8ad9d1031a2bdc4bca1d5dca468981fa29592e1660c8f4883")
df <- read.delim(ref, stringsAsFactors = FALSE)
df <- dplyr::select(df, url = contentURL, id = checksum)
ex <- sample(df$url, 1e6)
bench_time({
id0 <- df[df$url %in% ex,]$id
})
## process real
## 651ms 652ms
dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bench_time({
id1 <- df %>% filter(url %in% ex) %>% pull(id)
})
## process real
## 662ms 663ms
identical(id0, id1)
## [1] TRUE
## inner join is slower, and not literally the same thing
bench_time({
id2 <- df %>% inner_join(tibble(url = ex)) %>% pull(id)
})
## Joining, by = "url"
## process real
## 2.36s 2.37s
identical(id0, id2)
## [1] FALSE
disk.frame
A fst
-file backed on disk storage with light-weigth dplyr semantics.
library(disk.frame, warn.conflicts = FALSE, quietly = TRUE)
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
##
## ## Message from disk.frame:
## We have 1 workers to use with disk.frame.
## To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults.
##
##
## It is recommended that you run the following immediately to set up disk.frame with multiple workers in order to parallelize your operations:
##
##
## ```r
## # this will set up disk.frame with multiple workers
## setup_disk.frame()
## # this will allow unlimited amount of data to be passed from worker to worker
## options(future.globals.maxSize = Inf)
## ```
setup_disk.frame()
## The number of workers available for disk.frame is 12
#options(future.globals.maxSize = Inf) # wow memory issues quickly
df_con <- disk.frame::as.disk.frame(df)
bench::bench_time({ ##
id3 <- df_con %>% filter(url %in% ex) %>% collect() %>% pull(id)
})
## process real
## 1.59m 3.49m
identical(sort(id0), sort(id3))
## [1] TRUE
library(thor)
## set map size to ~ 4GB to be safe
## Thor persists to local db, but whole DB must be able to fit in RAM?
dbfile <- tempfile()
env <- thor::mdb_env(dbfile, mapsize = 1048576*4e3)
env$mput(df$url, df$id)
## NULL
bench_time({ #
id4 <- env$mget(ex) %>% as.character()
})
## process real
## 2.01s 2.01s
#identical(id0, id4)
fs::dir_info(dbfile) %>% pull(size) %>% sum()
## 894M
arrow
.parquet
instead of .tsv
as base file, is slightly
faster than vroom
on compressed tsv
. (Reads in as a standard
data.frame
)dplyr
syntax (can we
forgo the dplyr
dependency though?) dplyr
syntax is not DBI-based, and very hit-or-miss. filter(x
%in% ...)
semantics don’t always work (but don’t error?).
inner_join()
not implemented…library(arrow)
##
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
##
## timestamp
library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
pqt <- file.path(tempfile(), "arrow_dir", "df.parquet")
dir <- dirname(pqt)
dir.create(dir, recursive = TRUE)
write_parquet(df, pqt)
## parquet on disk w/ dplyr semantics
con_arw <- arrow::open_dataset(dir)
bench_time({ # 8.8s
id5 <- con_arw %>%
# inner_join(tibble(url = ex), copy=TRUE) %>% ## NO inner join
filter(url %in% ex) %>%
collect() %>% pull(id)
})
## process real
## 21.2s 20.6s
identical(id0, id5)
## [1] TRUE
## an in memory data.frame from parquet, but reading is quite fast!
bench_time({# 1.3s
df_pqt <- read_parquet(pqt)
})
## process real
## 3.15s 2.9s
fs::file_size(pqt)
## 562M
duckdb
# install.packages("duckdb", repos=c("http://download.duckdb.org/alias/master/rstats/", "http://cran.rstudio.com"))
library(duckdb)
## Loading required package: DBI
ddir <- fs::path(fs::path_temp(), "duckdb", "duckdb1")
fs::dir_create(fs::path_dir(ddir))
con <- DBI::dbConnect( duckdb::duckdb(), dbdir = ddir)
DBI::dbWriteTable(con, "df", df)
bench_time({
id6 <- tbl(con, "df") %>% inner_join(tibble(url = ex), by="url", copy = TRUE) %>% pull(id)
})
## process real
## 1.56s 1.59s
identical(id0, id6)
## [1] FALSE
bench_time({
id6b <- tbl(con, "df") %>% filter(url %in% ex) %>% pull(id)
})
## Error in .local(conn, statement, ...): duckdb_prepare_R: Failed to prepare query SELECT "id"
## FROM "df"
## WHERE ("url" IN ('https://merritt-aws.cdlib.org:8084/mn/v2/object/ark%3A%2F13030%2Fm5m32tpn%2F3%2Fcadwsap-s3610110-006-main.csv', 'https://cn.dataone.org/cn/v2/object/doi%3A10.6085%2FAA%2FALEXXX_015MTBD003R00_20051021.50.3', 'https://arcticdata.io/metacat/d1/mn/v2/object/urn%3Auuid%3A99277a51-a5ed-40b7-bb4d-9c1e9d1fae8a', 'https://datadryad.org/mn/v2/object/https%3A%2F%2Fdoi.org%2F10.5061%2Fdryad.5gk51p0%3Fformat%3Dd1rem%26ver%3D2018-08-21T23%3A18%3A15.438%2B00%3A00', 'https://gmn.lternet.edu/mn/v2/object/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Freport%2Feml%2Flter-landsat-ledaps%2F6127%2F1', 'https://cn.dataone.org/cn/v2/object/http%3A%2F%2Fdx.doi.org%2F10.5061%2Fdryad.gj51n%2F1%3Fver%3D2016-03-01T19%3A23%3A37.195-05%3A00', 'https://data.piscoweb.org/catalog/d1/mn/v2/object/doi%3A10.6085%2FAA%2FBBYX00_XXXITBDXLSR03_20060227.40.1', 'https://pangaea-orc-1.dataone.org/mn/v2/object/1cdd1b536b1556ecd6734364fe47a8de', 'http
identical(id0, id6b)
## Error in identical(id0, id6b): object 'id6b' not found
dbDisconnect(con, shutdown=TRUE)
fs::dir_info(fs::path_dir(ddir)) %>% pull(size) %>% sum()
## 562M
MonetDBLite
# install.packages("MonetDBLite", repo = "https://cboettig.github.io/drat")
library(MonetDBLite)
library(DBI)
library(dplyr)
mdir <- tempfile()
con2 <- DBI::dbConnect( MonetDBLite() , dbname = mdir)
DBI::dbWriteTable(con2, "df", df)
bench_time({
id7 <- tbl(con2, "df") %>% inner_join(tibble(url = ex), copy = TRUE) %>% pull(id)
})
## Joining, by = "url"
## process real
## 7.64s 1.44s
identical(id0, id7)
## [1] FALSE
### fails if ex is a big vector
# bench_time({
# id7b <- tbl(con2, "df") %>% filter(url %in% ex) %>% pull(id)
# })
# identical(id0, id7b)
DBI::dbDisconnect(con2, shutdown=TRUE)
rm(con2)
fs::dir_info(mdir, recurse=TRUE) %>% pull(size) %>% sum()
## 519M
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.