contentid: An Interface for Content-Based Identifiers

Benchmark some possible backends for the registry.

library(bench)
library(contentid) # remotes::install_github("cboettig/contentid", upgrade = TRUE)

knitr::opts_chunk$set(error=TRUE)

ref <- contentid::resolve("hash://sha256/598032f108d602a8ad9d1031a2bdc4bca1d5dca468981fa29592e1660c8f4883")
df <- read.delim(ref, stringsAsFactors = FALSE)


df <- dplyr::select(df, url = contentURL, id = checksum)
ex <- sample(df$url, 1e6)

bench_time({
  id0 <- df[df$url %in% ex,]$id
})

## process    real 
##   651ms   652ms

`dplyr`

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

bench_time({
  id1 <- df %>% filter(url %in% ex) %>% pull(id)
})

## process    real 
##   662ms   663ms

identical(id0, id1)

## [1] TRUE

## inner join is slower, and not literally the same thing
bench_time({
  id2 <- df %>% inner_join(tibble(url = ex)) %>% pull(id)
})

## Joining, by = "url"

## process    real 
##   2.36s   2.37s

identical(id0, id2)

## [1] FALSE

`disk.frame`

A fst-file backed on disk storage with light-weigth dplyr semantics.

library(disk.frame, warn.conflicts = FALSE, quietly = TRUE)

## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp

## 
## ## Message from disk.frame:
## We have 1 workers to use with disk.frame.
## To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults.
## 
## 
## It is recommended that you run the following immediately to set up disk.frame with multiple workers in order to parallelize your operations:
## 
## 
## ```r
## # this will set up disk.frame with multiple workers
## setup_disk.frame()
## # this will allow unlimited amount of data to be passed from worker to worker
## options(future.globals.maxSize = Inf)
## ```

setup_disk.frame()

## The number of workers available for disk.frame is 12

#options(future.globals.maxSize = Inf) # wow memory issues quickly

df_con <- disk.frame::as.disk.frame(df)
bench::bench_time({ ##
  id3 <- df_con %>% filter(url %in% ex) %>% collect() %>% pull(id)
})

## process    real 
##   1.59m   3.49m

identical(sort(id0), sort(id3))

## [1] TRUE

library(thor)
## set map size to ~ 4GB to be safe
## Thor persists to local db, but whole DB must be able to fit in RAM?
dbfile <- tempfile()
env <- thor::mdb_env(dbfile, mapsize = 1048576*4e3)
env$mput(df$url, df$id)

## NULL

bench_time({ # 
  id4 <- env$mget(ex) %>% as.character()
})

## process    real 
##   2.01s   2.01s

#identical(id0, id4)

fs::dir_info(dbfile) %>% pull(size) %>% sum()

## 894M

`arrow`

Can use .parquet instead of .tsv as base file, is slightly faster than vroom on compressed tsv. (Reads in as a standard data.frame)
Offers on-disk option that we can query with dplyr syntax (can we forgo the dplyr dependency though?)
the dplyr syntax is not DBI-based, and very hit-or-miss. filter(x %in% ...) semantics don’t always work (but don’t error?). inner_join() not implemented…

library(arrow)

## 
## Attaching package: 'arrow'

## The following object is masked from 'package:utils':
## 
##     timestamp

library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
pqt <- file.path(tempfile(), "arrow_dir", "df.parquet")
dir <- dirname(pqt)
dir.create(dir, recursive = TRUE)
write_parquet(df, pqt)

## parquet on disk w/ dplyr semantics
con_arw <- arrow::open_dataset(dir)
bench_time({  # 8.8s
  id5 <- con_arw %>% 
    # inner_join(tibble(url = ex), copy=TRUE) %>%  ## NO inner join
    filter(url %in% ex) %>% 
    collect() %>% pull(id)
})

## process    real 
##   21.2s   20.6s

identical(id0, id5)

## [1] TRUE

## an in memory data.frame from parquet, but reading is quite fast!
bench_time({#  1.3s
df_pqt <- read_parquet(pqt)
})

## process    real 
##   3.15s    2.9s

fs::file_size(pqt)

## 562M

`duckdb`

On disk, standard DBI interface.
not on CRAN

# install.packages("duckdb", repos=c("http://download.duckdb.org/alias/master/rstats/", "http://cran.rstudio.com"))
library(duckdb)

## Loading required package: DBI

ddir <- fs::path(fs::path_temp(), "duckdb", "duckdb1")
fs::dir_create(fs::path_dir(ddir))
con <- DBI::dbConnect( duckdb::duckdb(), dbdir = ddir)
DBI::dbWriteTable(con, "df", df)

bench_time({
  id6 <- tbl(con, "df") %>% inner_join(tibble(url = ex), by="url", copy = TRUE) %>% pull(id)
})

## process    real 
##   1.56s   1.59s

identical(id0, id6)

## [1] FALSE

bench_time({
  id6b <- tbl(con, "df") %>%  filter(url %in% ex) %>% pull(id)
})

## Error in .local(conn, statement, ...): duckdb_prepare_R: Failed to prepare query SELECT "id"
## FROM "df"
## WHERE ("url" IN ('https://merritt-aws.cdlib.org:8084/mn/v2/object/ark%3A%2F13030%2Fm5m32tpn%2F3%2Fcadwsap-s3610110-006-main.csv', 'https://cn.dataone.org/cn/v2/object/doi%3A10.6085%2FAA%2FALEXXX_015MTBD003R00_20051021.50.3', 'https://arcticdata.io/metacat/d1/mn/v2/object/urn%3Auuid%3A99277a51-a5ed-40b7-bb4d-9c1e9d1fae8a', 'https://datadryad.org/mn/v2/object/https%3A%2F%2Fdoi.org%2F10.5061%2Fdryad.5gk51p0%3Fformat%3Dd1rem%26ver%3D2018-08-21T23%3A18%3A15.438%2B00%3A00', 'https://gmn.lternet.edu/mn/v2/object/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Freport%2Feml%2Flter-landsat-ledaps%2F6127%2F1', 'https://cn.dataone.org/cn/v2/object/http%3A%2F%2Fdx.doi.org%2F10.5061%2Fdryad.gj51n%2F1%3Fver%3D2016-03-01T19%3A23%3A37.195-05%3A00', 'https://data.piscoweb.org/catalog/d1/mn/v2/object/doi%3A10.6085%2FAA%2FBBYX00_XXXITBDXLSR03_20060227.40.1', 'https://pangaea-orc-1.dataone.org/mn/v2/object/1cdd1b536b1556ecd6734364fe47a8de', 'http

identical(id0, id6b)

## Error in identical(id0, id6b): object 'id6b' not found

dbDisconnect(con, shutdown=TRUE)
fs::dir_info(fs::path_dir(ddir)) %>% pull(size) %>% sum()

## 562M

`MonetDBLite`

On disk, standard DBI interface.
no longer on CRAN

# install.packages("MonetDBLite", repo = "https://cboettig.github.io/drat")
library(MonetDBLite)
library(DBI)
library(dplyr)

mdir <- tempfile()
con2 <- DBI::dbConnect( MonetDBLite() , dbname = mdir)
DBI::dbWriteTable(con2, "df", df)

bench_time({
  id7 <- tbl(con2, "df") %>% inner_join(tibble(url = ex), copy = TRUE) %>% pull(id)
})

## Joining, by = "url"

## process    real 
##   7.64s   1.44s

identical(id0, id7)

## [1] FALSE

### fails if ex is a big vector

# bench_time({
#  id7b <- tbl(con2, "df") %>%  filter(url %in% ex) %>% pull(id)
#  })
# identical(id0, id7b)

DBI::dbDisconnect(con2, shutdown=TRUE)
rm(con2)

fs::dir_info(mdir, recurse=TRUE) %>% pull(size) %>% sum()

## 519M

cboettig/contentid documentation built on Jan. 23, 2025, 8:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cboettig/contentid
An Interface for Content-Based Identifiers

inst/examples/benchmarking-backends.md
In cboettig/contentid: An Interface for Content-Based Identifiers

Parsing directly

Base R

`dplyr`

`disk.frame`

Thor

`arrow`

`duckdb`

`MonetDBLite`

R Package Documentation

Browse R Packages

We want your feedback!

cboettig/contentid An Interface for Content-Based Identifiers

inst/examples/benchmarking-backends.md In cboettig/contentid: An Interface for Content-Based Identifiers

Parsing directly

Base R

dplyr

disk.frame

Thor

arrow

duckdb

MonetDBLite

R Package Documentation

Browse R Packages

We want your feedback!

cboettig/contentid
An Interface for Content-Based Identifiers

inst/examples/benchmarking-backends.md
In cboettig/contentid: An Interface for Content-Based Identifiers

`dplyr`

`disk.frame`

`arrow`

`duckdb`

`MonetDBLite`