tests/testthat/test-bulk-import.R

library(arkdb)
library(testthat)

testthat::context("Bulk importers")
test_that("We can do fast bulk import with duckdb in most cases", {
  skip_on_cran()
  skip_if_offline()
  skip_if_not_installed("duckdb")
  skip_if_not_installed("dplyr")
  skip_if_not_installed("nycflights13")
  skip_if_not_installed("readr")
  library(nycflights13)


  tmp <- tempdir()
  duck_dir <- fs::dir_create(fs::path(tmp))
  duck_file <- fs::path(duck_dir, "flights", ext = "db")
  db_con <- DBI::dbConnect(duckdb::duckdb(), duck_file)

  readr::write_tsv(airlines, file.path(tmp, "airlines.tsv.xz"))
  readr::write_tsv(planes, file.path(tmp, "planes.tsv.xz"))
  readr::write_tsv(flights, file.path(tmp, "flights.tsv.xz"))

  files <- paste(tmp, c(
    "airlines.tsv.xz",
    "planes.tsv.xz",
    "flights.tsv.xz"
  ),
  sep = "/"
  )

  expect_message(unark(files, db_con), "Native bulk importer found")
  expect_failure(expect_message(
    suppressWarnings(unark(files, db_con, overwrite = TRUE)),
    "Native import failed, falling back on R-based parser"
  ))

  who <- DBI::dbListTables(db_con)
  expect_identical(sort(who), sort(c("airlines", "planes", "flights")))
  remote_flights <- dplyr::tbl(db_con, "flights")
  expect_is(remote_flights, "tbl_duckdb_connection")
  lapply(who, function(tbl) DBI::dbRemoveTable(db_con, tbl))

  expect_message(
    suppressWarnings(unark(files, db_con, try_native = FALSE)),
    "line chunks"
  )

  who <- DBI::dbListTables(db_con)
  expect_identical(sort(who), sort(c("airlines", "planes", "flights")))
  remote_flights <- dplyr::tbl(db_con, "flights")
  expect_is(remote_flights, "tbl_duckdb_connection")

  ## import from URL
  url <- paste0(
    "https://github.com/ropensci/piggyback",
    "/releases/download/v0.0.3/mtcars.tsv.xz"
  )
  expect_message(unark(url, db_con), "Native bulk importer found")

  remote_cars <- dplyr::collect(dplyr::tbl(db_con, "mtcars"))
  expect_is(remote_cars, "data.frame")
  expect_equivalent(mtcars, remote_cars)

  DBI::dbDisconnect(db_con, shutdown = TRUE)
  unlink(duck_dir, TRUE)
  unlink(tmp)
})


test_that("We can do fast bulk import with MonetDBLite in most cases", {
  skip_on_cran()
  skip_if_offline()
  skip_if_not_installed("MonetDBLite")
  skip_if_not_installed("dplyr")
  skip_if_not_installed("nycflights13")
  skip_if_not_installed("readr")
  library(nycflights13)

  MonetDBLite::monetdblite_shutdown()

  tmp <- tempdir()
  monet_dir <- fs::dir_create(fs::path(tmp, "monet"))
  db_con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), monet_dir)

  readr::write_tsv(airlines, file.path(tmp, "airlines.tsv.xz"))
  readr::write_tsv(planes, file.path(tmp, "planes.tsv.xz"))
  readr::write_tsv(flights, file.path(tmp, "flights.tsv.xz"))

  files <- paste(tmp, c(
    "airlines.tsv.xz",
    "planes.tsv.xz",
    "flights.tsv.xz"
  ),
  sep = "/"
  )

  expect_message(unark(files, db_con), "Native bulk importer found")
  expect_failure(
    expect_message(
      suppressWarnings(unark(files, db_con, overwrite = TRUE)),
      "Native import failed, falling back on R-based parser"
    )
  )

  who <- DBI::dbListTables(db_con)
  expect_identical(sort(who), sort(c("airlines", "planes", "flights")))
  remote_flights <- dplyr::tbl(db_con, "flights")
  expect_is(remote_flights, "tbl_MonetDBEmbeddedConnection")
  lapply(who, function(tbl) DBI::dbRemoveTable(db_con, tbl))

  expect_message(
    suppressWarnings(
      unark(files, db_con, try_native = FALSE, overwrite = TRUE)
    ),
    "line chunks"
  )

  who <- DBI::dbListTables(db_con)
  expect_identical(sort(who), sort(c("airlines", "planes", "flights")))
  remote_flights <- dplyr::tbl(db_con, "flights")
  lapply(who, function(tbl) DBI::dbRemoveTable(db_con, tbl))

  ## import from URL
  url <- paste0(
    "https://github.com/ropensci/piggyback",
    "/releases/download/v0.0.3/mtcars.tsv.xz"
  )
  unark(url, db_con)

  remote_cars <- dplyr::collect(dplyr::tbl(db_con, "mtcars"))
  expect_is(remote_cars, "data.frame")
  expect_equivalent(mtcars, remote_cars)

  DBI::dbDisconnect(db_con)
  unlink(monet_dir, TRUE)
  unlink(tmp)
})
ropensci/arkdb documentation built on Jan. 27, 2024, 4:47 p.m.