tests/testthat/test-store.R

test_that("ragnar_store_update/insert, v1", {
  store <- ragnar_store_create(
    version = 1,
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100))
  )
  maybe_set_threads(store)
  expect_true(grepl("^store_[0-9]+$", store@name))

  chunks <- data.frame(
    origin = "foo",
    hash = "foo",
    text = "foo"
  )
  ragnar_store_update(store, chunks)

  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(val, chunks)

  # now try to update the store again - without changing the hash
  chunks2 <- data.frame(
    origin = "foo",
    hash = "foo",
    text = "bar"
  )
  ragnar_store_update(store, chunks2)

  # Expect that the text is not updated, because the hash is the same
  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(val, chunks)

  # now try to update the store again - changing the hash
  chunks2 <- data.frame(
    origin = "foo",
    hash = "bar",
    text = "bar"
  )
  ragnar_store_update(store, chunks2)

  # Expect that the text is updated
  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(val, chunks2)

  # Finally, try adding a new origin - even with the same hash
  chunks2 <- data.frame(
    origin = "bar",
    hash = "bar",
    text = "bar"
  )
  ragnar_store_update(store, chunks2)

  # Expect the origin is added
  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(nrow(val), 2)

  # Try adding with insert
  chunks2 <- data.frame(
    origin = "bar",
    hash = "bar",
    text = "bar"
  )
  ragnar_store_insert(store, chunks2)

  # Since we used insert, there's no checking if the hash is the same
  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(nrow(val), 3)
})

test_that("ragnar_store_update/insert", {
  store <- ragnar_store_create(
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100))
  )
  maybe_set_threads(store)
  expect_true(grepl("^store_[0-9]+$", store@name))

  doc <- MarkdownDocument(c("foo", "bar", "faz"), 'someorigin')
  chunks <- doc |> markdown_chunk(target_size = 4, target_overlap = 0)
  ragnar_store_update(store, chunks)

  val <- dbGetQuery(
    store@con,
    'select start, "end", "context", text, from chunks'
  )
  expect_equal(val, as_bare_df(chunks))

  # now try to update the store with the same content
  chunks2 <- doc |> markdown_chunk(target_size = 4, target_overlap = 0)
  ragnar_store_update(store, chunks2)

  # Expect that the text is not updated
  val <- dbGetQuery(store@con, "select origin, text from chunks order by start")
  expect_equal(val$text, chunks$text)

  # now try to update the store again - with different content
  doc <- MarkdownDocument(c("abc", "def", "ghi"), 'someorigin')
  chunks <- doc |> markdown_chunk(target_size = 4, target_overlap = 0)
  ragnar_store_update(store, chunks)

  # Expect that the text is updated
  val <- dbGetQuery(
    store@con,
    'select start, "end", context, text, origin, from chunks'
  )
  expect_equal(val, chunks |> as_bare_df() |> mutate(origin = 'someorigin'))
  expect_equal(nrow(val), 3)

  # Finally, try adding a new origin - even with the same text
  doc2 <- doc |> set_props(origin = "some other origin")
  chunks2 <- doc2 |> markdown_chunk(target_size = 4, target_overlap = 0)
  ragnar_store_update(store, chunks2)

  hoist_origin <- function(x) {
    x$origin <- x@document@origin
    x[unique(c("origin", names(x)))]
  }

  # Expect the origin is added
  val <- dbGetQuery(
    store@con,
    "select * exclude (chunk_id, doc_id, embedding) from chunks"
  )
  expect_equal(nrow(val), 6)
  val <- val |> arrange(origin, start) |> as_bare_df()
  expected <- vec_rbind(
    chunks |> hoist_origin() |> as_bare_df(),
    chunks2 |> hoist_origin() |> as_bare_df()
  ) |>
    arrange(origin, start)

  expect_equal(expected, val)
})

test_that("behavior when no hash/origin are provided", {
  store <- ragnar_store_create(
    version = 1,
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100))
  )
  maybe_set_threads(store)

  chunks <- data.frame(
    text = "foo"
  )
  # users must explicitly set origin and hash when updating!
  expect_error(
    ragnar_store_update(store, chunks),
    "chunks` must have `origin` and `hash`"
  )
  # they can insert though
  ragnar_store_insert(store, chunks)

  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(
    val,
    data.frame(origin = NA_character_, hash = rlang::hash("foo"), text = "foo")
  )

  # if they insert again, even though the text has the same hash, we don't update anything
  ragnar_store_insert(store, chunks)

  # Expect that the text is not updated, because the hash is the same
  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(
    val,
    rbind(
      data.frame(
        origin = NA_character_,
        hash = rlang::hash("foo"),
        text = "foo"
      ),
      data.frame(
        origin = NA_character_,
        hash = rlang::hash("foo"),
        text = "foo"
      )
    )
  )
})

test_that("additional columns", {
  store <- ragnar_store_create(
    version = 1,
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)),
    extra_cols = data.frame(h1 = character(0), int = integer(0))
  )
  maybe_set_threads(store)

  chunks <- data.frame(
    text = "foo"
  )
  # You can't insert chunks that miss the column
  expect_error(ragnar_store_insert(store, chunks), regexp = "Missing columns:")

  # Can't insert if they don't match types
  chunks <- data.frame(
    text = "foo",
    h1 = 1L, # h1 should be character, not integer
    int = 1L
  )
  expect_error(ragnar_store_insert(store, chunks), regexp = "Can't convert")

  # We should include if there's the correct data
  chunks <- data.frame(
    text = "foo",
    h1 = "hello",
    int = 1 # numerics should be allowed and converted to integer
  )
  ragnar_store_insert(store, chunks)
  val <- dbGetQuery(store@con, "select text, h1 from chunks")
  expect_equal(val, chunks |> select(-int))

  # It's fine to insert a chunk if it has an additional column. It's
  # simply ignored.
  chunks <- data.frame(
    text = "foo",
    h1 = "hello",
    h2 = "bye",
    int = 1L
  )
  ragnar_store_insert(store, chunks)
  val <- dbGetQuery(store@con, "select text, h1 from chunks")
  expect_equal(nrow(val), 2)
})

test_that("Allow a NULL embedding function", {
  skip_on_cran() # See comment in test-retrieve.R
  store <- ragnar_store_create(embed = NULL, version = 1)
  maybe_set_threads(store)
  chunks <- data.frame(
    origin = c("foo", "bar"),
    hash = c("foo", "bar"),
    text = c("foo", "bar")
  )
  ragnar_store_update(store, chunks)
  # no error, vss is ignored
  expect_no_error(ragnar_store_build_index(store))

  ragnar_store_build_index(store, type = "fts")
  ragnar_retrieve_bm25(store, "bar")
  expect_no_error(ragnar_retrieve(store, "bar"))
})

test_that("works with MotherDuck", {
  skip_if_cant_use_motherduck()
  skip_on_cran() # See comment in test-retrieve.R

  store <- ragnar_store_create(
    version = 1,
    "md:ragnartest",
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)),
    overwrite = TRUE
  )
  maybe_set_threads(store)

  chunks <- data.frame(
    origin = "foo",
    hash = "foo",
    text = "foo"
  )

  expect_error(ragnar_store_insert(store, chunks), regexp = NA)
  expect_warning(
    ragnar_store_build_index(store),
    regexp = "MotherDuck does not support"
  )
  expect_error(ragnar_retrieve(store, "hello"), regexp = NA)

  # Since we used insert, there's no checking if the hash is the same
  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(nrow(val), 1)

  # connect to the motherduck store
  store <- ragnar_store_connect("md:ragnartest")
  maybe_set_threads(store)
  expect_error(ragnar_retrieve(store, "hello"), regexp = NA)

  val <- dbGetQuery(store@con, "select origin, hash, text from chunks")
  expect_equal(nrow(val), 1)
})

test_that("embed functions get the defaults stored", {
  store <- ragnar_store_create(embed = function(x) ragnar::embed_openai(x))
  expect_snapshot(store@embed)

  # here embed_openai is implicitly obtained from ragnar::embed_openai
  store <- ragnar_store_create(embed = function(x) embed_openai(x))
  expect_snapshot(store@embed)

  # if the embed function takes ..., they're preserved
  store <- ragnar_store_create(
    embed = function(x, ...) ragnar::embed_openai(x, ...)
  )
  expect_snapshot(store@embed)

  # when using the partialized version, we should also add the defaults
  store <- ragnar_store_create(embed = embed_openai())
  expect_snapshot(store@embed)

  # test other embed funcs
  skip_if(inherits(try(silent = TRUE, ragnar::embed_ollama("hi")), "try-error"))
  store <- ragnar_store_create(embed = function(x) ragnar::embed_ollama(x))
  expect_snapshot(store@embed)
})

test_that("store v1 accepts markdown chunks (from v2)", {
  skip_on_cran() # See comment in test-retrieve.R
  store <- ragnar_store_create(
    version = 1,
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100))
  )
  maybe_set_threads(store)

  chunks <- test_doc() |>
    read_as_markdown() |>
    markdown_chunk()

  # insert directly ignores al other columns
  expect_no_error(ragnar_store_insert(store, chunks))
  ragnar_store_build_index(store)
  expect_equal(ragnar_retrieve(store, "r")$origin[1], chunks@document@origin)

  # when the store has start, end and origin as extra cols they get included
  store <- ragnar_store_create(
    version = 1,
    extra_cols = data.frame(
      start = integer(0),
      end = integer(0),
      context = character(0)
    ),
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100))
  )
  maybe_set_threads(store)

  chunks <- test_doc() |>
    read_as_markdown() |>
    markdown_chunk()

  expect_no_error(ragnar_store_insert(store, chunks))
  ragnar_store_build_index(store)
  expect_equal(ragnar_retrieve(store, "r")$origin[1], chunks@document@origin)

  # when the chunks data.frame already contains an origin column, it's used instead.
  store <- ragnar_store_create(
    version = 1,
    embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100))
  )
  maybe_set_threads(store)

  chunks2 <- chunks |> mutate(origin = "a")
  expect_no_error(ragnar_store_insert(store, chunks2))
  ragnar_store_build_index(store)
  expect_equal(ragnar_retrieve(store, "r")$origin[1], "a")
})

Try the ragnar package in your browser

Any scripts or data that you put into this service are public.

ragnar documentation built on Aug. 8, 2025, 7:07 p.m.