tests/testthat/test-translate.R

test_that("expect_valid_log fails when log file is nonsense", {
  # use a file name that "looks like" it could be a real Inspect log so that
  # Inspect tries to read it
  tmp_dir <- withr::local_tempdir()
  tmp_file <- file.path(
    tmp_dir,
    "2025-04-02T16-49-36-05-00_simpleaddition_e1e56aeef83a77f6392787.json"
  )
  file.create(tmp_file)
  writeLines(c("beep", "bop", "boop"), tmp_file)

  expect_error(
    suppressWarnings(expect_valid_log(tmp_file)),
    regexp = "Expecting value: line 1 column 1"
  )
})

test_that("vitals writes valid eval logs (basic, openai)", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(ellmer::chat_openai(model = "gpt-4.1-nano")),
    scorer = model_graded_qa()
  )
  tsk$eval()
  expect_valid_log(tsk$log())
})

test_that("vitals writes valid eval logs (basic, claude)", {
  skip_if(identical(Sys.getenv("ANTHROPIC_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(ellmer::chat_anthropic(
      model = "claude-3-7-sonnet-latest"
    )),
    scorer = model_graded_qa()
  )
  tsk$eval()
  expect_valid_log(tsk$log())
})

test_that("vitals writes valid eval logs (basic, gemini)", {
  skip_if(identical(Sys.getenv("GOOGLE_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(ellmer::chat_google_gemini(model = "gemini-2.0-flash")),
    scorer = model_graded_qa()
  )
  tsk$eval()
  expect_valid_log(tsk$log())
})


test_that("vitals writes valid eval logs (solver tool calls, claude)", {
  skip_if(identical(Sys.getenv("ANTHROPIC_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  current_date <- tibble::tibble(
    input = c("What's the current date?"),
    target = c("2024-01-01")
  )

  ch <- chat_anthropic(model = "claude-3-7-sonnet-latest")
  ch$register_tool(tool(function() "2024-01-01", "Return the current date"))

  tsk <- Task$new(
    dataset = current_date,
    solver = generate(ch),
    scorer = function(samples) {
      list(score = factor("C", levels = c("I", "C"), ordered = TRUE))
    }
  )
  tsk$eval()
  expect_valid_log(tsk$log())
})

test_that("vitals writes valid eval logs (solver errors on tool call, claude)", {
  skip_if(identical(Sys.getenv("ANTHROPIC_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  current_date <- tibble::tibble(
    input = c("What's the current date?"),
    target = c("2024-01-01")
  )

  ch <- chat_anthropic(model = "claude-3-7-sonnet-latest")
  ch$register_tool(
    tool(function() stop("Couldn't find the date"), "Return the current date")
  )

  tsk <- Task$new(
    dataset = current_date,
    solver = generate(ch),
    scorer = function(samples) {
      list(score = factor("C", levels = c("I", "C"), ordered = TRUE))
    }
  )

  # a tool call will fail here and raise a warning; this is intentional.
  # since raised from ellmer, we don't expect anything specific about it.
  suppressWarnings(tsk$eval())

  log_file <- list.files(tmp_dir, full.names = TRUE)
  expect_gte(length(log_file), 1)

  expect_valid_log(log_file[1])
})

test_that("as_metadata can make lists into a named vector (#69)", {
  res <- as_metadata(mtcars[1:2, 1:2])
  expect_type(res, "list")
  expect_named(res, c("mpg", "cyl"))
  expect_snapshot(res)

  res <- as_metadata(tibble(x = list(list(a = 1, b = 2)), y = 3))
  expect_type(res, "list")
  expect_named(res, c("x", "y"))
  expect_snapshot(res)

  res <- as_metadata(list(1:3, b = 2))
  expect_type(res, "list")
  expect_named(res, c("1", "b"))
  expect_snapshot(res)

  res <- as_metadata(list(1:3, b = 2))
  expect_type(res, "list")
  expect_named(res, c("1", "b"))
  expect_snapshot(res)

  vec <- c(a = 1, b = 2)
  expect_equal(as_metadata(vec), as.list(vec))
})

Try the vitals package in your browser

Any scripts or data that you put into this service are public.

vitals documentation built on June 24, 2025, 9:08 a.m.