Nothing
test_that("expect_valid_log fails when log file is nonsense", {
# use a file name that "looks like" it could be a real Inspect log so that
# Inspect tries to read it
tmp_dir <- withr::local_tempdir()
tmp_file <- file.path(
tmp_dir,
"2025-04-02T16-49-36-05-00_simpleaddition_e1e56aeef83a77f6392787.json"
)
file.create(tmp_file)
writeLines(c("beep", "bop", "boop"), tmp_file)
expect_error(
suppressWarnings(expect_valid_log(tmp_file)),
regexp = "Expecting value: line 1 column 1"
)
})
test_that("vitals writes valid eval logs (basic, openai)", {
skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
tmp_dir <- withr::local_tempdir()
withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
withr::local_options(cli.default_handler = function(...) {
})
local_mocked_bindings(interactive = function(...) FALSE)
simple_addition <- tibble::tibble(
input = c("What's 2+2?", "What's 2+3?"),
target = c("4", "5")
)
tsk <- Task$new(
dataset = simple_addition,
solver = generate(ellmer::chat_openai(model = "gpt-4.1-nano")),
scorer = model_graded_qa()
)
tsk$eval()
expect_valid_log(tsk$log())
})
test_that("vitals writes valid eval logs (basic, claude)", {
skip_if(identical(Sys.getenv("ANTHROPIC_API_KEY"), ""))
tmp_dir <- withr::local_tempdir()
withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
withr::local_options(cli.default_handler = function(...) {
})
local_mocked_bindings(interactive = function(...) FALSE)
simple_addition <- tibble::tibble(
input = c("What's 2+2?", "What's 2+3?"),
target = c("4", "5")
)
tsk <- Task$new(
dataset = simple_addition,
solver = generate(ellmer::chat_anthropic(
model = "claude-3-7-sonnet-latest"
)),
scorer = model_graded_qa()
)
tsk$eval()
expect_valid_log(tsk$log())
})
test_that("vitals writes valid eval logs (basic, gemini)", {
skip_if(identical(Sys.getenv("GOOGLE_API_KEY"), ""))
tmp_dir <- withr::local_tempdir()
withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
withr::local_options(cli.default_handler = function(...) {
})
local_mocked_bindings(interactive = function(...) FALSE)
simple_addition <- tibble::tibble(
input = c("What's 2+2?", "What's 2+3?"),
target = c("4", "5")
)
tsk <- Task$new(
dataset = simple_addition,
solver = generate(ellmer::chat_google_gemini(model = "gemini-2.0-flash")),
scorer = model_graded_qa()
)
tsk$eval()
expect_valid_log(tsk$log())
})
test_that("vitals writes valid eval logs (solver tool calls, claude)", {
skip_if(identical(Sys.getenv("ANTHROPIC_API_KEY"), ""))
tmp_dir <- withr::local_tempdir()
withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
withr::local_options(cli.default_handler = function(...) {
})
local_mocked_bindings(interactive = function(...) FALSE)
library(ellmer)
current_date <- tibble::tibble(
input = c("What's the current date?"),
target = c("2024-01-01")
)
ch <- chat_anthropic(model = "claude-3-7-sonnet-latest")
ch$register_tool(tool(function() "2024-01-01", "Return the current date"))
tsk <- Task$new(
dataset = current_date,
solver = generate(ch),
scorer = function(samples) {
list(score = factor("C", levels = c("I", "C"), ordered = TRUE))
}
)
tsk$eval()
expect_valid_log(tsk$log())
})
test_that("vitals writes valid eval logs (solver errors on tool call, claude)", {
skip_if(identical(Sys.getenv("ANTHROPIC_API_KEY"), ""))
tmp_dir <- withr::local_tempdir()
withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
withr::local_options(cli.default_handler = function(...) {
})
local_mocked_bindings(interactive = function(...) FALSE)
library(ellmer)
current_date <- tibble::tibble(
input = c("What's the current date?"),
target = c("2024-01-01")
)
ch <- chat_anthropic(model = "claude-3-7-sonnet-latest")
ch$register_tool(
tool(function() stop("Couldn't find the date"), "Return the current date")
)
tsk <- Task$new(
dataset = current_date,
solver = generate(ch),
scorer = function(samples) {
list(score = factor("C", levels = c("I", "C"), ordered = TRUE))
}
)
# a tool call will fail here and raise a warning; this is intentional.
# since raised from ellmer, we don't expect anything specific about it.
suppressWarnings(tsk$eval())
log_file <- list.files(tmp_dir, full.names = TRUE)
expect_gte(length(log_file), 1)
expect_valid_log(log_file[1])
})
test_that("as_metadata can make lists into a named vector (#69)", {
res <- as_metadata(mtcars[1:2, 1:2])
expect_type(res, "list")
expect_named(res, c("mpg", "cyl"))
expect_snapshot(res)
res <- as_metadata(tibble(x = list(list(a = 1, b = 2)), y = 3))
expect_type(res, "list")
expect_named(res, c("x", "y"))
expect_snapshot(res)
res <- as_metadata(list(1:3, b = 2))
expect_type(res, "list")
expect_named(res, c("1", "b"))
expect_snapshot(res)
res <- as_metadata(list(1:3, b = 2))
expect_type(res, "list")
expect_named(res, c("1", "b"))
expect_snapshot(res)
vec <- c(a = 1, b = 2)
expect_equal(as_metadata(vec), as.list(vec))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.