test-task.R
In vitals: Large Language Model Evaluation

test_that("Task R6 class works", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = model_graded_qa()
  )

  expect_true(R6::is.R6(tsk))
  expect_true(inherits(tsk, "Task"))
  expect_snapshot(tsk)

  expect_equal(nrow(tsk$get_samples()), nrow(simple_addition))
  expect_named(tsk$get_samples(), c("input", "target", "id"))

  tsk$eval()
  expect_valid_log(tsk$log())
  expect_snapshot(tsk)

  expect_named(
    tsk$get_samples(),
    c(
      "input",
      "target",
      "id",
      "result",
      "solver_chat",
      "score",
      "scorer",
      "scorer_chat",
      "scorer_metadata"
    ),
    ignore.order = TRUE
  )

  expect_equal(tsk, .last_task)
})

test_that("Task with epochs works", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = model_graded_qa()
  )

  tsk$eval(epochs = 2)
  expect_valid_log(tsk$log())

  expect_equal(nrow(tsk$get_samples()), nrow(simple_addition) * 2)
  expect_named(
    tsk$get_samples(),
    c(
      "input",
      "target",
      "id",
      "epoch",
      "result",
      "solver_chat",
      "score",
      "scorer",
      "scorer_chat",
      "scorer_metadata"
    ),
    ignore.order = TRUE
  )
})

test_that("Task respects `$new(epochs)`", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = model_graded_qa(),
    epochs = 2
  )

  tsk$eval()
  expect_valid_log(tsk$log())

  expect_equal(nrow(tsk$get_samples()), nrow(simple_addition) * 2)
})

test_that("`$eval(epochs)` takes precedence over `$new(epochs)`", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = model_graded_qa(),
    epochs = 2
  )

  tsk$eval(epochs = 1)
  expect_valid_log(tsk$log())

  expect_equal(nrow(tsk$get_samples()), nrow(simple_addition))
})

test_that("check_dataset works", {
  expect_snapshot(
    Task$new(
      dataset = data.frame(input = 1),
      solver = function() {
      },
      scorer = function() {
      }
    ),
    error = TRUE
  )
  expect_snapshot(
    Task$new(
      dataset = data.frame(target = 1),
      solver = function() {
      },
      scorer = function() {
      }
    ),
    error = TRUE
  )
  expect_snapshot(
    Task$new(
      dataset = data.frame(x = 1),
      solver = function() {
      },
      scorer = function() {
      }
    ),
    error = TRUE
  )

  d <- data.frame(input = "hey", target = "there")
  expect_equal(d, check_dataset(d))
})

test_that("join_epochs() works", {
  task_data <- data.frame(something = "here", id = 1:3)
  expect_equal(join_epochs(task_data, 1), task_data)

  joined <- join_epochs(task_data, 2)
  expect_equal(nrow(joined), nrow(task_data) * 2)
  expect_equal(joined$epoch, rep(1:2, 3))
  expect_equal(joined$id, rep(1:3, each = 2))
})

test_that("set_id_column works", {
  # no existing `id`
  df <- tibble::tibble(input = c("a", "b"), target = c("c", "d"))
  result <- set_id_column(df)

  expect_equal(nrow(result), 2)
  expect_true("id" %in% names(result))
  expect_equal(result$id, 1:2)

  # existing `id``
  df <- tibble::tibble(input = c("a", "b"), target = c("c", "d"), id = c(5, 10))
  result <- set_id_column(df)

  expect_equal(nrow(result), 2)
  expect_true("id" %in% names(result))
  expect_equal(result$id, c(5, 10))
})

test_that("Task preserves existing id column", {
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))
  local_mocked_bindings(interactive = function(...) FALSE)

  d <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5"),
    id = c(10, 20)
  )

  tsk <- Task$new(
    dataset = d,
    solver = function() {
    },
    scorer = function() {
    }
  )

  expect_equal(tsk$get_samples()$id, c(10, 20))
})

test_that("Task errors informatively with duplicate ids", {
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))

  d <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5"),
    id = c(10, 10)
  )

  expect_snapshot(
    Task$new(
      dataset = d,
      solver = function() {
      },
      scorer = function() {
      }
    ),
    error = TRUE
  )
})

# solver ------------------------------------------------------------------
test_that("set_solver works", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    result = c("4", "5"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = function() {
    },
    scorer = function() {
    }
  )

  new_solver <- function(inputs) {
    list(
      result = c("4", "5"),
      solver_chat = list(
        ellmer::chat_openai(model = "gpt-4.1-nano"),
        ellmer::chat_openai(model = "gpt-4.1-nano")
      )
    )
  }
  tsk$set_solver(new_solver)
  tsk$solve()

  expect_equal(tsk$get_samples()$result, c("4", "5"))
  expect_false("solver_metadata" %in% names(tsk$get_samples()))

  # set a new solver that includes metadata
  new_solver <- function(inputs) {
    list(
      result = c("4", "5"),
      solver_chat = list(
        ellmer::chat_openai(model = "gpt-4.1-nano"),
        ellmer::chat_openai(model = "gpt-4.1-nano")
      ),
      solver_metadata = c("boop!", "bop!")
    )
  }
  tsk$set_solver(new_solver)
  expect_false(
    any(c("solver_chat", "solver_metadata") %in% names(tsk$get_samples()))
  )
  tsk$solve()

  expect_true("solver_metadata" %in% names(tsk$get_samples()))
})

test_that("set_solver works", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    result = c("4", "5"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = function() {
    },
    scorer = function() {
    }
  )

  new_solver <- function(inputs) {
    list(
      result = c("4", "5"),
      solver_chat = list(
        ellmer::chat_openai(model = "gpt-4.1-nano"),
        ellmer::chat_openai(model = "gpt-4.1-nano")
      )
    )
  }
  tsk$set_solver(new_solver)
  tsk$solve()

  expect_equal(tsk$get_samples()$result, c("4", "5"))
  expect_false("solver_metadata" %in% names(tsk$get_samples()))

  # set a new solver that includes metadata
  new_solver <- function(inputs) {
    list(
      result = c("4", "5"),
      solver_chat = list(
        ellmer::chat_openai(model = "gpt-4.1-nano"),
        ellmer::chat_openai(model = "gpt-4.1-nano")
      ),
      solver_metadata = c("boop!", "bop!")
    )
  }
  tsk$set_solver(new_solver)
  expect_false(
    any(c("solver_chat", "solver_metadata") %in% names(tsk$get_samples()))
  )
  tsk$solve()

  expect_true("solver_metadata" %in% names(tsk$get_samples()))
})

# scorer ------------------------------------------------------------------
test_that("set_scorer works", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    result = c("4", "5"),
    target = c("4", "5")
  )

  solver <- function(inputs) {
    list(
      result = c("4", "5"),
      solver_chat = list(
        ellmer::chat_openai(model = "gpt-4.1-nano"),
        ellmer::chat_openai(model = "gpt-4.1-nano")
      )
    )
  }

  tsk <- Task$new(
    dataset = simple_addition,
    solver = solver,
    scorer = function() {
    }
  )

  tsk$solve()

  # first, return only the score
  scorer_minimal <- function(samples) {
    list(score = c(1, 1))
  }
  tsk$set_scorer(scorer_minimal)
  tsk$score()

  expect_equal(tsk$get_samples()$score, c(1, 1))
  expect_false(any(
    c("scorer_chat", "scorer_metadata") %in% names(tsk$get_samples())
  ))

  # return scorer chats
  scorer_chat <- function(samples) {
    list(
      score = c(1, 1),
      scorer_chat = list(
        ellmer::chat_openai(model = "gpt-4.1-nano"),
        ellmer::chat_openai(model = "gpt-4.1-nano")
      )
    )
  }
  tsk$set_scorer(scorer_chat)
  expect_true(all(is.na(tsk$get_samples()$score)))
  tsk$score()
  expect_true("scorer_chat" %in% names(tsk$get_samples()))

  # return metadata, too
  scorer_metadata <- function(samples) {
    list(
      score = c(1, 1),
      scorer_chat = list(
        ellmer::chat_openai(model = "gpt-4.1-nano"),
        ellmer::chat_openai(model = "gpt-4.1-nano")
      ),
      scorer_metadata = c("beep", "bop")
    )
  }
  tsk$set_scorer(scorer_metadata)
  expect_true(all(is.na(tsk$get_samples()$score)))
  expect_false(any(
    c("scorer_chat", "scorer_metadata") %in% names(tsk$get_samples())
  ))
  tsk$score()
  expect_true(all(
    c("scorer_chat", "scorer_metadata") %in% names(tsk$get_samples())
  ))
})

# metrics ------------------------------------------------------------------
test_that("default metrics are applied effectively", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(ellmer::chat_openai(model = "gpt-4.1-nano")),
    scorer = function(...) {
      list(
        score = factor(c("C", "C"), levels = c("I", "P", "C"))
      )
    }
  )

  tsk$eval()

  expect_equal(tsk$metrics, c("accuracy" = 100))
  expect_valid_log(tsk$log())
})

test_that("task applies non-default metrics", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  # via Task$new()...
  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(ellmer::chat_openai(model = "gpt-4.1-nano")),
    scorer = function(...) {
      list(
        score = factor(c("C", "C"), levels = c("I", "P", "C"))
      )
    },
    metrics = list(pct_correct = function(scores) {
      mean(scores == "C") * 100
    })
  )

  tsk$eval()

  expect_equal(tsk$metrics, c("pct_correct" = 100))
  expect_valid_log(tsk$log())

  # via set_metrics...
  tsk$set_metrics(list(prop_correct = function(scores) {
    mean(scores == "C")
  }))
  expect_null(tsk$metrics)
  tsk$measure()
  expect_equal(tsk$metrics, c("prop_correct" = 1))
})

test_that("task errors informatively with bad metrics", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  # wrong type supplied to `$new()`
  expect_snapshot(
    tsk <- Task$new(
      dataset = simple_addition,
      solver = generate(ellmer::chat_openai(model = "gpt-4.1-nano")),
      scorer = function(...) {
        list(
          score = factor(c("C", "C"), levels = c("I", "P", "C"))
        )
      },
      metrics = function(scores) {
        mean(scores == "C") * 100
      }
    ),
    error = TRUE
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(ellmer::chat_openai(model = "gpt-4.1-nano")),
    scorer = function(...) {
      list(
        score = factor(c("C", "C"), levels = c("I", "P", "C"))
      )
    },
    metrics = list(pct_correct = function(scores) {
      mean(scores == "C") * 100
    })
  )

  # wrong type supplied to `$set_metrics()`
  expect_snapshot(
    tsk$set_metrics(function(...) "boop bop"),
    error = TRUE
  )

  # valid type but bad return type
  expect_snapshot(
    {
      tsk$set_metrics(list(
        bad_metric = function(scores) "this is not a numeric"
      ))
      tsk$eval()
    },
    error = TRUE
  )
})

# misc ------------------------------------------------------------------
test_that("task ids are deterministic", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))

  tsk_1 <-
    Task$new(
      dataset = are,
      solver = generate(),
      scorer = model_graded_qa()
    )

  tsk_2 <-
    Task$new(
      dataset = are,
      solver = generate(),
      scorer = model_graded_qa()
    )

  tsk_id_1 <- tsk_1$.__enclos_env__$private$task_id
  tsk_id_2 <- tsk_2$.__enclos_env__$private$task_id

  expect_equal(tsk_id_1, tsk_id_2)
  expect_equal(nchar(tsk_id_1), 22)
})

test_that("Task completeness is tracked and preserved", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  tmp_dir <- withr::local_tempdir()
  withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  mock_scorer <- function(samples) {
    list(
      score = c(1),
      metadata = list(NULL)
    )
  }

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = mock_scorer
  )

  expect_false(tsk$.__enclos_env__$private$solved)
  expect_false(tsk$.__enclos_env__$private$scored)

  tsk$solve()
  expect_true(tsk$.__enclos_env__$private$solved)

  tsk$score()
  expect_true(tsk$.__enclos_env__$private$scored)

  tsk$set_solver(generate(chat_openai(model = "gpt-4.1-nano")))
  expect_false(tsk$.__enclos_env__$private$solved)

  tsk$solve()
  expect_true(tsk$.__enclos_env__$private$solved)

  tsk$set_scorer(mock_scorer)
  expect_false(tsk$.__enclos_env__$private$scored)

  tsk$solve()
  tsk$score()

  tsk_clone <- tsk$clone()
  original_results <- tsk$get_samples()$result
  original_scores <- tsk$get_samples()$score

  tsk_clone$eval()
  # TODO: expect_valid_log(tsk$log())
  expect_equal(nrow(tsk_clone$get_samples()), nrow(simple_addition))

  expect_equal(tsk$get_samples()$result, original_results)
  expect_equal(tsk$get_samples()$score, original_scores)

  # test re-evaluation with epochs
  tsk_epochs <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = mock_scorer
  )

  tsk_epochs$eval(epochs = 2)
  # TODO: expect_valid_log(tsk$log())
  expect_equal(nrow(tsk_epochs$get_samples()), nrow(simple_addition) * 2)
  expect_true("epoch" %in% names(tsk_epochs$get_samples()))

  tsk_epochs$eval(epochs = 3)
  # TODO: expect_valid_log(tsk$log())
  expect_equal(nrow(tsk_epochs$get_samples()), nrow(simple_addition) * 3)
  expect_true("epoch" %in% names(tsk_epochs$get_samples()))
})

test_that("Task errors informatively with bad solver output", {
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  bad_solver_missing_fields <- function(inputs) {
    list(
      wrong_name = c("4", "5")
      # missing solver_chat
    )
  }

  tsk <- Task$new(
    dataset = simple_addition,
    solver = bad_solver_missing_fields,
    scorer = function() {
    }
  )

  expect_snapshot(tsk$solve(), error = TRUE)
})

test_that("Task detects non-Chat objects in solver_chat", {
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  bad_solver_wrong_type <- function(inputs) {
    list(
      result = c("4", "5"),
      solver_chat = list("not a Chat object", "also not a Chat object")
    )
  }

  tsk <- Task$new(
    dataset = simple_addition,
    solver = bad_solver_wrong_type,
    scorer = function() {
    }
  )

  expect_snapshot(tsk$solve(), error = TRUE)
})

test_that("Task errors informatively with bad scorer output", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = function(samples) {
      list(wrong_name = c("4", "5"))
    }
  )

  expect_snapshot(tsk$eval(), error = TRUE)
})

test_that("Task detects non-Chat objects in scorer_chat", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = function(samples) {
      list(
        score = c("4", "5"),
        scorer_chat = list("not a Chat object", "also not a Chat object")
      )
    }
  )

  expect_snapshot(tsk$eval(), error = TRUE)
})

test_that("token usage is logged correctly", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  # use a couple tokens to ensure non-NULL
  chat_openai(model = "gpt-4.1-nano")$chat("hey!", echo = "none")
  usage_before <- ellmer::token_usage()
  usage_before <- dplyr::filter(usage_before, model == "gpt-4.1-nano")

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = model_graded_qa()
  )

  tsk$solve()
  usage_after_solve <- ellmer::token_usage()
  usage_after_solve <- dplyr::filter(usage_after_solve, model == "gpt-4.1-nano")
  cost_after_solve <- tsk$get_cost()
  expect_equal(
    cost_after_solve$input,
    usage_after_solve$input - usage_before$input
  )
  expect_equal(
    cost_after_solve$output,
    usage_after_solve$output - usage_before$output
  )

  tsk$score()
  usage_after_score <- ellmer::token_usage()
  usage_after_score <- dplyr::filter(usage_after_score, model == "gpt-4.1-nano")
  cost_after_score <- tsk$get_cost()
  expect_equal(
    cost_after_score$input,
    c(
      usage_after_solve$input - usage_before$input,
      usage_after_score$input - usage_after_solve$input
    )
  )
  expect_equal(
    cost_after_score$output,
    c(
      usage_after_solve$output - usage_before$output,
      usage_after_score$output - usage_after_solve$output
    )
  )
})


test_that("token usage is logged correctly (with unrelated token usage)", {
  skip_if(identical(Sys.getenv("OPENAI_API_KEY"), ""))
  skip_if(identical(Sys.getenv("ANTHROPIC_API_KEY"), ""))
  withr::local_envvar(list(VITALS_LOG_DIR = withr::local_tempdir()))
  withr::local_options(cli.default_handler = function(...) {
  })
  local_mocked_bindings(interactive = function(...) FALSE)
  library(ellmer)

  simple_addition <- tibble::tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  # use a couple tokens to ensure non-NULL
  chat_openai(model = "gpt-4.1-nano")$chat("hey!", echo = "none")
  chat_anthropic(model = "claude-3-7-sonnet-latest")$chat("hey!", echo = "none")
  usage_before <- ellmer::token_usage()
  usage_before <- dplyr::filter(usage_before, model == "gpt-4.1-nano")

  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_openai(model = "gpt-4.1-nano")),
    scorer = model_graded_qa()
  )

  tsk$solve()
  usage_after_solve <- ellmer::token_usage()
  usage_after_solve <- dplyr::filter(usage_after_solve, model == "gpt-4.1-nano")
  cost_after_solve <- tsk$get_cost()
  expect_equal(nrow(cost_after_solve), 1)
  expect_equal(
    cost_after_solve$input,
    usage_after_solve$input - usage_before$input
  )
  expect_equal(
    cost_after_solve$output,
    usage_after_solve$output - usage_before$output
  )
})