yardstick: Tidy Characterizations of Model Performance

set.seed(1311)
three_class <- data.frame(
  obs = iris$Species,
  pred = sample(iris$Species, replace = TRUE)
)
probs <- matrix(runif(150 * 3), nrow = 150)
probs <- t(apply(probs, 1, function(x) x / sum(x)))
colnames(probs) <- levels(iris$Species)
three_class <- cbind(three_class, as.data.frame(probs))


###################################################################

test_that("correct metrics returned", {
  expect_equal(
    metrics(two_class_example, truth, predicted)[[".metric"]],
    c("accuracy", "kap")
  )
  expect_equal(
    metrics(two_class_example, truth, predicted, Class1)[[".metric"]],
    c("accuracy", "kap", "mn_log_loss", "roc_auc")
  )
  expect_equal(
    metrics(three_class, "obs", "pred", setosa, versicolor, virginica)[[".metric"]],
    c("accuracy", "kap", "mn_log_loss", "roc_auc")
  )
  expect_equal(
    metrics(three_class, "obs", "pred", setosa, versicolor, virginica)[[".estimator"]],
    c("multiclass", "multiclass", "multiclass", "hand_till")
  )
  expect_equal(
    metrics(solubility_test, solubility, "prediction")[[".metric"]],
    c("rmse", "rsq", "mae")
  )
})

###################################################################

test_that("bad args", {
  expect_snapshot(
    error = TRUE,
    metrics(two_class_example, truth, Class1)
  )
  expect_snapshot(
    error = TRUE,
    metrics(two_class_example, Class1, truth)
  )
  expect_snapshot(
    error = TRUE,
    metrics(three_class, "obs", "pred", setosa, versicolor)
  )
})

###################################################################

class_res_1 <- dplyr::bind_rows(
  accuracy(two_class_example, truth, predicted),
  kap(two_class_example, truth, predicted),
  mn_log_loss(two_class_example, truth, Class1),
  roc_auc(two_class_example, truth, Class1)
)

reg_res_1 <- dplyr::bind_rows(
  rmse(solubility_test, solubility, "prediction"),
  rsq(solubility_test, solubility, prediction),
  mae(solubility_test, solubility, prediction)
)


test_that("correct results", {
  class_idx <- which(class_res_1$.metric %in% c("accuracy", "kap"))

  expect_equal(
    metrics(two_class_example, truth, predicted)[[".estimate"]],
    class_res_1[class_idx, ][[".estimate"]]
  )
  expect_equal(
    metrics(two_class_example, truth, predicted, Class1)[[".estimate"]],
    class_res_1[[".estimate"]]
  )
  expect_equal(
    metrics(solubility_test, solubility, prediction)[[".estimate"]],
    reg_res_1[[".estimate"]]
  )
})

###################################################################

test_that("metrics() - `options` is deprecated", {
  skip_if(getRversion() <= "3.5.3", "Base R used a different deprecated warning class.")
  rlang::local_options(lifecycle_verbosity = "warning")

  expect_snapshot({
    out <- metrics(two_class_example, truth, predicted, Class1, options = 1)
  })

  expect_identical(
    out,
    metrics(two_class_example, truth, predicted, Class1)
  )
})

###################################################################

test_that("numeric metric sets", {
  reg_set <- metric_set(rmse, rsq, mae)

  expect_equal(
    reg_set(solubility_test, solubility, prediction),
    reg_res_1
  )
  # ensure helpful messages are printed
  expect_snapshot(
    error = TRUE,
    metric_set(rmse, "x")
  )

  # Can mix class and class prob together
  mixed_set <- metric_set(accuracy, roc_auc)
  expect_no_error(
    mixed_set(two_class_example, truth, Class1, estimate = predicted)
  )
})

test_that("mixing bad metric sets", {
  expect_snapshot(
    error = TRUE,
    metric_set(rmse, accuracy)
  )
})

test_that("can mix class and class prob metrics together", {
  expect_no_error(
    mixed_set <- metric_set(accuracy, roc_auc)
  )
  expect_no_error(
    mixed_set(two_class_example, truth, Class1, estimate = predicted)
  )
})

test_that("dynamic survival metric sets", {
  my_set <- metric_set(brier_survival)

  expect_equal(
    my_set(lung_surv, surv_obj, .pred),
    brier_survival(lung_surv, surv_obj, .pred)
  )
})

test_that("can mix dynamic and static survival metric together", {
  expect_no_error(
    mixed_set <- metric_set(brier_survival, concordance_survival)
  )
  expect_no_error(
    mixed_set(lung_surv, surv_obj, .pred, estimate = .pred_time)
  )
})

test_that("can mix dynamic and static survival metric together", {
  expect_no_error(
    mixed_set <- metric_set(
      brier_survival,
      concordance_survival,
      brier_survival_integrated
    )
  )
  expect_no_error(
    mixed_set(lung_surv, surv_obj, .pred, estimate = .pred_time)
  )
})

test_that("can supply `event_level` even with metrics that don't use it", {
  df <- two_class_example

  df_rev <- df
  df_rev$truth <- stats::relevel(df_rev$truth, "Class2")
  df_rev$predicted <- stats::relevel(df_rev$predicted, "Class2")

  # accuracy doesn't use it, and doesn't have it as an argument
  set <- metric_set(accuracy, recall, roc_auc)

  expect_equal(
    as.data.frame(set(df, truth, Class1, estimate = predicted)),
    as.data.frame(set(df_rev, truth, Class1, estimate = predicted, event_level = "second"))
  )
})

test_that("metric set functions are classed", {
  expect_s3_class(
    metric_set(accuracy, roc_auc),
    "class_prob_metric_set"
  )
  expect_s3_class(
    metric_set(mae),
    "numeric_metric_set"
  )
  expect_s3_class(
    metric_set(accuracy, roc_auc),
    "metric_set"
  )
  expect_s3_class(
    metric_set(mae),
    "metric_set"
  )
})

test_that("metric set functions retain class/prob metric functions", {
  fns <- attr(metric_set(accuracy, roc_auc), "metrics")

  expect_equal(
    names(fns),
    c("accuracy", "roc_auc")
  )

  expect_equal(
    class(fns[[1]]),
    c("class_metric", "metric", "function")
  )

  expect_equal(
    class(fns[[2]]),
    c("prob_metric", "metric", "function")
  )

  expect_equal(
    vapply(fns, function(fn) attr(fn, "direction"), character(1)),
    c(accuracy = "maximize", roc_auc = "maximize")
  )
})

test_that("metric set functions retain numeric metric functions", {
  fns <- attr(metric_set(mae, rmse), "metrics")

  expect_equal(
    names(fns),
    c("mae", "rmse")
  )

  expect_equal(
    class(fns[[1]]),
    c("numeric_metric", "metric", "function")
  )

  expect_equal(
    class(fns[[2]]),
    c("numeric_metric", "metric", "function")
  )

  expect_equal(
    vapply(fns, function(fn) attr(fn, "direction"), character(1)),
    c(mae = "minimize", rmse = "minimize")
  )
})

test_that("`metric_set()` labeling remove namespaces", {
  x <- metric_set(yardstick::mase, rmse)
  expect_identical(names(attr(x, "metrics")), c("mase", "rmse"))
})

test_that("print metric_set works", {
  expect_snapshot(metric_set(rmse, rsq, ccc))
})

test_that("metric_set can be coerced to a tibble", {
  x <- metric_set(roc_auc, pr_auc, accuracy)
  expect_s3_class(dplyr::as_tibble(x), "tbl_df")
})

test_that("`metric_set()` errors contain env name for unknown functions (#128)", {
  foobar <- function() {}

  # Store env name in `name` attribute for `environmentName()` to find it
  env <- rlang::new_environment(parent = globalenv())
  attr(env, "name") <- "test"

  rlang::fn_env(foobar) <- env

  expect_snapshot(
    error = TRUE,
    metric_set(accuracy, foobar, sens, rlang::abort)
  )
  expect_snapshot(
    error = TRUE,
    metric_set(accuracy, foobar, sens, rlang::abort)
  )
})

test_that("`metric_set()` gives an informative error for a single non-metric function (#181)", {
  foobar <- function() {}

  # Store env name in `name` attribute for `environmentName()` to find it
  env <- rlang::new_environment(parent = globalenv())
  attr(env, "name") <- "test"
  rlang::fn_env(foobar) <- env

  expect_snapshot(
    error = TRUE,
    metric_set(foobar)
  )
})

test_that("errors informatively for unevaluated metric factories", {
  # one bad metric
  expect_snapshot(
    error = TRUE,
    metric_set(demographic_parity)
  )

  expect_snapshot(
    error = TRUE,
    metric_set(demographic_parity, roc_auc)
  )

  # two bad metrics
  expect_snapshot(
    error = TRUE,
    metric_set(demographic_parity, equal_opportunity)
  )

  expect_snapshot(
    error = TRUE,
    metric_set(demographic_parity, equal_opportunity, roc_auc)
  )
})

test_that("all class metrics - `metric_set()` works with `case_weights`", {
  # Mock a metric that doesn't support weights
  accuracy_no_weights <- function(data, truth, estimate, na_rm = TRUE, ...) {
    # Eat the `...` silently
    accuracy(
      data = data,
      truth = !!enquo(truth),
      estimate = !!enquo(estimate),
      na_rm = na_rm
    )
  }
  accuracy_no_weights <- new_class_metric(accuracy_no_weights, "maximize")

  set <- metric_set(accuracy, accuracy_no_weights)

  df <- data.frame(
    truth = factor(c("x", "x", "y"), levels = c("x", "y")),
    estimate = factor(c("x", "y", "x"), levels = c("x", "y")),
    case_weights = c(1L, 1L, 2L)
  )

  expect_identical(
    set(df, truth, estimate = estimate, case_weights = case_weights)[[".estimate"]],
    c(1 / 4, 1 / 3)
  )
})

test_that("all numeric metrics - `metric_set()` works with `case_weights`", {
  # Mock a metric that doesn't support weights
  rmse_no_weights <- function(data, truth, estimate, na_rm = TRUE, ...) {
    # Eat the `...` silently
    rmse(
      data = data,
      truth = !!enquo(truth),
      estimate = !!enquo(estimate),
      na_rm = na_rm
    )
  }
  rmse_no_weights <- new_numeric_metric(rmse_no_weights, "minimize")

  set <- metric_set(rmse, rmse_no_weights)

  solubility_test$weight <- read_weights_solubility_test()

  expect <- c(
    rmse(solubility_test, solubility, prediction, case_weights = weight)[[".estimate"]],
    rmse(solubility_test, solubility, prediction)[[".estimate"]]
  )

  expect_identical(
    set(solubility_test, solubility, prediction, case_weights = weight)[[".estimate"]],
    expect
  )
})

test_that("class and prob metrics - `metric_set()` works with `case_weights`", {
  # Mock a metric that doesn't support weights
  accuracy_no_weights <- function(data, truth, estimate, na_rm = TRUE, ...) {
    # Eat the `...` silently
    accuracy(
      data = data,
      truth = !!enquo(truth),
      estimate = !!enquo(estimate),
      na_rm = na_rm
    )
  }
  accuracy_no_weights <- new_class_metric(accuracy_no_weights, "maximize")

  set <- metric_set(accuracy, accuracy_no_weights, roc_auc)

  two_class_example$weight <- read_weights_two_class_example()

  expect <- c(
    accuracy(two_class_example, truth, predicted, case_weights = weight)[[".estimate"]],
    accuracy(two_class_example, truth, predicted)[[".estimate"]],
    roc_auc(two_class_example, truth, Class1, case_weights = weight)[[".estimate"]]
  )

  expect_identical(
    set(two_class_example, truth, Class1, estimate = predicted, case_weights = weight)[[".estimate"]],
    expect
  )
})

test_that("propagates 'caused by' error message when specifying the wrong column name", {
  set <- metric_set(accuracy, kap)

  # There is no `weight` column!
  expect_snapshot(error = TRUE, {
    set(two_class_example, truth, Class1, estimate = predicted, case_weights = weight)
  })
})

test_that("metric_tweak and metric_set plays nicely together (#351)", {
  # Classification
  multi_ex <- data_three_by_three()

  ref <- dplyr::bind_rows(
    j_index(multi_ex, estimator = "macro"),
    j_index(multi_ex, estimator = "micro")
  )

  j_index_macro <- metric_tweak("j_index", j_index, estimator = "macro")
  j_index_micro <- metric_tweak("j_index", j_index, estimator = "micro")

  expect_identical(
    metric_set(j_index_macro, j_index_micro)(multi_ex),
    ref
  )

  # Probability
  ref <- dplyr::bind_rows(
    roc_auc(two_class_example, truth, Class1, event_level = "first"),
    roc_auc(two_class_example, truth, Class1, event_level = "second")
  )

  roc_auc_first <- metric_tweak("roc_auc", roc_auc, event_level = "first")
  roc_auc_second <- metric_tweak("roc_auc", roc_auc, event_level = "second")

  expect_identical(
    metric_set(roc_auc_first, roc_auc_second)(two_class_example, truth, Class1),
    ref
  )

  # regression
  ref <- dplyr::bind_rows(
    ccc(mtcars, truth = mpg, estimate = disp, bias = TRUE),
    ccc(mtcars, truth = mpg, estimate = disp, bias = FALSE)
  )

  ccc_bias <- metric_tweak("ccc", ccc, bias = TRUE)
  ccc_no_bias <- metric_tweak("ccc", ccc, bias = FALSE)

  expect_identical(
    metric_set(ccc_bias, ccc_no_bias)(mtcars, truth = mpg, estimate = disp),
    ref
  )

  # Static survival
  lung_surv_na <- lung_surv
  lung_surv_na$.pred_time[1] <- NA

  ref <- dplyr::bind_rows(
    concordance_survival(lung_surv_na, surv_obj, .pred_time, na_rm = TRUE),
    concordance_survival(lung_surv_na, surv_obj, .pred_time, na_rm = FALSE)
  )

  concordance_survival_na_rm <- metric_tweak(
    "concordance_survival",
    concordance_survival,
    na_rm = TRUE
  )
  concordance_survival_no_na_rm <- metric_tweak(
    "concordance_survival",
    concordance_survival,
    na_rm = FALSE
  )

  expect_identical(
    metric_set(concordance_survival_na_rm, concordance_survival_no_na_rm)(
      lung_surv_na, truth = surv_obj, estimate = .pred_time
    ),
    ref
  )

  # dynamic survival
  lung_surv_na <- lung_surv
  lung_surv_na$surv_obj[1] <- NA

  ref <- dplyr::bind_rows(
    brier_survival(lung_surv_na, surv_obj, .pred, na_rm = TRUE),
    brier_survival(lung_surv_na, surv_obj, .pred, na_rm = FALSE)
  )

  brier_survival_na_rm <- metric_tweak(
    "brier_survival",
    brier_survival,
    na_rm = TRUE
  )
  brier_survival_no_na_rm <- metric_tweak(
    "brier_survival",
    brier_survival,
    na_rm = FALSE
  )

  expect_identical(
    metric_set(brier_survival_na_rm, brier_survival_no_na_rm)(
      lung_surv_na, truth = surv_obj, .pred
    ),
    ref
  )
})
topepo/yardstick documentation built on April 20, 2024, 7:15 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
topepo/yardstick
Tidy Characterizations of Model Performance

tests/testthat/test-aaa-metrics.R
In topepo/yardstick: Tidy Characterizations of Model Performance

R Package Documentation

Browse R Packages

We want your feedback!

topepo/yardstick Tidy Characterizations of Model Performance

tests/testthat/test-aaa-metrics.R In topepo/yardstick: Tidy Characterizations of Model Performance

R Package Documentation

Browse R Packages

We want your feedback!

topepo/yardstick
Tidy Characterizations of Model Performance

tests/testthat/test-aaa-metrics.R
In topepo/yardstick: Tidy Characterizations of Model Performance