yardstick: Tidy Characterizations of Model Performance

# reticulate::install_python("3.10.4")
# reticulate::virtualenv_create("yardstick-environment", version = "3.10.4")
# reticulate::use_virtualenv("yardstick-environment")
# reticulate::py_install("scikit-learn") # 1.2.0 is what was downloaded
# reticulate::py_install("scikit-survival") # 1.2.0 is what was downloaded
# reticulate::py_install("fairlearn") # 0.8.0 is what was downloaded
library(reticulate)

# Inside yardstick
devtools::load_all()

use_virtualenv("yardstick-environment")

skmetrics <- import("sklearn.metrics")
sksurv_metrics <- import("sksurv.metrics")
sksurv_util <- import("sksurv.util", convert = FALSE)
fairlearn <- import("fairlearn.metrics")

data("hpc_cv")
data("two_class_example")
data("solubility_test")

weights_hpc_cv <- read_weights_hpc_cv()
weights_two_class_example <- read_weights_two_class_example()
weights_solubility_test <- read_weights_solubility_test()

save_metric_results <- function(nm, fn, ..., average = c("macro", "micro", "weighted")) {
  # Two class metrics
  res <- list(fn(two_class_example$truth, two_class_example$predicted, ..., pos_label = "Class1"))

  # Multiclass metrics
  res2 <- lapply(average, function(.x) fn(hpc_cv$obs, hpc_cv$pred, ..., average = .x))

  # Two class weighted metrics
  case_weight <- list(fn(
    two_class_example$truth,
    two_class_example$predicted,
    ...,
    pos_label = "Class1",
    sample_weight = weights_two_class_example
  ))

  # Multiclass weighted metrics
  case_weight2 <- lapply(
    average,
    function(.x) fn(hpc_cv$obs, hpc_cv$pred, ..., average = .x, sample_weight = weights_hpc_cv)
  )

  case_weight <- c(case_weight, case_weight2)
  names(case_weight) <- c("binary", average)

  res <- c(res, res2, list(case_weight))
  names(res) <- c("binary", average, "case_weight")

  saveRDS(res, test_path("py-data", paste0("py-", nm, ".rds")), version = 2)
}

# ------------------------------------------------------------------------------
# Table statistics used in other metrics

# Note: Truth down the left side, Predicted across the top, unlike
# yardstick's table function. But this matches most sklearn references better.
py_binary_weighted_confusion <- skmetrics$confusion_matrix(
  y_true = two_class_example$truth,
  y_pred = two_class_example$predicted,
  sample_weight = weights_two_class_example
)
py_multiclass_weighted_confusion <- skmetrics$confusion_matrix(
  y_true = hpc_cv$obs,
  y_pred = hpc_cv$pred,
  sample_weight = weights_hpc_cv
)

py_binary_weighted_FP <- colSums(py_binary_weighted_confusion) - diag(py_binary_weighted_confusion)
py_binary_weighted_FN <- rowSums(py_binary_weighted_confusion) - diag(py_binary_weighted_confusion)
py_binary_weighted_TP <- diag(py_binary_weighted_confusion)
py_binary_weighted_TN <- sum(py_binary_weighted_confusion) - (py_binary_weighted_FP + py_binary_weighted_FN + py_binary_weighted_TP)

py_multiclass_weighted_FP <- colSums(py_multiclass_weighted_confusion) - diag(py_multiclass_weighted_confusion)
py_multiclass_weighted_FN <- rowSums(py_multiclass_weighted_confusion) - diag(py_multiclass_weighted_confusion)
py_multiclass_weighted_TP <- diag(py_multiclass_weighted_confusion)
py_multiclass_weighted_TN <- sum(py_multiclass_weighted_confusion) - (py_multiclass_weighted_FP + py_multiclass_weighted_FN + py_multiclass_weighted_TP)

# ------------------------------------------------------------------------------

# Save metrics results
save_metric_results("precision", skmetrics$precision_score)
save_metric_results("recall", skmetrics$recall_score)
save_metric_results("f_meas", skmetrics$f1_score)
save_metric_results("f_meas_beta_.5", skmetrics$fbeta_score, beta = .5)

# MCC
py_mcc <- list(
  binary = skmetrics$matthews_corrcoef(two_class_example$truth, two_class_example$predicted),
  multiclass = skmetrics$matthews_corrcoef(hpc_cv$obs, hpc_cv$pred),
  case_weight = list(
    binary = skmetrics$matthews_corrcoef(two_class_example$truth, two_class_example$predicted, sample_weight = weights_two_class_example),
    multiclass = skmetrics$matthews_corrcoef(hpc_cv$obs, hpc_cv$pred, sample_weight = weights_hpc_cv)
  )
)
saveRDS(py_mcc, test_path("py-data", "py-mcc.rds"), version = 2)

# Accuracy
py_accuracy <- list(
  binary = skmetrics$accuracy_score(two_class_example$truth, two_class_example$predicted),
  multiclass = skmetrics$accuracy_score(hpc_cv$obs, hpc_cv$pred)
)
saveRDS(py_accuracy, test_path("py-data", "py-accuracy.rds"), version = 2)

# Kappa
py_kap <- list(
  binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth)),
  multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs)),
  linear_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "linear"),
  linear_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "linear"),
  quadratic_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "quadratic"),
  quadratic_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "quadratic"),
  case_weight = list(
    binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), sample_weight = weights_two_class_example),
    multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), sample_weight = weights_hpc_cv),
    linear_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "linear", sample_weight = weights_two_class_example),
    linear_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "linear", sample_weight = weights_hpc_cv),
    quadratic_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "quadratic", sample_weight = weights_two_class_example),
    quadratic_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "quadratic", sample_weight = weights_hpc_cv)
  )
)
saveRDS(py_kap, test_path("py-data", "py-kap.rds"), version = 2)

# RMSE
py_rmse <- list(
  case_weight = sqrt(skmetrics$mean_squared_error(
    y_true = solubility_test$solubility,
    y_pred = solubility_test$prediction,
    sample_weight = weights_solubility_test
  ))
)
saveRDS(py_rmse, test_path("py-data", "py-rmse.rds"), version = 2)

# MAE
py_mae <- list(
  case_weight = skmetrics$mean_absolute_error(
    y_true = solubility_test$solubility,
    y_pred = solubility_test$prediction,
    sample_weight = weights_solubility_test
  )
)
saveRDS(py_mae, test_path("py-data", "py-mae.rds"), version = 2)

# MAPE
# (Zeros purposefully cause infinite results in our R metrics, see #271)
zero_solubility <- solubility_test$solubility == 0
solubility_test_not_zero <- solubility_test[!zero_solubility, ]
py_mape <- list(
  case_weight = skmetrics$mean_absolute_percentage_error(
    y_true = solubility_test_not_zero$solubility,
    y_pred = solubility_test_not_zero$prediction,
    sample_weight = weights_solubility_test[!zero_solubility]
  )
)
saveRDS(py_mape, test_path("py-data", "py-mape.rds"), version = 2)

# R^2 Traditional
py_rsq_trad <- list(
  case_weight = skmetrics$r2_score(
    y_true = solubility_test$solubility,
    y_pred = solubility_test$prediction,
    sample_weight = weights_solubility_test
  )
)
saveRDS(py_rsq_trad, test_path("py-data", "py-rsq-trad.rds"), version = 2)

# Balanced Accuracy
# Not comparing multiclass against sklearn here, because they use a different definition
py_bal_accuracy <- list(
  case_weight = list(
    binary = skmetrics$balanced_accuracy_score(
      y_true = two_class_example$truth,
      y_pred = two_class_example$predicted,
      sample_weight = weights_two_class_example
    )
  )
)
saveRDS(py_bal_accuracy, test_path("py-data", "py-bal-accuracy.rds"), version = 2)

# NPV
py_binary_weighted_NPV <- py_binary_weighted_TN / (py_binary_weighted_TN + py_binary_weighted_FN)
py_multiclass_weighted_NPV <- py_multiclass_weighted_TN / (py_multiclass_weighted_TN + py_multiclass_weighted_FN)

py_npv <- list(
  case_weight = list(
    binary = py_binary_weighted_NPV[[1]],
    macro = mean(py_multiclass_weighted_NPV)
  )
)
saveRDS(py_npv, test_path("py-data", "py-npv.rds"), version = 2)

# PPV
py_binary_weighted_PPV <- py_binary_weighted_TP / (py_binary_weighted_TP + py_binary_weighted_FP)
py_multiclass_weighted_PPV <- py_multiclass_weighted_TP / (py_multiclass_weighted_TP + py_multiclass_weighted_FP)

py_ppv <- list(
  case_weight = list(
    binary = py_binary_weighted_PPV[[1]],
    macro = mean(py_multiclass_weighted_PPV)
  )
)
saveRDS(py_ppv, test_path("py-data", "py-ppv.rds"), version = 2)

# mn_log_loss
# log_loss() requires labels in alphabetical order, because that is what LabelBinarizer() does
log_loss_two_class_example_levels <- c("Class1", "Class2")
log_loss_two_class_example_estimate <- cbind(Class1 = two_class_example$Class1, Class2 = two_class_example$Class2)

log_loss_hpc_cv_levels <- c("F", "L", "M", "VF")
log_loss_hpc_cv_estimate <- as.matrix(hpc_cv[log_loss_hpc_cv_levels])

mn_log_loss_eps <- .Machine$double.eps

py_mn_log_loss <- list(
  binary = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, eps = mn_log_loss_eps),
  multiclass = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps),
  binary_sum = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, normalize = FALSE),
  multiclass_sum = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps, normalize = FALSE),
  case_weight = list(
    binary = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, eps = mn_log_loss_eps, sample_weight = weights_two_class_example),
    multiclass = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps, sample_weight = weights_hpc_cv),
    binary_sum = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, normalize = FALSE, sample_weight = weights_two_class_example),
    multiclass_sum = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps, normalize = FALSE, sample_weight = weights_hpc_cv)
  )
)
saveRDS(py_mn_log_loss, test_path("py-data", "py-mn_log_loss.rds"), version = 2)

# PR Curve
make_py_two_class_pr_curve <- function(case_weights) {
  # - sklearn puts them in decreasing order of recall
  # - sklearn doesn't add the `Inf` threshold value but does add the `0` recall and `1` precision value
  # - sklearn stops the curve once we hit the first `recall == 1` value, we don't
  # - reticulate brings all inputs in as arrays

  if (!is.logical(case_weights) || length(case_weights) != 1L) {
    stop("`case_weights` must be a single logical.")
  }

  if (case_weights) {
    curve <- skmetrics$precision_recall_curve(
      y_true = two_class_example$truth,
      probas_pred = two_class_example$Class1,
      pos_label = "Class1",
      sample_weight = weights_two_class_example
    )
  } else {
    curve <- skmetrics$precision_recall_curve(
      y_true = two_class_example$truth,
      probas_pred = two_class_example$Class1,
      pos_label = "Class1"
    )
  }

  names(curve) <- c("precision", "recall", ".threshold")

  curve$recall <- as.vector(curve$recall)
  curve$precision <- as.vector(curve$precision)
  curve$.threshold <- as.vector(curve$.threshold)

  curve$.threshold <- c(curve$.threshold, Inf)

  curve <- tibble::tibble(!!!curve)
  curve <- dplyr::select(curve, .threshold, recall, precision)

  # Reverse rows to match yardstick
  curve <- curve[rev(seq_len(nrow(curve))), ]

  class(curve) <- c("pr_df", class(curve))

  curve
}

py_pr_curve <- list(
  binary = make_py_two_class_pr_curve(case_weights = FALSE),
  case_weight = list(
    binary = make_py_two_class_pr_curve(case_weights = TRUE)
  )
)
saveRDS(py_pr_curve, test_path("py-data", "py-pr-curve.rds"), version = 2)

# Average precision
# sklearn doesn't support >2 levels in `truth` directly, but does support
# "multi-label" `truth`, which ends up being the same thing here.
average_precision_hpc_cv_truth <- model.matrix(~ obs - 1, hpc_cv)
colnames(average_precision_hpc_cv_truth) <- levels(hpc_cv$obs)
average_precision_hpc_cv_estimate <- as.matrix(hpc_cv[levels(hpc_cv$obs)])

py_average_precision <- list(
  binary = skmetrics$average_precision_score(two_class_example$truth, two_class_example$Class1, pos_label = "Class1"),
  macro = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "macro"),
  macro_weighted = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "weighted"),
  case_weight = list(
    binary = skmetrics$average_precision_score(two_class_example$truth, two_class_example$Class1, pos_label = "Class1", sample_weight = weights_two_class_example),
    macro = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "macro", sample_weight = weights_hpc_cv),
    macro_weighted = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "weighted", sample_weight = weights_hpc_cv)
  )
)
saveRDS(py_average_precision, test_path("py-data", "py-average-precision.rds"), version = 2)

# ROC Curve
make_py_two_class_roc_curve <- function(case_weights) {
  # - sklearn puts them in decreasing order of specificity
  # - sklearn needs the "first" row and a tweak to the threshold of the "last" row
  # - sklearn has a `drop_intermediate = TRUE` default to trim the curve
  # - reticulate brings all inputs in as arrays

  if (!is.logical(case_weights) || length(case_weights) != 1L) {
    stop("`case_weights` must be a single logical.")
  }

  if (case_weights) {
    curve <- skmetrics$roc_curve(
      y_true = two_class_example$truth,
      y_score = two_class_example$Class1,
      pos_label = "Class1",
      sample_weight = weights_two_class_example,
      drop_intermediate = FALSE
    )
  } else {
    curve <- skmetrics$roc_curve(
      y_true = two_class_example$truth,
      y_score = two_class_example$Class1,
      pos_label = "Class1",
      drop_intermediate = FALSE
    )
  }

  names(curve) <- c("fpr", "tpr", ".threshold")

  curve$fpr <- as.vector(curve$fpr)
  curve$tpr <- as.vector(curve$tpr)
  curve$.threshold <- as.vector(curve$.threshold)

  curve$sensitivity <- curve$tpr
  curve$specificity <- 1 - curve$fpr

  curve$fpr <- NULL
  curve$tpr <- NULL

  # Reverse rows to match yardstick
  curve$.threshold <- rev(curve$.threshold)
  curve$sensitivity <- rev(curve$sensitivity)
  curve$specificity <- rev(curve$specificity)

  # Add "first" row to match yardstick
  curve$.threshold <- c(-Inf, curve$.threshold)
  curve$sensitivity <- c(1, curve$sensitivity)
  curve$specificity <- c(0, curve$specificity)

  # sklearn adds the "last" row, but their threshold value is
  # `last_threshold + 1` rather than `Inf`
  curve$.threshold[length(curve$.threshold)] <- Inf

  curve <- tibble::tibble(!!!curve)
  curve <- dplyr::select(curve, .threshold, specificity, sensitivity)

  class(curve) <- c("roc_df", class(curve))

  curve
}

py_roc_curve <- list(
  binary = make_py_two_class_roc_curve(case_weights = FALSE),
  case_weight = list(
    binary = make_py_two_class_roc_curve(case_weights = TRUE)
  )
)
saveRDS(py_roc_curve, test_path("py-data", "py-roc-curve.rds"), version = 2)

# ROC AUC Score
# - For binary, sklearn expects the probability of the class with the "greater
#   label", so if you sort c("Class1", "Class2") and take the 2nd one, you get
#   Class2, which is what we have to provide.
# - Neither sklearn nor yardstick support case weights in combination with the
#   Hand Till method
roc_auc_hpc_cv_levels <- sort(levels(hpc_cv$obs))
roc_auc_hpc_cv_estimate <- as.matrix(dplyr::select(hpc_cv, !!!syms(roc_auc_hpc_cv_levels)))

py_roc_auc <- list(
  binary = skmetrics$roc_auc_score(two_class_example$truth, two_class_example$Class2),
  macro = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "macro", labels = roc_auc_hpc_cv_levels, multi_class = "ovr"),
  macro_weighted = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "weighted", labels = roc_auc_hpc_cv_levels, multi_class = "ovr"),
  hand_till = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "macro", labels = roc_auc_hpc_cv_levels, multi_class = "ovo"),
  case_weight = list(
    binary = skmetrics$roc_auc_score(two_class_example$truth, two_class_example$Class2, sample_weight = weights_two_class_example),
    macro = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "macro", labels = roc_auc_hpc_cv_levels, multi_class = "ovr", sample_weight = weights_hpc_cv),
    macro_weighted = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "weighted", labels = roc_auc_hpc_cv_levels, multi_class = "ovr", sample_weight = weights_hpc_cv)
  )
)
saveRDS(py_roc_auc, test_path("py-data", "py-roc-auc.rds"), version = 2)

# Brier survival
lung_surv <- data_lung_surv()
lung_surv_unnest <- tidyr::unnest(lung_surv, cols = c(.pred))

sksurv_obj <- sksurv_util$Surv$from_arrays(
  event = lung_surv$surv_obj[, "status"],
  time = lung_surv$surv_obj[, "time"]
)

eval_time_unique <- unique(lung_surv_unnest$.eval_time)

py_brier_survival <- vapply(
  X = eval_time_unique,
  FUN.VALUE = numeric(1),
  FUN = function(x) {
    sksurv_metrics$brier_score(
      sksurv_obj, sksurv_obj,
      dplyr::filter(lung_surv_unnest, .eval_time == x)$.pred_survival,
      x
    )[[2]]
  }
)

names(py_brier_survival) <- eval_time_unique
saveRDS(py_brier_survival, test_path("py-data", "py-brier-survival.rds"), version = 2)

# Fairness metrics via fairlearn
py_demographic_parity <-
  list(
    binary =
      fairlearn$demographic_parity_difference(
        hpc_cv$obs == "VF",
        hpc_cv$pred == "VF",
        sensitive_features = hpc_cv$Resample
      ),
    weighted =
      fairlearn$demographic_parity_difference(
        hpc_cv$obs == "VF",
        hpc_cv$pred == "VF",
        sensitive_features = hpc_cv$Resample,
        sample_weight = weights_hpc_cv
      )
  )

saveRDS(
  py_demographic_parity,
  test_path("py-data", "py-demographic_parity.rds"),
  version = 2
)

py_equalized_odds <-
  list(
    binary =
      fairlearn$equalized_odds_difference(
        hpc_cv$obs == "VF",
        hpc_cv$pred == "VF",
        sensitive_features = hpc_cv$Resample
      ),
    weighted =
      fairlearn$equalized_odds_difference(
        hpc_cv$obs == "VF",
        hpc_cv$pred == "VF",
        sensitive_features = hpc_cv$Resample,
        sample_weight = weights_hpc_cv
      )
  )

saveRDS(
  py_equalized_odds,
  test_path("py-data", "py-equalized_odds.rds"),
  version = 2
)

py_equal_opportunity <-
  list(
    binary =
      fairlearn$true_positive_rate_difference(
        hpc_cv$obs == "VF",
        hpc_cv$pred == "VF",
        sensitive_features = hpc_cv$Resample
      ),
    weighted =
      fairlearn$true_positive_rate_difference(
        hpc_cv$obs == "VF",
        hpc_cv$pred == "VF",
        sensitive_features = hpc_cv$Resample,
        sample_weight = weights_hpc_cv
      )
  )

saveRDS(
  py_equal_opportunity,
  test_path("py-data", "py-equal_opportunity.rds"),
  version = 2
)
topepo/yardstick documentation built on April 20, 2024, 7:15 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
topepo/yardstick
Tidy Characterizations of Model Performance

tests/pycompare/generate-pycompare.R
In topepo/yardstick: Tidy Characterizations of Model Performance

R Package Documentation

Browse R Packages

We want your feedback!

topepo/yardstick Tidy Characterizations of Model Performance

tests/pycompare/generate-pycompare.R In topepo/yardstick: Tidy Characterizations of Model Performance

R Package Documentation

Browse R Packages

We want your feedback!

topepo/yardstick
Tidy Characterizations of Model Performance

tests/pycompare/generate-pycompare.R
In topepo/yardstick: Tidy Characterizations of Model Performance