Nothing
# reticulate::install_python("3.10.4")
# reticulate::virtualenv_create("yardstick-environment", version = "3.10.4")
# reticulate::use_virtualenv("yardstick-environment")
# reticulate::py_install("scikit-learn") # 1.2.0 is what was downloaded
# reticulate::py_install("scikit-survival") # 1.2.0 is what was downloaded
# reticulate::py_install("fairlearn") # 0.8.0 is what was downloaded
library(reticulate)
# Inside yardstick
devtools::load_all()
use_virtualenv("yardstick-environment")
skmetrics <- import("sklearn.metrics")
sksurv_metrics <- import("sksurv.metrics")
sksurv_util <- import("sksurv.util", convert = FALSE)
fairlearn <- import("fairlearn.metrics")
data("hpc_cv")
data("two_class_example")
data("solubility_test")
weights_hpc_cv <- read_weights_hpc_cv()
weights_two_class_example <- read_weights_two_class_example()
weights_solubility_test <- read_weights_solubility_test()
save_metric_results <- function(nm, fn, ..., average = c("macro", "micro", "weighted")) {
# Two class metrics
res <- list(fn(two_class_example$truth, two_class_example$predicted, ..., pos_label = "Class1"))
# Multiclass metrics
res2 <- lapply(average, function(.x) fn(hpc_cv$obs, hpc_cv$pred, ..., average = .x))
# Two class weighted metrics
case_weight <- list(fn(
two_class_example$truth,
two_class_example$predicted,
...,
pos_label = "Class1",
sample_weight = weights_two_class_example
))
# Multiclass weighted metrics
case_weight2 <- lapply(
average,
function(.x) fn(hpc_cv$obs, hpc_cv$pred, ..., average = .x, sample_weight = weights_hpc_cv)
)
case_weight <- c(case_weight, case_weight2)
names(case_weight) <- c("binary", average)
res <- c(res, res2, list(case_weight))
names(res) <- c("binary", average, "case_weight")
saveRDS(res, test_path("py-data", paste0("py-", nm, ".rds")), version = 2)
}
# ------------------------------------------------------------------------------
# Table statistics used in other metrics
# Note: Truth down the left side, Predicted across the top, unlike
# yardstick's table function. But this matches most sklearn references better.
py_binary_weighted_confusion <- skmetrics$confusion_matrix(
y_true = two_class_example$truth,
y_pred = two_class_example$predicted,
sample_weight = weights_two_class_example
)
py_multiclass_weighted_confusion <- skmetrics$confusion_matrix(
y_true = hpc_cv$obs,
y_pred = hpc_cv$pred,
sample_weight = weights_hpc_cv
)
py_binary_weighted_FP <- colSums(py_binary_weighted_confusion) - diag(py_binary_weighted_confusion)
py_binary_weighted_FN <- rowSums(py_binary_weighted_confusion) - diag(py_binary_weighted_confusion)
py_binary_weighted_TP <- diag(py_binary_weighted_confusion)
py_binary_weighted_TN <- sum(py_binary_weighted_confusion) - (py_binary_weighted_FP + py_binary_weighted_FN + py_binary_weighted_TP)
py_multiclass_weighted_FP <- colSums(py_multiclass_weighted_confusion) - diag(py_multiclass_weighted_confusion)
py_multiclass_weighted_FN <- rowSums(py_multiclass_weighted_confusion) - diag(py_multiclass_weighted_confusion)
py_multiclass_weighted_TP <- diag(py_multiclass_weighted_confusion)
py_multiclass_weighted_TN <- sum(py_multiclass_weighted_confusion) - (py_multiclass_weighted_FP + py_multiclass_weighted_FN + py_multiclass_weighted_TP)
# ------------------------------------------------------------------------------
# Save metrics results
save_metric_results("precision", skmetrics$precision_score)
save_metric_results("recall", skmetrics$recall_score)
save_metric_results("f_meas", skmetrics$f1_score)
save_metric_results("f_meas_beta_.5", skmetrics$fbeta_score, beta = .5)
# MCC
py_mcc <- list(
binary = skmetrics$matthews_corrcoef(two_class_example$truth, two_class_example$predicted),
multiclass = skmetrics$matthews_corrcoef(hpc_cv$obs, hpc_cv$pred),
case_weight = list(
binary = skmetrics$matthews_corrcoef(two_class_example$truth, two_class_example$predicted, sample_weight = weights_two_class_example),
multiclass = skmetrics$matthews_corrcoef(hpc_cv$obs, hpc_cv$pred, sample_weight = weights_hpc_cv)
)
)
saveRDS(py_mcc, test_path("py-data", "py-mcc.rds"), version = 2)
# Accuracy
py_accuracy <- list(
binary = skmetrics$accuracy_score(two_class_example$truth, two_class_example$predicted),
multiclass = skmetrics$accuracy_score(hpc_cv$obs, hpc_cv$pred)
)
saveRDS(py_accuracy, test_path("py-data", "py-accuracy.rds"), version = 2)
# Kappa
py_kap <- list(
binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth)),
multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs)),
linear_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "linear"),
linear_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "linear"),
quadratic_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "quadratic"),
quadratic_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "quadratic"),
case_weight = list(
binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), sample_weight = weights_two_class_example),
multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), sample_weight = weights_hpc_cv),
linear_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "linear", sample_weight = weights_two_class_example),
linear_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "linear", sample_weight = weights_hpc_cv),
quadratic_binary = skmetrics$cohen_kappa_score(two_class_example$truth, two_class_example$predicted, labels = levels(two_class_example$truth), weights = "quadratic", sample_weight = weights_two_class_example),
quadratic_multiclass = skmetrics$cohen_kappa_score(hpc_cv$obs, hpc_cv$pred, labels = levels(hpc_cv$obs), weights = "quadratic", sample_weight = weights_hpc_cv)
)
)
saveRDS(py_kap, test_path("py-data", "py-kap.rds"), version = 2)
# RMSE
py_rmse <- list(
case_weight = sqrt(skmetrics$mean_squared_error(
y_true = solubility_test$solubility,
y_pred = solubility_test$prediction,
sample_weight = weights_solubility_test
))
)
saveRDS(py_rmse, test_path("py-data", "py-rmse.rds"), version = 2)
# MAE
py_mae <- list(
case_weight = skmetrics$mean_absolute_error(
y_true = solubility_test$solubility,
y_pred = solubility_test$prediction,
sample_weight = weights_solubility_test
)
)
saveRDS(py_mae, test_path("py-data", "py-mae.rds"), version = 2)
# MAPE
# (Zeros purposefully cause infinite results in our R metrics, see #271)
zero_solubility <- solubility_test$solubility == 0
solubility_test_not_zero <- solubility_test[!zero_solubility, ]
py_mape <- list(
case_weight = skmetrics$mean_absolute_percentage_error(
y_true = solubility_test_not_zero$solubility,
y_pred = solubility_test_not_zero$prediction,
sample_weight = weights_solubility_test[!zero_solubility]
)
)
saveRDS(py_mape, test_path("py-data", "py-mape.rds"), version = 2)
# R^2 Traditional
py_rsq_trad <- list(
case_weight = skmetrics$r2_score(
y_true = solubility_test$solubility,
y_pred = solubility_test$prediction,
sample_weight = weights_solubility_test
)
)
saveRDS(py_rsq_trad, test_path("py-data", "py-rsq-trad.rds"), version = 2)
# Balanced Accuracy
# Not comparing multiclass against sklearn here, because they use a different definition
py_bal_accuracy <- list(
case_weight = list(
binary = skmetrics$balanced_accuracy_score(
y_true = two_class_example$truth,
y_pred = two_class_example$predicted,
sample_weight = weights_two_class_example
)
)
)
saveRDS(py_bal_accuracy, test_path("py-data", "py-bal-accuracy.rds"), version = 2)
# NPV
py_binary_weighted_NPV <- py_binary_weighted_TN / (py_binary_weighted_TN + py_binary_weighted_FN)
py_multiclass_weighted_NPV <- py_multiclass_weighted_TN / (py_multiclass_weighted_TN + py_multiclass_weighted_FN)
py_npv <- list(
case_weight = list(
binary = py_binary_weighted_NPV[[1]],
macro = mean(py_multiclass_weighted_NPV)
)
)
saveRDS(py_npv, test_path("py-data", "py-npv.rds"), version = 2)
# PPV
py_binary_weighted_PPV <- py_binary_weighted_TP / (py_binary_weighted_TP + py_binary_weighted_FP)
py_multiclass_weighted_PPV <- py_multiclass_weighted_TP / (py_multiclass_weighted_TP + py_multiclass_weighted_FP)
py_ppv <- list(
case_weight = list(
binary = py_binary_weighted_PPV[[1]],
macro = mean(py_multiclass_weighted_PPV)
)
)
saveRDS(py_ppv, test_path("py-data", "py-ppv.rds"), version = 2)
# mn_log_loss
# log_loss() requires labels in alphabetical order, because that is what LabelBinarizer() does
log_loss_two_class_example_levels <- c("Class1", "Class2")
log_loss_two_class_example_estimate <- cbind(Class1 = two_class_example$Class1, Class2 = two_class_example$Class2)
log_loss_hpc_cv_levels <- c("F", "L", "M", "VF")
log_loss_hpc_cv_estimate <- as.matrix(hpc_cv[log_loss_hpc_cv_levels])
mn_log_loss_eps <- .Machine$double.eps
py_mn_log_loss <- list(
binary = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, eps = mn_log_loss_eps),
multiclass = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps),
binary_sum = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, normalize = FALSE),
multiclass_sum = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps, normalize = FALSE),
case_weight = list(
binary = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, eps = mn_log_loss_eps, sample_weight = weights_two_class_example),
multiclass = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps, sample_weight = weights_hpc_cv),
binary_sum = skmetrics$log_loss(two_class_example$truth, log_loss_two_class_example_estimate, labels = log_loss_two_class_example_levels, normalize = FALSE, sample_weight = weights_two_class_example),
multiclass_sum = skmetrics$log_loss(hpc_cv$obs, log_loss_hpc_cv_estimate, labels = log_loss_hpc_cv_levels, eps = mn_log_loss_eps, normalize = FALSE, sample_weight = weights_hpc_cv)
)
)
saveRDS(py_mn_log_loss, test_path("py-data", "py-mn_log_loss.rds"), version = 2)
# PR Curve
make_py_two_class_pr_curve <- function(case_weights) {
# - sklearn puts them in decreasing order of recall
# - sklearn doesn't add the `Inf` threshold value but does add the `0` recall and `1` precision value
# - sklearn stops the curve once we hit the first `recall == 1` value, we don't
# - reticulate brings all inputs in as arrays
if (!is.logical(case_weights) || length(case_weights) != 1L) {
stop("`case_weights` must be a single logical.")
}
if (case_weights) {
curve <- skmetrics$precision_recall_curve(
y_true = two_class_example$truth,
probas_pred = two_class_example$Class1,
pos_label = "Class1",
sample_weight = weights_two_class_example
)
} else {
curve <- skmetrics$precision_recall_curve(
y_true = two_class_example$truth,
probas_pred = two_class_example$Class1,
pos_label = "Class1"
)
}
names(curve) <- c("precision", "recall", ".threshold")
curve$recall <- as.vector(curve$recall)
curve$precision <- as.vector(curve$precision)
curve$.threshold <- as.vector(curve$.threshold)
curve$.threshold <- c(curve$.threshold, Inf)
curve <- tibble::tibble(!!!curve)
curve <- dplyr::select(curve, .threshold, recall, precision)
# Reverse rows to match yardstick
curve <- curve[rev(seq_len(nrow(curve))), ]
class(curve) <- c("pr_df", class(curve))
curve
}
py_pr_curve <- list(
binary = make_py_two_class_pr_curve(case_weights = FALSE),
case_weight = list(
binary = make_py_two_class_pr_curve(case_weights = TRUE)
)
)
saveRDS(py_pr_curve, test_path("py-data", "py-pr-curve.rds"), version = 2)
# Average precision
# sklearn doesn't support >2 levels in `truth` directly, but does support
# "multi-label" `truth`, which ends up being the same thing here.
average_precision_hpc_cv_truth <- model.matrix(~ obs - 1, hpc_cv)
colnames(average_precision_hpc_cv_truth) <- levels(hpc_cv$obs)
average_precision_hpc_cv_estimate <- as.matrix(hpc_cv[levels(hpc_cv$obs)])
py_average_precision <- list(
binary = skmetrics$average_precision_score(two_class_example$truth, two_class_example$Class1, pos_label = "Class1"),
macro = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "macro"),
macro_weighted = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "weighted"),
case_weight = list(
binary = skmetrics$average_precision_score(two_class_example$truth, two_class_example$Class1, pos_label = "Class1", sample_weight = weights_two_class_example),
macro = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "macro", sample_weight = weights_hpc_cv),
macro_weighted = skmetrics$average_precision_score(average_precision_hpc_cv_truth, average_precision_hpc_cv_estimate, average = "weighted", sample_weight = weights_hpc_cv)
)
)
saveRDS(py_average_precision, test_path("py-data", "py-average-precision.rds"), version = 2)
# ROC Curve
make_py_two_class_roc_curve <- function(case_weights) {
# - sklearn puts them in decreasing order of specificity
# - sklearn needs the "first" row and a tweak to the threshold of the "last" row
# - sklearn has a `drop_intermediate = TRUE` default to trim the curve
# - reticulate brings all inputs in as arrays
if (!is.logical(case_weights) || length(case_weights) != 1L) {
stop("`case_weights` must be a single logical.")
}
if (case_weights) {
curve <- skmetrics$roc_curve(
y_true = two_class_example$truth,
y_score = two_class_example$Class1,
pos_label = "Class1",
sample_weight = weights_two_class_example,
drop_intermediate = FALSE
)
} else {
curve <- skmetrics$roc_curve(
y_true = two_class_example$truth,
y_score = two_class_example$Class1,
pos_label = "Class1",
drop_intermediate = FALSE
)
}
names(curve) <- c("fpr", "tpr", ".threshold")
curve$fpr <- as.vector(curve$fpr)
curve$tpr <- as.vector(curve$tpr)
curve$.threshold <- as.vector(curve$.threshold)
curve$sensitivity <- curve$tpr
curve$specificity <- 1 - curve$fpr
curve$fpr <- NULL
curve$tpr <- NULL
# Reverse rows to match yardstick
curve$.threshold <- rev(curve$.threshold)
curve$sensitivity <- rev(curve$sensitivity)
curve$specificity <- rev(curve$specificity)
# Add "first" row to match yardstick
curve$.threshold <- c(-Inf, curve$.threshold)
curve$sensitivity <- c(1, curve$sensitivity)
curve$specificity <- c(0, curve$specificity)
# sklearn adds the "last" row, but their threshold value is
# `last_threshold + 1` rather than `Inf`
curve$.threshold[length(curve$.threshold)] <- Inf
curve <- tibble::tibble(!!!curve)
curve <- dplyr::select(curve, .threshold, specificity, sensitivity)
class(curve) <- c("roc_df", class(curve))
curve
}
py_roc_curve <- list(
binary = make_py_two_class_roc_curve(case_weights = FALSE),
case_weight = list(
binary = make_py_two_class_roc_curve(case_weights = TRUE)
)
)
saveRDS(py_roc_curve, test_path("py-data", "py-roc-curve.rds"), version = 2)
# ROC AUC Score
# - For binary, sklearn expects the probability of the class with the "greater
# label", so if you sort c("Class1", "Class2") and take the 2nd one, you get
# Class2, which is what we have to provide.
# - Neither sklearn nor yardstick support case weights in combination with the
# Hand Till method
roc_auc_hpc_cv_levels <- sort(levels(hpc_cv$obs))
roc_auc_hpc_cv_estimate <- as.matrix(dplyr::select(hpc_cv, !!!syms(roc_auc_hpc_cv_levels)))
py_roc_auc <- list(
binary = skmetrics$roc_auc_score(two_class_example$truth, two_class_example$Class2),
macro = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "macro", labels = roc_auc_hpc_cv_levels, multi_class = "ovr"),
macro_weighted = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "weighted", labels = roc_auc_hpc_cv_levels, multi_class = "ovr"),
hand_till = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "macro", labels = roc_auc_hpc_cv_levels, multi_class = "ovo"),
case_weight = list(
binary = skmetrics$roc_auc_score(two_class_example$truth, two_class_example$Class2, sample_weight = weights_two_class_example),
macro = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "macro", labels = roc_auc_hpc_cv_levels, multi_class = "ovr", sample_weight = weights_hpc_cv),
macro_weighted = skmetrics$roc_auc_score(hpc_cv$obs, roc_auc_hpc_cv_estimate, average = "weighted", labels = roc_auc_hpc_cv_levels, multi_class = "ovr", sample_weight = weights_hpc_cv)
)
)
saveRDS(py_roc_auc, test_path("py-data", "py-roc-auc.rds"), version = 2)
# Brier survival
lung_surv <- data_lung_surv()
lung_surv_unnest <- tidyr::unnest(lung_surv, cols = c(.pred))
sksurv_obj <- sksurv_util$Surv$from_arrays(
event = lung_surv$surv_obj[, "status"],
time = lung_surv$surv_obj[, "time"]
)
eval_time_unique <- unique(lung_surv_unnest$.eval_time)
py_brier_survival <- vapply(
X = eval_time_unique,
FUN.VALUE = numeric(1),
FUN = function(x) {
sksurv_metrics$brier_score(
sksurv_obj, sksurv_obj,
dplyr::filter(lung_surv_unnest, .eval_time == x)$.pred_survival,
x
)[[2]]
}
)
names(py_brier_survival) <- eval_time_unique
saveRDS(py_brier_survival, test_path("py-data", "py-brier-survival.rds"), version = 2)
# Fairness metrics via fairlearn
py_demographic_parity <-
list(
binary =
fairlearn$demographic_parity_difference(
hpc_cv$obs == "VF",
hpc_cv$pred == "VF",
sensitive_features = hpc_cv$Resample
),
weighted =
fairlearn$demographic_parity_difference(
hpc_cv$obs == "VF",
hpc_cv$pred == "VF",
sensitive_features = hpc_cv$Resample,
sample_weight = weights_hpc_cv
)
)
saveRDS(
py_demographic_parity,
test_path("py-data", "py-demographic_parity.rds"),
version = 2
)
py_equalized_odds <-
list(
binary =
fairlearn$equalized_odds_difference(
hpc_cv$obs == "VF",
hpc_cv$pred == "VF",
sensitive_features = hpc_cv$Resample
),
weighted =
fairlearn$equalized_odds_difference(
hpc_cv$obs == "VF",
hpc_cv$pred == "VF",
sensitive_features = hpc_cv$Resample,
sample_weight = weights_hpc_cv
)
)
saveRDS(
py_equalized_odds,
test_path("py-data", "py-equalized_odds.rds"),
version = 2
)
py_equal_opportunity <-
list(
binary =
fairlearn$true_positive_rate_difference(
hpc_cv$obs == "VF",
hpc_cv$pred == "VF",
sensitive_features = hpc_cv$Resample
),
weighted =
fairlearn$true_positive_rate_difference(
hpc_cv$obs == "VF",
hpc_cv$pred == "VF",
sensitive_features = hpc_cv$Resample,
sample_weight = weights_hpc_cv
)
)
saveRDS(
py_equal_opportunity,
test_path("py-data", "py-equal_opportunity.rds"),
version = 2
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.