Nothing
#' @title Find the data points that were hardest to predict
#' @aliases hardest
#' @description
#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
#' Finds the data points that, overall, were the most challenging to predict,
#' based on a prediction metric.
#' @param data \code{data.frame} with predictions, targets and observation IDs.
#' Can be grouped by \code{\link[dplyr:group_by]{dplyr::group_by()}}.
#'
#' Predictions can be passed as values, predicted classes or predicted probabilities:
#'
#' \strong{N.B.} Adds \code{\link[base:.Machine]{.Machine$double.eps}} to all probabilities to avoid \code{log(0)}.
#'
#' \subsection{Multinomial}{
#' When \code{`type`} is \code{"multinomial"}, the predictions can be passed in one of two formats.
#'
#' \subsection{Probabilities (Preferable)}{
#'
#' One column per class with the probability of that class.
#' The columns should have the name of their class,
#' as they are named in the target column. E.g.:
#'
#' \tabular{rrrrr}{
#' \strong{class_1} \tab \strong{class_2} \tab
#' \strong{class_3} \tab \strong{target}\cr
#' 0.269 \tab 0.528 \tab 0.203 \tab class_2\cr
#' 0.368 \tab 0.322 \tab 0.310 \tab class_3\cr
#' 0.375 \tab 0.371 \tab 0.254 \tab class_2\cr
#' ... \tab ... \tab ... \tab ...}
#' }
#' \subsection{Classes}{
#'
#' A single column of type \code{character} with the predicted classes. E.g.:
#'
#' \tabular{rrrrr}{
#' \strong{prediction} \tab \strong{target}\cr
#' class_2 \tab class_2\cr
#' class_1 \tab class_3\cr
#' class_1 \tab class_2\cr
#' ... \tab ...}
#'
#' }
#' }
#' \subsection{Binomial}{
#' When \code{`type`} is \code{"binomial"}, the predictions can be passed in one of two formats.
#'
#' \subsection{Probabilities (Preferable)}{
#' One column with the \strong{probability of class being
#' the second class alphabetically}
#' ("dog" if classes are "cat" and "dog"). E.g.:
#'
#' \tabular{rrrrr}{
#' \strong{prediction} \tab \strong{target}\cr
#' 0.769 \tab "dog"\cr
#' 0.368 \tab "dog"\cr
#' 0.375 \tab "cat"\cr
#' ... \tab ...}
#' }
#'
#' Note: At the alphabetical ordering of the class labels, they are of type \code{character},
#' why e.g. \code{100} would come before \code{7}.
#'
#' \subsection{Classes}{
#'
#' A single column of type \code{character} with the predicted classes. E.g.:
#'
#' \tabular{rrrrr}{
#' \strong{prediction} \tab \strong{target}\cr
#' class_0 \tab class_1\cr
#' class_1 \tab class_1\cr
#' class_1 \tab class_0\cr
#' ... \tab ...}
#' }
#' }
#' \subsection{Gaussian}{
#' When \code{`type`} is \code{"gaussian"}, the predictions should be passed as
#' one column with the predicted values. E.g.:
#'
#' \tabular{rrrrr}{
#' \strong{prediction} \tab \strong{target}\cr
#' 28.9 \tab 30.2\cr
#' 33.2 \tab 27.1\cr
#' 23.4 \tab 21.3\cr
#' ... \tab ...}
#' }
#'
#' @param obs_id_col Name of column with observation IDs. This will be used to aggregate
#' the performance of each observation.
#' @param target_col Name of column with the true classes/values in \code{`data`}.
#' @param prediction_cols Name(s) of column(s) with the predictions.
#' @param type Type of task used to get the predictions:
#'
#' \code{"gaussian"} for regression (like linear regression).
#'
#' \code{"binomial"} for binary classification.
#'
#' \code{"multinomial"} for multiclass classification.
#' @param threshold Threshold to filter observations by. Depends on \code{`type`} and \code{`threshold_is`}.
#'
#' The \code{threshold} can either be a \strong{percentage} or a \strong{score}.
#' For percentages, a lower \code{threshold}
#' returns fewer observations. For scores, this depends on \code{`type`}.
#'
#' \subsection{Gaussian}{
#' \subsection{threshold_is "percentage"}{
#' (Approximate) percentage of the observations with the largest root mean square errors
#' to return.
#' }
#' \subsection{threshold_is "score"}{
#' Observations with a root mean square error larger than or equal to the \code{threshold} will be returned.
#' }
#' }
#' \subsection{Binomial, Multinomial}{
#' \subsection{threshold_is "percentage"}{
#' (Approximate) percentage of the observations to return with:
#'
#' \code{MAE}, \code{Cross Entropy}: Highest error scores.
#'
#' \code{Accuracy}: Lowest accuracies
#' }
#' \subsection{threshold_is "score"}{
#' \code{MAE}, \code{Cross Entropy}: Observations with an error score above or equal to the threshold will be returned.
#'
#' \code{Accuracy}: Observations with an accuracy below or equal to the threshold will be returned.
#'
#' }
#' }
#' @param threshold_is Either \code{"score"} or \code{"percentage"}. See \code{`threshold`}.
#' @param metric The metric to use. If \code{NULL},
#' the default metric depends on the format of the prediction columns.
#'
#' \subsection{Binomial, Multinomial}{
#' \code{"Accuracy"}, \code{"MAE"} or \code{"Cross Entropy"}.
#'
#' When \emph{one} prediction column with predicted \emph{classes} is passed,
#' the default is \code{"Accuracy"}.
#' In this configuration, the other metrics are not calculated.
#'
#' When \emph{one or more} prediction columns with predicted \emph{probabilities} are passed,
#' the default is \code{"MAE"}. This is the Mean Absolute Error of the
#' probability of the target class.
#' }
#'
#' \subsection{Gaussian}{
#' Ignored. Always uses \code{"RMSE"}.
#' }
#' @param cutoff Threshold for predicted classes. (Numeric)
#'
#' N.B. \strong{Binomial only}.
#' @return \code{data.frame} with the most challenging observations and their metrics.
#'
#' \code{`>=` / `<=`} denotes the threshold as score.
#' @author Ludvig Renbo Olsen, \email{r-pkgs@@ludvigolsen.dk}
#' @export
#' @examples
#' \donttest{
#' # Attach packages
#' library(cvms)
#' library(dplyr)
#'
#' ##
#' ## Multinomial
#' ##
#'
#' # Find the most challenging data points (per classifier)
#' # in the predicted.musicians dataset
#' # which resembles the "Predictions" tibble from the evaluation results
#'
#' # Passing predicted probabilities
#' # Observations with 30% highest MAE scores
#' most_challenging(
#' predicted.musicians,
#' obs_id_col = "ID",
#' prediction_cols = c("A", "B", "C", "D"),
#' type = "multinomial",
#' threshold = 0.30
#' )
#'
#' # Observations with 25% highest Cross Entropy scores
#' most_challenging(
#' predicted.musicians,
#' obs_id_col = "ID",
#' prediction_cols = c("A", "B", "C", "D"),
#' type = "multinomial",
#' threshold = 0.25,
#' metric = "Cross Entropy"
#' )
#'
#' # Passing predicted classes
#' # Observations with 30% lowest Accuracy scores
#' most_challenging(
#' predicted.musicians,
#' obs_id_col = "ID",
#' prediction_cols = "Predicted Class",
#' type = "multinomial",
#' threshold = 0.30
#' )
#'
#' # The 40% lowest-scoring on accuracy per classifier
#' predicted.musicians %>%
#' dplyr::group_by(Classifier) %>%
#' most_challenging(
#' obs_id_col = "ID",
#' prediction_cols = "Predicted Class",
#' type = "multinomial",
#' threshold = 0.40
#' )
#'
#' # Accuracy scores below 0.05
#' most_challenging(
#' predicted.musicians,
#' obs_id_col = "ID",
#' type = "multinomial",
#' threshold = 0.05,
#' threshold_is = "score"
#' )
#'
#' ##
#' ## Binomial
#' ##
#'
#' # Subset the predicted.musicians
#' binom_data <- predicted.musicians %>%
#' dplyr::filter(Target %in% c("A","B")) %>%
#' dplyr::rename(Prediction = B)
#'
#' # Passing probabilities
#' # Observations with 30% highest MAE
#' most_challenging(
#' binom_data,
#' obs_id_col = "ID",
#' type = "binomial",
#' prediction_cols = "Prediction",
#' threshold = 0.30
#' )
#'
#' # Observations with 30% highest Cross Entropy
#' most_challenging(
#' binom_data,
#' obs_id_col = "ID",
#' type = "binomial",
#' prediction_cols = "Prediction",
#' threshold = 0.30,
#' metric = "Cross Entropy"
#' )
#'
#' # Passing predicted classes
#' # Observations with 30% lowest Accuracy scores
#' most_challenging(
#' binom_data,
#' obs_id_col = "ID",
#' type = "binomial",
#' prediction_cols = "Predicted Class",
#' threshold = 0.30
#' )
#'
#' ##
#' ## Gaussian
#' ##
#'
#' set.seed(1)
#'
#' df <- data.frame(
#' "Observation" = rep(1:10, n = 3),
#' "Target" = rnorm(n = 30, mean = 25, sd = 5),
#' "Prediction" = rnorm(n = 30, mean = 27, sd = 7)
#' )
#'
#' # The 20% highest RMSE scores
#' most_challenging(
#' df,
#' type = "gaussian",
#' threshold = 0.2
#' )
#'
#' # RMSE scores above 9
#' most_challenging(
#' df,
#' type = "gaussian",
#' threshold = 9,
#' threshold_is = "score"
#' )
#' }
most_challenging <- function(data,
type,
obs_id_col = "Observation",
target_col = "Target",
prediction_cols = ifelse(
type == "gaussian",
"Prediction",
"Predicted Class"
),
threshold = 0.15,
threshold_is = "percentage",
metric = NULL,
cutoff = 0.5) {
# Check arguments ####
assert_collection <- checkmate::makeAssertCollection()
checkmate::assert_data_frame(
x = data, min.rows = 1,
min.cols = 3,
add = assert_collection
)
checkmate::assert_string(x = obs_id_col, min.chars = 1, add = assert_collection)
checkmate::assert_string(x = target_col, min.chars = 1, add = assert_collection)
checkmate::assert_character(x = prediction_cols, min.chars = 1,
min.len = 1, any.missing = FALSE,
add = assert_collection)
checkmate::assert_choice(
x = type, choices = c("gaussian", "binomial", "multinomial"),
add = assert_collection
)
if (type != "gaussian"){
checkmate::assert_choice(
x = metric, choices = c("Accuracy", "Cross Entropy", "MAE"),
null.ok = TRUE,
add = assert_collection
)
} else {
checkmate::assert_choice(
x = metric, choices = c("RMSE"),
null.ok = TRUE,
add = assert_collection
)
}
checkmate::assert_choice(
x = threshold_is, choices = c("score", "percentage"),
add = assert_collection
)
checkmate::assert_number(
x = cutoff,
lower = 0,
upper = 1,
add = assert_collection
)
checkmate::assert_number(x = threshold, add = assert_collection)
if (threshold_is == "percentage" &&
!is_between_(threshold, 0.0, 1.0)) {
assert_collection$push(
"when 'threshold_is' a percentage, 'threshold' must be between 0 and 1."
)
}
checkmate::reportAssertions(assert_collection)
checkmate::assert_names(
x = names(data),
must.include = c(obs_id_col,
target_col,
prediction_cols),
add = assert_collection)
checkmate::reportAssertions(assert_collection)
# End of argument checks ####
# If the dataset is grouped, we need the indices and keys for the groups
# so we can evaluate group wise
# grouping_factor <- dplyr::group_indices(data)
grouping_keys <- dplyr::group_keys(data)
data <- dplyr::ungroup(data)
if (obs_id_col %in% colnames(grouping_keys)){
assert_collection$push(
"'data' cannot be grouped by the 'obs_id_col'. This is done internally."
)
checkmate::reportAssertions(assert_collection)
}
# Select relevant columns
data <- data[, c(
colnames(grouping_keys),
obs_id_col,
target_col,
prediction_cols
)]
# Possibly multiple prediction cols
# that should be transformed to probability of target class
prepped_predictions <-
prepare_predictions(
data = data,
prediction_cols = prediction_cols,
target_col = target_col,
cutoff = cutoff,
type = type
)
data <- prepped_predictions[["data"]]
prediction_col <- prepped_predictions[["prediction_col"]]
predicted_class_col <- prepped_predictions[["predicted_class_col"]]
probability_of_target_col <- prepped_predictions[["probability_of_target_col"]]
if (type %in% c("binomial", "multinomial")) {
most_challenging_classification(
data = data,
obs_id_col = obs_id_col,
target_col = target_col,
predicted_class_col = predicted_class_col,
probability_of_target_col = probability_of_target_col,
threshold = threshold,
threshold_is = threshold_is,
metric = metric,
grouping_keys = grouping_keys
)
} else if (type == "gaussian") {
most_challenging_gaussian(
data = data,
obs_id_col = obs_id_col,
target_col = target_col,
prediction_col = prediction_col,
threshold = threshold,
threshold_is = threshold_is,
grouping_keys = grouping_keys
)
}
}
# Check prediction columns
check_prediction_cols <- function(data,
prediction_cols,
target_col,
type,
cutoff){
# Check arguments ####
assert_collection <- checkmate::makeAssertCollection()
checkmate::assert_data_frame(x = data, min.cols = 2, add = assert_collection)
checkmate::assert_character(x = prediction_cols, any.missing = FALSE,
min.len = 1, add = assert_collection)
checkmate::assert_string(x = target_col, min.chars = 1, add = assert_collection)
checkmate::assert_string(x = type, min.chars = 1, add = assert_collection)
checkmate::assert_number(x = cutoff, lower = 0, upper = 1, add = assert_collection)
checkmate::reportAssertions(assert_collection)
if (!checkmate::test_names(x = colnames(data),
must.include = prediction_cols)){
assert_collection$push("'data' does not include all the prediction columns.")
}
if (!checkmate::test_names(x = colnames(data),
must.include = target_col)){
assert_collection$push("'data' does not include the target column.")
}
checkmate::reportAssertions(assert_collection)
# End of argument checks ####
if (type == "gaussian" &&
!is.numeric(data[[target_col]])){
assert_collection$push(paste0(
"When 'type' is 'gaussian', 'data[[target_col]]' must be numeric."
))
checkmate::reportAssertions(assert_collection)
}
if (type != "gaussian" &&
!(is.character(data[[target_col]]) ||
is.factor(data[[target_col]]))) {
assert_collection$push(paste0(
"When 'type' is '",
type,
"', 'data[[target_col]]' must either have type 'character' or 'factor'."
))
checkmate::reportAssertions(assert_collection)
}
# Checks
if (length(prediction_cols) == 1){
if (type == "gaussian"){
if (!is.double(data[[prediction_cols]])){
assert_collection$push(
"when 'type' is 'gaussian', 'data[[prediction_cols]]' must have type 'double'.")
checkmate::reportAssertions(assert_collection)
}
} else if (type == "binomial") {
if (!(is.double(data[[prediction_cols]]) ||
is.character(data[[prediction_cols]]))){
assert_collection$push(
paste0("when 'type' is 'binomial', 'data[[prediction_cols]]' must b",
"e either of type 'double' (probability of positive class) or",
" 'character' (predicted class)."))
checkmate::reportAssertions(assert_collection)
}
if (is.numeric(data[[prediction_cols]]) &&
!checkmate::test_numeric(data[[prediction_cols]],
lower = 0, upper = 1,
any.missing = FALSE)){
assert_collection$push(
"when 'type' is 'binomial' and 'data[[prediction_cols]]' is 'numeric', it must have values between 0 and 1."
)
checkmate::reportAssertions(assert_collection)
}
if (is.character(data[[target_col]]) &&
length(levels_as_characters(data[[target_col]])) > 2) {
assert_collection$push("found more than two target classes.")
checkmate::reportAssertions(assert_collection)
}
} else if (type == "multinomial"){
if (!is.character(data[[prediction_cols]])){
assert_collection$push(
"when 'type' is 'multinomial' and prediction_cols has length 1, 'data[[prediction_cols]]' must have type 'character'."
)
checkmate::reportAssertions(assert_collection)
}
}
} else {
if (type != "multinomial"){
assert_collection$push(
"'prediction_cols' can only have length > 1 when 'type' is 'multinomial'.")
checkmate::reportAssertions(assert_collection)
}
data_preds <- data[, prediction_cols]
if (any(!unlist(lapply(data_preds, is.numeric)))) {
assert_collection$push(
"when 'prediction_cols' has length > 1, all prediction columns must be numeric.")
checkmate::reportAssertions(assert_collection)
}
if (contains_na(data_preds)) {
assert_collection$push("prediction columns contained NAs (missing data).")
checkmate::reportAssertions(assert_collection)
}
if (!rows_sum_to(data_preds, sum_to = 1, digits = 5)) {
assert_collection$push(
paste0("when 'prediction_cols' has length > 1, prediction columns m",
"ust sum to 1 row-wise."))
checkmate::reportAssertions(assert_collection)
}
if (length(setdiff(levels_as_characters(data[[target_col]]),
prediction_cols)) > 0){
assert_collection$push(
paste0("when 'prediction_cols' has length > 1, all classes in 'data",
"[[target_col]]' must have a prediction column."))
checkmate::reportAssertions(assert_collection)
}
}
}
# Extract
prepare_predictions <- function(data,
prediction_cols,
target_col,
cutoff,
type) {
check_prediction_cols(
data = data,
prediction_cols = prediction_cols,
target_col = target_col,
type = type,
cutoff
)
if (type == "binomial"){
cat_levels <- levels_as_characters(data[[target_col]], drop_unused = TRUE, sort_levels=TRUE)
if (length(cat_levels) < 2) {
stop(paste0("Found less than 2 levels in the target column."))
}
if (length(cat_levels) > 2) {
stop("The target column must maximally contain 2 levels.")
}
positive <- cat_levels[[2]]
}
# For gaussian
if (type %ni% c("binomial", "multinomial")) {
return(list(
"data" = data,
"prediction_col" = prediction_cols,
"predicted_class_col" = NULL,
"probability_of_target_col" = NULL
))
}
if (length(prediction_cols) > 1) {
# Extract predicted class
data[["predicted_class_index"]] <-
argmax(data[, prediction_cols])
data[["predicted_class"]] <- purrr::map_chr(
data[["predicted_class_index"]],
.f = function(x) {
prediction_cols[[x]]
}
)
# Extract probability of target class
target_index_map <-
as.list(setNames(seq_along(prediction_cols), prediction_cols))
target_indices <- purrr::map_int(
data[[target_col]],
.f = function(x) {
target_index_map[[x]]
}
)
# Add + .Machine$double.eps to probabilities
data[, prediction_cols] <- data[, prediction_cols] + .Machine$double.eps
# Find probability of targets
data[["probability_of_target"]] <- as.data.frame(data[, prediction_cols])[
cbind(seq_along(target_indices), target_indices)]
prediction_col <- NULL
probability_of_target_col <- "probability_of_target"
predicted_class_col <- "predicted_class"
} else {
# Extract probability of target
if (is.numeric(data[[prediction_cols]])) {
if (type == "binomial"){
negative <- cat_levels[cat_levels != positive]
# Find predicted classes
data[["predicted_class"]] <-
ifelse(data[[prediction_cols]] > cutoff,
positive, negative)
# Add + .Machine$double.eps to probabilities
data[[prediction_cols]] <- data[[prediction_cols]] + .Machine$double.eps
# Find probability of targets
data[["probability_of_target"]] <-
ifelse(data[[target_col]] == positive,
data[[prediction_cols]],
1 - data[[prediction_cols]])
prediction_col <- NULL
probability_of_target_col <- "probability_of_target"
predicted_class_col <- "predicted_class"
}
} else {
probability_of_target_col <- NULL
prediction_col <- NULL
predicted_class_col <- prediction_cols
}
}
list("data" = data,
"prediction_col" = prediction_col,
"predicted_class_col" = predicted_class_col,
"probability_of_target_col" = probability_of_target_col)
}
most_challenging_classification <- function(data,
obs_id_col,
target_col,
predicted_class_col,
probability_of_target_col,
threshold,
threshold_is,
metric,
grouping_keys) {
# Ensure same type
data[[target_col]] <- as.character(data[[target_col]])
data[[predicted_class_col]] <- as.character(data[[predicted_class_col]])
# Count correct and incorrect predictions
data[["Correct"]] <- data[[predicted_class_col]] == data[[target_col]]
data[["Incorrect"]] <- data[[predicted_class_col]] != data[[target_col]]
if (!is.null(probability_of_target_col)){
# Make sure we don't do log(0)
data[[probability_of_target_col]] <- ifelse(
data[[probability_of_target_col]] < 1e-20, 1e-20,
data[[probability_of_target_col]])
data[["LogProbabilityOfTarget"]] <- log(data[[probability_of_target_col]])
data[["ProbabilityOfTarget"]] <- data[[probability_of_target_col]]
if (is.null(metric))
metric <- "MAE"
} else {
data[["LogProbabilityOfTarget"]] <- 1
data[["ProbabilityOfTarget"]] <- 1
if (is.null(metric))
metric <- "Accuracy"
}
# Calculate metrics per group
by_observation <- data %>%
dplyr::mutate(corr = .data$Correct) %>%
dplyr::group_by_at(c(colnames(grouping_keys), obs_id_col)) %>%
dplyr::summarise(
Correct = sum(.data$Correct),
Incorrect = sum(.data$Incorrect),
Accuracy = mean(.data$corr),
MAE = mean(1 - .data$ProbabilityOfTarget),
`Cross Entropy` = -mean(.data$LogProbabilityOfTarget)
) %>% dplyr::ungroup()
if (is.null(probability_of_target_col)){
by_observation[["Cross Entropy"]] <- NULL
by_observation[["MAE"]] <- NULL
}
# Find the observations that were the most difficult to predict
to_return <- exceeds_threshold(data = by_observation,
threshold = threshold,
threshold_is = threshold_is,
metric_name = metric,
maximize = metric == "Accuracy",
grouping_keys = grouping_keys)
to_return <- to_return %>%
dplyr::rename(`<=` = "Threshold")
if (metric == "Accuracy"){
to_return <- to_return %>%
dplyr::arrange(
!!!rlang::syms(colnames(grouping_keys)),
.data$Accuracy
)
} else if (metric == "MAE"){
to_return <- to_return %>%
dplyr::arrange(
!!!rlang::syms(colnames(grouping_keys)),
dplyr::desc(.data$`MAE`),
dplyr::desc(.data$`Cross Entropy`),
.data$Accuracy
)
} else if (metric == "Cross Entropy"){
to_return <- to_return %>%
dplyr::arrange(
!!!rlang::syms(colnames(grouping_keys)),
dplyr::desc(.data$`Cross Entropy`),
dplyr::desc(.data$`MAE`),
.data$Accuracy
)
}
to_return
}
most_challenging_gaussian <- function(data,
obs_id_col,
target_col,
prediction_col,
threshold,
threshold_is,
grouping_keys) {
# Check arguments ####
assert_collection <- checkmate::makeAssertCollection()
checkmate::assert_numeric(
x = data[[target_col]],
any.missing = FALSE,
min.len = 1,
add = assert_collection
)
checkmate::assert_numeric(
x = data[[prediction_col]],
any.missing = FALSE,
min.len = 1,
add = assert_collection
)
checkmate::reportAssertions(assert_collection)
# End of argument checks ####
# Calculate residuals
tmp_residual_var <- create_tmp_name(data, ".__residuals__")
data[[tmp_residual_var]] <- data[[target_col]] - data[[prediction_col]]
# RMS() (x is the residuals)
root_mean_square <- function(x) sqrt(mean(x^2))
# Calculate metrics per group
by_observation <- data %>%
dplyr::group_by_at(c(colnames(grouping_keys), obs_id_col)) %>%
dplyr::summarise(
`MAE` = mean(abs(!!as.name(tmp_residual_var))),
`RMSE` = root_mean_square(!!as.name(tmp_residual_var))
) %>% dplyr::ungroup()
# Find the observations that were the most difficult to predict
to_return <- exceeds_threshold(data = by_observation,
threshold = threshold,
threshold_is = threshold_is,
metric_name = "RMSE",
maximize = FALSE,
grouping_keys = grouping_keys)
# Reorder and return
to_return %>%
dplyr::arrange(
!!!rlang::syms(colnames(grouping_keys)),
dplyr::desc(.data$RMSE), dplyr::desc(.data$MAE)
) %>%
dplyr::rename(`>=` = "Threshold")
}
exceeds_threshold <- function(data,
threshold,
threshold_is,
metric_name,
maximize,
grouping_keys) {
# Check arguments ####
assert_collection <- checkmate::makeAssertCollection()
checkmate::assert_data_frame(x = data, add = assert_collection)
if (dplyr::is_grouped_df(data))
assert_collection$push("'data' cannot be grouped at this stage.")
checkmate::assert_data_frame(x = grouping_keys, add = assert_collection)
checkmate::assert_choice(x = threshold_is, choices = c("percentage", "score"),
add = assert_collection)
checkmate::assert_flag(x = maximize, add = assert_collection)
checkmate::reportAssertions(assert_collection)
if (threshold_is == "percentage")
checkmate::assert_number(x = threshold, lower = 0, upper = 1, add = assert_collection)
else {
checkmate::assert_number(x = threshold, add = assert_collection)
}
checkmate::assert_names(x = colnames(data), must.include = c(colnames(grouping_keys), metric_name),
what = "colnames", add = assert_collection)
checkmate::reportAssertions(assert_collection)
# End of argument checks ####
if (threshold_is == "percentage"){
# Calculate thresholds per grouping
probs <- ifelse(isTRUE(maximize), threshold, 1 - threshold)
thresholds <- data %>%
dplyr::group_by_at(colnames(grouping_keys)) %>%
dplyr::summarise(Threshold = stats::quantile(
!!as.name(metric_name), probs = probs)) %>%
dplyr::mutate(Threshold = unname(.data$Threshold))
# Add thresholds to data
if (ncol(grouping_keys) > 0){
data <- data %>%
dplyr::left_join(thresholds, by = colnames(grouping_keys))
} else {
data[["Threshold"]] <- thresholds[["Threshold"]]
}
} else if (threshold_is == "score") {
# Add threshold to data
data[["Threshold"]] <- threshold
}
# Find the most difficult observations
if (isTRUE(maximize)){
to_return <- data[data[[metric_name]] <= data[["Threshold"]],]
} else {
to_return <- data[data[[metric_name]] >= data[["Threshold"]],]
}
to_return
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.