# pp_score --------------------------------------------------------------------
#' @title pp_score
#'
#' @description Calculates predictive power score for `x` predicts `y`. Where
#' `x` can be multiple features in `df`. This is a light (wip) implementation
#' which for now only supports binary classification and x as numeric features.
#'
#' @param df data.frame
#' @param x `str` name of features in `df`
#' @param y `str` target feature
#' @param sample_size `int` random sample taken from `df` to speed
#' up calculations. If *NULL* all samples are used.
#' @param cv_folds number of cross validations folds
#' @param repeated_cv `int` number of repeated cross validations
#' @param metric `str` Machine learning metric to evaluate on. Choose between
#' *roc_auc*, *pr_auc*, *F1*
#'
#' @return A list with the predictive power score for each feature in `x`
#'
pp_score <- function(df, x, y, metric = "roc_auc", sample_size = NULL,
cv_folds = 5, repeated_cv = 1) {
stopifnot(is.data.frame(df), c(x, y) %in% names(df))
results <- list()
for (i in x) {
results[i] <- score(df, i, y, sample_size, cv_folds, repeated_cv)
}
return(results)
}
# score -----------------------------------------------------------------------
#' @title Calculates predictive power score
#'
#' @description Calculates predictive power score for `x` predicts `y`. This is
#' a light (wip) implementation which for now only supports binary
#' classification and x as numeric a feature.
#'
#' @details for mere information: https://github.com/8080labs/ppscore
#'
#' @param df `data.frame` input data which contains `x` and `y`
#' @param x `str` name of feature
#' @param y `str` name of target
#' @param sample_size `int` random sample taken from `df` to speed
#' up calculations. If *NULL* all samples are used.
#' @param cv_folds `int` number of cross validations folds
#' @param repeated_cv `int` number of repeated cross validations
#' @param metric `str` Machine learning metric to evaluate on. Choose between
#' *roc_auc*, *pr_auc*, *F1*.
#'
#' @importFrom rpart rpart
#'
#' @return The predictive power score for `x`.
#'
score <- function(df, x, y, metric = "roc_auc", sample_size = NULL,
cv_folds = 5L, repeated_cv = 1L) {
stopifnot(is.numeric(df[[x]]), length(unique(df[[y]])) == 2)
# Removing NA & keeping only x and y
df <- na.omit(df[c(x, y)])
if (nrow(df) == 0) stop("Zero rows in data after removing NA's!")
# Looping over repeated cv and cv folds
results <- list()
for(j in seq(repeated_cv)) {
# Sampling data
df_sampled <- sample_data(df, sample_size)
# Identifying cross validations folds
cv <- sample(seq(cv_folds), size = nrow(df_sampled), replace = TRUE)
for (i in seq(cv_folds)) {
# Splitting data
train <- df_sampled[cv != i, ]
test <- df_sampled[cv == i, ]
# Model
fit <- rpart(paste(y, "~", x), data = train, method = "class")
# Calculation metric
out <- calculate_metric(fit, test[x], test[[y]], metric)
# Save in results
results[[paste0("cv_repeat_", j)]][i] <- out
}
}
mean(unlist(results))
return(list(pp_score = mean(unlist(results)),
eval_metric = metric,
cv_scores = results))
}
# calculate_metric ------------------------------------------------------------
#' @title calculate_metric
#'
#' @param model `object` a model object
#' @param df `data.frame` test data to score
#' @param label `numeric` target vector
#' @param metric `str` Machine learning metric to evaluate on. Choose between
#' *roc_auc*, *pr_auc*, *F1*.
#'
#' @importFrom MLmetrics AUC PRAUC F1_Score
#' @importFrom stats predict
#'
#' @return `numeric` calculated ml metric
#'
calculate_metric <- function(model, df, label, metric) {
# Calculates metric
if (metric == "roc_auc") {
pred <- predict(model, df, type = "prob")[,2]
out <- AUC(pred, label)
} else if (metric == "pr_auc") {
pred <- predict(model, df, type = "prob")[,2]
out <- PRAUC(pred, label)
} else if (metric == "F1") {
pred <- predict(model, df, type = "class")
out <- F1_Score(y_true = label, y_pred = pred)
} else {
stop(paste(metric, "is not a valid option!"))
}
return(out)
}
# sample_data -----------------------------------------------------------------
#' @title sample_data
#'
#' @description Takes a random sample from `df` of size `sample_size`.
#'
#' @param df `data.frame` input data to sample from
#' @param sample_size `int` or `NULL` number of samples to take from `df`.
#'
#' @return `data.frame` with samples from `df` or `df` if `sample_size`
#' is *NULL*.
#'
sample_data <- function(df, sample_size) {
if (!is.null(sample_size)) {
df <- df[sample(nrow(df), sample_size), ]
}
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.