#' Run ranger with parameters of data.frame rows.
#'
#' This functions calls ranger using the parameter values in each row of the
#' provided master_grid, using the data of the list elements. Please have
#' a look at the [ranger doc](https://cran.r-project.org/web/packages/ranger/ranger.pdf)
#' for explanation on the ranger related variables, the arguments are beginning
#' with "ranger" in the description. Except for `the list`, `master_grid` and `.row`
#' all arguments need to be column names of `master_grid`
#'
#' @param Target char, the response variable
#' @param ML_object factor or char, the name of the corresponding `the_list` item
#' @param Cycle integer, the current repetition
#' @param Number_of_trees ranger, integer, number of trees per forest
#' @param Mtry_factor ranger, factor to multiply default ranger mtry argument
#' @param .row current row of master_grid
#' @param the_list The input tables list
#' @param master_grid the data frame containing all parameter combinations
#' @param step character declaring `training` or `prediction`
#' @param ... further parameters passed on to subfunctions
#'
#' @return a data frame with results and metrics for each row of the master_grid
#'
#' @export
ranger_classification <- function(master_grid, Target, ML_object, Cycle,
Number_of_trees, Mtry_factor, .row, the_list, step, ...) {
if(!all(c("Target", "ML_object", "Cycle", "Number_of_trees", "Mtry_factor") %in%
colnames(master_grid))) {
stop("Ranger parameters do not match column names in master_grid")
}
if(is.null(the_list[[ML_object]])) {
stop("Names in the_list and master_grid do not match")
}
if(!is.character(Target)) {
stop("ranger requires Target as character to work with purr::pmap()")
}
if(!is.factor(the_list[[ML_object]][["train_set"]][[Target]])) {
stop("Response variable is not a factor")
}
stopifnot(step == "training" | step == "prediction")
# print(.row)
#print(nrow(master_grid))
state <- paste("Row", .row, "of", nrow(master_grid))
futile.logger::flog.info(state)
all_vars <- ncol(the_list[[ML_object]][["train_set"]]) - 1
# multiply sqrt of variables with Mtry_factor; if greater than available
# number of variables, select all variables
for_mtry <- ifelse((sqrt(all_vars) * Mtry_factor) < all_vars,
sqrt(all_vars) * Mtry_factor, all_vars)
n_classes <- length(levels(as.factor(the_list[[ML_object]][["train_set"]][[Target]])))
RF_train <- ranger::ranger(
dependent.variable.name = Target, # needs to character, not factor
data = the_list[[ML_object]][["train_set"]], # referring to named list item
num.trees = Number_of_trees,
mtry = for_mtry,
importance = "none")
if (step == "prediction") {
RF_prediction <- stats::predict(object = RF_train,
data = the_list[[ML_object]][["test_set"]])
confusion_matrix <- table(true = the_list[[ML_object]][["test_set"]][[Target]],
predicted = RF_prediction$predictions)
store_classification(trained_rf = RF_train, predicted_rf = RF_prediction,
confusion_matrix = confusion_matrix, test_set = the_list[[ML_object]][["test_set"]],
n_classes = n_classes, step = step)
} else {
store_classification(trained_rf = RF_train, confusion_matrix = RF_train$confusion.matrix,
n_classes = n_classes, step = step)
}
}
#' Store results from ranger classification training and prediction
#'
#' This function extracts information from the ranger objects generated by
#' training or prediction and stores them in a data.frame. It calls the functions
#' `classification_metrics` and `prediction_accuracy` to generate metrics on
#' classification performance for each class.
#'
#' @param trained_rf the ranger object generated by training with `ranger()`
#' @param predicted_rf the ranger object generated by prediction with `predict()`,
#' (default: `NULL`)
#' @param confusion_matrix the confusion matrix obtained from
#' `trained_rf$confusion.matrix` or generated for `predicted_rf`
#' @param n_classes the number of classes for classification
#' @param step character declaring whether `training` or `prediction` occurs
#' @param ... parameters passed on to `prediction_accuracy`
#'
#' @return A data frame with one row per ranger run and class
#'
#' @export
store_classification <- function(trained_rf, predicted_rf = NULL,
confusion_matrix, n_classes, step, ...) {
stopifnot(step == "training" | step == "prediction")
if(class(trained_rf) != "ranger") {
stop("trained_rf is not of class ranger")
}
if(!is.numeric(n_classes)) {
stop("n_classes needs to be numeric")
}
if(class(confusion_matrix) != "table") {
stop("confusion_matrix is not a table")
}
results <- data.frame()
# extract classifications for each class, every class becomes own row
for (class in 1:n_classes) {
results[class, "Class"] <- row.names(confusion_matrix)[class]
results[class, "True_positive"] <- confusion_matrix[class, class]
results[class, "False_positive"] <- sum(confusion_matrix[,class]) -
confusion_matrix[class,class]
results[class, "True_negative"] <- sum(confusion_matrix[-class, -class])
results[class, "False_negative"] <- sum(confusion_matrix[class,]) -
confusion_matrix[class,class]
}
# values differing between training and prediction
if (step == "training") {
results$Prediction_error <- trained_rf$prediction.error * 100
results$Number_of_samples <- as.numeric(trained_rf$num.samples)
} else {
results$Prediction_error <- 100 - prediction_accuracy(predicted_rf, ...)
results$Number_of_samples <- as.numeric(predicted_rf$num.samples)
}
# these values are identical or similar to extract in training and prediction
results$Variables_sampled <- as.numeric(trained_rf$mtry)
results$Number_independent_vars <- as.numeric(trained_rf$num.independent.variables)
results$Tree_type <- trained_rf$treetype
results$Vars_percent <- as.numeric(results$Variables_sampled /
results$Number_independent_vars) * 100
results <- classification_metrics(results)
results
}
#' Calculate accuracy percentage for ranger predicted_rfs
#'
#' This function calculates predicted_rf accuracy, which means it counts the
#' amount of true positives and true negatives.
#'
#' @param predicted_rf the ranger prediction object
#' @param test_set the test_set containing the true values
#'
#' @return The accuracy in percentage
#'
#' @export
prediction_accuracy <- function(predicted_rf, test_set) {
if(class(predicted_rf) != "ranger.prediction") {
stop("predicted_rf is not of class ranger.prediction")
}
if(!is.data.frame(test_set)) {
stop("test_set is not a data frame")
}
comparison <- cbind(predicted_rf$predictions, test_set[[ncol(test_set)]])
matches <- ifelse(comparison[, 1]==comparison[, 2], 1, 0)
accuracy <- sum(matches) / length(matches) * 100
accuracy
}
#' calculate measures for classification performance
#'
#' This function calculates a bunch of classification performance metrics to help
#' evaluate the quality of the classification. For more info onto the calculated
#' values see e.g. [Binary Classification Metrics](https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers)
#'
#' @param result_table the result table generated by a store_*()
#' @param Number_of_samples the results table column with the number of samples
#' to calculate metrics. The default works for ranger classification. It has to
#' be overwritten by the number of validation samples for keras neural networks
#'
#' @return the result_table with additional metrics
#'
#' @export
classification_metrics <- function(result_table, Number_of_samples =
result_table$Number_of_samples) {
if(length(result_table) == 0) {
stop("result table is empty")
}
if(!is.numeric(Number_of_samples)) {
stop("Number_of_samples needs to be numeric")
}
# general values
result_table$Positive <- result_table$True_positive + result_table$False_negative
result_table$Negative <- result_table$True_negative + result_table$False_positive
result_table$Majority_fraction <- ifelse(
result_table$Positive > result_table$Negative,
result_table$Positive / (result_table$Positive + result_table$Negative) * 100,
100 - (result_table$Positive /
(result_table$Positive + result_table$Negative) * 100))
# metrics
result_table$Accuracy <- (result_table$True_positive + result_table$True_negative) /
Number_of_samples
result_table$True_negative_rate <- result_table$True_negative /
(result_table$True_negative + result_table$False_positive)
result_table$Precision <- result_table$True_positive /
(result_table$True_positive + result_table$False_positive)
result_table$Recall <- result_table$True_positive /
(result_table$True_positive + result_table$False_negative)
result_table$F1_score <- 2 * ((result_table$Precision * result_table$Recall) /
(result_table$Precision + result_table$Recall))
result_table$Balanced_accuracy <- (result_table$Recall +
result_table$True_negative_rate) / 2
result_table$False_discovery_rate <- 1 - result_table$Precision
result_table$Matthews_cor_coef <- (
result_table$True_positive * result_table$True_negative -
result_table$False_positive * result_table$False_negative) /
sqrt(
(result_table$True_positive + result_table$False_positive) *
(result_table$True_positive + result_table$False_negative) *
(result_table$True_negative + result_table$False_positive) *
(result_table$True_negative + result_table$False_negative)
)
if (any(sapply(result_table, is.nan))) {
futile.logger::flog.warn("Classification metrics containing NaN have been
generated probably due to division by 0. Poor classification performances
and/or small class sizes might be the reason.")
}
result_table
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.