R/ranger_classification.R

Defines functions classification_metrics prediction_accuracy store_classification ranger_classification

Documented in classification_metrics prediction_accuracy ranger_classification store_classification

#' Run ranger with parameters of data.frame rows.
#'
#' This functions calls ranger using the parameter values in each row of the 
#' provided master_grid, using the data of the list elements. Please have
#' a look at the [ranger doc](https://cran.r-project.org/web/packages/ranger/ranger.pdf)
#' for explanation on the ranger related variables, the arguments are beginning 
#' with "ranger" in the description. Except for `the list`, `master_grid` and `.row`
#' all arguments need to be column names of `master_grid`
#'
#' @param Target char, the response variable
#' @param ML_object factor or char, the name of the corresponding `the_list` item
#' @param Cycle integer, the current repetition
#' @param Number_of_trees ranger, integer, number of trees per forest
#' @param Mtry_factor ranger, factor to multiply default ranger mtry argument
#' @param .row current row of master_grid
#' @param the_list The input tables list
#' @param master_grid the data frame containing all parameter combinations
#' @param step character declaring `training` or `prediction`
#' @param ... further parameters passed on to subfunctions
#'
#' @return a data frame with results and metrics for each row of the master_grid
#'
#' @export
ranger_classification <- function(master_grid, Target, ML_object, Cycle, 
  Number_of_trees, Mtry_factor, .row, the_list, step, ...) {
  
  if(!all(c("Target", "ML_object", "Cycle", "Number_of_trees", "Mtry_factor") %in% 
      colnames(master_grid))) {
    stop("Ranger parameters do not match column names in master_grid")
  }
  if(is.null(the_list[[ML_object]])) {
    stop("Names in the_list and master_grid do not match")
  }
  if(!is.character(Target)) {
    stop("ranger requires Target as character to work with purr::pmap()")
  }
  if(!is.factor(the_list[[ML_object]][["train_set"]][[Target]])) {
    stop("Response variable is not a factor")
  }
  stopifnot(step == "training" | step == "prediction")
 # print(.row)
  #print(nrow(master_grid))
  state <- paste("Row", .row, "of", nrow(master_grid))
  futile.logger::flog.info(state)
  all_vars <- ncol(the_list[[ML_object]][["train_set"]]) - 1
  # multiply sqrt of variables with Mtry_factor; if greater than available 
  # number of variables, select all variables
  for_mtry <- ifelse((sqrt(all_vars) * Mtry_factor) < all_vars,
    sqrt(all_vars) * Mtry_factor, all_vars)
  n_classes <- length(levels(as.factor(the_list[[ML_object]][["train_set"]][[Target]])))

  RF_train <- ranger::ranger(
    dependent.variable.name = Target,  # needs to character, not factor
    data = the_list[[ML_object]][["train_set"]],  # referring to named list item
    num.trees = Number_of_trees,
    mtry = for_mtry,  
    importance = "none")

  if (step == "prediction") {

    RF_prediction <- stats::predict(object = RF_train, 
      data = the_list[[ML_object]][["test_set"]])
    
    confusion_matrix <- table(true = the_list[[ML_object]][["test_set"]][[Target]], 
      predicted = RF_prediction$predictions)
    store_classification(trained_rf = RF_train, predicted_rf = RF_prediction, 
      confusion_matrix = confusion_matrix, test_set = the_list[[ML_object]][["test_set"]],
      n_classes = n_classes, step = step)
  } else {  
    store_classification(trained_rf = RF_train, confusion_matrix = RF_train$confusion.matrix,
      n_classes = n_classes, step = step)
  }
}

#' Store results from ranger classification training and prediction
#'
#' This function extracts information from the ranger objects generated by
#' training or prediction and stores them in a data.frame. It calls the functions 
#' `classification_metrics` and `prediction_accuracy` to generate metrics on
#' classification performance for each class.
#'
#' @param trained_rf the ranger object generated by training with `ranger()`
#' @param predicted_rf the ranger object generated by prediction with `predict()`,
#'   (default: `NULL`)
#' @param confusion_matrix the confusion matrix obtained from 
#'   `trained_rf$confusion.matrix` or generated for `predicted_rf`
#' @param n_classes the number of classes for classification
#' @param step character declaring whether `training` or `prediction` occurs
#' @param ... parameters passed on to `prediction_accuracy`
#' 
#' @return A data frame with one row per ranger run and class
#'
#' @export
store_classification <- function(trained_rf, predicted_rf = NULL, 
  confusion_matrix, n_classes, step, ...) {
  
  stopifnot(step == "training" | step == "prediction")
  if(class(trained_rf) != "ranger") {
    stop("trained_rf is not of class ranger")
  }
  if(!is.numeric(n_classes)) {
    stop("n_classes needs to be numeric")
  }
  
  if(class(confusion_matrix) != "table") {
    stop("confusion_matrix is not a table")
  }
  
  results <- data.frame()
  # extract classifications for each class, every class becomes own row
  for (class in 1:n_classes) {
    results[class, "Class"] <- row.names(confusion_matrix)[class] 
    results[class, "True_positive"] <- confusion_matrix[class, class]
    results[class, "False_positive"] <- sum(confusion_matrix[,class]) - 
      confusion_matrix[class,class]
    results[class, "True_negative"] <- sum(confusion_matrix[-class, -class])
    results[class, "False_negative"] <- sum(confusion_matrix[class,]) - 
      confusion_matrix[class,class]
  }  
  # values differing between training and prediction
  if (step == "training") {
    results$Prediction_error <- trained_rf$prediction.error * 100
    results$Number_of_samples <- as.numeric(trained_rf$num.samples)   
  } else {
    results$Prediction_error <- 100 - prediction_accuracy(predicted_rf, ...)   
    results$Number_of_samples <- as.numeric(predicted_rf$num.samples)
  }
  # these values are identical or similar to extract in training and prediction
  results$Variables_sampled <- as.numeric(trained_rf$mtry)
  results$Number_independent_vars <- as.numeric(trained_rf$num.independent.variables)
  results$Tree_type <- trained_rf$treetype
  results$Vars_percent <- as.numeric(results$Variables_sampled / 
    results$Number_independent_vars) * 100
  results <- classification_metrics(results)
  results
}

#' Calculate accuracy percentage for ranger predicted_rfs
#'
#' This function calculates predicted_rf accuracy, which means it counts the
#' amount of true positives and true negatives.
#'
#' @param predicted_rf the ranger prediction object
#' @param test_set the test_set containing the true values
#'
#' @return The accuracy in percentage
#'
#' @export
prediction_accuracy <- function(predicted_rf, test_set) {
  
  if(class(predicted_rf) != "ranger.prediction") {
    stop("predicted_rf is not of class ranger.prediction")
  }
  if(!is.data.frame(test_set)) {
    stop("test_set is not a data frame")
  }
  comparison <- cbind(predicted_rf$predictions, test_set[[ncol(test_set)]])
  matches <- ifelse(comparison[, 1]==comparison[, 2], 1, 0)
  accuracy <- sum(matches) / length(matches) * 100
  accuracy
}

#' calculate measures for classification performance
#'
#' This function calculates a bunch of classification performance metrics to help
#' evaluate the quality of the classification. For more info onto the calculated
#' values see e.g. [Binary Classification Metrics](https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers)
#'
#' @param result_table the result table generated by a store_*()
#' @param Number_of_samples the results table column with the number of samples
#'   to calculate metrics. The default works for ranger classification. It has to
#'   be overwritten by the number of validation samples for keras neural networks
#'
#' @return the result_table with additional metrics 
#'
#' @export
classification_metrics <- function(result_table, Number_of_samples = 
    result_table$Number_of_samples) {
  
  if(length(result_table) == 0) {
    stop("result table is empty")
  }
  if(!is.numeric(Number_of_samples)) {
    stop("Number_of_samples needs to be numeric")
  }
 
  # general values
  result_table$Positive <- result_table$True_positive + result_table$False_negative
  result_table$Negative <- result_table$True_negative + result_table$False_positive
  result_table$Majority_fraction <- ifelse(
    result_table$Positive > result_table$Negative, 
    result_table$Positive / (result_table$Positive + result_table$Negative) * 100, 
    100 - (result_table$Positive / 
      (result_table$Positive + result_table$Negative) * 100))
  # metrics
  result_table$Accuracy <- (result_table$True_positive + result_table$True_negative) /
      Number_of_samples
  result_table$True_negative_rate <- result_table$True_negative / 
    (result_table$True_negative + result_table$False_positive)
  result_table$Precision <- result_table$True_positive /
    (result_table$True_positive + result_table$False_positive)
  result_table$Recall <- result_table$True_positive /
    (result_table$True_positive + result_table$False_negative)
  result_table$F1_score <- 2 * ((result_table$Precision * result_table$Recall) / 
    (result_table$Precision + result_table$Recall))
  result_table$Balanced_accuracy <- (result_table$Recall + 
    result_table$True_negative_rate) / 2
  result_table$False_discovery_rate <- 1 - result_table$Precision
  result_table$Matthews_cor_coef <- (
    result_table$True_positive * result_table$True_negative - 
      result_table$False_positive * result_table$False_negative) /
    sqrt(
      (result_table$True_positive + result_table$False_positive) * 
      (result_table$True_positive + result_table$False_negative) *
      (result_table$True_negative + result_table$False_positive) *
      (result_table$True_negative + result_table$False_negative)
      )
  if (any(sapply(result_table, is.nan))) {
    futile.logger::flog.warn("Classification metrics containing NaN have been 
      generated probably due to division by 0. Poor classification performances 
      and/or small class sizes might be the reason.")
  }
  result_table
}
RJ333/phyloseq2ML documentation built on June 2, 2020, 9:25 p.m.