cvms: Cross-Validation for Model Selection

Documented in evaluate

#   __________________ #< 1ee2f435e0cd344bcd4c561d5eb5542d ># __________________
#   Evaluate                                                                ####


#' @title Evaluate your model's performance
#' @description
#'  \Sexpr[results=rd, stage=render]{lifecycle::badge("maturing")}
#'
#'  Evaluate your model's predictions
#'  on a set of evaluation metrics.
#'
#'  Create ID-aggregated evaluations by multiple methods.
#'
#'  Currently supports regression and classification
#'  (binary and multiclass). See \code{`type`}.
#' @param data \code{data.frame} with predictions, targets and (optionally) an ID column.
#'  Can be grouped with \code{\link[dplyr]{group_by}}.
#'
#'  \subsection{Multinomial}{
#'  When \code{`type`} is \code{"multinomial"}, the predictions can be passed in one of two formats.
#'
#'  \subsection{Probabilities (Preferable)}{
#'
#'  One column per class with the probability of that class.
#'  The columns should have the name of their class,
#'  as they are named in the target column. E.g.:
#'
#'  \tabular{rrrrr}{
#'   \strong{class_1} \tab \strong{class_2} \tab
#'   \strong{class_3} \tab \strong{target}\cr
#'   0.269 \tab 0.528 \tab 0.203 \tab class_2\cr
#'   0.368 \tab 0.322 \tab 0.310 \tab class_3\cr
#'   0.375 \tab 0.371 \tab 0.254 \tab class_2\cr
#'   ... \tab ... \tab ... \tab ...}
#'  }
#'  \subsection{Classes}{
#'
#'  A single column of type \code{character} with the predicted classes. E.g.:
#'
#'  \tabular{rrrrr}{
#'   \strong{prediction} \tab \strong{target}\cr
#'   class_2 \tab class_2\cr
#'   class_1 \tab class_3\cr
#'   class_1 \tab class_2\cr
#'   ... \tab ...}
#'
#'  }
#'  }
#'  \subsection{Binomial}{
#'  When \code{`type`} is \code{"binomial"}, the predictions can be passed in one of two formats.
#'
#'  \subsection{Probabilities (Preferable)}{
#'  One column with the \strong{probability of class being
#'  the second class alphabetically}
#'  (1 if classes are 0 and 1). E.g.:
#'
#'  \tabular{rrrrr}{
#'   \strong{prediction} \tab \strong{target}\cr
#'   0.769 \tab 1\cr
#'   0.368 \tab 1\cr
#'   0.375 \tab 0\cr
#'   ... \tab ...}
#'  }
#'
#'  Note: At the alphabetical ordering of the class labels, they are of type \code{character},
#'  why e.g. \code{100} would come before \code{7}.
#'
#'  \subsection{Classes}{
#'
#'  A single column of type \code{character} with the predicted classes. E.g.:
#'
#'  \tabular{rrrrr}{
#'   \strong{prediction} \tab \strong{target}\cr
#'   class_0 \tab class_1\cr
#'   class_1 \tab class_1\cr
#'   class_1 \tab class_0\cr
#'   ... \tab ...}
#'  }
#'
#'  Note: The prediction column will be converted to the probability \code{0.0}
#'  for the first class alphabetically and \code{1.0} for
#'  the second class alphabetically.
#'  }
#'  \subsection{Gaussian}{
#'  When \code{`type`} is \code{"gaussian"}, the predictions should be passed as
#'  one column with the predicted values. E.g.:
#'
#'  \tabular{rrrrr}{
#'   \strong{prediction} \tab \strong{target}\cr
#'   28.9 \tab 30.2\cr
#'   33.2 \tab 27.1\cr
#'   23.4 \tab 21.3\cr
#'   ... \tab ...}
#'  }
#' @param target_col Name of the column with the true classes/values in \code{`data`}.
#'
#'  When \code{`type`} is \code{"multinomial"}, this column should contain the class names,
#'  not their indices.
#' @param prediction_cols Name(s) of column(s) with the predictions.
#'
#'  Columns can be either numeric or character depending on which format is chosen.
#'  See \code{`data`} for the possible formats.
#' @param id_col Name of ID column to aggregate predictions by.
#'
#'  N.B. Current methods assume that the target class/value is constant within the IDs.
#'
#'  N.B. When aggregating by ID, some metrics may be disabled.
#' @param id_method Method to use when aggregating predictions by ID.
#'  Either \code{"mean"} or \code{"majority"}.
#'
#'  When \code{`type`} is \code{gaussian}, only the \code{"mean"} method is available.
#'
#'  \subsection{mean}{
#'  The average prediction (value or probability) is calculated per ID and evaluated.
#'  This method assumes that the target class/value is constant within the IDs.
#'  }
#'  \subsection{majority}{
#'  The most predicted class per ID is found and evaluated. In case of a tie,
#'  the winning classes share the probability (e.g. \code{P = 0.5} each when two majority classes).
#'  This method assumes that the target class/value is constant within the IDs.
#'  }
#' @param apply_softmax Whether to apply the softmax function to the
#'  prediction columns when \code{`type`} is \code{"multinomial"}.
#'
#'  N.B. \strong{Multinomial models only}.
#' @param cutoff Threshold for predicted classes. (Numeric)
#'
#'  N.B. \strong{Binomial models only}.
#' @param positive Level from dependent variable to predict.
#'  Either as character (\emph{preferable}) or level index (\code{1} or \code{2} - alphabetically).
#'
#'  E.g. if we have the levels \code{"cat"} and \code{"dog"} and we want \code{"dog"} to be the positive class,
#'  we can either provide \code{"dog"} or \code{2}, as alphabetically, \code{"dog"} comes after \code{"cat"}.
#'
#'  \strong{Note:} For \emph{reproducibility}, it's preferable to \strong{specify the name directly}, as
#'  different \code{\link[base:locales]{locales}} may sort the levels differently.
#'
#'  Used when calculating confusion matrix metrics and creating \code{ROC} curves.
#'
#'  The \code{Process} column in the output can be used to verify this setting.
#'
#'  N.B. Only affects the evaluation metrics. \strong{Does NOT affect what the probabilities are of (always the second class alphabetically).}
#'
#'  N.B. \strong{Binomial models only}.
#' @param parallel Whether to run evaluations in parallel,
#'  when \code{`data`} is grouped with \code{\link[dplyr:group_by]{group_by}}.
#' @param metrics \code{list} for enabling/disabling metrics.
#'
#'   E.g. \code{list("RMSE" = FALSE)} would remove \code{RMSE} from the regression results,
#'   and \code{list("Accuracy" = TRUE)} would add the regular \code{Accuracy} metric
#'   to the classification results.
#'   Default values (\code{TRUE}/\code{FALSE}) will be used for the remaining available metrics.
#'
#'   You can enable/disable all metrics at once by including
#'   \code{"all" = TRUE/FALSE} in the \code{list}. This is done prior to enabling/disabling
#'   individual metrics, why f.i. \code{list("all" = FALSE, "RMSE" = TRUE)}
#'   would return only the \code{RMSE} metric.
#'
#'   The \code{list} can be created with
#'   \code{\link[cvms:gaussian_metrics]{gaussian_metrics()}},
#'   \code{\link[cvms:binomial_metrics]{binomial_metrics()}}, or
#'   \code{\link[cvms:multinomial_metrics]{multinomial_metrics()}}.
#'
#'   Also accepts the string \code{"all"}.
#' @param type Type of evaluation to perform:
#'
#'  \code{"gaussian"} for regression (like linear regression).
#'
#'  \code{"binomial"} for binary classification.
#'
#'  \code{"multinomial"} for multiclass classification.
#' @param include_predictions Whether to include the predictions
#'  in the output as a nested \code{tibble}. (Logical)
#' @param models Deprecated.
#' @details
#'
#'  Packages used:
#'
#'  \strong{Binomial} and \strong{Multinomial}:
#'
#'  \code{ROC} and \code{AUC}:
#'
#'  Binomial: \code{\link[pROC:roc]{pROC::roc}}
#'
#'  Multinomial: \code{\link[pROC:multiclass.roc]{pROC::multiclass.roc}}
#' @return
#'  ----------------------------------------------------------------
#'
#'  \subsection{Gaussian Results}{
#'
#'  ----------------------------------------------------------------
#'
#'  \code{tibble} containing the following metrics by default:
#'
#'  Average \strong{\code{RMSE}}, \strong{\code{MAE}}, \strong{\code{NRMSE(IQR)}},
#'  \strong{\code{RRSE}}, \strong{\code{RAE}}, \strong{\code{RMSLE}}.
#'
#'  See the additional metrics (disabled by default) at
#'  \code{\link[cvms:gaussian_metrics]{?gaussian_metrics}}.
#'
#'  Also includes:
#'
#'  A nested \code{tibble} with the \strong{Predictions} and targets.
#'
#'  A nested \strong{Process} information object with information
#'  about the evaluation.
#'  }
#'
#'  ----------------------------------------------------------------
#'
#'  \subsection{Binomial Results}{
#'
#'  ----------------------------------------------------------------
#'
#'  \code{tibble} with the following evaluation metrics, based on a
#'  \code{confusion matrix} and a \code{ROC} curve fitted to the predictions:
#'
#'  \code{Confusion Matrix}:
#'
#'  \strong{\code{Balanced Accuracy}},
#'  \strong{\code{Accuracy}},
#'  \strong{\code{F1}},
#'  \strong{\code{Sensitivity}},
#'  \strong{\code{Specificity}},
#'  \strong{\code{Positive Predictive Value}},
#'  \strong{\code{Negative Predictive Value}},
#'  \strong{\code{Kappa}},
#'  \strong{\code{Detection Rate}},
#'  \strong{\code{Detection Prevalence}},
#'  \strong{\code{Prevalence}}, and
#'  \strong{\code{MCC}} (Matthews correlation coefficient).
#'
#'  \code{ROC}:
#'
#'  \strong{\code{AUC}}, \strong{\code{Lower CI}}, and \strong{\code{Upper CI}}
#'
#'  Note, that the \code{ROC} curve is only computed if \code{AUC} is enabled. See \code{metrics}.
#'
#'  Also includes:
#'
#'  A nested \code{tibble} with the \strong{predictions} and targets.
#'
#'  A \code{list} of \strong{ROC} curve objects (if computed).
#'
#'  A nested \code{tibble} with the \strong{confusion matrix}.
#'  The \code{Pos_} columns tells you whether a row is a
#'  True Positive (\code{TP}), True Negative (\code{TN}),
#'  False Positive (\code{FP}), or False Negative (\code{FN}),
#'  depending on which level is the "\code{positive}" class.
#'  I.e. the level you wish to predict.
#'
#'  A nested \strong{Process} information object with information
#'  about the evaluation.
#'  }
#'
#'  ----------------------------------------------------------------
#'
#'  \subsection{Multinomial Results}{
#'
#'  ----------------------------------------------------------------
#'
#'  For each class, a \emph{one-vs-all} binomial evaluation is performed. This creates
#'  a \strong{Class Level Results} \code{tibble} containing the same metrics as the binomial results
#'  described above (excluding \code{Accuracy}, \code{MCC}, \code{AUC}, \code{Lower CI} and \code{Upper CI}),
#'  along with a count of the class in the target column (\strong{\code{Support}}).
#'  These metrics are used to calculate the \strong{macro-averaged} metrics.
#'  The nested class level results \code{tibble} is also included in the output \code{tibble},
#'  and could be reported along with the macro and overall metrics.
#'
#'  The output \code{tibble} contains the macro and overall metrics.
#'  The metrics that share their name with the metrics in the nested
#'  class level results \code{tibble} are averages of those metrics
#'  (note: does not remove \code{NA}s before averaging).
#'  In addition to these, it also includes the \strong{\code{Overall Accuracy}} and
#'  the multiclass \strong{\code{MCC}}.
#'
#'  \strong{Note:} \strong{\code{Balanced Accuracy}} is the macro-averaged metric,
#'  \emph{not} the macro sensitivity as sometimes used!
#'
#'  Other available metrics (disabled by default, see \code{metrics}):
#'  \strong{\code{Accuracy}},
#'  \emph{multiclass} \strong{\code{AUC}},
#'  \strong{\code{Weighted Balanced Accuracy}},
#'  \strong{\code{Weighted Accuracy}},
#'  \strong{\code{Weighted F1}},
#'  \strong{\code{Weighted Sensitivity}},
#'  \strong{\code{Weighted Sensitivity}},
#'  \strong{\code{Weighted Specificity}},
#'  \strong{\code{Weighted Pos Pred Value}},
#'  \strong{\code{Weighted Neg Pred Value}},
#'  \strong{\code{Weighted Kappa}},
#'  \strong{\code{Weighted Detection Rate}},
#'  \strong{\code{Weighted Detection Prevalence}}, and
#'  \strong{\code{Weighted Prevalence}}.
#'
#'  Note that the "Weighted" average metrics are weighted by the \code{Support}.
#'
#'  When having a large set of classes, consider keeping \code{AUC} disabled.
#'
#'  Also includes:
#'
#'  A nested \code{tibble} with the \strong{Predictions} and targets.
#'
#'  A \code{list} of \strong{ROC} curve objects when \code{AUC} is enabled.
#'
#'  A nested \code{tibble} with the multiclass \strong{Confusion Matrix}.
#'
#'  A nested \strong{Process} information object with information
#'  about the evaluation.
#'
#'  \subsection{Class Level Results}{
#'
#'  Besides the binomial evaluation metrics and the \code{Support},
#'  the nested class level results \code{tibble} also contains a
#'  nested \code{tibble} with the \strong{Confusion Matrix} from the one-vs-all evaluation.
#'  The \code{Pos_} columns tells you whether a row is a
#'  True Positive (\code{TP}), True Negative (\code{TN}),
#'  False Positive (\code{FP}), or False Negative (\code{FN}),
#'  depending on which level is the "positive" class. In our case, \code{1} is the current class
#'  and \code{0} represents all the other classes together.
#'  }
#'  }
#' @author Ludvig Renbo Olsen, \email{r-pkgs@@ludvigolsen.dk}
#' @export
#' @family evaluation functions
#' @examples
#' \donttest{
#' # Attach packages
#' library(cvms)
#' library(dplyr)
#'
#' # Load data
#' data <- participant.scores
#'
#' # Fit models
#' gaussian_model <- lm(age ~ diagnosis, data = data)
#' binomial_model <- glm(diagnosis ~ score, data = data)
#'
#' # Add predictions
#' data[["gaussian_predictions"]] <- predict(gaussian_model, data,
#'   type = "response",
#'   allow.new.levels = TRUE
#' )
#' data[["binomial_predictions"]] <- predict(binomial_model, data,
#'   allow.new.levels = TRUE
#' )
#'
#' # Gaussian evaluation
#' evaluate(
#'   data = data, target_col = "age",
#'   prediction_cols = "gaussian_predictions",
#'   type = "gaussian"
#' )
#'
#' # Binomial evaluation
#' evaluate(
#'   data = data, target_col = "diagnosis",
#'   prediction_cols = "binomial_predictions",
#'   type = "binomial"
#' )
#'
#' #
#' # Multinomial
#' #
#'
#' # Create a tibble with predicted probabilities and targets
#' data_mc <- multiclass_probability_tibble(
#'   num_classes = 3, num_observations = 45,
#'   apply_softmax = TRUE, FUN = runif,
#'   class_name = "class_",
#'   add_targets = TRUE
#' )
#'
#' class_names <- paste0("class_", 1:3)
#'
#' # Multinomial evaluation
#' evaluate(
#'   data = data_mc, target_col = "Target",
#'   prediction_cols = class_names,
#'   type = "multinomial"
#' )
#'
#' #
#' # ID evaluation
#' #
#'
#' # Gaussian ID evaluation
#' # Note that 'age' is the same for all observations
#' # of a participant
#' evaluate(
#'   data = data, target_col = "age",
#'   prediction_cols = "gaussian_predictions",
#'   id_col = "participant",
#'   type = "gaussian"
#' )
#'
#' # Binomial ID evaluation
#' evaluate(
#'   data = data, target_col = "diagnosis",
#'   prediction_cols = "binomial_predictions",
#'   id_col = "participant",
#'   id_method = "mean", # alternatively: "majority"
#'   type = "binomial"
#' )
#'
#' # Multinomial ID evaluation
#'
#' # Add IDs and new targets (must be constant within IDs)
#' data_mc[["Target"]] <- NULL
#' data_mc[["ID"]] <- rep(1:9, each = 5)
#' id_classes <- tibble::tibble(
#'   "ID" = 1:9,
#'   "Target" = sample(x = class_names, size = 9, replace = TRUE)
#' )
#' data_mc <- data_mc %>%
#'   dplyr::left_join(id_classes, by = "ID")
#'
#' # Perform ID evaluation
#' evaluate(
#'   data = data_mc, target_col = "Target",
#'   prediction_cols = class_names,
#'   id_col = "ID",
#'   id_method = "mean", # alternatively: "majority"
#'   type = "multinomial"
#' )
#'
#' #
#' # Training and evaluating a multinomial model with nnet
#' #
#'
#' # Only run if `nnet` is installed
#' if (requireNamespace("nnet", quietly = TRUE)){
#'
#' # Create a data frame with some predictors and a target column
#' class_names <- paste0("class_", 1:4)
#' data_for_nnet <- multiclass_probability_tibble(
#'   num_classes = 3, # Here, number of predictors
#'   num_observations = 30,
#'   apply_softmax = FALSE,
#'   FUN = rnorm,
#'   class_name = "predictor_"
#' ) %>%
#'   dplyr::mutate(Target = sample(
#'     class_names,
#'     size = 30,
#'     replace = TRUE
#'   ))
#'
#' # Train multinomial model using the nnet package
#' mn_model <- nnet::multinom(
#'   "Target ~ predictor_1 + predictor_2 + predictor_3",
#'   data = data_for_nnet
#' )
#'
#' # Predict the targets in the dataset
#' # (we would usually use a test set instead)
#' predictions <- predict(
#'   mn_model,
#'   data_for_nnet,
#'   type = "probs"
#' ) %>%
#'   dplyr::as_tibble()
#'
#' # Add the targets
#' predictions[["Target"]] <- data_for_nnet[["Target"]]
#'
#' # Evaluate predictions
#' evaluate(
#'   data = predictions,
#'   target_col = "Target",
#'   prediction_cols = class_names,
#'   type = "multinomial"
#' )
#' }
#' }
evaluate <- function(data,
                     target_col,
                     prediction_cols,
                     type,
                     id_col = NULL,
                     id_method = "mean",
                     apply_softmax = FALSE,
                     cutoff = 0.5,
                     positive = 2,
                     metrics = list(),
                     include_predictions = TRUE,
                     parallel = FALSE,
                     models = deprecated()) {
  if (!rlang::is_missing(models)) {
    deprecate_stop("1.0.0", "cvms::evaluate(models = )",
      details = "Now only evaluates predictions."
    )
  }

  # Check arguments ####
  assert_collection <- checkmate::makeAssertCollection()
  checkmate::assert_data_frame(
    x = data, min.rows = 1,
    min.cols = 2,
    col.names = "named",
    add = assert_collection
  )
  checkmate::assert_string(
    x = target_col, min.chars = 1,
    add = assert_collection
  )
  checkmate::assert_character(
    x = prediction_cols,
    min.len = 1,
    min.chars = 1,
    add = assert_collection
  )
  checkmate::assert_string(
    x = id_col, min.chars = 1, null.ok = TRUE,
    add = assert_collection
  )

  checkmate::reportAssertions(assert_collection)

  group_columns <- colnames(dplyr::group_keys(data))

  if (target_col %in% prediction_cols) {
    assert_collection$push("'target_col' was in 'prediction_cols'.")
  }

  if (target_col %in% group_columns) {
    assert_collection$push("'data' cannot be grouped by the 'target_col' column.")
  }

  if (length(intersect(prediction_cols, group_columns)) > 0) {
    assert_collection$push("'data' cannot be grouped by a prediction column.")
  }

  if (!is.null(id_col)){
    if (id_col %in% prediction_cols) {
      assert_collection$push("'id_col' was in 'prediction_cols'.")
    }

    if (id_col == target_col) {
      assert_collection$push("'id_col' and 'target_col' cannot be identical.")
    }

    if (id_col %in% group_columns) {
      assert_collection$push("'data' cannot be grouped by the 'id_col' column.")
    }

  }

  checkmate::reportAssertions(assert_collection)
  # End of argument checks ####

  # Remove unnecessary columns
  data <- base_select(
    data,
    c(
      group_columns,
      target_col,
      prediction_cols,
      id_col
    )
  ) %>%
    as.data.frame(stringsAsFactors = FALSE) %>%
    dplyr::as_tibble() %>% # removes grouping
    dplyr::group_by_at(group_columns)

  eval <- run_evaluate(
    data = data,
    target_col = target_col,
    prediction_cols = prediction_cols,
    type = type,
    id_col = id_col,
    id_method = id_method,
    models = NULL,
    apply_softmax = apply_softmax,
    cutoff = cutoff,
    positive = positive,
    metrics = metrics,
    include_predictions = include_predictions,
    parallel = parallel,
    caller = "evaluate()"
  )

  # Set extra class
  class(eval) <- c("eval_results", class(eval))
  eval
}

run_evaluate <- function(data,
                         target_col,
                         prediction_cols,
                         type = "gaussian",
                         id_col = NULL,
                         id_method = "mean",
                         models = NULL,
                         apply_softmax = FALSE,
                         cutoff = 0.5,
                         positive = 2,
                         metrics = list(),
                         fold_info_cols = NULL,
                         fold_and_fold_col = NULL,
                         include_predictions = TRUE,
                         include_fold_columns = TRUE,
                         parallel = FALSE,
                         na.rm = NULL,
                         caller = "evaluate()") {
  if (checkmate::test_string(x = metrics, pattern = "^all$")) {
    metrics <- list("all" = TRUE)
  }

  # Check arguments ####
  assert_collection <- checkmate::makeAssertCollection()
  # Data frame
  checkmate::assert_data_frame(
    x = data, min.rows = 1,
    min.cols = 2,
    col.names = "named",
    add = assert_collection
  )

  # String
  checkmate::assert_string(
    x = target_col, min.chars = 1,
    add = assert_collection
  )
  checkmate::assert_character(
    x = prediction_cols,
    min.len = 1,
    min.chars = 1,
    add = assert_collection
  )
  checkmate::assert_string(
    x = id_col, min.chars = 1, null.ok = TRUE,
    add = assert_collection
  )
  checkmate::assert_string(x = caller, add = assert_collection)

  # Names
  checkmate::reportAssertions(assert_collection) # before names check
  checkmate::assert_names(
    x = colnames(data),
    must.include = c(target_col, prediction_cols, id_col),
    what = "colnames",
    add = assert_collection
  )

  # Choice
  checkmate::assert_choice(
    x = type,
    choices = c("gaussian", "binomial", "multinomial"),
    add = assert_collection
  )
  checkmate::assert_choice(
    x = id_method,
    choices = c("mean", "majority"),
    add = assert_collection
  )

  # Flag
  checkmate::assert_flag(x = apply_softmax, add = assert_collection)
  checkmate::assert_flag(x = include_predictions, add = assert_collection)
  checkmate::assert_flag(x = include_fold_columns, add = assert_collection)
  checkmate::assert_flag(x = parallel, add = assert_collection)
  checkmate::assert_flag(x = na.rm, null.ok = TRUE, add = assert_collection)

  # List
  checkmate::assert_list(
    x = models, null.ok = TRUE,
    min.len = 1, names = "unnamed",
    add = assert_collection
  )
  checkmate::assert_list(
    x = metrics,
    types = "logical",
    any.missing = FALSE,
    names = "named",
    add = assert_collection
  )
  checkmate::assert_list(
    x = fold_info_cols,
    types = "character",
    any.missing = FALSE,
    names = "named",
    null.ok = TRUE,
    add = assert_collection
  )
  checkmate::assert_data_frame(
    x = fold_and_fold_col,
    col.names = "named",
    any.missing = FALSE,
    null.ok = TRUE,
    add = assert_collection
  )

  # Number
  checkmate::assert_number(
    x = cutoff,
    lower = 0,
    upper = 1,
    add = assert_collection
  )

  checkmate::assert(
    checkmate::check_choice(
      x = positive,
      choices = c(1, 2)
    ),
    checkmate::check_string(
      x = positive,
      min.chars = 1
    )
  )

  checkmate::reportAssertions(assert_collection)

  # Need to report before, so type is not NULL
  if (type != "multinomial" &&
      length(prediction_cols) != 1){
    assert_collection$push(
      paste0("When 'type' is '", type, "', 'prediction_cols' must have length 1.")
    )
  }

  checkmate::reportAssertions(assert_collection)
  # End of argument checks ####

  # Convert families to the internally used
  family <- type

  # Create basic model_specifics object
  model_specifics <- list(
    model_formula = "",
    family = family,
    REML = NULL,
    link = NULL,
    control = NULL,
    cutoff = cutoff,
    positive = positive,
    model_verbose = FALSE,
    model_fn = NULL,
    predict_fn = NULL,
    preprocess_fn = NULL,
    preprocess_once = NULL,
    for_process = list(
      apply_softmax=apply_softmax,
      id_col = id_col
    ),
    hparams = NULL,
    caller = caller
  ) %>%
    update_model_specifics()

  info_cols <- list("Results" = FALSE)

  # One-hot encode predicted classes, if multinomial
  # and prediction_cols is one column with classes
  if (type == "multinomial" && length(prediction_cols) == 1) {

    if (!is.character(data[[target_col]]) ||
        !is.character(data[[prediction_cols]])){
      assert_collection$push(paste0(
        "When 'type' is 'multinomial' and 'prediction_cols' has length",
        " 1, both 'data[[target_col]]' and 'data[[prediction_cols]]' must",
        " have type character."))
    }
    if (isTRUE(apply_softmax)) {
      assert_collection$push(paste0(
        "When passing 'prediction_cols' as single column with multiple classes, ",
        "'apply_softmax' should be 'FALSE'."
      ))
    }
    checkmate::reportAssertions(assert_collection)

    # Extract the categorical levels in both target and prediction cols
    c_levels <- union(data[[target_col]], data[[prediction_cols]])
    data <- one_hot_encode(data, prediction_cols,
      c_levels = c_levels,
      use_epsilon = FALSE
    )

    prediction_cols <- sort(c_levels)
  }

  if (type == "binomial"){

    if (!(is.numeric(data[[prediction_cols]]) ||
          is.character(data[[prediction_cols]]))){
      assert_collection$push(
        paste0("When 'type' is 'binomial', 'data[[prediction_cols]]' mus",
               "t be either numeric or character."))
    }

    if (is.numeric(data[[prediction_cols]])){
      if (max(data[[prediction_cols]]) > 1 ||
          min(data[[prediction_cols]]) < 0) {
        assert_collection$push(
          paste0(
            "When 'type' is 'binomial' and 'data[[prediction_cols]]' ",
            "is numeric, the values in 'data[[prediction_cols]]' must be b",
            "etween 0 and 1."
          ))
      }

      # One may believe that setting the `positive` argument to the name
      # of a class should mean that probabilities > `cutoff` would be
      # considered that class, but this is not the case, so we
      # make the user aware of this (once)
      if (is.character(positive) && positive != sort(unique(as.character(data[[target_col]])))[[2]]){
        inform_about_positive_no_effect_on_probs(positive=positive)
      }
    }
    checkmate::reportAssertions(assert_collection)

    if (is.character(data[[prediction_cols]])){
      c_levels <- sort(union(data[[target_col]], data[[prediction_cols]]))
      if (length(c_levels) != 2){
        assert_collection$push(paste0("When 'type' is 'binomial' and 'data[[prediction_cols]]' ",
                                      "has type character, the target and prediction columns must h",
                                      "ave exactly 2 unique values combined. ",
                                      "Did you mean to use type='multinomial'?"))
        checkmate::reportAssertions(assert_collection)
      }
      # Replace with probabilities (1.0 or 0.0)
      data[[prediction_cols]] <- ifelse(data[[prediction_cols]] == c_levels[[2]],
                                        1.0, 0.0)
    }

    # Enable Accuracy if not otherwise specified
    metrics <- add_metric_if_not_specified(metrics, "Accuracy", value=TRUE, check_all=TRUE)
  }

  # Find number of classes if classification
  if (type == "binomial") {
    num_classes <- 2
  } else if (type == "multinomial") {
    # We might not have every available class
    # in the targets, so we rely on the number
    # of prediction cols instead
    num_classes <- length(prediction_cols)
  } else {
    num_classes <- NULL
  }

  # If the dataset is grouped, we need the indices and keys for the groups
  # so we can evaluate group wise

  # Get grouping keys
  grouping_keys <- dplyr::group_keys(data)
  # Make sure, the grouping_keys and the dataset are in the same order
  # As we otherwise risk adding them in the wrong order later
  data <- dplyr::arrange(data, !!!rlang::syms(colnames(grouping_keys)))
  # Get group indices
  grouping_factor <- dplyr::group_indices(data)

  # Map for joining the grouping keys to the indices
  grouping_keys_with_indices <- grouping_keys %>%
    dplyr::mutate(.group = 1:dplyr::n())

  if (!is.null(models) && length(unique(grouping_factor)) != length(models)) {
    stop(paste0(
      "When the dataframe is grouped, ",
      "please provide a fitted model object per group or set models to NULL."
    ))
  }

  # Add grouping factor with a unique tmp var
  local_tmp_grouping_factor_var <- create_tmp_name(data, ".group")
  data[[local_tmp_grouping_factor_var]] <- grouping_factor

  # Now that we've saved the groups
  # we can ungroup the dataset
  data <- data %>% dplyr::ungroup()

  # Create temporary prediction column name
  local_tmp_prediction_col_var <- create_tmp_name(data, "tmp_prediction_col")
  local_tmp_std_col_var <- create_tmp_name(data, "tmp_std_col")

  if (!is.null(id_col)) {

    # ID level evaluation

    # Currently don't support model object metrics
    # in ID aggregation mode
    if (!is.null(models)){
      stop("When aggregating by ID, 'models' should be NULL.")
    }

    # Prepare data for ID level evaluation
    data_for_id_evaluation <- prepare_id_level_evaluation(
      data = data,
      target_col = target_col,
      prediction_cols = prediction_cols,
      family = family,
      cutoff = cutoff,
      id_col = id_col,
      id_method = id_method,
      groups_col = local_tmp_grouping_factor_var,
      apply_softmax = FALSE,
      new_prediction_col_name = local_tmp_prediction_col_var,
      new_std_col_name = local_tmp_std_col_var
    ) %>%
      dplyr::ungroup() %>%
      dplyr::left_join(grouping_keys_with_indices, by = ".group")

    if (family == "multinomial") {
      prediction_cols <- local_tmp_prediction_col_var
    }

    # Run ID level evaluation
    evaluations <- run_internal_evaluate_wrapper(
      data = data_for_id_evaluation,
      type = family,
      prediction_col = prediction_cols,
      target_col = target_col,
      id_col = id_col,
      id_method = id_method,
      fold_info_cols = fold_info_cols,
      fold_and_fold_col = fold_and_fold_col,
      groups_col = local_tmp_grouping_factor_var,
      grouping_keys = grouping_keys,
      stds_col = local_tmp_std_col_var,
      models = models,
      model_specifics = model_specifics,
      metrics = metrics,
      info_cols = info_cols,
      num_classes = num_classes,
      parallel = parallel,
      include_predictions = include_predictions,
      include_fold_columns = include_fold_columns,
      na.rm = na.rm
    )
  } else {

    # Regular evaluation

    if (family == "multinomial") {

      # Prepare data for multinomial evaluation
      data <- prepare_multinomial_evaluation(
        data = data,
        target_col = target_col,
        prediction_cols = prediction_cols,
        apply_softmax = apply_softmax,
        new_prediction_col_name = local_tmp_prediction_col_var
      )

      prediction_cols <- local_tmp_prediction_col_var
    } else {
      if (length(prediction_cols) > 1) {
        stop(paste0("'prediction_cols' must have length 1 when type is '", type, "'."))
      }
    }

    # Run evaluation
    evaluations <- run_internal_evaluate_wrapper(
      data = data,
      type = family,
      prediction_col = prediction_cols,
      target_col = target_col,
      models = models,
      fold_info_cols = fold_info_cols,
      fold_and_fold_col = fold_and_fold_col,
      groups_col = local_tmp_grouping_factor_var,
      grouping_keys = grouping_keys,
      model_specifics = model_specifics,
      metrics = metrics,
      info_cols = info_cols,
      num_classes = num_classes,
      parallel = parallel,
      include_predictions = include_predictions,
      include_fold_columns = include_fold_columns,
      na.rm = na.rm
    )
  }

  evaluations
}


run_internal_evaluate_wrapper <- function(data,
                                          type,
                                          prediction_col,
                                          target_col,
                                          models,
                                          groups_col,
                                          grouping_keys,
                                          id_col = NULL,
                                          id_method = NULL,
                                          stds_col = NULL,
                                          fold_info_cols = NULL,
                                          fold_and_fold_col = NULL,
                                          model_specifics,
                                          metrics = list(),
                                          info_cols = list(),
                                          include_predictions = TRUE,
                                          include_fold_columns = TRUE,
                                          num_classes = NULL,
                                          na.rm = NULL,
                                          parallel = FALSE) {
  if (type != "gaussian") {
    if (is.null(num_classes)) {
      num_classes <- length(unique(data[[target_col]]))
    }
  }

  if (is.null(fold_info_cols)) {
    tmp_fold_cols_obj <- create_tmp_fold_cols(data)
    data <- tmp_fold_cols_obj[["data"]]
    fold_info_cols <- tmp_fold_cols_obj[["fold_info_cols"]]
    include_fold_columns <- FALSE
  } else {
    include_fold_columns <- include_fold_columns
  }

  # Extract unique group identifiers
  unique_group_levels <- unique(data[[groups_col]])

  evaluations <- plyr::llply(seq_along(unique_group_levels),
    .parallel = parallel, function(gr_ind) {
      gr <- unique_group_levels[[gr_ind]]
      data_for_current_group <- data[data[[groups_col]] == gr, ]

      # Assign current model
      if (is.null(models)) {
        model <- NULL
      } else {
        model <- list(models[[gr_ind]])
      }

      internal_evaluate(
        data = data_for_current_group,
        type = type,
        prediction_col = prediction_col,
        target_col = target_col,
        models = model,
        id_col = id_col,
        id_method = id_method,
        fold_info_cols = fold_info_cols,
        fold_and_fold_col = fold_and_fold_col,
        grouping_key_names = colnames(grouping_keys),
        stds_col = stds_col,
        model_specifics = model_specifics,
        metrics = metrics,
        info_cols = info_cols,
        include_fold_columns = include_fold_columns,
        include_predictions = include_predictions,
        na.rm = na.rm
      )
    }
  ) %>%
    dplyr::bind_rows() %>%
    tibble::as_tibble()

  # Add group key to class level results
  if (type == "multinomial") {
    grouping_keys <- repeat_data_frame_if(grouping_keys, 2,
      condition = na.rm == "both"
    )

    # Extract all the class level results tibbles
    # Remove the Results column
    # And add the grouping keys
    class_level_results <- evaluations[["Class Level Results"]] %>%
      # ".__grouping__" is used when na.rm = "both"!
      dplyr::bind_rows(.id = ".__grouping__") %>%
      tibble::as_tibble() %>%
      base_deselect(cols = "Results")
    evaluations[["Class Level Results"]] <- NULL
    grouping_keys[[".__grouping__"]] <- as.character(seq_len(nrow(grouping_keys)))
    evaluations[[".__grouping__"]] <- as.character(seq_len(nrow(evaluations)))
    class_level_results <- grouping_keys %>%
      # TODO This part might not work with na.rm = "both"
      dplyr::right_join(class_level_results, by = ".__grouping__")

    # Nest class level results
    class_level_results <- class_level_results %>%
      dplyr::group_by_at(c(".__grouping__", colnames(grouping_keys))) %>%
      dplyr::group_nest(keep = TRUE) %>%
      # Remove ".__grouping__" again
      dplyr::mutate(data = purrr::map(.data$data,
        .f = ~ .x %>%
          base_deselect(cols = ".__grouping__")
      )) %>%
      base_rename(before = "data", after = "Class Level Results") %>%
      base_select(c(".__grouping__", "Class Level Results"))

    evaluations <- evaluations %>%
      dplyr::left_join(class_level_results, by = ".__grouping__") %>%
      base_deselect(".__grouping__")
  }

  # Add grouping keys
  results <- grouping_keys %>%
    dplyr::bind_cols(evaluations) %>%
    base_deselect(".__grouping__")

  # Move Process last
  if ("Process" %in% colnames(results)){
    results <- results %>%
      dplyr::relocate("Process", .after = dplyr::last_col())
  }

  # If na.rm != "both" and it contains the NAs_removed column
  if ((!is.character(na.rm) || na.rm != "both") &&
    "NAs_removed" %in% names(results)) {
    results[["NAs_removed"]] <- NULL
  }

  results
}

internal_evaluate <- function(data,
                              type = "gaussian",
                              prediction_col = "prediction",
                              target_col = "target",
                              model_was_null_col = NULL,
                              fold_info_cols = list(
                                rel_fold = "rel_fold",
                                abs_fold = "abs_fold",
                                fold_column = "fold_column"
                              ),
                              fold_and_fold_col = NULL,
                              grouping_key_names = NULL,
                              stds_col = NULL,
                              models = NULL,
                              id_col = NULL,
                              id_method = NULL,
                              model_specifics = list(),
                              metrics = list(),
                              info_cols = list(),
                              include_fold_columns = TRUE,
                              include_predictions = TRUE,
                              na.rm = NULL) {
  stopifnot(type %in% c("gaussian", "binomial", "multinomial")) # , "multiclass", "multilabel"))

  # Fill metrics with default values for non-specified metrics
  # and get the names of the metrics
  metrics <- set_metrics(
    family = type, metrics_list = metrics,
    include_model_object_metrics = !is.null(models)
  )
  info_cols <- set_info_cols(family = type, info_cols_list = info_cols)
  if (!is.null(na.rm) && na.rm == "both") {
    info_cols <- c(info_cols, "NAs_removed")
  }

  # Set default na.rm if NULL
  if (is.null(na.rm)) {
    na.rm <- dplyr::case_when(
      type == "gaussian" ~ TRUE,
      type == "binomial" ~ FALSE,
      type == "multinomial" ~ FALSE
    )
  }

  # data is a table with predictions, targets and folds
  # predictions can be values, logits, or classes, depending on evaluation type

  # Unless specified otherwise, we don't care about non-converged models etc. here
  # so we tell the eval that the models were not NULL. (TODO Would we even have predictions otherwise?)
  if (is.null(model_was_null_col)) {
    model_was_null_col <- "model_was_null"
    data[[model_was_null_col]] <- FALSE
  }

  # Extract grouping key info
  if (!is.null(grouping_key_names)) {
    group_info <- data %>%
      base_select(grouping_key_names)
  } else {
    group_info <- NULL
  }

  # Evaluate the predictions
  prediction_evaluation <- internal_evaluate_predictions(
    data = data,
    prediction_col = prediction_col,
    target_col = target_col,
    model_was_null_col = model_was_null_col,
    id_col = id_col,
    id_method = id_method,
    type = type,
    fold_info_cols = fold_info_cols,
    fold_and_fold_col = fold_and_fold_col,
    group_info = group_info,
    stds_col = stds_col,
    model_specifics = model_specifics,
    metrics = metrics,
    include_fold_columns = include_fold_columns,
    include_predictions = include_predictions,
    na.rm = na.rm
  )

  if (!is.null(models)) {
    model_evaluations <- plyr::ldply(seq_len(length(models)), function(i) {
      internal_evaluate_model(
        model = models[[i]],
        train_data = NULL,
        test_data = NULL,
        type = type,
        fold_info = NULL,
        fold_info_cols = fold_info_cols,
        model_specifics = model_specifics,
        metrics = metrics,
        include_fold_columns = include_fold_columns
      )
    })

    output <- dplyr::bind_cols(model_evaluations, prediction_evaluation)
  } else {
    output <- prediction_evaluation
  }

  # Extract ROC from Results col
  if (type == "multinomial") {
    ROCs <- output[["Results"]] %c% "ROC" %>%
      unlist(recursive = FALSE)
    if ("mv.multiclass.roc" %ni% class(ROCs[[1]])){
      ROCs <- unlist(ROCs, recursive = FALSE)
    }
    output[["ROC"]] <- ROCs
  }
  new_col_order <- c(metrics, intersect(info_cols, colnames(output)))
  base_select(output, cols = new_col_order)
}

create_tmp_fold_cols <- function(data) {
  # Create fold columns
  local_tmp_fold_col_var <- create_tmp_name(data, "fold_column")
  local_tmp_rel_fold_col_var <- create_tmp_name(data, "rel_fold")
  local_tmp_abs_fold_col_var <- create_tmp_name(data, "abs_fold")

  data[[local_tmp_fold_col_var]] <- as.character(1)
  data[[local_tmp_rel_fold_col_var]] <- as.character(1)
  data[[local_tmp_abs_fold_col_var]] <- as.character(1)

  fold_info_cols <- list(
    rel_fold = local_tmp_rel_fold_col_var,
    abs_fold = local_tmp_abs_fold_col_var,
    fold_column = local_tmp_fold_col_var
  )

  list(
    "data" = data,
    "fold_info_cols" = fold_info_cols
  )
}