sboost: Machine Learning with AdaBoost on Decision Stumps

Documented in validate

#' sboost Validation Function
#'
#' A k-fold cross validation algorithm for sboost.
#'
#' @param features feature set data.frame.
#' @param outcomes outcomes corresponding to the features.
#' @param iterations number of boosts.
#' @param k_fold number of cross-validation subsets.
#' @param positive is the positive outcome to test for; if NULL, the first in
#'                 alphabetical order will be chosen
#' @param verbose If true, progress bars will be displayed in console.
#' @return An \emph{sboost_validation} S3 object containing:
#' \describe{
#'   \item{\emph{performance}}{Final performance statistics for all stumps.}
#'   \item{\emph{training_summary_statistics}}{Mean and standard deviations for test statistics
#'         generated by \code{\link{assess}} cumulative statistics for each of the training sets.}
#'   \item{\emph{testing_summary_statistics}}{Mean and standard deviations for test statistics
#'         generated by \code{\link{assess}} cumulative statistics for each of the testing sets.}
#'   \item{\emph{training_statistics}}{sboost \emph{sboost_assessment} cumulative statistics objects
#'         used to generate training_statistics.}
#'   \item{\emph{testing_statistics}}{sboost \emph{sboost_assessment} cumulative statistics objects
#'         used to generate testing_statistics.}
#'   \item{\emph{classifier_list}}{sboost \emph{sboost_classifier} objects
#'         created from training sets.}
#'   \item{\emph{outcomes}}{Shows which outcome was considered as positive and which negative.}
#'   \item{\emph{k_fold}}{number of testing and training sets used in the validation.}
#'   \item{\emph{call}}{Shows the parameters that were used for validation.}
#' }
#' @seealso \code{\link{sboost}} documentation.
#' @examples
#' # malware
#' validate(malware[-1], malware[1], iterations = 5, k_fold = 3, positive = 1)
#'
#' # mushrooms
#' validate(mushrooms[-1], mushrooms[1], iterations = 5, k_fold = 3, positive = "p")
#' @export
validate <- function(features, outcomes, iterations = 1, k_fold = 6, positive = NULL, verbose = FALSE) {

  # PREPARE INPUT
  # --------------------------------------------------------------------------------

  # test and prepare features and outcomes
  if (is.data.frame(outcomes)) outcomes <- as.vector(outcomes[[1]])
  processed_features <- process_feature_input(features)
  categorical <- find_categorical(features)
  otcm_def <- check_positive_value(outcomes, positive)
  processed_outcomes <- process_outcome_input(outcomes, features, otcm_def)

  # create variables
  raw_classifier_list <- list();
  classifier_list <- list();
  training_statistics <- list();
  testing_statistics <- list();
  rows = nrow(features);


  # MAIN VALIDATION LOOP
  # --------------------------------------------------------------------------------
  for (i in 1:k_fold) {
    if (verbose) print(paste0("Training classifier ", i, " of ", k_fold, "..."))
    training <- -(((i - 1) / k_fold) * rows):-((i / k_fold) * rows)
    testing <- ((((i - 1) / k_fold) * rows) + 1):((i / k_fold) * rows)

    # create classifier
    raw_classifier_list[[i]] <- make_classifier(processed_features[training, ], processed_outcomes[training], categorical, iterations, verbose)
    classifier_list[[i]] <- process_classifier_output(raw_classifier_list[[i]], features, outcomes, otcm_def, match.call(), training)

    # test classifier
    training_statistics[[i]] <- get_cumulative_statistics(classifier_list[[i]], raw_classifier_list[[i]], processed_features[training, ], processed_outcomes[training])
    testing_statistics[[i]] <- get_cumulative_statistics(classifier_list[[i]], raw_classifier_list[[i]], processed_features[testing, ], processed_outcomes[testing])

  }

  validation <- process_validation_output(training_statistics, testing_statistics, classifier_list, k_fold, match.call())

  return(validation)
}