R/SOptim_ClassificationFunctions.R

Defines functions print.SOptim.ClassifierSummary summary.SOptim.Classifier calibrateClassifier predict.knn createDataSplits generateFormula replaceDefaultClassificationParams generateDefaultClassifierParams

Documented in calibrateClassifier createDataSplits generateDefaultClassifierParams generateFormula predict.knn print.SOptim.ClassifierSummary replaceDefaultClassificationParams summary.SOptim.Classifier

## --------------------------------------------------------------------- ##
##
## Main functions used for performing object-based image classification
##
## --------------------------------------------------------------------- ##



#' Generates default parameters for classification algorithms
#' 
#' This is an auxiliary function used for generating a list of default parameters used for the available 
#' classification algorithms: Random Forests (RF), K-nearest neighbour (KNN), Flexible Discriminant 
#' Analysis (FDA), Support Vector Machines (SVM) and Generalized Boosted Model (GBM).
#' 
#' @param x The dataset used for classification (this is required to calculate some classifier hyperparameters based on the number of 
#' columns/variables in the data).
#' 
#' @return A nested list object of class \code{classificationParamList} with parameters for the available 
#' algorithms, namely:
#'          
#'          \itemize{
#'            
#'            \item RF - Random Forest parameters:
#'              \itemize{
#'                \item \emph{mtry} is equal to \code{floor(sqrt(ncol(x)-2))} and defines the number of variables randomly sampled 
#'                as candidates at each split
#'                \item \emph{ntree} equals 250 (by default) and is the number of trees to grow
#'              }
#'          
#'            \item KNN - K-nearest neighbour parameters:
#'              \itemize{
#'                \item \emph{k} is equal to 5 and is the number of neighbours considered
#'              }
#'              
#'            \item FDA - Flexible Discriminant Analysis with MDA-MARS parameters:
#'              \itemize{
#'                \item \emph{degree} equals 1 defining an optional integer specifying maximum interaction degree 
#'                \item \emph{penalty} is equal to 2 and sets an optional value specifying the cost per degree of freedom charge
#'                \item \emph{prune} is set to TRUE and defines an optional logical value specifying whether the model should be 
#'                pruned in a backward stepwise fashion
#'              }
#'              
#'            \item SVM - Support Vector Machine (with radial-basis kernel) parameters:
#'              \itemize{
#'                \item \emph{gamma} equals \code{1/(ncol(x)-2)} and sets the parameter needed for all kernels except linear
#'                \item \emph{cost} equal to 1 defines the cost of constraints violation - it is the 'C'-constant of the regularization 
#'                term in the Lagrange formulation
#'                \item \emph{probability} is equal to TRUE and defines the output type
#'              }
#'          
#'            \item GBM - Generalized Boosted Modeling parameters:
#'              \itemize{
#'                \item \emph{n.trees} set to 250 defining the total number of trees to fit
#'                \item \emph{interaction.depth} equal to 1 which defines he maximum depth of variable interactions. 1 implies an 
#'                additive model, 2 implies a model with up to 2-way interactions, etc  
#'                \item \emph{shrinkage} set to 0.01 is a shrinkage parameter applied to each tree in the expansion. Also known as the 
#'                learning rate or step-size reduction.
#'                \item \emph{bag.fraction} set to 0.5 and equals the fraction of the training set observations randomly selected to 
#'                propose the next tree in the expansion)
#'                \item \emph{distribution} set to bernoulli (if single-class) or multinomial (if multi-class) this parameter defines 
#'                the applicable distribution used for classification
#'              }
#'              
#'          }
#'          
#' @examples 
#' 
#' DF <- data.frame(SID=1:5, train=sample(0:1,5,replace=TRUE), Var_1=rnorm(5), Var_2=rnorm(5))
#' generateDefaultClassifierParams(DF)
#' 
#' 
#' @seealso \code{\link{replaceDefaultClassificationParams}}
#' @export


generateDefaultClassifierParams<-function(x){
  
  paramList<-
  
  list(
    
    ## Random Forest ---------------------------------------------- ##
    ##
    `RF`=list(
      mtry=floor(sqrt(ncol(x)-2)), # Number of variables randomly sampled as candidates at each split
      ntree=250 # Number of trees to grow. This should not be set to too small a number, to ensure that every input row gets predicted at least a few times.
      ),
    
    ## K-nearest neighbour ---------------------------------------- ##
    ##
    `KNN`=list(
      k = floor(sqrt(nrow(x))) # number of neighbours considered
      ),
    
    ## Flexible Discriminant Analysis with MDA-MARS --------------- ##
    ##
    `FDA`=list(
      degree=1,  # an optional integer specifying maximum interaction degree (default is 1).
      penalty=2, # an optional value specifying the cost per degree of freedom charge (default is 2).
      prune=TRUE # an optional logical value specifying whether the model should be pruned in a backward stepwise fashion (default is TRUE)
      
    ),
    
    ## Support Vector Machine with Radial-basis kernel function --- ##
    ##
    `SVM`=list(
      gamma=1/(ncol(x)-2), # parameter needed for all kernels except linear (default: 1/(data dimension))
      cost=1,               # cost of constraints violation (default: 1) - it is the 'C'-constant of the regularization term in the Lagrange formulation
      probability=TRUE
    ),
    
    ## Generalized Boosted Regression Modeling -------------------- ##
    ##
    `GBM`=list(
      n.trees=250,               # the total number of trees to fit
      interaction.depth = 1,     # The maximum depth of variable interactions. 1 implies an additive model, 2 implies a model with up to 2-way interactions, etc  
      shrinkage = 0.01,          # a shrinkage parameter applied to each tree in the expansion. Also known as the learning rate or step-size reduction.
      bag.fraction = 0.5,        # the fraction of the training set observations randomly selected to propose the next tree in the expansion)
      distribution = "bernoulli") # distribution used for classification
  )
  
  class(paramList) <- append(class(paramList),"classificationParamList")
  return(paramList)
  
}

#' Replace classification parameters
#' 
#' An auxiliary function to replace default parameters of classification algorithms.
#' 
#' @param oldParamSet A parameter list object of class \code{classificationParamList}.  
#' @param newParamSet A nested list with names equal to the parameters defined in \code{\link{generateDefaultClassifierParams}}.
#' 
#' @return A nested list object of class \code{classificationParamList} with parameters for the available algorithms.  
#' 
#' @seealso \code{\link{generateDefaultClassifierParams}} to get the list of parameters that can be changed for each 
#' classification algorithm.
#' 
#' @examples
#' 
#' # x is the classification dataset with one column with the segment ID, 
#' # other for train/test data inputs, and the remaining columns with classification features
#' 
#' x<-data.frame(SID=1:10, train=sample(c(0,1),10, replace=TRUE), feature1=rnorm(10), 
#' feature2=rnorm(10,5,2), feature3=rnorm(10,12,3))
#' 
#' oldParams<-generateDefaultClassifierParams(x)
#' 
#' newParams<-list(RF=list(mtry=3),
#'                GBM=list(n.trees=3000))
#' 
#' replaceDefaultClassificationParams(oldParams, newParams)
#' 
#' @export

replaceDefaultClassificationParams <- function(oldParamSet, newParamSet){
  
  if(!inherits(oldParamSet,"classificationParamList"))
    stop("The object in oldParamSet must be of class classificationParamList")
  
  for(clMethodName in names(newParamSet)){
    for(parName in names(newParamSet[[clMethodName]])){
      oldParamSet[[clMethodName]][[parName]] <- newParamSet[[clMethodName]][[parName]]
    }
  }
  return(oldParamSet)
}

  

#' Generate formula
#' 
#' Generates a formula object to use in classification algorithms.
#' 
#' @param x A calibration dataset with the first two columns containing the segment ID (SID) and train/test 
#' inputs (named as train). The remaining columns will be used as classification features.
#' 
#' @note By default removes the first 2 columns corresponding to the segment identifier (SID) and and the train columns.
#' 
#' @return A \code{formula} object with a similar structure to: \code{train ~ feature_1 + ... + feature_n}.   
#' 
#' @importFrom stats as.formula         
#' 
#' @examples 
#' 
#' DF <- data.frame(SID=1:10, train = 1:10, x1 = rnorm(10), x2 = rnorm(10), x3 = rnorm(10))
#' generateFormula(DF)
#' 
#' @export

generateFormula <- function(x){
  
  # Get column names and remove SID and train cols
  cn <- colnames(x)[-c(1:2)] 
  
  # Assemble formula
  form <- stats::as.formula(paste("train ~",paste(cn,collapse="+")))
  return(form)
}



#' Create a data split for classification evaluation
#' 
#' An auxiliary/internal function used to create a data split intended to evaluate classification performance. 
#' Used internally in the \code{\link{calibrateClassifier}} function.
#' 
#' @param y A vector containing train inputs.
#' 
#' @param evalMethod A character string defining the evaluation method. The available methods are \code{"10FCV"} (10-fold 
#' cross-validation; the default), \code{"5FCV"}, (5-fold cross-validation), \code{"HOCV"} (holdout cross-validation with 
#' the training percentage defined by \code{trainPerc} and the number of rounds defined in \code{nRounds}), and, \code{"OOB"} 
#' (out-of-bag evaluation; only applicable to random forests algorithm).
#' 
#' @param nRounds Number of evaluation rounds. This is only used for HOCV method.
#' 
#' @param trainPerc Proportion of data used for training the algorithm (by deafult eaul to 0.8). This is 
#' only used for the \code{"HOCV"} method.
#' 
#' @details Uses function \link[caret]{createDataPartition} from \pkg{caret} package as the 'workhorse' to generate 
#' data splits.
#' 
#' @return A list like object of class 'dataSplit' to be used in calibrateClassifier() function. Each element on the list 
#' contains the indices used for training. 
#' 
#' @examples 
#' createDataSplits(y=rnorm(100), evalMethod = "10FCV", nRounds = 20, trainPerc = 0.8)
#' 
#' @export
#' @importFrom caret createFolds
#' @importFrom caret createDataPartition
#' 

createDataSplits <- function(y, evalMethod = "10FCV", nRounds = 20, trainPerc = 0.8){
  
  # Just a workaround?!... - OOB does not creates data splits externally
  if(evalMethod == "OOB"){
      dp<-list(resample01=1:length(y))
  }
  
  else if(evalMethod == "5FCV"){
    dp<-caret::createFolds(y, k = 5, list = TRUE, returnTrain = TRUE)
  }
  
  else if(evalMethod == "10FCV"){
    dp<-caret::createFolds(y, k = 10, list = TRUE, returnTrain = TRUE)
  }
  
  else if(evalMethod =="HOCV"){
    dp<-caret::createDataPartition(y, times = nRounds,p = trainPerc,list = TRUE)
  }
  
  else{
    stop("Invalid method set in evaluation method for creating data splits! Please review input parameters")
  }

  # Return output
  attr(dp,"evalMethod")<-evalMethod
  class(dp)<-append(class(dp),"dataSplit")
  return(dp)
  
}

#' Predict k-nearest neighbour output
#' 
#' An auxiliary function used to predict the probabilistic outcome from the knn classifier for 
#' single or multi-class problems.
#' 
#' @param object An object generated from \link[class]{knn} function.
#' 
#' @param type Type of response generated, if \code{"prob"} the probabilistic output for all classes 
#' will be returned. If \code{"response"} then the class labels are returned.
#' 
#' @param newdata A dataset used to predict the class labels.
#' 
#' @param traindata Train data used to calibrate the \link[class]{knn} classifier.
#' 
#' @param cl Factor of true classifications of training set.
#' 
#' @param k Number of neighbours considered.
#' 
#' @param ... Additional arguments passed to \link[class]{knn}.
#' 
#' @inheritParams calibrateClassifier
#' 
#' @return A matrix object with knn output.
#' 
#' @method predict knn
#' 
#' @importFrom class knn
#' 
#' @export

predict.knn <- function(object, type="response", newdata=NULL, traindata = NULL, cl=NULL, k=NULL, 
                      balanceTrainData=FALSE, balanceMethod="ubOver", ...){
  
  if(!is.null(newdata) && !is.null(traindata) && !is.null(k) && !is.null(cl)){
    
    if(balanceTrainData){
      traindata <- cbind(cl,traindata)
      colnames(traindata)[1] <- "train"
      traindata <- dataBalancing(x = traindata, method = balanceMethod)
      object <- class::knn(train = traindata[,-1], test = newdata, cl = as.factor(traindata[,"train"]), k = k, prob = TRUE, ...)
    }else{
      object <- class::knn(train = traindata, test = newdata, cl = cl, k = k, prob = TRUE, ...)
    }

  }
  
  if(type=="prob"){
    lv<-levels(object)
    prob<-attr(object,"prob")
    pred<-as.character(object)
    out<-matrix(0,length(object),length(lv),dimnames = list(1:length(object),lv))
    
    for(i in 1:length(prob)){
      posCol<-(1:length(lv))[lv==pred[i]]
      out[i,posCol]<-prob[i]
      out[i,-posCol]<-(1-prob[i])/(length(lv)-1) ## Assumes equal probability for the remaining classes?!
    }
    return(out)

  }else if(type=="response"){
    
    return(object)
  }
}


#' Train and evaluate a classification algorithm
#' 
#' Main function used for classifier training and evaluation for both single and multi-class problems.     
#' 
#' 
#' @param calData An input calibration dataset used for classification. It can be either an object of class \code{matrix}, 
#' \code{data.frame} or \code{SOptim.CalData} generated by \code{prepareCalData} (to use with option \code{runFullCalibration=TRUE}; 
#' see below for more details). If the input is a \code{matrix} or \code{data.frame} then it must contain one column named \code{SID} with 
#' segment IDs, one column named \code{"train"} defining the labels, classes or categories (either {0,1} for single-class problems, 
#' or, {1, 2, ..., n} for multi-class problems) followed by n columns containing features (or variables) used for training the classifier.
#'   
#' @param classificationMethod An input string defining the classification algorithm to be used. Available options are: 
#' \code{"RF"} (random forests), \code{"GBM"} (generalized boosted models), \code{"SVM"} (support vector machines), \code{"KNN"} 
#' (k-nearest neighbour), and, \code{"FDA"} (flexible discriminant analysis).     
#' 
#' @param classificationMethodParams A list object with a customized set of parameters to be used for the classification algorithms 
#' (default = NULL). See also \link{generateDefaultClassifierParams} to see which parameters can be changed and how to structure the 
#' list object.
#' 
#' @param balanceTrainData Defines if data balancing is to be used (only available for single-class problems; default: TRUE). 
#'   
#' @param balanceMethod A character string used to set the data balancing method. Available methods are based on under-sampling 
#' \code{"ubUnder"} or over-sampling \code{"ubOver"} the target class. 
#'  
#' @param evalMethod A character string defining the evaluation method. The available methods are \code{"10FCV"} (10-fold 
#' cross-validation; the default), \code{"5FCV"} (5-fold cross-validation), \code{"HOCV"} (holdout cross-validation with 
#' the training percentage defined by \code{trainPerc} and the number of rounds defined in \code{nRounds}), and, \code{"OOB"} 
#' (out-of-bag evaluation; only applicable to random forests).
#' 
#' @param evalMetric A character string setting the evaluation metric or a function that calculates the performance score 
#' based on two vectors one for observed and the other for predicted values (see below for more details). 
#' This option defines the outcome value of the genetic algorithm fitness function and the output of grid or random search 
#' optimization routines. Check \code{\link{evalPerformanceGeneric}} for available options. When \code{runFullCalibration=TRUE} 
#' this metric will be calculated however other evaluation metrics can be quantified using \link{evalPerformanceClassifier}. 
#' 
#' @param trainPerc A decimal number defining the training proportion (default: 0.8; if \code{"HOCV"} is used).
#' 
#' @param nRounds Number of training rounds used for holdout cross-validation (default: 20; if \code{"HOCV"} is used).
#' 
#' @param minTrainCases The minimum number of training cases used for calibration (default: 20). If the number of rows 
#' in \code{x} is below this number then \code{calibrateClassifier} will not run.
#' 
#' @param minCasesByClassTrain Minimum number of cases by class for each train data split so that the classifier 
#' is able to run.
#' 
#' @param minCasesByClassTest Minimum number of cases by class for each test data split so that the classifier 
#' is able to run.  
#'  
#' @param runFullCalibration Run full calibration? Check \strong{details} section (default: FALSE).  
#' 
#' @param verbose Print progress messages? (default: TRUE)
#' 
#' @details 
#' 
#' Two working modes can be used:   
#' \enumerate{
#'    \item i) for "internal" GA optimization or grid/random search: \code{runFullCalibration = FALSE}, or,     
#'    \item ii) for performing a full segmented image classification: \code{runFullCalibration = TRUE}.
#' }
#' Tipically, the first option is used internally for optimizing segmentation parameters in 
#' \code{\link{gaOptimizeSegmentationParams}} where the output value from the selected evaluation metric 
#' is passed as the fitnes function outcome for \pkg{GA} optimization.        
#' The second option, should be used to perform a final image classification and to get full evaluation 
#' statistics (slot: 'PerfStats'), confusion matrices (slot: 'ConfMat'), train/test partion sets (slot: 'TrainSets'), 
#' classifier objects (slot: 'ClassObj') and parameters (slot: 'ClassParams'). In addition to the evaluation rounds 
#' (depending on the evaluation method selected) this option will also run a "full" round where all the data (i.e., 
#' no train/test split) will be used for training. Results from this option can then be used in \code{\link{predictSegments}}.  
#' This function can also perform data balancing for single-class problems (check out option \code{balanceTrainData} and 
#' \code{balanceMethod}). 
#' 
#' Check \link[unbalanced]{ubBalance} function for further details regarding data balancing.    
#' 
#' For more details about the classification algorithms check out the following functions: 
#' \itemize{
#'   \item \link[randomForest]{randomForest} for random forest algorithm, 
#'   \item \link[gbm]{gbm} for generalized boosted modelling, 
#'   \item \link[e1071]{svm} for details related to support vector machines, 
#'   \item \link[class]{knn} for k-nearest neighbour classification, and, 
#'   \item \link[mda]{fda} for flexible discriminant analysis. 
#' }
#' 
#' @section Defining custom performance evaluation functions:
#' 
#' In argument \code{evalMetric} it is possible to define a custom function. This must take two vectors: one containing 
#' observed/ground-truth values (first argument) and other with predicted values by the trained classifier (second argument) 
#' and both for the test set (from holdout or k-fold CV). If the classification task is single-class (e.g., 1:forest/0:non-forest, 
#' 1:water/0:non-water) then the predicted values will be probabilities (ranging in [0,1]) for the interest class (coded as 
#' 1's). If the task is multi-class, then the predicted values will be integer codes for each class.
#' 
#' To be considered valid, the evaluation function for single-class must have:
#' 
#' \itemize{
#'    \item Have at least two inputs arguments (observed and predicted);
#'    \item Produce a non-null and valid numerical result;
#'    \item A scalar output;
#'    \item An attribute named \code{'thresh'} defining the numerical threshold to 
#'    binarize  the classifier predictions (i.e., to convert from continuous probability 
#'    to discrete {0,1}). The calculation of this threshold is necessary to maximize the 
#'    value of the performance metric instead of using a naive 0.5 cutoff value.
#' }
#' 
#' Here goes an example function used to calculate the maximum value for the overall 
#' accuracy based on multiple threshold values:
#' 
#' \preformatted{
#' 
#' calcMaxAccuracy <- function(obs, pred){
#' 
#'    accuracies <- c()
#'    i <- 0
#'    N <- length(obs)
#'    thresholds <- seq(0, 1, 0.05)
#' 
#'    for(thresh in thresholds){
#'       i <- i + 1   
#'       pred_bin <- as.integer(pred > thresh)
#'       confusionMatrix <- as.matrix(table(obs, pred_bin))
#'       accuracies[i] <- diag(confusionMatrix) / N
#'    }
#' 
#'    bestAccuracy <- max(accuracies)
#'    attr(bestAccuracy, "thresh") <- thresholds[which.max(accuracies)]
#' 
#'    return(bestAccuracy)
#' }
#' 
#' x <- sample(0:1,100,replace=TRUE)
#' y <- runif(100)
#' calcMaxAccuracy(obs = x, pred = y)
#' 
#' } 
#' 
#' Valid multi-class functions' must have:
#' 
#' \itemize{
#'    \item Have at least two inputs arguments (observed and predicted);
#'    \item Produce a non-null and valid numerical result;
#'    \item A scalar output.
#' }
#' 
#' An example of a valid custom function to calculate the overall accuracy:
#' 
#' \preformatted{
#' 
#' calcAccuracy <- function(obs, pred){
#' 
#'    N <- length(obs)
#'    confusionMatrix <- as.matrix(table(obs, pred))
#'    acc <- diag(confusionMatrix) / N
#'    return(acc)
#' }
#' 
#' x <- sample(0:1,100,replace=TRUE)
#' y <- sample(0:1,100,replace=TRUE)
#' calcAccuracy(obs = x, pred = y)
#' 
#' } 
#'
#' @return 
#' 
#' If \code{runFullCalibration = FALSE} then a single average value (across evaluation replicates/folds) for the 
#' selected evaluation metric will be returned (typically used for GA optimization).       
#' If \code{runFullCalibration = TRUE} then an object of class \code{SOptim.Classifier} is returned with the 
#' following elements:      
#' \itemize{
#'    \item \bold{AvgPerf} - average value of the evaluation metric selected;
#'    \item \bold{PerfStats} - numeric vector with performance statistics (for the selected metric) for each 
#'    evaluation round plus one more round using the "full" train dataset;
#'    \item \bold{Thresh} - for single-class problems only; numeric vector with the threshold values (one for 
#'    each round plus the "full" dataset) that maximize the selected evaluation metric;
#'    \item \bold{ConfMat} - a list object with confusion matrices generated at each round; for single-class problems 
#'    this matrix is generated by dichotomizing the probability predictions (into {0,1}) using the threshold that 
#'    optimizes the selected evaluation metric (see 'Thresh' above);
#'    \item \bold{obsTestSet} - observed values for the test set (one integer vector for each evaluation round plus 
#'    the full evaluation round);
#'    \item \bold{predTestSet} - predicted values for the test set (one integer or numeric vector for each evaluation 
#'    round plus the full evaluation round);
#'    \item \bold{TrainSets} - a list object with row indices identifying train splits for each test round;
#'    \item \bold{ClassObj} - a list containing classifier objects for each round;
#'    \item \bold{ClassParams} - classification parameters used for running \code{\link{calibrateClassifier}}. 
#' }
#'
#' @note
#' 1) By default, if 25\% or more of the calibration/evaluation rounds must produce valid results otherwise the 
#' optimization algorithm will return \code{NA}.   
#' 
#' 2) Data balancing is only performed on the train dataset to avoid bias in performance evaluation derived from 
#' this procedure. 
#' 
#' @export 
#' 
#' @import class
#' @import gbm
#' @import unbalanced
#' @import randomForest
#' @import e1071
#' @import mda
#' @importFrom stats predict
#' 

calibrateClassifier <- function(calData,
                                classificationMethod = "RF",
                                classificationMethodParams = NULL,
                                balanceTrainData = FALSE,
                                balanceMethod = "ubOver",
                                evalMethod = "HOCV",
                                evalMetric = "Kappa",
                                trainPerc = 0.8,
                                nRounds = 20,
                                minTrainCases = 30,
                                minCasesByClassTrain = 10,
                                minCasesByClassTest = 10,
                                runFullCalibration = FALSE,
                                verbose = TRUE){
  
  
  # Check if the object in calData was generated by prepareCalData if runFullCalibration is TRUE
  if(runFullCalibration){
    if(!inherits(calData, "SOptim.CalData")){
      stop("When in runFullCalibration mode then calData must be an object of class SOptim.CalData generated by prepareCalData!")
    }
    else{
      nClassType <- attr(calData, "nClassType")
      calData <- calData[["calData"]] #Use only the first component of the object for calibration
    }
  }
  
  if(inherits(calData, "SOptim.CalData") && !runFullCalibration){
    stop("Seems that you have inputed a SOptim.CalData object! 
         In this case set option runFullCalibration = TRUE to generate a complete set of results")
  }
  
  if((!is.matrix(calData)) && (!is.data.frame(calData))){
    if(is.na(calData)){
      warning("Input train data in calData cannot be used for running the classification 
              (wrong object type or not enough image objects/segments?)")
      return(NA)
    }
  }

  if(nrow(calData) < minTrainCases){
    warning("The number of training cases is below the minimum value set in minTrainCases!")
    return(NA)
  } 
  
  ## Set nClassType ------------------------------------------------------------------------
  ##
  if(!inherits(calData, "SOptim.CalData")){
    if(is.null(attr(calData, "nClassType"))){
      nClassNum <- length(unique(calData[,"train"]))
      nClassType <- ifelse(nClassNum == 2, "single-class", "multi-class")
    }else{
      nClassType <- attr(calData, "nClassType")
    }
  }

  ## Validate input parameters --------------------------------------------------------------
  ##
  if(nClassType=="multi-class" & balanceTrainData){
    stop("Data balancing is only implemented for single-class! Set balanceTrainData to FALSE to proceed.")
  }
  
  # Validate if evalMetric is a valid function when performing a full calibration
  if(runFullCalibration & is.function(evalMetric)){
    if(!checkEvalFun(evalMetric, nClassType))
      stop("Invalid function defined in evalMetric!")
  }
  
  if(is.function(evalMetric)){
    evalMetricName <- "UserDefEvalMetric"
  }else{
    evalMetricName <- evalMetric
  }

  # Generate default parametrization
  p <- generateDefaultClassifierParams(calData)

  # Replace values in the default parametrization using user-defined values
  if(!is.null(classificationMethodParams)){
    p <- replaceDefaultClassificationParams(p, classificationMethodParams)
  }
    
  
  ## Create CV folds or data partitions depending on the method selected --------------------
  ##
  ##

  if(evalMethod=="OOB" && classificationMethod != "RF")
    stop("Evaluation method OOB is only available for Random Forest algorithm (RF)")
  
  dp <- createDataSplits(y=calData[,"train"], evalMethod, nRounds, trainPerc)
  

  ## Calibrate classifier --------------------------------------------------------------------
  ##
  ##
  
  # Convert input train column to factor (except for GBM..)
  if(classificationMethod != "GBM"){
    calData[,"train"] <- as.factor(calData[,"train"])
  }

  # If runFullCalibration is TRUE then start the list with all trained classifier objects among other outputs
  #
  if(runFullCalibration){
      
      # Number of evaluation rounds plus one full round 
      nRounds <- length(dp) + 1
      
      # Start the vector holding performance scores and other outputs
      perfStats <- vector(length = nRounds, mode = "numeric")
      names(perfStats) <- c(paste("TestSet_",1:(nRounds-1),sep=""),"FULL")
      
      threshVals <- vector(length = nRounds, mode = "numeric")
      names(threshVals) <- c(paste("TestSet_",1:(nRounds-1),sep=""),"FULL")
      
      # Test set containing observed and predicted vectors
      predTestSet <- list()
      obsTestSet <- list()
      
      # List to hold final classifier objects and confusion matrices
      classifierObj <- list()
      confusionMatrices <- list()
      
    }else{
      nRounds <- length(dp)
      
      # Start the vector holding performance scores
      perfStats <- vector(length = nRounds, mode = "numeric")
      names(perfStats) <- paste("TestSet_",1:(nRounds),sep="")
    }
  

  
  
  ## Iterate through data partitions/folds --------------------------------------------------
  ##

  for(i in 1:nRounds){
    
    if(verbose){
      cat("## ---",ifelse((runFullCalibration && (i==nRounds)), "FULL DATASET TRAINING ROUND",
          paste("TRAINING ROUND",i)),"--- ##\n\n")
    }
    
    # Generate temp datasets for train and test
    #
    if(runFullCalibration && i==nRounds){
      # ..TRAIN dataset
      trainDF <- calData
      # ..TEST dataset
      testDF <- calData
    }else{
      # ..TRAIN dataset
      trainDF <- calData[dp[[i]],]
      # ..TEST dataset
      if(evalMethod != "OOB"){
        testDF <- calData[-dp[[i]],]
      }else{
        testDF <- calData[dp[[i]],]
      }
    }
    
    # Check number of classes in train and test --------------------------------------------
    #
    if(length(unique(trainDF$train)) < 2){
      warning("The number of classes in train data is below 2! Unable to perform classifier training. 
                Check trainThresh and evalMethod to change calibration train/test sets?")
      warning(".. Evaluation round #",i," (ERROR: The number of classes in train data is below 2!)","\n\n", sep="")
      
      perfStats[i]<-0 # Changed to reflect bad performance in this type of situations
      next
    }
    
    if(length(unique(testDF$train)) < 2){
      warning("The number of classes in test data is below 2! Unable to do performance evaluation. 
                Check trainThresh and evalMethod to change calibration train/test sets?")
      warning(".. Evaluation round #",i," (ERROR: The number of classes in test data is below 2!)","\n\n", sep="")
      
      perfStats[i]<-0
      next
    }
    
    
    # Verify the number of train cases by class ----------------------------------------------
    # If this is too low training or evaluation will fail...
    #
    tb_train <- table(trainDF$train)
    tb_test  <- table(testDF$train)
    
    if(verbose){
      cat(".. Frequency table by class for train data",ifelse(balanceTrainData," (before data balancing) ",""),":\n",sep="")
      print(tb_train)
      
      cat("\n.. Frequency table by class for test data:\n")
      print(tb_test)
      cat("\n")
    }
    
    if(any(as.vector(tb_train) < minCasesByClassTrain)){
      warning("The number of train cases by class is below minCasesByClassTrain! Unable to perform classifier training.
                Check trainThresh and evalMethod to change calibration train/test sets?")
      warning(".. Evaluation round #",i," (ERROR: The number of train cases by class is below minCasesByClassTrain!)","\n\n", sep="")
      
      if(!runFullCalibration)
        perfStats[i] <- 0 # Changed to reflect bad performance in this type of situations (only for optimization)
      else
        perfStats[i] <- NA
      
      next
    }
    
    if(any(as.vector(tb_test) < minCasesByClassTest)){
      warning("The number of train cases by class is below minCasesByClassTest! Unable to do performance evaluation. 
                Check trainThresh and evalMethod to change calibration train/test sets?")
      warning(".. Evaluation round #",i," (ERROR: The number of test cases by class is below minCasesByClassTest!)","\n\n", sep="")
      
      if(!runFullCalibration)
        perfStats[i] <- 0 # Changed to reflect bad performance in this type of situations (only for optimization)
      else
        perfStats[i] <- NA
      
      next
    }
    
    
    # Perform data balancing on train data only ----------------------------------------------
    # (except for the full calibration round!...)
    #
    if(balanceTrainData){
      
      trainDF <- try(dataBalancing(x=trainDF, method=balanceMethod))
      
      if(inherits(trainDF,"try-error")){
        warning("An error occurred while performing data balancing!")
        warning(".. Evaluation round #",i," (An error occurred while performing data balancing)","\n", sep="")
        perfStats[i]<-NA
        next
        
      }else{
        # When using balance data this must be equal for the full evaluation round
        if(runFullCalibration && (i==nRounds)){
          testDF <- trainDF
        }
      }
    }

    
    ## Random Forest classification ----------------------------------------------------------
    
    if(classificationMethod=="RF"){
      clObj<-try(randomForest::randomForest(y     = trainDF$train,
                                            x     = trainDF[,-c(1,2)],
                                            mtry  = p$RF$mtry,
                                            ntree = p$RF$ntree))

      ## Make predictions for RF
      if(!inherits(clObj,"try-error")){

        if(nClassType=="single-class"){

          if(evalMethod=="OOB" || (runFullCalibration && i==nRounds)){ # Gets OOB results for the full calibration round
            pred<-try(stats::predict(clObj, type="prob"))
          }
          else{ 
            pred<-try(stats::predict(clObj, newdata = testDF, type = "prob"))
          }
        }
        else if(nClassType=="multi-class"){
          
          if(evalMethod=="OOB" || (runFullCalibration && i==nRounds)){ # Gets OOB results for the full calibration round
            pred<-try(stats::predict(clObj,type="response"))
          }
          else{ 
            pred<-try(stats::predict(clObj, newdata = testDF, type = "response"))
          }
        } 
        else{
          stop("Invalid value in nClassType! Must be either single-class or multi-class")
        }
      }else{
        warning("An error occured while performing ",classificationMethod," classifier training!")
        return(NA)
      }
      
      if(inherits(pred,"try-error")){
        warning("An error occurred while predicting for test dataset with classification method: ",classificationMethod,"!")
        return(NA)
      }
      
    }
    
    
    ## K-Nearest Neighbour classification ----------------------------------------------------
    ##
    else if(classificationMethod=="KNN"){
      
      clObj<-try(class::knn(train      = trainDF[,-c(1:2)], 
                             test      = testDF[,-c(1:2)], 
                             cl        = trainDF$train, 
                             k         = p$KNN$k, 
                             prob      = TRUE))
      
      ## Make predictions for KNN
      if(!inherits(clObj,"try-error")){
  
        # Create object of class knn
        class(clObj) <- append(class(clObj),"knn")
        
        
        if(nClassType == "single-class"){
          pred<-try(stats::predict(clObj, type = "prob"))
          
        }
        else if(nClassType=="multi-class"){
          pred<-try(stats::predict(clObj, type = "response"))
        }
        else{
          stop("Invalid value in nClassType! Must be either single-class or multi-class")
        }
      }
      else{
        warning("An error occured while performing ",classificationMethod," classifier training!")
        return(NA)
      }
      
      if(inherits(pred,"try-error")){
        warning("An error occurred while predicting for test dataset with classification method: ",classificationMethod,"!")
        return(NA)
      }
      class(clObj) <- c(class(clObj),"knn")
    }
    
    
    ## Flexible Discriminant Analysis classification ------------------------------------------
    ##
    else if(classificationMethod=="FDA"){
     
      form<-generateFormula(calData)
      
      clObj<-try(mda::fda(formula       = form, 
                           data         = trainDF, 
                           method       = mars, 
                           keep.fitted  = FALSE,
                           degree       = p$FDA$degree,
                           penalty      = p$FDA$penalty,
                           prune        = p$FDA$prune))
      
      ## Make predictions for FDA
      if(!inherits(clObj,"try-error")){
        
        if(nClassType=="single-class"){
          pred<-try(stats::predict(clObj, newdata = testDF, type = "posterior")) # posterior type for probability
          
        }
        else if(nClassType=="multi-class"){
          pred<-try(stats::predict(clObj, newdata = testDF, type = "class"))
        } 
        else{
          stop("Invalid value in nClassType! Must be either single-class or multi-class")
        }
      }
      else{
        warning("An error occured while performing ",classificationMethod," classifier training!")
        return(NA)
      }
       
      if(inherits(pred,"try-error")){
        warning("An error occurred while predicting for test dataset with classification method: ",classificationMethod,"!")
        return(NA)
      }
      
    }
    
    ## Generalized Boosted Regression Modeling ------------------------------------------------
    ##
    else if(classificationMethod=="GBM"){
      
      # Set family to multinomial if it is a multi-class problem
      #
      if(nClassType=="multi-class"){
        p<-replaceDefaultClassificationParams(p,list(GBM=list(distribution="multinomial")))
      }
        
      form<-generateFormula(calData)
      
      clObj<-try(gbm::gbm(formula      = form, 
                     distribution      = p$GBM$distribution,
                     data              = trainDF, 
                     keep.data         = FALSE,
                     n.trees           = p$GBM$n.trees,
                     interaction.depth = p$GBM$interaction.depth,
                     shrinkage         = p$GBM$shrinkage,
                     bag.fraction      = p$GBM$bag.fraction))
      
      ## Make predictions for FDA
      if(!inherits(clObj,"try-error")){
        
        if(nClassType=="single-class"){
          pred<-try(stats::predict(clObj, newdata = testDF, type = "response", n.trees=p$GBM$n.trees)) 
        }
        else if(nClassType=="multi-class"){
          pred<-try(stats::predict(clObj, newdata = testDF, type = "response", n.trees=p$GBM$n.trees)[,,1]) # Gets array -> 1 matrix per n.trees value
          pred<-try(as.integer(colnames(pred))[apply(pred,1,which.max)])
        } 
        else{
          stop("Invalid value in nClassType! Must be either single-class or multi-class")
        }
      }
      else{
        warning("An error occured while performing ",classificationMethod," classifier training!")
        return(NA)
      }

      if(inherits(pred,"try-error")){
        warning("An error occurred while predicting for test dataset with classification method: ",classificationMethod,"!")
        return(NA)
      }
    }
      
    
    ## Support Vector Machines classification ------------------------------------------------
    ##
    else if(classificationMethod=="SVM"){
     
      form<-generateFormula(calData)
      
      
      if(nClassType=="multi-class"){
        p<-replaceDefaultClassificationParams(p,list(SVM=list(probability = FALSE)))
      }
      
      clObj<-try(e1071::svm(formula  = form, 
                     data            = trainDF, 
                     type            = "C-classification",
                     kernel          = "radial",
                     gamma           = p$SVM$gamma,
                     cost            = p$SVM$cost,
                     cachesize       = 512,
                     probability     = p$SVM$probability,
                     fitted          = FALSE))
      
      ## Make predictions for SVM
      if(!inherits(clObj,"try-error")){
        
        if(nClassType=="single-class"){
          # Prob matrix has column with names equal to factor levels, e.g., "0" / "1"
          #
          pred<-try(attr(stats::predict(clObj, newdata = testDF, probability = TRUE), "probabilities")) # Get prob matrix
        }
        else if(nClassType=="multi-class"){
          pred<-try(stats::predict(clObj, newdata = testDF))
        } 
        else{
          stop("Invalid value in nClassType! Must be either single-class or multi-class")
        }
      }
      else{
        warning("An error occured while performing ",classificationMethod," classifier training!")
        return(NA)
      }
      if(inherits(pred,"try-error")){
        warning("An error occurred while predicting for test dataset with classification method: ",classificationMethod,"!")
        return(NA)
      }
    }
    
    #
    # Bad classification method string
    #
    else{
      stop("Unknown classification method defined! Please review input parameters")
    }
    
    
    ## Perform evaluation -------------------------------------------------------------------
    ##
    ## OK!
    if(!inherits(pred,"try-error")){

      ## Observed data for the test set ----------- ##
      ##
      ##
      if(evalMethod=="OOB"){
        obs <- trainDF$train
      }else{
        obs <- testDF$train
      }

      ## Predicted data for the test set --------- ##
      ##
      
      if((nClassType=="single-class") && (is.matrix(pred) || is.data.frame(pred)) && ncol(pred) > 1){
        
        if(classificationMethod=="SVM"){
          # Altered this line to ensure that the right column is found for evaluation
          # when using SVM probability output
          pred <- pred[,"1"]
        }else{
          pred <- as.numeric(pred[,2]) # Use positives probability column only (single-class problems)
        }
        
      }
      
      # Convert factors into integers for making evaluations
      # Works both with single- or multi-class problems
      if(is.factor(obs)){
        # if(nClassType == "single-class"){
        #   obs.int <- as.integer(levels(obs)[as.integer(obs)])
        # }
        # else if(nClassType == "multi-class"){
        #   obs.int <- f2int(obs)
        # }
        
        # Converts factors into integers
        obs.int <- as.integer(levels(obs)[as.integer(obs)])
        
      }else{
        obs.int <- as.integer(obs)
      }
      
      # Convert predicted values to integer if is a factor (required for eval)
      if(is.factor(pred)){
        #pred <- as.integer(pred)
        pred <- as.integer(levels(pred)[as.integer(pred)])
      }
      
      # Remove NA's from predicted and observed values (FDA produces NaN's in some cases...) 
      nv_idx  <- !(is.na(pred) | is.nan(pred) | is.infinite(pred))
      pred    <- pred[nv_idx]
      obs.int <- obs.int[nv_idx]

      # Check if observed and predicted have the same size (FDA sometimes gives problems here!...)
      if((length(pred) != length(obs.int)) || length(pred) == 0){
        
        warning(".. Evaluation round #",i,
            " (ERROR: Different sizes between observed and predicted vectors during performance evaluation or 0-length prediction vector!)","\n\n", sep="")
        
        perfStats[i] <- NA
        next
      }

      # Evaluate performance using the selected evaluation metric
      #
      if(is.character(evalMetric)){
        # Internal metrics
        evalStatValue <- try(evalPerformanceGeneric(obs.int, pred ,stat=evalMetric, nClassType=nClassType))
      }else if(is.function(evalMetric)){
        # User-defined metric
        evalStatValue <- try(evalMetric(obs.int, pred))
      }else{
        stop("Invalid object type in evalMetric!")
      }
      
      
      if(inherits(evalStatValue,"try-error") || is.na(evalStatValue)){ # Invalid result
        
        # On error set NA as the result in performance statistics vector (or other results)
        if(runFullCalibration){
          
          # Save the object generated by the classifier
          classifierObj[[i]] <- NA
          
          # Save observed and predicted values
          obsTestSet[[i]] <- NA
          predTestSet[[i]] <- NA
          
          # Single-class problems with optimized thresholds
          if(nClassType=="single-class"){
            threshVals[i] <- NA
            confusionMatrices[[i]] <- NA
          }
          # Multi-class problems
          if(nClassType=="multi-class"){
            confusionMatrices[[i]] <- NA
          }
        }
        # If evaluation went wrong save results as NA
        perfStats[i] <- NA
      }
      # Save values if all went OK
      #
      else{
        
        if(runFullCalibration){
          
          # Save the object generated by the classifier
          classifierObj[[i]] <- clObj
          
          # Save observed and predicted values
          obsTestSet[[i]] <- obs.int
          predTestSet[[i]] <- pred
          
          # Single-class problems with optimized thresholds
          if(nClassType=="single-class"){
            thresh <- attr(evalStatValue,"thresh")
            threshVals[i] <- thresh
            confusionMatrices[[i]] <- generateConfusionMatrix(obs.int, as.integer(pred > thresh))
          }
          # Multi-class problems
          if(nClassType=="multi-class"){
            confusionMatrices[[i]] <- generateConfusionMatrix(obs.int, pred)
          }
        }
        
        # Valid performance results
        perfStats[i] <- evalStatValue 
        
        if(verbose){
          # Print results per round
          cat(".. Evaluation round #", i, " | N(train) = ",nrow(trainDF), " | N(test) = ",nrow(testDF)," | ",
              evalMetricName," = ",round(evalStatValue,3),"\n\n", sep="") 
        }
        
      }
    }
    # -- KO!
    else{
      perfStats[i] <- NA
    }
  } ## --- End loop
  

  # Check the % number of successful evaluation rounds, if less than 75% then an error occurs
  if((sum(is.na(perfStats))/length(dp)) >= 0.75){
    
    warning("Unable to evaluate classification performance! Less than 25% of the train/test rounds produced invalid values.
         Please review input parameters and calibration data")
    return(NA)
  }
  else{
    
    if(runFullCalibration){
      
      ## Average performance in cross-validation rounds excluding the full evaluation round
      avgPerformance <- mean(perfStats[-nRounds], na.rm=TRUE)
      if(verbose) cat(".. Average ",evalMetricName," = ",round(avgPerformance,3),"\n\n",sep="")
      
      # Define names for the objects contained in lists: classifierObj and confusionMatrices
      names(classifierObj) <- c(paste("TestSet_",1:(nRounds - 1),sep=""),"FULL")
      names(confusionMatrices) <- c(paste("TestSet_",1:(nRounds - 1),sep=""),"FULL")
      names(predTestSet) <- c(paste("TestSet_",1:(nRounds - 1),sep=""),"FULL")
      names(obsTestSet) <- c(paste("TestSet_",1:(nRounds - 1),sep=""),"FULL")
      
      # Build output object data and metadata
      #
      out <- list(AvgPerf     = avgPerformance, 
                  PerfStats   = perfStats, 
                  Thresh      = ifelse(nClassType == "single-class", threshVals, NA),
                  ConfMat     = confusionMatrices,
                  TrainSets   = dp,
                  ClassObj    = classifierObj,
                  obsTestSet  = obsTestSet,
                  predTestSet = predTestSet,
                  ClassParams = list(classificationMethod       = classificationMethod,
                                     classificationMethodParams = p,
                                     balanceTrainData           = balanceTrainData,
                                     balanceMethod              = balanceMethod,
                                     evalMethod                 = evalMethod,
                                     evalMetric                 = evalMetric,
                                     evalMetricName             = evalMetricName,
                                     trainPerc                  = trainPerc,
                                     nRounds                    = nRounds,
                                     nClassType                 = nClassType,
                                     minTrainCases              = minTrainCases,
                                     minCasesByClassTrain       = minCasesByClassTrain,
                                     minCasesByClassTest        = minCasesByClassTest))
      
      class(out) <- "SOptim.Classifier" # Output class when runFullCalibration = TRUE
      return(out)
    }
    else{
      
      ## Average performance in cross-validation rounds
      avgPerformance <- mean(perfStats, na.rm=TRUE)
      if(verbose) cat(".. Average ",evalMetricName," (cross-validation) = ",round(avgPerformance,3),"\n\n",sep="")
      
      return(avgPerformance)
    }
  }
}


#' Summary for SOptim.Classifier objects
#' 
#' Produces a summary of classification performance results for a 
#' \code{SOptim.Classifier} object.
#' 
#' @param object An object of class \code{SOptim.Classifier} created by 
#' \code{\link{calibrateClassifier}} with option \code{runFullCalibration=TRUE}.
#' 
#' @param ... Additional arguments affecting the summary produced.
#' 
#' @return A summary of the object of class \code{SOptim.ClassifierSummary} 
#' which can be used in print
#' 
#' @seealso  \link{calibrateClassifier}
#' 
#' @method summary SOptim.Classifier
#' 
#' @rdname summary.SOptim.Classifier
#' 
#' @export

summary.SOptim.Classifier <- function(object, ...){
  
  # Calculate all performance evaluation metrics based on object
  #
  evalMatrix <- evalPerformanceClassifier(object)
  nr <- nrow(evalMatrix)
  
  evalMatrix <- rbind(evalMatrix,
                      apply(evalMatrix[-nr,], 2, mean),
                      apply(evalMatrix[-nr,], 2, sd))
  
  rownames(evalMatrix) <- c(rownames(evalMatrix)[-c(nr+1,nr+2)],
                            paste("AVG (",object$ClassParams$evalMethod,")",sep=""),
                            paste("ST-DEV (",object$ClassParams$evalMethod,")",sep=""))
   
  cm <- object$ConfMat$FULL
  
  if(object$ClassParams$nClassType == "multi-class"){
    
    # Calculate user and producer accuracies
    #
    nc <- ncol(cm)
    user.acc <- vector(length = nc)
    prod.acc <- vector(length = nc)
    
    for(i in 1:ncol(cm)){
      col.sum <- sum(cm[,i])
      row.sum <- sum(cm[i,])
      diag.val <- cm[i,i]
      
      prod.acc[i] <- diag.val / col.sum
      user.acc[i] <- diag.val / row.sum
    }

    acc <- data.frame(ClassLabel = colnames(cm), 
                      UserAccuracy      = round(user.acc,3), 
                      ProducerAccuracy  = round(prod.acc,3))

  }else{
    acc <- NA
  }

  out <- list(evalMatrix  = evalMatrix,
            ConfMat       = cm,
            ProdUserAcc   = acc)
  class(out) <- "SOptim.ClassifierSummary"
  
  return(invisible(out))
  
}

#' Print classification summary
#' 
#' Prints an object of class \code{SOptim.ClassifierSummary} generated by 
#' \code{\link{summary.SOptim.Classifier}}.
#' 
#' @param x \code{SOptim.ClassifierSummary} generated by 
#' \code{\link{summary.SOptim.Classifier}}.
#' 
#' @param ... Additional arguments affecting the print produced.
#' 
#' @return None
#' 
#' @rdname print.SOptim.ClassifierSummary
#' 
#' @method print SOptim.ClassifierSummary
#' 
#' @export
#' 

print.SOptim.ClassifierSummary <- function(x,...){
  
  cat("\n\n|| Supervised classification summary ||\n\n")
  
  cat(".: Classification performance:\n\n",sep="")
  print(round(x[["evalMatrix"]], 2), ...)
  
  cat("\n\n.: Confusion matrix for full dataset calibration:\n\n")
  print(x[["ConfMat"]], ...)
  
  if(!is.data.frame(x[["ProdUserAcc"]])){
    cat("\n\n.: Producer/user accuracies for full dataset:\n\n")
    print(x[["ProdUserAcc"]], ...)
  }
}
joaofgoncalves/SegOptim documentation built on Feb. 5, 2024, 11:10 p.m.