R/modelingSummary.R

#' @title Get modeling metrics
#' @description modelingSummary is an automatic function for modeling data, it returns a dataframe containing the metrics of the modeling using five machine learning algorithms: KNN, SVM, RF, NNET, and Bcart. This function is based on spliData, tuneTrain, predict, and getMetrics functions.
#' @param data object of class "data.frame" with target variable and predictor variables.
#' @param y character. Target variable.
#' @param p numeric. Proportion of data to be used for training. Default: 0.7
#' @param length integer. Number of values to output for each tuning parameter. If \code{search = "random"} is passed to \code{\link[caret]{trainControl}} through \code{...}, this becomes the maximum number of tuning parameter combinations that are generated by the random search. Default: 10.
#' @param control character. Resampling method to use. Choices include: "boot", "boot632", "optimism_boot", "boot_all", "cv", "repeatedcv", "LOOCV", "LGOCV", "none", "oob", timeslice, "adaptive_cv", "adaptive_boot", or "adaptive_LGOCV". Default: "repeatedcv". See \code{\link[caret]{train}} for specific details on the resampling methods.
#' @param number integer. Number of cross-validation folds or number of resampling iterations. Default: 10.
#' @param repeats integer. Number of folds for repeated k-fold cross-validation if "repeatedcv" is chosen as the resampling method in \code{control}. Default: 10.
#' @param summary expression. Computes performance metrics across resamples. For numeric \code{y}, the mean squared error and R-squared are calculated. For factor \code{y}, the overall accuracy and Kappa are calculated. See \code{\link[caret]{trainControl}} and \code{\link[caret]{defaultSummary}} for details on specification and summary options. Default: multiClassSummary.
#' @param process character. Defines the pre-processing transformation of predictor variables to be done. Options are: "BoxCox", "YeoJohnson", "expoTrans", "center", "scale", "range", "knnImpute", "bagImpute", "medianImpute", "pca", "ica", or "spatialSign". See \code{\link[caret]{preProcess}} for specific details on each pre-processing transformation. Default: c('center', 'scale').
#' @param positive character. The positive class for the target variable if \code{y} is factor. Usually, it is the first level of the factor.
#' @param parallelComputing logical. indicates whether to also use the parallel processing. Default: False
#' @param classtype integer.indicates the number of classes of the traits.
#' @param ... additional arguments to be passed to \code{createDataPartition}, \code{trainControl} and \code{train} functions in the package \code{caret}.
#' @return A dataframe contains the metrics of the modeling of five machine learning algorithms: KNN, SVM, RF, NNET, and Bcart.
#'
#' \code{tuneTrain} relies on package \code{caret} to perform the modeling.
#' @details Types of classification and regression models available for use with \code{tuneTrain} can be found using \code{names(getModelInfo())}. The results given depend on the type of model used.
#' 
#' @author Zakaria Kehel, Khadija Aziz
#' @examples
#' if(interactive()){
#'  data(septoriaDurumWC)
#'  models <- modelingSummary(data = septoriaDurumWC, y = "ST_S", positive = "R", classtype = 2)
#' }
#' @seealso
#'  \code{\link[caret]{createDataPartition}},
#'  \code{\link[caret]{trainControl}},
#'  \code{\link[caret]{train}},
#'  \code{\link[caret]{predict.train}},
#'  \code{\link[caret]{confusionMatrix}}
#' @rdname modelingSummary
#' @export
#' @importFrom caret createDataPartition trainControl train predict.train confusionMatrix
#' @importFrom utils View
#' @importFrom stats predict xtabs

modelingSummary <- function (data, y, p = 0.7, 
                       length = 10, control = "repeatedcv", number = 10, 
                       repeats = 10, process = c('center', 'scale'),
                       summary= multiClassSummary,positive, parallelComputing = FALSE, 
                       classtype, ...){
  
  #### Tuning ####
  knn.mod <- tuneTrain(data = data,y = y,method = 'knn',positive = positive, parallelComputing = parallelComputing)
  svm.mod <- tuneTrain(data = data,y = y,method = 'svmLinear2',positive = positive, parallelComputing = parallelComputing)
  rf.mod <- tuneTrain(data = data,y = y,method = 'rf',positive = positive, parallelComputing = parallelComputing)
  nnet.mod <- tuneTrain(data = data,y = y,method = 'nnet',positive = positive, parallelComputing = parallelComputing)  
  bcart.mod <- tuneTrain(data = data,y = y,method = 'treebag',positive = positive, parallelComputing = parallelComputing)

  #### PREDICTING ####
  data.knn.pred <- stats::predict(knn.mod$Model, knn.mod$`Test Data`[ , -1])
  cm.knn <- caret::confusionMatrix(stats::xtabs(~ data.knn.pred + unlist(knn.mod$`Test Data`[1])))

  data.svm.pred <- stats::predict(svm.mod$Model, svm.mod$`Test Data`[ , -1])
  cm.svm <- caret::confusionMatrix(stats::xtabs(~ data.svm.pred + unlist(svm.mod$`Test Data`[1])))
  
  data.rf.pred <- stats::predict(rf.mod$Model, rf.mod$`Test Data`[ , -1])
  cm.rf <- caret::confusionMatrix(stats::xtabs(~ data.rf.pred + unlist(rf.mod$`Test Data`[1])))
  
  data.nnet.pred <- stats::predict(nnet.mod$Model, nnet.mod$`Test Data`[ , -1])
  cm.nnet <- caret::confusionMatrix(stats::xtabs(~ data.nnet.pred + unlist(nnet.mod$`Test Data`[1])))
  
  data.bcart.pred <- stats::predict(bcart.mod$Model, bcart.mod$`Test Data`[ , -1])
  cm.bcart <- caret::confusionMatrix(stats::xtabs(~ data.bcart.pred + unlist(bcart.mod$`Test Data`[1])))
  
  
  # Apply metrics function to yhat and y from each model
  data.knn.metrics <- getMetrics(unlist(knn.mod$`Test Data`[1]), data.knn.pred, classtype = classtype)
  data.svm.metrics <- getMetrics(unlist(svm.mod$`Test Data`[1]), data.svm.pred, classtype = classtype)
  data.rf.metrics <- getMetrics(unlist(rf.mod$`Test Data`[1]), data.rf.pred, classtype = classtype)
  data.nnet.metrics <- getMetrics(unlist(nnet.mod$`Test Data`[1]), data.nnet.pred, classtype = classtype)
  data.bcart.metrics <- getMetrics(unlist(bcart.mod$`Test Data`[1]), data.bcart.pred, classtype = classtype)
  
  metrics.models <- cbind(data.knn.metrics$Metrics, data.svm.metrics$Metrics, data.rf.metrics$Metrics, data.nnet.metrics$Metrics, data.bcart.metrics$Metrics)
  names(metrics.models) <- c("K-NN", "SVM", "RF", "NNET", "BCART")
  
  if (classtype == 2) {
    CM.knn.df <- as.data.frame(c(as.numeric(data.knn.metrics$CM[,1]),as.numeric(data.knn.metrics$CM[,2][c(2,1)])))
    CM.svm.df <- as.data.frame(c(as.numeric(data.svm.metrics$CM[,1]),as.numeric(data.svm.metrics$CM[,2][c(2,1)])))
    CM.rf.df <- as.data.frame(c(as.numeric(data.rf.metrics$CM[,1]),as.numeric(data.rf.metrics$CM[,2][c(2,1)])))
    CM.nnet.df <- as.data.frame(c(as.numeric(data.nnet.metrics$CM[,1]),as.numeric(data.nnet.metrics$CM[,2][c(2,1)])))
    CM.bcart.df <- as.data.frame(c(as.numeric(data.bcart.metrics$CM[,1]),as.numeric(data.bcart.metrics$CM[,2][c(2,1)])))
    
    metrics.CM <- cbind(CM.knn.df,CM.svm.df,CM.rf.df,CM.nnet.df,CM.bcart.df)
    metrics.CM <- as.data.frame(lapply(metrics.CM[,1:5], as.factor))
    row.names(metrics.CM) <- c("True Positive","False Positive","True Negative","False Negative")
    names(metrics.CM) <- c("K-NN", "SVM", "RF", "NNET", "BCART")
    
    final.metrics <- rbind(metrics.models,metrics.CM)
  }
  
  else if(classtype > 2) {
    final.metrics <- metrics.models
  }
  
  View(final.metrics)
  #return(final.metrics)
  x = list(knn.mod = knn.mod,
           svm.mod = svm.mod,
           rf.mod = rf.mod,
           nnet.mod = nnet.mod,
           bcart.mod = bcart.mod,
           metrics = final.metrics)
  return(x)
  
}
khadijaaziz/icardaFIGSr documentation built on Dec. 21, 2021, 6:38 a.m.