#' @title Run smartML function for automatic Supervised Machine Learning.
#'
#' @description Run the smartML main function for automatic classifier algorithm selection, and hyper-parameter tuning.
#'
#' @param maxTime Float numeric of the maximum time budget for reading dataset, preprocessing, calculating meta-features, Algorithm Selection & hyper-parameter tuning process only in minutes(Excluding Model Interpretability) - This is applicable in case of Option = 2 only.
#' @param directory String Character of the training dataset directory (SmartML accepts file formats arff/(csv with columns headers) ).
#' @param testDirectory String Character of the testing dataset directory (SmartML accepts file formats arff/(csv with columns headers) ).
#' @param classCol String Character of the name of the class label column in the dataset (default = 'class').
#' @param vRatio Float numeric of the validation set ratio that should be splitted out of the training set for the evaluation process (default = 0.1 --> 10\%).
#' @param preProcessF vector of string Character containing the name of the preprocessing algorithms (default = c('standardize', 'zv') --> no preprocessing):
#' \itemize{
#' \item "boxcox" - apply a Box–Cox transform and values must be non-zero and positive in all features,
#' \item "yeo-Johnson" - apply a Yeo-Johnson transform, like a BoxCox, but values can be negative,
#' \item "zv" - remove attributes with a zero variance (all the same value),
#' \item "center" - subtract mean from values,
#' \item "scale" - divide values by standard deviation,
#' \item "standardize" - perform both centering and scaling,
#' \item "normalize" - normalize values,
#' \item "pca" - transform data to the principal components,
#' \item "ica" - transform data to the independent components.
#' }
#' @param featuresToPreProcess Vector of number of features to perform the feature preprocessing on - In case of empty vector, this means to include all features in the dataset file (default = c()) - This vector should be a subset of \code{selectedFeats}.
#' @param nComp Integer numeric of Number of components needed if either "pca" or "ica" feature preprocessors are needed.
#' @param nModels Integer numeric representing the number of classifier algorithms that you want to select based on Meta-Learning and start to tune using Bayesian Optimization (default = 5).
#' @param option Integer numeric representing either Classifier Algorithm Selection is needed only = 1 or Algorithm selection with its parameter tuning is required = 2 which is the default value.
#' @param featureTypes Vector of either 'numerical' or 'categorical' representing the types of features in the dataset (default = c() --> any factor or character features will be considered as categorical otherwise numerical).
#' @param interp Boolean representing if model interpretability (Feature Importance and Interaction) is needed or not (default = FALSE) This option will take more time budget if set to 1.
#' @param missingOpr Boolean variable represents either use median/mode imputation for instances with missing values (FALSE) or apply imputation using "MICE" library which helps you imputing missing values with plausible data values that are drawn from a distribution specifically designed for each missing datapoint (TRUE).
#' @param balance Boolean variable represents if SMOTE class balancing is required or not (default FALSE).
#' @param metric Metric of string character to be used in evaluation:
#' \itemize{
#' \item "acc" - Accuracy,
#' \item "avg-fscore" - Average of F-Score of each label,
#' \item "avg-recall" - Average of Recall of each label,
#' \item "avg-precision" - Average of Precision of each label,
#' \item "fscore" - Micro-Average of F-Score of each label,
#' \item "recall" - Micro-Average of Recall of each label,
#' \item "precision" - Micro-Average of Precision of each label.
#' }
#'
#' @return List of Results
#' \itemize{
#' \item "option=1" - Choosen Classifier Algorithms Names \code{clfs} with their parameters configurations \code{params}, Training DataFrame \code{TRData}, Test DataFrame \code{TEData} in case of \code{option=2},
#' \item "option=2" - Best classifier algorithm name found \code{clfs} with its parameters configuration \code{params}, , Training DataFrame \code{TRData}, Test DataFrame \code{TEData}, model variable \code{model}, predicted values on test set \code{pred}, performance on TestingSet \code{perf}, and Feature Importance \code{interpret$featImp} / Interaction \code{interpret$Interact} plots in case of interpretability \code{interp} = TRUE and chosen model is not knn.
#' }
#'
#' @examples
#' \dontrun{
#' autoRLearn(1, 'sampleDatasets/car/train.arff', \
#' 'sampleDatasets/car/test.arff', option = 2, preProcessF = 'normalize')
#'
#' result <- autoRLearn(10, 'sampleDatasets/shuttle/train.arff', 'sampleDatasets/shuttle/test.arff')
#' }
#'
#' @importFrom tictoc tic toc
#' @importFrom R.utils withTimeout
#' @importFrom graphics plot
#' @import ggplot2
#'
#' @export autoRLearn
autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', metric = 'acc', vRatio = 0.3, preProcessF = c('standardize', 'zv'), featuresToPreProcess = c(), nComp = NA, nModels = 5, option = 2, featureTypes = c(), interp = FALSE, missingOpr = FALSE, balance = FALSE) {
#Set Seed
set.seed(22)
#Read Dataset
datasetReadError <- try(
{
#Read Training Dataset
dataset <- readDataset(directory, testDirectory, classCol = classCol, vRatio = vRatio, preProcessF = preProcessF, featuresToPreProcess = featuresToPreProcess, nComp = nComp, missingOpr = missingOpr, metric = metric, balance = balance)
trainingSet <- dataset$TD
#Read Testing Dataset
testDataset <- dataset$TED
#Read all training Dataset without validation
trainDataset <- dataset$FULLTD
})
if(inherits(datasetReadError, "try-error")){
print('Error: Failed Reading Dataset: Make sure that dataset directory is correct and it is a valid csv/arff file.')
return(-1)
}
#Calculate Meta-Features for the dataset
metaFeaturesError <- try(
{
metaFeatures <- computeMetaFeatures(trainingSet, maxTime, featureTypes)
})
if(inherits(metaFeaturesError, "try-error")){
print('Error: Failed Extracting Dataset MetaFeatures.')
return(-1)
}
splitError <- try(
{
#Convert Categorical Features to Numerical Ones and split the dataset
B <- max(10, as.integer((metaFeatures$nInstances) / 2000)) #Number of folds to work on for the dataset and trees in SMAC forest model
dataset <- convertCategorical(dataset, trainDataset, testDataset, B = B)
validationSet <- dataset$VD #Validation set
trainingSet <- dataset$TD #Training Set
foldedSet <- dataset$FD #Folded sets of Training Data.
#Convert for all TrainingSet
trainDataset <- dataset$FULLTD
#Convert for all TestingSet
testDataset <- dataset$TED
})
if(inherits(splitError, "try-error")){
print('Error: Failed Splitting Dataset.')
return(-1)
}
#Generate candidate classifiers
candidateClfsError <- try(
{
nClassifiers <- 15
output <- getCandidateClassifiers(maxTime, metaFeatures, min(c(nModels, nClassifiers)) )
algorithms <- output$c #Classifier Algorithm names selected.
tRatio <- output$r #Time ratio between all classifiers.
algorithmsParams <- output$p #Initial Parameter configuration of each classifier.
})
if(inherits(candidateClfsError, "try-error")){
print('Error: Can not generate Candidate classifiers.')
return(-1)
}
tryCatch({
#Option 1: Only Candidate Classifiers with initial parameters will be resulted (No Hyper-parameter tuning)
if(option == 1 && length(algorithms) == length(algorithmsParams))
return (list(clfs = algorithms, params = algorithmsParams, TRData = dataset$FULLTD, TEData = dataset$TED))
else if(option == 1)
return ('Error: Failed to Connect to KnowledgeBase, Option 1 can not be executed')
#Option 2: Classifier Algorithm Selection + Parameter Tuning
res <- withTimeout({
#variables to hold best classifiers
bestAlgorithm <- '' #bestClassifierName.
bestAlgorithmPerf <- 0 #bestClassifierPerformance.
bestAlgorithmParams <- list() #Parameters of best Classifier.
#loop over each classifier
for(i in 1:length(algorithms)){
classifierAlgorithm <- algorithms[[i]]
if (i <= length(algorithmsParams))
classifierAlgorithmParams <- algorithmsParams[[i]]
else
classifierAlgorithmParams <- '' #use the default initial parameter configuration
#Read maxTime for the current classifier algorithm and convert to seconds
maxClfTime <- tRatio[i] * 60
#Read the current classifier default parameter configuration
classifierConf <- getClassifierConf(classifierAlgorithm)
cat('\nStart Tuning Classifier Algorithm: ', classifierAlgorithm, '\n')
#initialize step
R <- initialize(classifierAlgorithm, classifierConf, classifierAlgorithmParams)
cntParams <- R[, -which(names(R) == "performance")]
#start hyperParameter tuning till maximum Time
tic(quiet = TRUE)
timeTillNow <- 0
#Regression Random Forest Trees for training set folds
tree <- data.frame(fold=integer(), parent=integer(), params=character(), rightChild=integer(), leftChild=integer(), performance=double(), rowN = integer())
bestParams <- cntParams
bestPerf <- c()
classifierFailureCounter <- 0
repeat{
gc()
#Fit Model
output <- fitModel(bestParams, bestPerf, trainingSet, validationSet, foldedSet, classifierAlgorithm, tree, B = B)
#Check if this classifer failed for more than 5 times, skip to the next classifier
if((length(bestPerf) > 0 && mean(bestPerf) == 0) || length(bestPerf) == 0){
classifierFailureCounter <- classifierFailureCounter + 1
if(classifierFailureCounter > 2) break
}
tree <- output$t
bestPerf <- output$p
bestParams <- output$bp
#Select Candidate Classifier Configurations
candidateConfs <- selectConfiguration(R, classifierAlgorithm, tree, bestParams, B = B)
#Intensify
if(nrow(candidateConfs) > 0){
output <- intensify(R, bestParams, bestPerf, candidateConfs, foldedSet, trainingSet, validationSet, classifierAlgorithm, maxClfTime, timeTillNow, B = B, metric = metric)
bestParams <- output$params
bestPerf <- output$perf
timeTillNow <- output$timeTillNow
classifierFailureCounter <- classifierFailureCounter + output$fails
R <- output$r
}
#Check if execution time exceeded the allowed time or not
t <- toc(quiet = TRUE)
timeTillNow <- timeTillNow + t$toc - t$tic
tic(quiet = TRUE)
if(timeTillNow > maxClfTime){
if(mean(bestPerf) > mean(bestAlgorithmPerf)){
bestAlgorithmPerf <- bestPerf
bestAlgorithm <- classifierAlgorithm
bestAlgorithmParams <- bestParams
#cat('Best Classifier:', bestAlgorithm, ' --> Performance:', bestAlgorithmPerf, '\n')
}
break
}
}
}
},timeout = maxTime * 60, cpu = maxTime * 60)
}, TimeoutException = function(ex) {
message("NOTE: Time Budget allowed has been finished.")
})
print("Time Limit for Tuning process has been reached out. Training the best classifier found over whole Training set now.")
if (bestAlgorithm != '')
bestAlgorithmParams <- bestAlgorithmParams[,names(bestAlgorithmParams) != "EI" & names(bestAlgorithmParams) != "performance"]
else{
bestAlgorithm <- algorithms[[1]]
bestAlgorithmParams <- algorithmsParams[[1]]
}
trainFinalModelError <- try(
{
#Run Classifier over all training set and check performance on testing set
finalResult <- runClassifier(trainingSet = trainDataset, validationSet = testDataset, params = bestAlgorithmParams, classifierAlgorithm = bestAlgorithm, metric = metric, interp = interp)
finalResult$clfs <- bestAlgorithm
finalResult$params <- bestAlgorithmParams
#save results to Temporary File
query <- sendToTmp(metaFeatures, bestAlgorithm, bestAlgorithmParams, finalResult$perf, nModels, metric)
#check internet connection and send data in tmp file to database if connection exists
if(checkInternet() == TRUE){
sendToDatabase(query)
}
})
if(inherits(trainFinalModelError, "try-error")){
print('Error: No Enough Computational Resources. Can not build a model over the current dataset!')
}
finalResult$TRData = dataset$FULLTD
finalResult$TEData = dataset$TED
return(finalResult)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.