R/ApplyModels.R

Defines functions ApplyModels

Documented in ApplyModels

#' Title Apply various ML models to your datasets and compare the results
#'
#' @param working_df input dataset for classification
#' @param model_names a list of models to be appied on the data
#' @param split_ratio training to test ratio
#' @param save_results_on_disk if set to true results are saved
#' @param return_plots if true returns plots confusion matrix
#' @param shrink the fraction of the data to be used for modeling. If modeling takes to long reduce this.
#' @param scale_center If true, centers and scale the data before modeling
#' @param cv_folds number of folds for cross-validation. Set to zero for not using cross-validation
#' @param RF_mtry minimum number of featuresto be used by the random forest algorithm
#' @param cores number of cores. For parallel computing set it to an integer greater than 1
#' @param min_node_size minimum number of nodes for the random forest model
#'
#' @return output a list containing a dataframe containing model names and accuracies and a list of plots and feature importance
#'
#' @import caret
#' @importFrom rlang .data
#' @importFrom dplyr slice
#' @importFrom tictoc tic
#' @importFrom tictoc toc
#' @importFrom parallel makePSOCKcluster
#' @importFrom doParallel registerDoParallel
#' @importFrom parallel stopCluster
#' @importFrom data.table transpose
#' @import ggplot2
#' @export
#'
ApplyModels <-
    function(working_df,
             model_names = c("RF", "LDA", "NB", "SVM", "KNN" , "DT", "XGB"),
             split_ratio = 0.66,
             scale_center = FALSE,
             cv_folds = 0,
             shrink = 1,
             save_results_on_disk = TRUE,
             return_plots = TRUE,
             RF_mtry = 2,
             min_node_size = 50,
             cores = 1) {
        # # Parallel and time to see if caret parallel works
        tic("Preprocessing")

        if (cores != 1) {
            message("Parallel computing is activated!")
            cl <- makePSOCKcluster(cores)
            registerDoParallel(cl)
        }
        else {
            message("parallel computing is not activated. For parallel computing set it to an integer greater than 1")
        }


        # Create train and test to train and evalute the model
        seed <- 2020
        set.seed(seed)

        working_df %<>% dplyr::sample_frac(shrink)
        message(paste0(shrink * 100, " % of the data will be used"))

        training_indices <-
            createDataPartition(working_df$trimmed_activity,
                                p = split_ratio,
                                list = FALSE)

        training_df <- working_df %>% dplyr::slice(training_indices)
        testing_df <- working_df %>% dplyr::slice(-training_indices)

        if (scale_center) {
            preProcValues <- preProcess(training_df, method = c("center", "scale"))
            training_df <- predict(preProcValues, training_df)
            testing_df <- predict(preProcValues, testing_df)
            message("Data is scaled and centered")

        }

        message(paste0("Data is devided into training and test set "))
        message(paste0(
            "Training set has ",
            nrow(training_df),
            " rows and ",
            ncol(training_df),
            " columns"
        ))
        message(paste0(
            "Testing set has ",
            nrow(testing_df),
            " rows and ",
            ncol(testing_df),
            " columns"
        ))


        # To store the model and all the performance metric
        results <- NULL

        # To store plots
        plts <- NULL

        # To store confusion matrix
        cf_mat <- NULL

        # To return main results for each model
        accuracies <- NULL

        # to return RF feature importance
        importance <- NULL

        # The end of preprocessing step
        toc()


        if (cv_folds <= 0) {
            fitControl <-
                trainControl(method = "none", classProbs = TRUE)
            message("Cross-validation is not being used, set cv_folds to a positive number to use cross-validation")
        } else if (cv_folds > 0)
        {
            cv_folds  %<>%  ceiling()
            fitControl <-
                trainControl(method = "cv", number = cv_folds, classProbs = TRUE)
            message(paste0(cv_folds, " fold cross-validation is being used"))
        }


        # ------------------------------------- LDA --------------------------------------
        if ("LDA" %in% model_names) {
            tic("LDA took")
            message("Starting LDA")
            model_name <- "lda"
            train_control_method <- "none"
            model_parameter <- 10

            model_A <- train(
                trimmed_activity ~ .,
                data = training_df,
                method = model_name,
                trControl = fitControl,
                verbose = FALSE,
                tuneGrid = data.frame(parameter = model_parameter),
                metric = "ROC"
            )

            pred <- stats::predict(model_A, newdata = testing_df)

            # To calculate area AUC we need probabilies and predicted classes in a single dataframe
            pred_prob <-
                data.frame(obs =  testing_df$trimmed_activity,
                           pred = pred)
            pred <-
                stats::predict(model_A, newdata = testing_df, type = "prob")
            pred_prob <- bind_cols(pred_prob, pred)

            # Calculate different metrics
            metrics <-
                multiClassSummary(data = pred_prob,
                                  lev = levels(testing_df$trimmed_activity)) %>%
                as.data.frame()
            # Return the metric in a nicer format
            metric_names <- rownames(metrics)
            metrics  %<>% data.table::transpose()
            colnames(metrics) <- metric_names
            rownames(metrics) <- "LDA"
            accuracies  %<>%  rbind(metrics)

            # CF need a different format of prediction results so recalcuate
            pred <- stats::predict(model_A, newdata = testing_df)

            # Calculate confusion matrix
            cf_matrix <-
                confusionMatrix(
                    data = pred,
                    reference = testing_df$trimmed_activity,
                    mode = "prec_recall"
                )


            # create a list of the model and the results to save
            results[["LDA"]] <-
                list(
                    split_seed = seed,
                    model_name = model_name,
                    model = model_A,
                    train_control_method = train_control_method,
                    tune_parameters = c(model_parameter),
                    cf_matrix = cf_matrix
                )

            if (return_plots) {
                plts[["LDA"]] <-  cf_matrix$table %>%
                    data.frame() %>%
                    ggplot2::ggplot(aes(Prediction, Reference)) +
                    geom_tile(aes(fill = Freq), colour = "gray50") +
                    scale_fill_gradient(low = "beige", high = muted("chocolate")) +
                    geom_text(aes(label = Freq)) +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                    ggtitle("LDA")
            }
            toc()
        }




        #------------------------------ Random Forest _ Ranger package----------------------
        if ("RF" %in% model_names) {
            # Ranger is a fast implementation of random forests (Breiman 2001)
            # The method is none becuase we have test and train data
            tic("RF took")
            message("Starting RF")

            model_name <- "ranger"
            train_control_method <- "none"
            model_mtry <- RF_mtry
            model_splitrule <- "extratrees"
            model_min_node_size <- min_node_size


            model_A <- train(
                trimmed_activity ~ .,
                data = training_df,
                method = model_name,
                trControl = fitControl,
                verbose = FALSE,
                importance = "impurity",
                tuneGrid = data.frame(
                    mtry = model_mtry,
                    splitrule = model_splitrule,
                    min.node.size = model_min_node_size
                ),
                metric = "ROC"
            )

            pred <- stats::predict(model_A, newdata = testing_df)

            # To calculate area AUC we need probabilies and predicted classes in a single dataframe
            pred_prob <-
                data.frame(obs =  testing_df$trimmed_activity,
                           pred = pred)
            pred <-
                stats::predict(model_A, newdata = testing_df, type = "prob")
            pred_prob <- bind_cols(pred_prob, pred)

            # Calculate different metrics
            metrics <-
                multiClassSummary(data = pred_prob,
                                  lev = levels(testing_df$trimmed_activity)) %>%
                as.data.frame()
            # Return the metric in a nicer format
            metric_names <- rownames(metrics)
            metrics  %<>% data.table::transpose()
            colnames(metrics) <- metric_names
            rownames(metrics) <- "RF"
            accuracies  %<>%  rbind(metrics)


            # CF need a different format of prediction results so recalcuate
            pred <- stats::predict(model_A, newdata = testing_df)

            # Calculate confusion matrix
            cf_matrix <-
                confusionMatrix(
                    data = pred,
                    reference = testing_df$trimmed_activity,
                    mode = "prec_recall"
                )


            # Create a list of the model and the results to save
            results[["RF"]] <-
                list(
                    split_seed = seed,
                    model_name = model_name,
                    model = model_A,
                    train_control_method = train_control_method,
                    tune_parameters = c(model_mtry, model_splitrule, model_min_node_size),
                    cf_matrix = cf_matrix
                )


            if (return_plots) {
                plts[["RF"]] <- cf_matrix$table %>%
                    data.frame() %>%
                    ggplot2::ggplot(aes(Prediction, Reference)) +
                    geom_tile(aes(fill = Freq), colour = "gray50") +
                    scale_fill_gradient(low = "beige", high = muted("chocolate")) +
                    geom_text(aes(label = Freq)) +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                    ggtitle("RF")
            }
            importance <- caret::varImp(model_A,scale = FALSE)
            toc()
        }

        #------------------------------ Boosting Trees----------------------
        if ("XGB" %in% model_names) {
            tic("XGB took")
            message("XGB")
            model_name <- "xgbTree"
            train_control_method <- "none"
            model_max_depth <- 4
            model_nrounds<- 500
            model_eta <- 0.4
            model_min_child_weight <- 1
            model_subsample <- 1
            model_gamma <- 1
            model_colsample_bytree <- 1


            model_A <- train(
                trimmed_activity ~ .,
                data = training_df,
                method = model_name,
                trControl = fitControl,
                verbose = FALSE,
                importance = "impurity",
                tuneGrid = data.frame(
                    max_depth = model_max_depth,
                    nrounds = model_nrounds,
                    eta = model_eta,
                    min_child_weight = model_min_child_weight,
                    subsample = model_subsample,
                    gamma = model_gamma,
                    colsample_bytree = model_colsample_bytree

                ),
                metric = "ROC"
            )

            pred <- stats::predict(model_A, newdata = testing_df)

            # To calculate area AUC we need probabilies and predicted classes in a single dataframe
            pred_prob <-
                data.frame(obs =  testing_df$trimmed_activity,
                           pred = pred)
            pred <-
                stats::predict(model_A, newdata = testing_df, type = "prob")
            pred_prob <- bind_cols(pred_prob, pred)

            # Calculate different metrics
            metrics <-
                multiClassSummary(data = pred_prob,
                                  lev = levels(testing_df$trimmed_activity)) %>%
                as.data.frame()
            # Return the metric in a nicer format
            metric_names <- rownames(metrics)
            metrics  %<>% data.table::transpose()
            colnames(metrics) <- metric_names
            rownames(metrics) <- "XGB"
            accuracies  %<>%  rbind(metrics)


            # CF need a different format of prediction results so recalcuate
            pred <- stats::predict(model_A, newdata = testing_df)

            # Calculate confusion matrix
            cf_matrix <-
                confusionMatrix(
                    data = pred,
                    reference = testing_df$trimmed_activity,
                    mode = "prec_recall"
                )


            # Create a list of the model and the results to save
            results[["XGB"]] <-
                list(
                    split_seed = seed,
                    model_name = model_name,
                    model = model_A,
                    train_control_method = train_control_method,
                    tune_parameters = c(model_max_depth, model_nrounds, model_eta,
                                        model_min_child_weight,model_subsample,
                                        model_gamma,model_colsample_bytree),
                    cf_matrix = cf_matrix
                )


            if (return_plots) {
                plts[["XGB"]] <- cf_matrix$table %>%
                    data.frame() %>%
                    ggplot2::ggplot(aes(Prediction, Reference)) +
                    geom_tile(aes(fill = Freq), colour = "gray50") +
                    scale_fill_gradient(low = "beige", high = muted("chocolate")) +
                    geom_text(aes(label = Freq)) +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                    ggtitle("XGB")
            }
            importance <- caret::varImp(model_A,scale = FALSE)
            toc()
        }




        #---------------------------------- Naive Bayes Classifier ----------------------------
        if ("NB" %in% model_names) {
            # The method is none becuase we have test and train data
            tic("NB took:")
            message("Starting NB")
            model_name <- "nb"
            train_control_method <- "none"
            model_fL <- 3
            model_usekernel <- TRUE
            model_adjust <- 1.5

            model_A <- train(
                trimmed_activity ~ .,
                data = training_df,
                method = model_name,
                trControl = fitControl,
                verbose = FALSE,
                tuneGrid = data.frame(
                    fL = model_fL ,
                    usekernel = model_usekernel ,
                    adjust = model_adjust
                ),
                metric = "ROC"
            )

            pred <- stats::predict(model_A, newdata = testing_df)

            # To calculate area AUC we need probabilies and predicted classes in a single dataframe
            pred_prob <-
                data.frame(obs =  testing_df$trimmed_activity,
                           pred = pred)
            pred <-
                stats::predict(model_A, newdata = testing_df, type = "prob")
            pred_prob <- bind_cols(pred_prob, pred)

            # Calculate different metrics
            metrics <-
                multiClassSummary(data = pred_prob,
                                  lev = levels(testing_df$trimmed_activity)) %>%
                as.data.frame()
            # Return the metric in a nicer format
            metric_names <- rownames(metrics)
            metrics  %<>% data.table::transpose()
            colnames(metrics) <- metric_names
            rownames(metrics) <- "NB"
                        accuracies  %<>%  rbind(metrics)

            # CF need a different format of prediction results so recalcuate
            pred <- stats::predict(model_A, newdata = testing_df)

            # Calculate confusion matrix
            cf_matrix <-
                confusionMatrix(
                    data = pred,
                    reference = testing_df$trimmed_activity,
                    mode = "prec_recall"
                )


            # Create a list of the model and the results to save
            results[["NB"]] <-
                list(
                    split_seed = seed,
                    model_name = model_name,
                    model = model_A,
                    train_control_method = train_control_method,
                    tune_parameters = c(model_fL, model_usekernel, model_adjust),
                    cf_matrix = cf_matrix
                )
            if (return_plots) {
                plts[["NB"]] <-  cf_matrix$table %>%
                    data.frame() %>%
                    ggplot2::ggplot(aes(Prediction, Reference)) +
                    geom_tile(aes(fill = Freq), colour = "gray50") +
                    scale_fill_gradient(low = "beige", high = muted("chocolate")) +
                    geom_text(aes(label = Freq)) +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                    ggtitle("NB")
            }
            toc()
        }


        #----------------------------------- k-Nearest Neighbors -----------------------------------
        if ("KNN" %in% model_names) {
            # using kknn package
            # The method is none becuase we have test and train data
            message("Starting KNN")
            tic("KNN took")


            model_name <- "kknn"
            train_control_method <- "none"
            model_kmax <- 3
            model_kernel <- "optimal" # Normal unweighted KNN
            model_distance <-
                1 # 1 for Manhatan , 2 for Euclidean distance


            model_A <- train(
                trimmed_activity ~ .,
                data = training_df,
                method = model_name,
                trControl = fitControl,
                verbose = FALSE,
                tuneGrid = data.frame(
                    kmax = model_kmax,
                    kernel = model_kernel,
                    distance = model_distance
                ),
                metric = "ROC"
            )


            pred <- stats::predict(model_A, newdata = testing_df)

            # To calculate area AUC we need probabilies and predicted classes in a single dataframe
            pred_prob <-
                data.frame(obs =  testing_df$trimmed_activity,
                           pred = pred)
            pred <-
                stats::predict(model_A, newdata = testing_df, type = "prob")
            pred_prob <- bind_cols(pred_prob, pred)

            # Calculate different metrics
            metrics <-
                multiClassSummary(data = pred_prob,
                                  lev = levels(testing_df$trimmed_activity)) %>%
                as.data.frame()
            # Return the metric in a nicer format
            metric_names <- rownames(metrics)
            metrics  %<>% data.table::transpose()
            colnames(metrics) <- metric_names
            rownames(metrics) <- "KNN"
            accuracies  %<>%  rbind(metrics)

            # CF need a different format of prediction results so recalcuate
            pred <- stats::predict(model_A, newdata = testing_df)

            # Calculate confusion matrix
            cf_matrix <-
                confusionMatrix(
                    data = pred,
                    reference = testing_df$trimmed_activity,
                    mode = "prec_recall"
                )


            # Create a list of the model and the results to save
            results[["KNN"]] <-
                list(
                    split_seed = seed,
                    model_name = model_name,
                    model = model_A,
                    train_control_method = train_control_method,
                    tune_parameters = c(model_kmax, model_kernel, model_distance),
                    cf_matrix = cf_matrix
                )

            if (return_plots) {
                plts[["KNN"]] <- cf_matrix$table %>%
                    data.frame() %>%
                    ggplot2::ggplot(aes(Prediction, Reference)) +
                    geom_tile(aes(fill = Freq), colour = "gray50") +
                    scale_fill_gradient(low = "beige", high = muted("chocolate")) +
                    geom_text(aes(label = Freq)) +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                    ggtitle("KNN")
            }
            toc()
        }

        #------------------------------ Support Vector Machines with Polynomial Kernel ----------------------
        if ("SVM" %in% model_names) {
            # using  kernlab package
            # The method is none becuase we have test and train data
            tic("SVM took")
            message("Starting SVM")

            model_name <- "svmPoly"
            train_control_method <- "none"
            model_degree <- 3
            model_scale <- 1
            model_C <- 0.01

            model_A <- train(
                trimmed_activity ~ .,
                data = training_df,
                method = model_name,
                trControl = fitControl,
                verbose = FALSE,
                tuneGrid = data.frame(
                    degree = model_degree,
                    scale = model_scale,
                    C = model_C
                ),
                metric = "ROC"
            )

            pred <- stats::predict(model_A, newdata = testing_df)

            # To calculate area AUC we need probabilies and predicted classes in a single dataframe
            pred_prob <-
                data.frame(obs =  testing_df$trimmed_activity,
                           pred = pred)
            pred <-
                stats::predict(model_A, newdata = testing_df, type = "prob")
            pred_prob <- bind_cols(pred_prob, pred)

            # Calculate different metrics
            metrics <-
                multiClassSummary(data = pred_prob,
                                  lev = levels(testing_df$trimmed_activity)) %>%
                as.data.frame()
            # Return the metric in a nicer format
            metric_names <- rownames(metrics)
            metrics  %<>% data.table::transpose()
            colnames(metrics) <- metric_names
            rownames(metrics) <- "SVM"
                        accuracies  %<>%  rbind(metrics)

            # CF need a different format of prediction results so recalcuate
            pred <- stats::predict(model_A, newdata = testing_df)

            # Calculate confusion matrix
            cf_matrix <-
                confusionMatrix(
                    data = pred,
                    reference = testing_df$trimmed_activity,
                    mode = "prec_recall"
                )


            # Create a list of the model and the results to save
            results[["SVM"]] <-
                list(
                    split_seed = seed,
                    model_name = model_name,
                    model = model_A,
                    train_control_method = train_control_method,
                    tune_parameters = c(model_degree, model_scale, model_C),
                    cf_matrix = cf_matrix
                )

            if (return_plots) {
                plts[["SVM"]] <- cf_matrix$table %>%
                    data.frame() %>%
                    ggplot2::ggplot(aes(Prediction, Reference)) +
                    geom_tile(aes(fill = Freq), colour = "gray50") +
                    scale_fill_gradient(low = "beige", high = muted("chocolate")) +
                    geom_text(aes(label = Freq)) +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                    ggtitle("SVM")
            }
            toc()
        }

        # -------------------------- DT ------------------------
        if ("DT" %in% model_names) {
            tic("DT took")
            message("Starting DT")
            model_name <- "C5.0"
            train_control_method <- "none"
            model_trails <- 10
            model_model <- "C5.0"
            model_winnow <- FALSE

            model_A <- train(
                trimmed_activity ~ .,
                data = training_df,
                method = model_name,
                trControl = fitControl,
                verbose = FALSE,
                tuneGrid = data.frame(
                    trials = model_trails,
                    model = model_model,
                    winnow = model_winnow
                ),
                metric = "ROC"
            )

            pred <- stats::predict(model_A, newdata = testing_df)

            # To calculate area AUC we need probabilies and predicted classes in a single dataframe
            pred_prob <-
                data.frame(obs =  testing_df$trimmed_activity,
                           pred = pred)
            pred <-
                stats::predict(model_A, newdata = testing_df, type = "prob")
            pred_prob <- bind_cols(pred_prob, pred)

            # Calculate different metrics
            metrics <-
                multiClassSummary(data = pred_prob,
                                  lev = levels(testing_df$trimmed_activity)) %>%
                as.data.frame()
            # Return the metric in a nicer format
            metric_names <- rownames(metrics)
            metrics  %<>% data.table::transpose()
            colnames(metrics) <- metric_names
            rownames(metrics) <- "DT"
                        accuracies  %<>%  rbind(metrics)

            # CF need a different format of prediction results so recalcuate
            pred <- stats::predict(model_A, newdata = testing_df)

            # Calculate confusion matrix
            cf_matrix <-
                confusionMatrix(
                    data = pred,
                    reference = testing_df$trimmed_activity,
                    mode = "prec_recall"
                )



            # Create a list of the model and the results to save
            results[["DT"]] <-
                list(
                    split_seed = seed,
                    model_name = model_name,
                    model = model_A,
                    train_control_method = train_control_method,
                    tune_parameters = c(model_trails, model_model, model_winnow),
                    cf_matrix = cf_matrix
                )

            if (return_plots) {
                plts[["DT"]] <- cf_matrix$table %>%
                    data.frame() %>%
                    ggplot2::ggplot(aes(Prediction, Reference)) +
                    geom_tile(aes(fill = Freq), colour = "gray50") +
                    scale_fill_gradient(low = "beige", high = muted("chocolate")) +
                    geom_text(aes(label = Freq)) +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                    ggtitle("DT")
            }
            toc()
        }


        # ---------------------------- save the results ----------------------

        if (save_results_on_disk) {
            fname <- paste0("Model_results_", as.numeric(now()), ".RData")
            save(results,
                 file = fname)
            message(paste0("The models are stored in ", fname))
        }


        if (cores != 1) {
            # To stop parallel calculation
            stopCluster(cl)
        }



        output <-
            list(
                "Model-Accuracy" = accuracies,
                "Plots" = plts,
                "RF_Feature_Importance" = importance
            )
        return(output)
    }
Javad-mun/Beap documentation built on July 22, 2020, 3:11 p.m.