Nothing
#' classification—function to perform classification analysis and return results to the user.
#' @param data a data set that includes classification data. For example, the Carseats data in the ISLR package
#' @param colnum the number of the column. For example, in the Carseats data this is column 7, ShelveLoc with three values, Good, Medium and Bad
#' @param numresamples the number of times to resample the analysis
#' @param how_to_handle_strings Converts strings to factor levels
#' @param predict_on_new_data asks if the user has new data to be analyzed using the trained models that were just developed
#' @param set_seed Gives the user the option to set a seed
#' @param remove_VIF_above Removes columns with Variance Inflation Factors above the level chosen by the user
#' @param scale_all_numeric_predictors_in_data Scales all numeric predictors in the original data
#' @param save_all_trained_models Gives the user the option to save all trained models in the Environment
#' @param save_all_plots Saves all plots in the user's chosen format
#' @param use_parallel "Y" or "N" for parallel processing
#' @param train_amount set the amount for the training data
#' @param test_amount set the amount for the testing data
#' @param validation_amount Set the amount for the validation data
#' @returns a full analysis, including data visualizations, statistical summaries, and a full report on the results of 35 models on the data
#' @export Classification
#' @importFrom C50 C5.0
#' @importFrom car vif
#' @importFrom caret confusionMatrix
#' @importFrom corrplot corrplot
#' @importFrom dplyr across count mutate relocate select
#' @importFrom doParallel registerDoParallel
#' @importFrom e1071 svm
#' @importFrom ggplot2 geom_boxplot geom_histogram ggplot facet_wrap labs theme_bw labs aes
#' @importFrom graphics hist pairs panel.smooth par rect
#' @importFrom gt gt
#' @importFrom ipred bagging
#' @importFrom MachineShop fit
#' @importFrom magrittr %>%
#' @importFrom parallel makeCluster
#' @importFrom pls plsr
#' @importFrom purrr keep
#' @importFrom randomForest randomForest
#' @importFrom ranger ranger
#' @importFrom reactable reactable
#' @importFrom reactablefmtr add_title
#' @importFrom scales percent
#' @importFrom stats complete.cases cor predict reorder sd
#' @importFrom tidyr gather last_col pivot_longer
#' @importFrom tree tree cv.tree prune.misclass
#' @importFrom utils head read.csv str
Classification <- function(data, colnum, numresamples, predict_on_new_data = c("Y", "N"), set_seed = c("Y", "N"), remove_VIF_above, scale_all_numeric_predictors_in_data,
how_to_handle_strings = c(0("No strings"), 1("Strings as factors")), save_all_trained_models = c("Y", "N"),
save_all_plots, use_parallel = c("Y", "N"), train_amount, test_amount, validation_amount) {
use_parallel <- 0
no_cores <- 0
if (use_parallel == "Y") {
cl <- parallel::makeCluster(no_cores, type = "FORK")
doParallel::registerDoParallel(cl)
}
if(set_seed == "Y"){
seed = as.integer(readline("Which integer would you like to use for the seed? "))
}
colnames(data)[colnum] <- "y"
df <- data %>% dplyr::relocate(y, .after = tidyr::last_col()) # Moves the target column to the last column on the right
df <- df[sample(nrow(df)), ]
VIF <- car::vif(stats::lm(as.numeric(df$y) ~ ., data = df[, 1:ncol(df)]))
for (i in 1:ncol(df)) {
if(max(VIF) > remove_VIF_above){
df <- df %>% dplyr::select(-which.max(VIF))
VIF <- car::vif(stats::lm(as.numeric(df$y) ~ ., data = df[, 1:ncol(df)]))
}
}
VIF <- reactable::reactable(as.data.frame(VIF),
searchable = TRUE, pagination = FALSE, wrap = TRUE, rownames = TRUE, fullWidth = TRUE, filterable = TRUE, bordered = TRUE,
striped = TRUE, highlight = TRUE, resizable = TRUE
)%>%
reactablefmtr::add_title("Variance Inflation Factor")
head_df <- head(df) %>% dplyr::mutate_if(is.numeric, round, digits = 4)
head_df <- reactable::reactable(head_df,
searchable = TRUE, pagination = FALSE, wrap = TRUE, rownames = TRUE, fullWidth = TRUE, filterable = TRUE, bordered = TRUE,
striped = TRUE, highlight = TRUE, resizable = TRUE
)%>%
reactablefmtr::add_title("Head of the data frame")
data_summary <- summary(df)
if(save_all_plots == "Y"){
width = as.numeric(readline("Width of the graphics: "))
height = as.numeric(readline("Height of the graphics: "))
units = readline("Which units? You may use in, cm, mm or px. ")
scale = as.numeric(readline("What multiplicative scaling factor? "))
device = readline("Which device to use? You may enter eps, jpeg, pdf, png, svg or tiff: ")
dpi <- as.numeric(readline("Plot resolution. Applies only to raster output types (jpeg, png, tiff): "))
}
tempdir1 <- tempdir()
#### Barchart of the data against y ####
barchart <- df %>%
dplyr::mutate(dplyr::across(-y, as.numeric)) %>%
tidyr::pivot_longer(!y) %>%
dplyr::summarise(dplyr::across(value, sum), .by = c(y, name)) %>%
dplyr::mutate(perc = proportions(value), .by = c(name)) %>%
ggplot2::ggplot(ggplot2::aes(x = y, y = value)) +
ggplot2::geom_col() +
ggplot2::geom_text(aes(label = round(value, 4)),
vjust = -.5) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, hjust=1)) +
ggplot2::facet_wrap(~ name, scales = "free") +
ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0.1, 0.25)))
barchart2 <- df %>%
dplyr::mutate(dplyr::across(-y, as.numeric)) %>%
tidyr::pivot_longer(!y) %>%
dplyr::summarise(dplyr::across(value, sum), .by = c(y, name)) %>%
dplyr::mutate(perc = proportions(value), .by = c(name)) %>%
ggplot2::ggplot(ggplot2::aes(x = y, y = value)) +
ggplot2::geom_col() +
ggplot2::geom_text(aes(label = scales::percent(round(perc, 3)),
vjust = -0.5)) +
ggplot2::facet_wrap(~ name, scales = "free") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, hjust=1)) +
ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0.1, 0.25)))
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("barchart.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("barchart.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("barchart.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("barchart.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("barchart.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("barchart.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
data_summary <- summary(df)
head_data <- head(df)
#### Correlation plot of numeric data ####
df1 <- df %>% purrr::keep(is.numeric)
M1 <- cor(df1)
title <- "Correlation plot of the numerical data"
corrplot::corrplot(M1, method = "number", title = title, mar = c(0, 0, 1, 0)) # http://stackoverflow.com/a/14754408/54964)
corrplot::corrplot(M1, method = "circle", title = title, mar = c(0, 0, 1, 0)) # http://stackoverflow.com/a/14754408/54964)
#### message correlation matrix of numeric data ####
correlation_marix <- reactable::reactable(round(cor(df1), 4),
searchable = TRUE, pagination = FALSE, wrap = TRUE, rownames = TRUE, fullWidth = TRUE, filterable = TRUE, bordered = TRUE,
striped = TRUE, highlight = TRUE, resizable = TRUE
)%>%
reactablefmtr::add_title("Correlation of the data")
#### Boxplots of the numeric data ####
boxplots <- df1 %>%
tidyr::gather(key = "var", value = "value") %>%
ggplot2::ggplot(ggplot2::aes(x = "", y = value)) +
ggplot2::geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
ggplot2::facet_wrap(~var, scales = "free") +
ggplot2::theme_bw() +
ggplot2::labs(title = "Boxplots of the numeric data")
# Thanks to https://rstudio-pubs-static.s3.amazonaws.com/388596_e21196f1adf04e0ea7cd68edd9eba966.html
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("boxplots.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("boxplots.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("boxplots.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("boxplots.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("boxplots.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("boxplots.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
#### Histograms of the numeric data ####
histograms <- ggplot2::ggplot(tidyr::gather(df1, cols, value), ggplot2::aes(x = value)) +
ggplot2::geom_histogram(bins = round(nrow(df1) / 10)) +
ggplot2::facet_wrap(. ~ cols, scales = "free") +
ggplot2::labs(title = "Histograms of each numeric column. Each bar = 10 rows of data")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("histograms.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("histograms.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("histograms.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("histograms.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("histograms.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("histograms.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if (use_parallel == "Y") {
cl <- parallel::makeCluster(no_cores, type = "FORK")
doParallel::registerDoParallel(cl)
}
y <- 0
colnames(data)[colnum] <- "y"
df <- data %>% dplyr::relocate(y, .after = tidyr::last_col()) # Moves the target column to the last column on the right
df <- df[sample(nrow(df)), ]
if (how_to_handle_strings == 1) {
df <- dplyr::mutate_if(df, is.character, as.factor)
}
if (how_to_handle_strings == 2) {
df <- dplyr::mutate_if(df, is.character, as.factor)
df <- dplyr::mutate_if(df, is.factor, as.numeric)
}
if (predict_on_new_data == "Y") {
new_data <- readline("What is the URL of the new data? ")
new_data <- read.csv(new_data, stringsAsFactors = TRUE)
y <- 0
colnames(new_data)[colnum] <- "y"
new_data <- new_data %>% dplyr::relocate(y, .after = tidyr::last_col()) # Moves the target column to the last column on the right
}
if (predict_on_new_data == "Y") {
new_data <- readline("What is the URL of the new data? ")
new_data <- read.csv(new_data, stringsAsFactors = TRUE)
y <- 0
colnames(new_data)[colnum] <- "y"
new_data <- new_data %>% dplyr::relocate(y, .after = tidyr::last_col()) # Moves the target column to the last column on the right
}
if (scale_all_numeric_predictors_in_data == "Y"){
df <- as.data.frame(scale(df[, 1:ncol(df) -1]) %>% cbind(y = df$y))
}
#### Set accuracy values to zero ####
bagging_train_accuracy <- 0
bagging_test_accuracy <- 0
bagging_validation_accuracy <- 0
bagging_holdout_vs_train <- 0
bagging_holdout <- 0
bagging_residuals <- 0
bagging_duration <- 0
bagging_true_positive_rate <- 0
bagging_true_negative_rate <- 0
bagging_false_positive_rate <- 0
bagging_false_negative_rate <- 0
bagging_F1_score <- 0
bagging_table_total <- 0
bagging_classification_error <- 0
bagging_classification_error_mean <- 0
bagging_positive_pred_value <- 0
bagging_negative_pred_value <- 0
bagging_prevalence <- 0
bagging_detection_rate <- 0
bagging_detection_prevalence <- 0
bag_rf_train_accuracy <- 0
bag_rf_test_accuracy <- 0
bag_rf_validation_accuracy <- 0
bag_rf_holdout_vs_train <- 0
bag_rf_holdout <- 0
bag_rf_residuals <- 0
bag_rf_duration <- 0
bag_rf_true_positive_rate <- 0
bag_rf_true_negative_rate <- 0
bag_rf_false_positive_rate <- 0
bag_rf_false_negative_rate <- 0
bag_rf_F1_score <- 0
bag_rf_table_total <- 0
bag_rf_classification_error <- 0
bag_rf_classification_error_mean <- 0
bag_rf_positive_pred_value <- 0
bag_rf_negative_pred_value <- 0
bag_rf_prevalence <- 0
bag_rf_detection_rate <- 0
bag_rf_detection_prevalence <- 0
C50_train_accuracy <- 0
C50_test_accuracy <- 0
C50_validation_accuracy <- 0
C50_holdout_vs_train <- 0
C50_holdout <- 0
C50_residuals <- 0
C50_duration <- 0
C50_true_positive_rate <- 0
C50_true_negative_rate <- 0
C50_false_positive_rate <- 0
C50_false_negative_rate <- 0
C50_F1_score <- 0
C50_table_total <- 0
C50_classification_error <- 0
C50_classification_error_mean <- 0
C50_positive_pred_value <- 0
C50_negative_pred_value <- 0
C50_prevalence <- 0
C50_detection_rate <- 0
C50_detection_prevalence <- 0
linear_train_accuracy <- 0
linear_validation_accuracy <- 0
linear_test_accuracy <- 0
linear_test_accuracy_mean <- 0
linear_validation_accuracy_mean <- 0
linear_holdout_vs_train <- 0
linear_holdout <- 0
linear_residuals <- 0
linear_duration <- 0
linear_true_positive_rate <- 0
linear_true_negative_rate <- 0
linear_false_positive_rate <- 0
linear_false_negative_rate <- 0
linear_F1_score <- 0
linear_table_total <- 0
linear_classification_error <- 0
linear_classification_error_mean <- 0
linear_positive_pred_value <- 0
linear_negative_pred_value <- 0
linear_prevalence <- 0
linear_detection_rate <- 0
linear_detection_prevalence <- 0
n_bayes_train_accuracy <- 0
n_bayes_test_accuracy <- 0
n_bayes_validation_accuracy <- 0
n_bayes_accuracy <- 0
n_bayes_test_accuracy_mean <- 0
n_bayes_validation_accuracy_mean <- 0
n_bayes_holdout_vs_train <- 0
n_bayes_holdout <- 0
n_bayes_residuals <- 0
n_bayes_duration <- 0
n_bayes_true_positive_rate <- 0
n_bayes_true_negative_rate <- 0
n_bayes_false_positive_rate <- 0
n_bayes_false_negative_rate <- 0
n_bayes_F1_score <- 0
n_bayes_table_total <- 0
n_bayes_classification_error <- 0
n_bayes_classification_error_mean <- 0
n_bayes_positive_pred_value <- 0
n_bayes_negative_pred_value <- 0
n_bayes_prevalence <- 0
n_bayes_detection_rate <- 0
n_bayes_detection_prevalence <- 0
pls_train_accuracy <- 0
pls_test_accuracy <- 0
pls_test_accuracy_mean <- 0
pls_validation_accuracy <- 0
pls_validation_accuracy_mean <- 0
pls_holdout_vs_train <- 0
pls_holdout <- 0
pls_residuals <- 0
pls_duration <- 0
pls_true_positive_rate <- 0
pls_true_negative_rate <- 0
pls_false_positive_rate <- 0
pls_false_negative_rate <- 0
pls_F1_score <- 0
pls_table_total <- 0
pls_classification_error <- 0
pls_classification_error_mean <- 0
pls_positive_pred_value <- 0
pls_negative_pred_value <- 0
pls_prevalence <- 0
pls_detection_rate <- 0
pls_detection_prevalence <- 0
pda_train_accuracy <- 0
pda_test_accuracy <- 0
pda_test_accuracy_mean <- 0
pda_validation_accuracy <- 0
pda_validation_accuracy_mean <- 0
pda_holdout_vs_train <- 0
pda_holdout <- 0
pda_residuals <- 0
pda_duration <- 0
pda_true_positive_rate <- 0
pda_true_negative_rate <- 0
pda_false_positive_rate <- 0
pda_false_negative_rate <- 0
pda_F1_score <- 0
pda_table_total <- 0
pda_classification_error <- 0
pda_classification_error_mean <- 0
pda_positive_pred_value <- 0
pda_negative_pred_value <- 0
pda_prevalence <- 0
pda_detection_rate <- 0
pda_detection_prevalence <- 0
rf_train_accuracy <- 0
rf_test_accuracy <- 0
rf_test_accuracy_mean <- 0
rf_validation_accuracy <- 0
rf_validation_accuracy_mean <- 0
rf_holdout_vs_train <- 0
rf_holdout_vs_train_holdout <- 0
rf_holdout <- 0
rf_residuals <- 0
rf_duration <- 0
rf_true_positive_rate <- 0
rf_true_negative_rate <- 0
rf_false_positive_rate <- 0
rf_false_negative_rate <- 0
rf_F1_score <- 0
rf_table_total <- 0
rf_classification_error <- 0
rf_classification_error_mean <- 0
rf_positive_pred_value <- 0
rf_negative_pred_value <- 0
rf_prevalence <- 0
rf_detection_rate <- 0
rf_detection_prevalence <- 0
ranger_train_accuracy <- 0
ranger_test_accuracy <- 0
ranger_test_accuracy_mean <- 0
ranger_validation_accuracy <-
ranger_validation_accuracy_mean <- 0
ranger_holdout_vs_train <- 0
ranger_holdout <- 0
ranger_residuals <- 0
ranger_duration <- 0
ranger_true_positive_rate <- 0
ranger_true_negative_rate <- 0
ranger_false_positive_rate <- 0
ranger_false_negative_rate <- 0
ranger_F1_score <- 0
ranger_table_total <- 0
ranger_classification_error <- 0
ranger_classification_error_mean <- 0
ranger_positive_pred_value <- 0
ranger_negative_pred_value <- 0
ranger_prevalence <- 0
ranger_detection_rate <- 0
ranger_detection_prevalence <- 0
rpart_train_accuracy <- 0
rpart_test_accuracy <- 0
rpart_test_accuracy_mean <- 0
rpart_validation_accuracy <- 0
rpart_validation_accuracy_mean <- 0
rpart_holdout_vs_train <- 0
rpart_holdout <- 0
rpart_residuals <- 0
rpart_duration <- 0
rpart_true_positive_rate <- 0
rpart_true_negative_rate <- 0
rpart_false_positive_rate <- 0
rpart_false_negative_rate <- 0
rpart_F1_score <- 0
rpart_table_total <- 0
rpart_classification_error <- 0
rpart_classification_error_mean <- 0
rpart_positive_pred_value <- 0
rpart_negative_pred_value <- 0
rpart_prevalence <- 0
rpart_detection_rate <- 0
rpart_detection_prevalence <- 0
svm_train_accuracy <- 0
svm_test_accuracy <- 0
svm_test_accuracy_mean <- 0
svm_validation_accuracy <- 0
svm_validation_accuracy_mean <- 0
svm_holdout_vs_train <- 0
svm_holdout <- 0
svm_residuals <- 0
svm_duration <- 0
svm_true_positive_rate <- 0
svm_true_negative_rate <- 0
svm_false_positive_rate <- 0
svm_false_negative_rate <- 0
svm_F1_score <- 0
svm_table_total <- 0
svm_classification_error <- 0
svm_classification_error_mean <- 0
svm_positive_pred_value <- 0
svm_negative_pred_value <- 0
svm_prevalence <- 0
svm_detection_rate <- 0
svm_detection_prevalence <- 0
tree_train_accuracy <- 0
tree_test_accuracy <- 0
tree_test_accuracy_mean <- 0
tree_validation_accuracy <- 0
tree_validation_accuracy_mean <- 0
tree_holdout_vs_train <- 0
tree_holdout <- 0
tree_residuals <- 0
tree_duration <- 0
tree_true_positive_rate <- 0
tree_true_negative_rate <- 0
tree_false_positive_rate <- 0
tree_false_negative_rate <- 0
tree_F1_score <- 0
tree_table_total <- 0
tree_classification_error <- 0
tree_classification_error_mean <- 0
tree_positive_pred_value <- 0
tree_negative_pred_value <- 0
tree_prevalence <- 0
tree_detection_rate <- 0
tree_detection_prevalence <- 0
ensemble_adaboost_train_accuracy <- 0
ensemble_adaboost_train_accuracy_mean <- 0
ensemble_adaboost_test_accuracy <- 0
ensemble_adaboost_test_accuracy_mean <- 0
ensemble_adaboost_validation_accuracy <- 0
ensemble_adaboost_validation_accuracy_mean <- 0
ensemble_adaboost_holdout_vs_train <- 0
ensemble_adaboost_holdout <- 0
ensemble_adaboost_residuals <- 0
ensemble_adaboost_duration <- 0
ensemble_adaboost_true_positive_rate <- 0
ensemble_adaboost_true_negative_rate <- 0
ensemble_adaboost_false_positive_rate <- 0
ensemble_adaboost_false_negative_rate <- 0
ensemble_adaboost_F1_score <- 0
ensemble_adaboost_table_total <- 0
ensemble_adaboost_classification_error <- 0
ensemble_adaboost_classification_error_mean <- 0
ensemble_adaboost_positive_pred_value <- 0
ensemble_adaboost_negative_pred_value <- 0
ensemble_adaboost_prevalence <- 0
ensemble_adaboost_detection_rate <- 0
ensemble_adaboost_detection_prevalence <- 0
ensemble_bag_cart_train_accuracy <- 0
ensemble_bag_cart_train_accuracy_mean <- 0
ensemble_bag_cart_test_accuracy <- 0
ensemble_bag_cart_test_accuracy_mean <- 0
ensemble_bag_cart_validation_accuracy <- 0
ensemble_bag_cart_validation_accuracy_mean <- 0
ensemble_bag_cart_holdout_vs_train <- 0
ensemble_bag_cart_holdout <- 0
ensemble_bag_cart_residuals <- 0
ensemble_bag_cart_duration <- 0
ensemble_bag_cart_true_positive_rate <- 0
ensemble_bag_cart_true_negative_rate <- 0
ensemble_bag_cart_false_positive_rate <- 0
ensemble_bag_cart_false_negative_rate <- 0
ensemble_bag_cart_F1_score <- 0
ensemble_bag_cart_table_total <- 0
ensemble_bag_cart_classification_error <- 0
ensemble_bag_cart_classification_error_mean <- 0
ensemble_bag_cart_positive_pred_value <- 0
ensemble_bag_cart_negative_pred_value <- 0
ensemble_bag_cart_prevalence <- 0
ensemble_bag_cart_detection_rate <- 0
ensemble_bag_cart_detection_prevalence <- 0
ensemble_bag_rf_train_accuracy <- 0
ensemble_bag_rf_train_accuracy_mean <- 0
ensemble_bag_rf_test_accuracy <- 0
ensemble_bag_rf_test_accuracy_mean <- 0
ensemble_bag_rf_validation_accuracy <- 0
ensemble_bag_rf_validation_accuracy_mean <- 0
ensemble_bag_rf_holdout_vs_train <- 0
ensemble_bag_rf_holdout <- 0
ensemble_bag_rf_residuals <- 0
ensemble_bag_rf_duration <- 0
ensemble_bag_rf_true_positive_rate <- 0
ensemble_bag_rf_true_negative_rate <- 0
ensemble_bag_rf_false_positive_rate <- 0
ensemble_bag_rf_false_negative_rate <- 0
ensemble_bag_rf_F1_score <- 0
ensemble_bag_rf_table_total <- 0
ensemble_bag_rf_classification_error <- 0
ensemble_bag_rf_classification_error_mean <- 0
ensemble_bag_rf_positive_pred_value <- 0
ensemble_bag_rf_negative_pred_value <- 0
ensemble_bag_rf_prevalence <- 0
ensemble_bag_rf_detection_rate <- 0
ensemble_bag_rf_detection_prevalence <- 0
ensemble_C50_train_accuracy <- 0
ensemble_C50_train_accuracy_mean <- 0
ensemble_C50_test_accuracy <- 0
ensemble_C50_test_accuracy_mean <- 0
ensemble_C50_validation_accuracy <- 0
ensemble_C50_validation_accuracy_mean <- 0
ensemble_C50_holdout_vs_train <- 0
ensemble_C50_holdout <- 0
ensemble_C50_residuals <- 0
ensemble_C50_duration <- 0
ensemble_C50_true_positive_rate <- 0
ensemble_C50_true_negative_rate <- 0
ensemble_C50_false_positive_rate <- 0
ensemble_C50_false_negative_rate <- 0
ensemble_C50_F1_score <- 0
ensemble_C50_table_total <- 0
ensemble_C50_classification_error <- 0
ensemble_C50_classification_error_mean <- 0
ensemble_C50_positive_pred_value <- 0
ensemble_C50_negative_pred_value <- 0
ensemble_C50_prevalence <- 0
ensemble_C50_detection_rate <- 0
ensemble_C50_detection_prevalence <- 0
ensemble_n_bayes_train_accuracy <- 0
ensemble_n_bayes_train_accuracy_mean <- 0
ensemble_n_bayes_test_accuracy <- 0
ensemble_n_bayes_test_accuracy_mean <- 0
ensemble_n_bayes_validation_accuracy <- 0
ensemble_n_bayes_validation_accuracy_mean <- 0
ensemble_n_bayes_holdout_vs_train <- 0
ensemble_n_bayes_holdout <- 0
ensemble_n_bayes_residuals <- 0
ensemble_n_bayes_duration <- 0
ensemble_n_bayes_true_positive_rate <- 0
ensemble_n_bayes_true_negative_rate <- 0
ensemble_n_bayes_false_positive_rate <- 0
ensemble_n_bayes_false_negative_rate <- 0
ensemble_n_bayes_F1_score <- 0
ensemble_n_bayes_table_total <- 0
ensemble_n_bayes_classification_error <- 0
ensemble_n_bayes_classification_error_mean <- 0
ensemble_n_bayes_positive_pred_value <- 0
ensemble_n_bayes_negative_pred_value <- 0
ensemble_n_bayes_prevalence <- 0
ensemble_n_bayes_detection_rate <- 0
ensemble_n_bayes_detection_prevalence <- 0
ensemble_ranger_train_accuracy <- 0
ensemble_ranger_train_accuracy_mean <- 0
ensemble_ranger_test_accuracy <- 0
ensemble_ranger_test_accuracy_mean <- 0
ensemble_ranger_validation_accuracy <- 0
ensemble_ranger_validation_accuracy_mean <- 0
ensemble_ranger_holdout_vs_train <- 0
ensemble_ranger_holdout <- 0
ensemble_ranger_residuals <- 0
ensemble_ranger_duration <- 0
ensemble_ranger_true_positive_rate <- 0
ensemble_ranger_true_negative_rate <- 0
ensemble_ranger_false_positive_rate <- 0
ensemble_ranger_false_negative_rate <- 0
ensemble_ranger_F1_score <- 0
ensemble_ranger_table_total <- 0
ensemble_ranger_classification_error <- 0
ensemble_ranger_classification_error_mean <- 0
ensemble_ranger_positive_pred_value <- 0
ensemble_ranger_negative_pred_value <- 0
ensemble_ranger_prevalence <- 0
ensemble_ranger_detection_rate <- 0
ensemble_ranger_detection_prevalence <- 0
ensemble_rf_train_accuracy <- 0
ensemble_rf_train_accuracy_mean <- 0
ensemble_rf_test_accuracy <- 0
ensemble_rf_test_accuracy_mean <- 0
ensemble_rf_validation_accuracy <- 0
ensemble_rf_validation_accuracy_mean <- 0
ensemble_rf_holdout_vs_train <- 0
ensemble_rf_holdout <- 0
ensemble_rf_residuals <- 0
ensemble_rf_duration <- 0
ensemble_rf_true_positive_rate <- 0
ensemble_rf_true_negative_rate <- 0
ensemble_rf_false_positive_rate <- 0
ensemble_rf_false_negative_rate <- 0
ensemble_rf_F1_score <- 0
ensemble_rf_table_total <- 0
ensemble_rf_classification_error <- 0
ensemble_rf_classification_error_mean <- 0
ensemble_rf_positive_pred_value <- 0
ensemble_rf_negative_pred_value <- 0
ensemble_rf_prevalence <- 0
ensemble_rf_detection_rate <- 0
ensemble_rf_detection_prevalence <- 0
ensemble_svm_train_accuracy <- 0
ensemble_svm_train_accuracy_mean <- 0
ensemble_svm_test_accuracy <- 0
ensemble_svm_test_accuracy_mean <- 0
ensemble_svm_validation_accuracy <- 0
ensemble_svm_validation_accuracy_mean <- 0
ensemble_svm_holdout_vs_train <- 0
ensemble_svm_holdout <- 0
ensemble_svm_residuals <- 0
ensemble_svm_duration <- 0
ensemble_svm_true_positive_rate <- 0
ensemble_svm_true_negative_rate <- 0
ensemble_svm_false_positive_rate <- 0
ensemble_svm_false_negative_rate <- 0
ensemble_svm_F1_score <- 0
ensemble_svm_table_total <- 0
ensemble_svm_classification_error <- 0
ensemble_svm_classification_error_mean <- 0
ensemble_svm_positive_pred_value <- 0
ensemble_svm_negative_pred_value <- 0
ensemble_svm_prevalence <- 0
ensemble_svm_detection_rate <- 0
ensemble_svm_detection_prevalence <- 0
ensemble_tree_train_accuracy <- 0
ensemble_tree_train_accuracy_mean <- 0
ensemble_tree_test_accuracy <- 0
ensemble_tree_test_accuracy_mean <- 0
ensemble_tree_validation_accuracy <- 0
ensemble_tree_validation_accuracy_mean <- 0
ensemble_tree_holdout_vs_train <- 0
ensemble_tree_holdout <- 0
ensemble_tree_residuals <- 0
ensemble_tree_duration <- 0
ensemble_tree_true_positive_rate <- 0
ensemble_tree_true_negative_rate <- 0
ensemble_tree_false_positive_rate <- 0
ensemble_tree_false_negative_rate <- 0
ensemble_tree_F1_score <- 0
ensemble_tree_table_total <- 0
ensemble_tree_classification_error <- 0
ensemble_tree_classification_error_mean <- 0
ensemble_tree_positive_pred_value <- 0
ensemble_tree_negative_pred_value <- 0
ensemble_tree_prevalence <- 0
ensemble_tree_detection_rate <- 0
ensemble_tree_detection_prevalence <- 0
value <- 0
cols <- 0
Mean_Holdout_Accuracy <- 0
count <- 0
model <- 0
holdout <- 0
name <- 0
perc <- 0
Model <- 0
Holdout_vs_train <- 0
Holdout_vs_train_St_Dev <- 0
Duration <- 0
Accuracy_Holdout_Std.Dev <- 0
holdout_vs_train_St_Dev <- 0
Duration_St_Dev <- 0
seed <- 0
#### Random resampling starts here ####
for (i in 1:numresamples) {
message(noquote(""))
message(paste0("Resampling number ", i, " of ", numresamples, sep = ','))
message(noquote(""))
if(set_seed == "Y"){
train <- df[1:round(train_amount*nrow(df)), ]
test <- df[round(train_amount*nrow(df)) +1:round(test_amount*nrow(df)), ]
validation <- df[(nrow(test) + nrow(train) +1) : nrow(df), ]
}
if(set_seed == "N"){
idx <- sample(seq(1, 3), size = nrow(df), replace = TRUE, prob = c(train_amount, test_amount, validation_amount))
train <- df[idx == 1, ]
test <- df[idx == 2, ]
validation <- df[idx == 3, ]
}
# train <- df[index == 1, ]
# test <- df[index == 2, ]
# validation <- df[index == 3, ]
#
train01 <- train
test01 <- test
validation01 <- validation
#
y_train <- train$y
y_test <- test$y
y_validation <- validation$y
#
# train <- df[index == 1, ] %>% dplyr::select(-y)
# test <- df[index == 2, ] %>% dplyr::select(-y)
# validation <- df[index == 3, ] %>% dplyr::select(-y)
#### 1. Bagging ####
bagging_start <- Sys.time()
message("Working on Bagging analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
bagging_train_fit <- ipred::bagging(formula = y ~ ., data = train)
}
if(set_seed == "N"){
bagging_train_fit <- ipred::bagging(formula = y ~ ., data = train)
}
bagging_train_pred <- predict(object = bagging_train_fit, newdata = train)
bagging_train_table <- table(bagging_train_pred, y_train)
bagging_train_accuracy[i] <- sum(diag(bagging_train_table)) / sum(bagging_train_table)
bagging_train_accuracy_mean <- mean(bagging_train_accuracy)
bagging_train_mean <- mean(diag(bagging_train_table)) / mean(bagging_train_table)
bagging_train_sd <- sd(diag(bagging_train_table)) / sd(bagging_train_table)
bagging_train_diag <- sum(diag(bagging_train_table))
sum_diag_train_bagging <- sum(diag(bagging_train_table))
bagging_train_prop <- diag(prop.table(bagging_train_table))
bagging_test_pred <- predict(object = bagging_train_fit, newdata = test)
bagging_test_table <- table(bagging_test_pred, y_test)
bagging_test_accuracy[i] <- sum(diag(bagging_test_table)) / sum(bagging_test_table)
bagging_test_accuracy_mean <- mean(bagging_test_accuracy)
bagging_test_mean <- mean(diag(bagging_test_table)) / mean(bagging_test_table)
bagging_test_sd <- sd(diag(bagging_test_table)) / sd(bagging_test_table)
bagging_test_diag <- sum(diag(bagging_test_table))
sum_diag_test_bagging <- sum(diag(bagging_test_table))
bagging_test_prop <- diag(prop.table(bagging_test_table))
bagging_validation_pred <- predict(object = bagging_train_fit, newdata = validation)
bagging_validation_table <- table(bagging_validation_pred, y_validation)
bagging_validation_accuracy[i] <- sum(diag(bagging_validation_table)) / sum(bagging_validation_table)
bagging_validation_accuracy_mean <- mean(bagging_validation_accuracy)
bagging_validation_mean <- mean(diag(bagging_validation_table)) / mean(bagging_validation_table)
bagging_validation_sd <- sd(diag(bagging_validation_table)) / sd(bagging_validation_table)
bagging_validation_diag <- sum(diag(bagging_validation_table))
sum_diag_validation_bagging <- sum(diag(bagging_validation_table))
bagging_validation_prop <- diag(prop.table(bagging_validation_table))
bagging_holdout[i] <- mean(c(bagging_test_accuracy_mean, bagging_validation_accuracy_mean))
bagging_holdout_mean <- mean(bagging_holdout)
bagging_residuals[i] <- 1 - bagging_holdout[i]
bagging_residuals_mean <- mean(bagging_residuals)
bagging_holdout_sd <- sd(bagging_holdout)
bagging_holdout_vs_train[i] <- bagging_holdout_mean / bagging_train_accuracy_mean
bagging_holdout_vs_train_mean <- mean(bagging_holdout_vs_train)
bagging_holdout_vs_train_range <- range(bagging_holdout_vs_train)
bagging_holdout_vs_train_sd <- sd(bagging_holdout_vs_train)
bagging_table <- bagging_test_table + bagging_validation_table
bagging_table_total <- bagging_table_total + bagging_table
bagging_table_sum_diag <- sum(diag(bagging_table))
bagging_confusion_matrix <- caret::confusionMatrix(bagging_table_total)
bagging_confusion_matrix_summary <- colMeans(bagging_confusion_matrix$byClass, na.rm = TRUE)
bagging_true_positive_rate[i] <- bagging_confusion_matrix_summary[1]
bagging_true_positive_rate_mean <- mean(bagging_true_positive_rate[i])
bagging_true_negative_rate[i] <- bagging_confusion_matrix_summary[2]
bagging_true_negative_rate_mean <- mean(bagging_true_negative_rate)
bagging_false_negative_rate[i] <- 1 - bagging_true_positive_rate[i]
bagging_false_negative_rate_mean <- mean(bagging_false_positive_rate)
bagging_false_positive_rate[i] <- 1 - bagging_true_negative_rate[i]
bagging_false_positive_rate_mean <- mean(bagging_false_positive_rate)
bagging_positive_pred_value[i] <- bagging_confusion_matrix_summary[3]
bagging_positive_pred_value_mean <- mean(bagging_positive_pred_value)
bagging_negative_pred_value[i] <- bagging_confusion_matrix_summary[4]
bagging_negative_pred_value_mean <- mean(bagging_negative_pred_value)
bagging_prevalence[i] <- bagging_confusion_matrix_summary[8]
bagging_prevalence_mean <- mean(bagging_prevalence)
bagging_detection_rate[i] <- bagging_confusion_matrix_summary[9]
bagging_detection_rate_mean <- mean(bagging_detection_rate)
bagging_detection_prevalence[i] <- bagging_confusion_matrix_summary[10]
bagging_detection_prevalence_mean <- mean(bagging_detection_prevalence)
bagging_classification_error[i] <- 1 - bagging_holdout[i]
bagging_classification_error_mean <- mean(bagging_classification_error)
bagging_F1_score[i] <- bagging_confusion_matrix_summary[7]
bagging_F1_score_mean <- mean(bagging_F1_score[i])
bagging_end <- Sys.time()
bagging_duration[i] <- bagging_end - bagging_start
bagging_duration_mean <- mean(bagging_duration)
bagging_duration_sd <- sd(bagging_duration)
#### 2. Bagged Random Forest ####
bag_rf_start <- Sys.time()
message("Working on Bagged Random Forest analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
bag_rf_train_fit <- randomForest::randomForest(y ~ ., data = train01, mtry = ncol(train01) -1)
}
if(set_seed == "N"){
bag_rf_train_fit <- randomForest::randomForest(y ~ ., data = train01, mtry = ncol(train01) -1)
}
bag_rf_train_pred <- predict(bag_rf_train_fit, train, type = "class")
bag_rf_train_table <- table(bag_rf_train_pred, y_train)
bag_rf_train_accuracy[i] <- sum(diag(bag_rf_train_table)) / sum(bag_rf_train_table)
bag_rf_train_accuracy_mean <- mean(bag_rf_train_accuracy)
bag_rf_train_diag <- sum(bag_rf_train_table)
bag_rf_train_mean <- mean(diag(bag_rf_train_table)) / mean(bag_rf_train_table)
bag_rf_train_sd <- sd(diag(bag_rf_train_table)) / sd(bag_rf_train_table)
sum_diag_bag_train_rf <- sum(diag(bag_rf_train_table))
bag_rf_train_prop <- diag(prop.table(bag_rf_train_table, margin = 1))
bag_rf_test_pred <- predict(bag_rf_train_fit, test, type = "class")
bag_rf_test_table <- table(bag_rf_test_pred, y_test)
bag_rf_test_accuracy[i] <- sum(diag(bag_rf_test_table)) / sum(bag_rf_test_table)
bag_rf_test_accuracy_mean <- mean(bag_rf_test_accuracy)
bag_rf_test_diag <- sum(bag_rf_test_table)
bag_rf_test_mean <- mean(diag(bag_rf_test_table)) / mean(bag_rf_test_table)
bag_rf_test_sd <- sd(diag(bag_rf_test_table)) / sd(bag_rf_test_table)
sum_diag_bag_test_rf <- sum(diag(bag_rf_test_table))
bag_rf_test_prop <- diag(prop.table(bag_rf_test_table, margin = 1))
bag_rf_validation_pred <- predict(bag_rf_train_fit, validation, type = "class")
bag_rf_validation_table <- table(bag_rf_validation_pred, y_validation)
bag_rf_validation_accuracy[i] <- sum(diag(bag_rf_validation_table)) / sum(bag_rf_validation_table)
bag_rf_validation_accuracy_mean <- mean(bag_rf_validation_accuracy)
bag_rf_validation_diag <- sum(bag_rf_validation_table)
bag_rf_validation_mean <- mean(diag(bag_rf_validation_table)) / mean(bag_rf_validation_table)
bag_rf_validation_sd <- sd(diag(bag_rf_validation_table)) / sd(bag_rf_validation_table)
sum_diag_bag_validation_rf <- sum(diag(bag_rf_validation_table))
bag_rf_validation_prop <- diag(prop.table(bag_rf_validation_table, margin = 1))
bag_rf_holdout[i] <- mean(c(bag_rf_test_accuracy_mean, bag_rf_validation_accuracy_mean))
bag_rf_holdout_mean <- mean(bag_rf_holdout)
bag_rf_residuals[i] <- 1 - bag_rf_holdout[i]
bag_rf_residuals_mean <- mean(bag_rf_residuals)
bag_rf_holdout_sd <- sd(bag_rf_holdout)
bag_rf_holdout_vs_train[i] <- bag_rf_holdout_mean / bag_rf_train_accuracy_mean
bag_rf_holdout_vs_train_mean <- mean(bag_rf_holdout_vs_train)
bag_rf_holdout_vs_train_range <- range(bag_rf_holdout_vs_train)
bag_rf_holdout_vs_train_sd <- sd(bag_rf_holdout_vs_train)
bag_rf_table <- bag_rf_test_table + bag_rf_validation_table
bag_rf_table_total <- bag_rf_table_total + bag_rf_table
bag_rf_table_sum_diag <- sum(diag(bag_rf_table))
bag_rf_confusion_matrix <- caret::confusionMatrix(bag_rf_table_total)
bag_rf_confusion_matrix_summary <- colMeans(bag_rf_confusion_matrix$byClass, na.rm = TRUE)
bag_rf_true_positive_rate[i] <- bag_rf_confusion_matrix_summary[1]
bag_rf_true_positive_rate_mean <- mean(bag_rf_true_positive_rate[i])
bag_rf_true_negative_rate[i] <- bag_rf_confusion_matrix_summary[2]
bag_rf_true_negative_rate_mean <- mean(bag_rf_true_negative_rate)
bag_rf_false_negative_rate[i] <- 1 - bag_rf_true_positive_rate[i]
bag_rf_false_negative_rate_mean <- mean(bag_rf_false_positive_rate)
bag_rf_false_positive_rate[i] <- 1 - bag_rf_true_negative_rate[i]
bag_rf_false_positive_rate_mean <- mean(bag_rf_false_positive_rate)
bag_rf_positive_pred_value[i] <- bag_rf_confusion_matrix_summary[3]
bag_rf_positive_pred_value_mean <- mean(bag_rf_positive_pred_value)
bag_rf_negative_pred_value[i] <- bag_rf_confusion_matrix_summary[4]
bag_rf_negative_pred_value_mean <- mean(bag_rf_negative_pred_value)
bag_rf_prevalence[i] <- bag_rf_confusion_matrix_summary[8]
bag_rf_prevalence_mean <- mean(bag_rf_prevalence)
bag_rf_detection_rate[i] <- bag_rf_confusion_matrix_summary[9]
bag_rf_detection_rate_mean <- mean(bag_rf_detection_rate)
bag_rf_detection_prevalence[i] <- bag_rf_confusion_matrix_summary[10]
bag_rf_detection_prevalence_mean <- mean(bag_rf_detection_prevalence)
bag_rf_classification_error[i] <- 1 - bag_rf_holdout[i]
bag_rf_classification_error_mean <- mean(bag_rf_classification_error)
bag_rf_F1_score[i] <- bag_rf_confusion_matrix_summary[7]
bag_rf_F1_score_mean <- mean(bag_rf_F1_score[i])
bag_rf_end <- Sys.time()
bag_rf_duration[i] <- bag_rf_end - bag_rf_start
bag_rf_duration_mean <- mean(bag_rf_duration)
bag_rf_duration_sd <- sd(bag_rf_duration)
#### 3. C50 ####
C50_start <- Sys.time()
message("Working on C50 analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
C50_train_fit <- C50::C5.0(as.factor(y_train) ~ ., data = train)
}
if(set_seed == "N"){
C50_train_fit <- C50::C5.0(as.factor(y_train) ~ ., data = train)
}
C50_train_pred <- predict(C50_train_fit, train)
C50_train_table <- table(C50_train_pred, y_train)
C50_train_accuracy[i] <- sum(diag(C50_train_table)) / sum(C50_train_table)
C50_train_accuracy_mean <- mean(C50_train_accuracy)
C50_train_mean <- mean(diag(C50_train_table)) / mean(C50_train_table)
C50_train_sd <- sd(diag(C50_train_table)) / sd(C50_train_table)
sum_diag_train_C50 <- sum(diag(C50_train_table))
C50_train_prop <- diag(prop.table(C50_train_table, margin = 1))
C50_test_pred <- predict(C50_train_fit, test)
C50_test_table <- table(C50_test_pred, y_test)
C50_test_accuracy[i] <- sum(diag(C50_test_table)) / sum(C50_test_table)
C50_test_accuracy_mean <- mean(C50_test_accuracy)
C50_test_mean <- mean(diag(C50_test_table)) / mean(C50_test_table)
C50_test_sd <- sd(diag(C50_test_table)) / sd(C50_test_table)
sum_diag_test_C50 <- sum(diag(C50_test_table))
C50_test_prop <- diag(prop.table(C50_test_table, margin = 1))
C50_validation_pred <- predict(C50_train_fit, validation)
C50_validation_table <- table(C50_validation_pred, y_validation)
C50_validation_accuracy[i] <- sum(diag(C50_validation_table)) / sum(C50_validation_table)
C50_validation_accuracy_mean <- mean(C50_validation_accuracy)
C50_validation_mean <- mean(diag(C50_validation_table)) / mean(C50_validation_table)
C50_validation_sd <- sd(diag(C50_validation_table)) / sd(C50_validation_table)
sum_diag_validation_C50 <- sum(diag(C50_validation_table))
C50_validation_prop <- diag(prop.table(C50_validation_table, margin = 1))
C50_holdout[i] <- mean(c(C50_test_accuracy_mean, C50_validation_accuracy_mean))
C50_holdout_mean <- mean(C50_holdout)
C50_residuals[i] <- 1 - C50_holdout[i]
C50_residuals_mean <- mean(C50_residuals)
C50_holdout_sd <- sd(C50_holdout)
C50_holdout_vs_train[i] <- C50_holdout_mean / C50_train_accuracy_mean
C50_holdout_vs_train_mean <- mean(C50_holdout_vs_train)
C50_holdout_vs_train_range <- range(C50_holdout_vs_train)
C50_holdout_vs_train_sd <- sd(C50_holdout_vs_train)
C50_table <- C50_test_table + C50_validation_table
C50_table_total <- C50_table_total + C50_table
C50_table_sum_diag <- sum(diag(C50_table))
C50_confusion_matrix <- caret::confusionMatrix(C50_table_total)
C50_confusion_matrix_summary <- colMeans(C50_confusion_matrix$byClass, na.rm = TRUE)
C50_true_positive_rate[i] <- C50_confusion_matrix_summary[1]
C50_true_positive_rate_mean <- mean(C50_true_positive_rate[i])
C50_true_negative_rate[i] <- C50_confusion_matrix_summary[2]
C50_true_negative_rate_mean <- mean(C50_true_negative_rate)
C50_false_negative_rate[i] <- 1 - C50_true_positive_rate[i]
C50_false_negative_rate_mean <- mean(C50_false_positive_rate)
C50_false_positive_rate[i] <- 1 - C50_true_negative_rate[i]
C50_false_positive_rate_mean <- mean(C50_false_positive_rate)
C50_positive_pred_value[i] <- C50_confusion_matrix_summary[3]
C50_positive_pred_value_mean <- mean(C50_positive_pred_value)
C50_negative_pred_value[i] <- C50_confusion_matrix_summary[4]
C50_negative_pred_value_mean <- mean(C50_negative_pred_value)
C50_prevalence[i] <- C50_confusion_matrix_summary[8]
C50_prevalence_mean <- mean(C50_prevalence)
C50_detection_rate[i] <- C50_confusion_matrix_summary[9]
C50_detection_rate_mean <- mean(C50_detection_rate)
C50_detection_prevalence[i] <- C50_confusion_matrix_summary[10]
C50_detection_prevalence_mean <- mean(C50_detection_prevalence)
C50_classification_error[i] <- 1 - C50_holdout[i]
C50_classification_error_mean <- mean(C50_classification_error)
C50_F1_score[i] <- C50_confusion_matrix_summary[7]
C50_F1_score_mean <- mean(C50_F1_score[i])
C50_end <- Sys.time()
C50_duration[i] <- C50_end - C50_start
C50_duration_mean <- mean(C50_duration)
C50_duration_sd <- sd(C50_duration)
#### 4. Linear Model ####
linear_start <- Sys.time()
message("Working on Linear analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
linear_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "LMModel")
}
if(set_seed == "N"){
linear_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "LMModel")
}
linear_train_pred <- predict(object = linear_train_fit, newdata = train01)
linear_train_table <- table(linear_train_pred, y_train)
linear_train_accuracy[i] <- sum(diag(linear_train_table)) / sum(linear_train_table)
linear_train_accuracy_mean <- mean(linear_train_accuracy)
linear_train_mean <- mean(diag(linear_train_table)) / mean(linear_train_table)
linear_train_sd <- sd(diag(linear_train_table)) / sd(linear_train_table)
sum_diag_train_linear <- sum(diag(linear_train_table))
linear_train_prop <- diag(prop.table(linear_train_table, margin = 1))
linear_test_pred <- predict(object = linear_train_fit, newdata = test01)
linear_test_table <- table(linear_test_pred, y_test)
linear_test_accuracy[i] <- sum(diag(linear_test_table)) / sum(linear_test_table)
linear_test_accuracy_mean <- mean(linear_test_accuracy)
linear_test_mean <- mean(diag(linear_test_table)) / mean(linear_test_table)
linear_test_sd <- sd(diag(linear_test_table)) / sd(linear_test_table)
sum_diag_test_linear <- sum(diag(linear_test_table))
linear_test_prop <- diag(prop.table(linear_test_table, margin = 1))
linear_validation_pred <- predict(object = linear_train_fit, newdata = validation01)
linear_validation_table <- table(linear_validation_pred, y_validation)
linear_validation_accuracy[i] <- sum(diag(linear_validation_table)) / sum(linear_validation_table)
linear_validation_accuracy_mean <- mean(linear_validation_accuracy)
linear_validation_mean <- mean(diag(linear_validation_table)) / mean(linear_validation_table)
linear_validation_sd <- sd(diag(linear_validation_table)) / sd(linear_validation_table)
sum_diag_validation_linear <- sum(diag(linear_validation_table))
linear_validation_prop <- diag(prop.table(linear_validation_table, margin = 1))
linear_holdout[i] <- mean(c(linear_test_accuracy_mean, linear_validation_accuracy_mean))
linear_holdout_mean <- mean(linear_holdout)
linear_residuals[i] <- 1 - linear_holdout[i]
linear_residuals_mean <- mean(linear_residuals)
linear_holdout_sd <- sd(linear_holdout)
linear_holdout_vs_train[i] <- linear_holdout_mean / linear_train_accuracy_mean
linear_holdout_vs_train_mean <- mean(linear_holdout_vs_train)
linear_holdout_vs_train_range <- range(linear_holdout_vs_train)
linear_holdout_vs_train_sd <- sd(linear_holdout_vs_train)
linear_table <- linear_test_table + linear_validation_table
linear_table_total <- linear_table_total + linear_table
linear_table_sum_diag <- sum(diag(linear_table))
linear_confusion_matrix <- caret::confusionMatrix(linear_table_total)
linear_confusion_matrix_summary <- colMeans(linear_confusion_matrix$byClass, na.rm = TRUE)
linear_true_positive_rate[i] <- linear_confusion_matrix_summary[1]
linear_true_positive_rate_mean <- mean(linear_true_positive_rate[i])
linear_true_negative_rate[i] <- linear_confusion_matrix_summary[2]
linear_true_negative_rate_mean <- mean(linear_true_negative_rate)
linear_false_negative_rate[i] <- 1 - linear_true_positive_rate[i]
linear_false_negative_rate_mean <- mean(linear_false_positive_rate)
linear_false_positive_rate[i] <- 1 - linear_true_negative_rate[i]
linear_false_positive_rate_mean <- mean(linear_false_positive_rate)
linear_positive_pred_value[i] <- linear_confusion_matrix_summary[3]
linear_positive_pred_value_mean <- mean(linear_positive_pred_value)
linear_negative_pred_value[i] <- linear_confusion_matrix_summary[4]
linear_negative_pred_value_mean <- mean(linear_negative_pred_value)
linear_prevalence[i] <- linear_confusion_matrix_summary[8]
linear_prevalence_mean <- mean(linear_prevalence)
linear_detection_rate[i] <- linear_confusion_matrix_summary[9]
linear_detection_rate_mean <- mean(linear_detection_rate)
linear_detection_prevalence[i] <- linear_confusion_matrix_summary[10]
linear_detection_prevalence_mean <- mean(linear_detection_prevalence)
linear_classification_error[i] <- 1 - linear_holdout[i]
linear_classification_error_mean <- mean(linear_classification_error)
linear_F1_score[i] <- linear_confusion_matrix_summary[7]
linear_F1_score_mean <- mean(linear_F1_score[i])
linear_end <- Sys.time()
linear_duration[i] <- linear_end - linear_start
linear_duration_mean <- mean(linear_duration)
linear_duration_sd <- sd(linear_duration)
#### 5. Naive Bayes ####
n_bayes_start <- Sys.time()
message("Working on Naive Bayes analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
n_bayes_train_fit <- e1071::naiveBayes(y_train ~ ., data = train)
}
if(set_seed == "N"){
n_bayes_train_fit <- e1071::naiveBayes(y_train ~ ., data = train)
}
n_bayes_train_pred <- predict(n_bayes_train_fit, train)
n_bayes_train_table <- table(n_bayes_train_pred, y_train)
n_bayes_train_accuracy[i] <- sum(diag(n_bayes_train_table)) / sum(n_bayes_train_table)
n_bayes_train_accuracy_mean <- mean(n_bayes_train_accuracy)
n_bayes_train_diag <- sum(diag(n_bayes_train_table))
n_bayes_train_mean <- mean(diag(n_bayes_train_table)) / mean(n_bayes_train_table)
n_bayes_train_sd <- sd(diag(n_bayes_train_table)) / sd(n_bayes_train_table)
sum_diag_n_train_bayes <- sum(diag(n_bayes_train_table))
n_bayes_train_prop <- diag(prop.table(n_bayes_train_table, margin = 1))
n_bayes_test_pred <- predict(n_bayes_train_fit, test)
n_bayes_test_table <- table(n_bayes_test_pred, y_test)
n_bayes_test_accuracy[i] <- sum(diag(n_bayes_test_table)) / sum(n_bayes_test_table)
n_bayes_test_accuracy_mean <- mean(n_bayes_test_accuracy)
n_bayes_test_diag <- sum(diag(n_bayes_test_table))
n_bayes_test_mean <- mean(diag(n_bayes_test_table)) / mean(n_bayes_test_table)
n_bayes_test_sd <- sd(diag(n_bayes_test_table)) / sd(n_bayes_test_table)
sum_diag_n_test_bayes <- sum(diag(n_bayes_test_table))
n_bayes_test_prop <- diag(prop.table(n_bayes_test_table, margin = 1))
n_bayes_validation_pred <- predict(n_bayes_train_fit, validation)
n_bayes_validation_table <- table(n_bayes_validation_pred, y_validation)
n_bayes_validation_accuracy[i] <- sum(diag(n_bayes_validation_table)) / sum(n_bayes_validation_table)
n_bayes_validation_accuracy_mean <- mean(n_bayes_validation_accuracy)
n_bayes_validation_diag <- sum(diag(n_bayes_validation_table))
n_bayes_validation_mean <- mean(diag(n_bayes_validation_table)) / mean(n_bayes_validation_table)
n_bayes_validation_sd <- sd(diag(n_bayes_validation_table)) / sd(n_bayes_validation_table)
sum_diag_n_validation_bayes <- sum(diag(n_bayes_validation_table))
n_bayes_validation_prop <- diag(prop.table(n_bayes_validation_table, margin = 1))
n_bayes_holdout[i] <- mean(c(n_bayes_test_accuracy_mean, n_bayes_validation_accuracy_mean))
n_bayes_holdout_mean <- mean(n_bayes_holdout)
n_bayes_residuals[i] <- 1 - n_bayes_holdout[i]
n_bayes_residuals_mean <- mean(n_bayes_residuals)
n_bayes_holdout_sd <- sd(n_bayes_holdout)
n_bayes_holdout_vs_train[i] <- n_bayes_holdout_mean / n_bayes_train_accuracy_mean
n_bayes_holdout_vs_train_mean <- mean(n_bayes_holdout_vs_train)
n_bayes_holdout_vs_train_range <- range(n_bayes_holdout_vs_train)
n_bayes_holdout_vs_train_sd <- sd(n_bayes_holdout_vs_train)
n_bayes_table <- n_bayes_test_table + n_bayes_validation_table
n_bayes_table_total <- n_bayes_table_total + n_bayes_table
n_bayes_table_sum_diag <- sum(diag(n_bayes_table))
n_bayes_confusion_matrix <- caret::confusionMatrix(n_bayes_table_total)
n_bayes_confusion_matrix_summary <- colMeans(n_bayes_confusion_matrix$byClass, na.rm = TRUE)
n_bayes_true_positive_rate[i] <- n_bayes_confusion_matrix_summary[1]
n_bayes_true_positive_rate_mean <- mean(n_bayes_true_positive_rate[i])
n_bayes_true_negative_rate[i] <- n_bayes_confusion_matrix_summary[2]
n_bayes_true_negative_rate_mean <- mean(n_bayes_true_negative_rate)
n_bayes_false_negative_rate[i] <- 1 - n_bayes_true_positive_rate[i]
n_bayes_false_negative_rate_mean <- mean(n_bayes_false_positive_rate)
n_bayes_false_positive_rate[i] <- 1 - n_bayes_true_negative_rate[i]
n_bayes_false_positive_rate_mean <- mean(n_bayes_false_positive_rate)
n_bayes_positive_pred_value[i] <- n_bayes_confusion_matrix_summary[3]
n_bayes_positive_pred_value_mean <- mean(n_bayes_positive_pred_value)
n_bayes_negative_pred_value[i] <- n_bayes_confusion_matrix_summary[4]
n_bayes_negative_pred_value_mean <- mean(n_bayes_negative_pred_value)
n_bayes_prevalence[i] <- n_bayes_confusion_matrix_summary[8]
n_bayes_prevalence_mean <- mean(n_bayes_prevalence)
n_bayes_detection_rate[i] <- n_bayes_confusion_matrix_summary[9]
n_bayes_detection_rate_mean <- mean(n_bayes_detection_rate)
n_bayes_detection_prevalence[i] <- n_bayes_confusion_matrix_summary[10]
n_bayes_detection_prevalence_mean <- mean(n_bayes_detection_prevalence)
n_bayes_classification_error[i] <- 1 - n_bayes_holdout[i]
n_bayes_classification_error_mean <- mean(n_bayes_classification_error)
n_bayes_F1_score[i] <- n_bayes_confusion_matrix_summary[7]
n_bayes_F1_score_mean <- mean(n_bayes_F1_score[i])
n_bayes_end <- Sys.time()
n_bayes_duration[i] <- n_bayes_end - n_bayes_start
n_bayes_duration_mean <- mean(n_bayes_duration)
n_bayes_duration_sd <- sd(n_bayes_duration)
#### 6. Partial Least Squares ####
pls_start <- Sys.time()
message("Working on Partial Least Squares analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
pls_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "PLSModel")
}
if(set_seed == "N"){
pls_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "PLSModel")
}
pls_train_predict <- predict(object = pls_train_fit, newdata = train01)
pls_train_table <- table(pls_train_predict, y_train)
pls_train_accuracy[i] <- sum(diag(pls_train_table)) / sum(pls_train_table)
pls_train_accuracy_mean <- mean(pls_train_accuracy)
pls_train_pred <- pls_train_predict
pls_train_mean <- mean(diag(pls_train_table)) / sum(pls_train_table)
pls_train_sd <- sd(diag(pls_train_table)) / sd(pls_train_table)
sum_diag_train_pls <- sum(diag(pls_train_table))
pls_train_prop <- diag(prop.table(pls_train_table, margin = 1))
pls_test_predict <- predict(object = pls_train_fit, newdata = test01)
pls_test_table <- table(pls_test_predict, y_test)
pls_test_accuracy[i] <- sum(diag(pls_test_table)) / sum(pls_test_table)
pls_test_accuracy_mean <- mean(pls_test_accuracy)
pls_test_pred <- pls_test_predict
pls_test_mean <- mean(diag(pls_test_table)) / sum(pls_test_table)
pls_test_sd <- sd(diag(pls_test_table)) / sd(pls_test_table)
sum_diag_test_pls <- sum(diag(pls_test_table))
pls_test_prop <- diag(prop.table(pls_test_table, margin = 1))
pls_validation_predict <- predict(object = pls_train_fit, newdata = validation01)
pls_validation_table <- table(pls_validation_predict, y_validation)
pls_validation_accuracy[i] <- sum(diag(pls_validation_table)) / sum(pls_validation_table)
pls_validation_accuracy_mean <- mean(pls_validation_accuracy)
pls_validation_pred <- pls_validation_predict
pls_validation_mean <- mean(diag(pls_validation_table)) / sum(pls_validation_table)
pls_validation_sd <- sd(diag(pls_validation_table)) / sd(pls_validation_table)
sum_diag_validation_pls <- sum(diag(pls_validation_table))
pls_validation_prop <- diag(prop.table(pls_validation_table, margin = 1))
pls_holdout[i] <- mean(c(pls_test_accuracy_mean, pls_validation_accuracy_mean))
pls_holdout_mean <- mean(pls_holdout)
pls_residuals[i] <- 1 - pls_holdout[i]
pls_residuals_mean <- mean(pls_residuals)
pls_holdout_sd <- sd(pls_holdout)
pls_holdout_vs_train[i] <- pls_holdout_mean / pls_train_accuracy_mean
pls_holdout_vs_train_mean <- mean(pls_holdout_vs_train)
pls_holdout_vs_train_range <- range(pls_holdout_vs_train)
pls_holdout_vs_train_sd <- sd(pls_holdout_vs_train)
pls_table <- pls_test_table + pls_validation_table
pls_table_total <- pls_table_total + pls_table
pls_table_sum_diag <- sum(diag(pls_table))
pls_confusion_matrix <- caret::confusionMatrix(pls_table_total)
pls_confusion_matrix_summary <- colMeans(pls_confusion_matrix$byClass, na.rm = TRUE)
pls_true_positive_rate[i] <- pls_confusion_matrix_summary[1]
pls_true_positive_rate_mean <- mean(pls_true_positive_rate[i])
pls_true_negative_rate[i] <- pls_confusion_matrix_summary[2]
pls_true_negative_rate_mean <- mean(pls_true_negative_rate)
pls_false_negative_rate[i] <- 1 - pls_true_positive_rate[i]
pls_false_negative_rate_mean <- mean(pls_false_positive_rate)
pls_false_positive_rate[i] <- 1 - pls_true_negative_rate[i]
pls_false_positive_rate_mean <- mean(pls_false_positive_rate)
pls_positive_pred_value[i] <- pls_confusion_matrix_summary[3]
pls_positive_pred_value_mean <- mean(pls_positive_pred_value)
pls_negative_pred_value[i] <- pls_confusion_matrix_summary[4]
pls_negative_pred_value_mean <- mean(pls_negative_pred_value)
pls_prevalence[i] <- pls_confusion_matrix_summary[8]
pls_prevalence_mean <- mean(pls_prevalence)
pls_detection_rate[i] <- pls_confusion_matrix_summary[9]
pls_detection_rate_mean <- mean(pls_detection_rate)
pls_detection_prevalence[i] <- pls_confusion_matrix_summary[10]
pls_detection_prevalence_mean <- mean(pls_detection_prevalence)
pls_classification_error[i] <- 1 - pls_holdout[i]
pls_classification_error_mean <- mean(pls_classification_error)
pls_F1_score[i] <- pls_confusion_matrix_summary[7]
pls_F1_score_mean <- mean(pls_F1_score[i])
pls_end <- Sys.time()
pls_duration[i] <- pls_end - pls_start
pls_duration_mean <- mean(pls_duration)
pls_duration_sd <- sd(pls_duration)
#### 7. Penalized Discriminant Analysis Model ####
pda_start <- Sys.time()
message("Working on Penalized Discriminant analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
pda_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "PDAModel")
}
if(set_seed == "N"){
pda_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "PDAModel")
}
pda_train_predict <- predict(object = pda_train_fit, newdata = train01)
pda_train_table <- table(pda_train_predict, y_train)
pda_train_accuracy[i] <- sum(diag(pda_train_table)) / sum(pda_train_table)
pda_train_accuracy_mean <- mean(pda_train_accuracy)
pda_train_pred <- pda_train_predict
pda_train_mean <- mean(diag(pda_train_table)) / sum(pda_train_table)
pda_train_sd <- sd(diag(pda_train_table)) / sd(pda_train_table)
sum_diag_train_pda <- sum(diag(pda_train_table))
pda_train_prop <- diag(prop.table(pda_train_table, margin = 1))
pda_test_predict <- predict(object = pda_train_fit, newdata = test01)
pda_test_table <- table(pda_test_predict, y_test)
pda_test_accuracy[i] <- sum(diag(pda_test_table)) / sum(pda_test_table)
pda_test_accuracy_mean <- mean(pda_test_accuracy)
pda_test_pred <- pda_test_predict
pda_test_mean <- mean(diag(pda_test_table)) / sum(pda_test_table)
pda_test_sd <- sd(diag(pda_test_table)) / sd(pda_test_table)
sum_diag_test_pda <- sum(diag(pda_test_table))
pda_test_prop <- diag(prop.table(pda_test_table, margin = 1))
pda_validation_predict <- predict(object = pda_train_fit, newdata = validation01)
pda_validation_table <- table(pda_validation_predict, y_validation)
pda_validation_accuracy[i] <- sum(diag(pda_validation_table)) / sum(pda_validation_table)
pda_validation_accuracy_mean <- mean(pda_validation_accuracy)
pda_validation_pred <- pda_validation_predict
pda_validation_mean <- mean(diag(pda_validation_table)) / sum(pda_validation_table)
pda_validation_sd <- sd(diag(pda_validation_table)) / sd(pda_validation_table)
sum_diag_validation_pda <- sum(diag(pda_validation_table))
pda_validation_prop <- diag(prop.table(pda_validation_table, margin = 1))
pda_holdout[i] <- mean(c(pda_test_accuracy_mean, pda_validation_accuracy_mean))
pda_holdout_mean <- mean(pda_holdout)
pda_residuals[i] <- 1 - pda_holdout[i]
pda_residuals_mean <- mean(pda_residuals)
pda_holdout_sd <- sd(pda_holdout)
pda_holdout_vs_train[i] <- pda_holdout_mean / pda_train_accuracy_mean
pda_holdout_vs_train_mean <- mean(pda_holdout_vs_train)
pda_holdout_vs_train_range <- range(pda_holdout_vs_train)
pda_holdout_vs_train_sd <- sd(pda_holdout_vs_train)
pda_table <- pda_test_table + pda_validation_table
pda_table_total <- pda_table_total + pda_table
pda_table_sum_diag <- sum(diag(pda_table))
pda_confusion_matrix <- caret::confusionMatrix(pda_table_total)
pda_confusion_matrix_summary <- colMeans(pda_confusion_matrix$byClass, na.rm = TRUE)
pda_true_positive_rate[i] <- pda_confusion_matrix_summary[1]
pda_true_positive_rate_mean <- mean(pda_true_positive_rate[i])
pda_true_negative_rate[i] <- pda_confusion_matrix_summary[2]
pda_true_negative_rate_mean <- mean(pda_true_negative_rate)
pda_false_negative_rate[i] <- 1 - pda_true_positive_rate[i]
pda_false_negative_rate_mean <- mean(pda_false_positive_rate)
pda_false_positive_rate[i] <- 1 - pda_true_negative_rate[i]
pda_false_positive_rate_mean <- mean(pda_false_positive_rate)
pda_positive_pred_value[i] <- pda_confusion_matrix_summary[3]
pda_positive_pred_value_mean <- mean(pda_positive_pred_value)
pda_negative_pred_value[i] <- pda_confusion_matrix_summary[4]
pda_negative_pred_value_mean <- mean(pda_negative_pred_value)
pda_prevalence[i] <- pda_confusion_matrix_summary[8]
pda_prevalence_mean <- mean(pda_prevalence)
pda_detection_rate[i] <- pda_confusion_matrix_summary[9]
pda_detection_rate_mean <- mean(pda_detection_rate)
pda_detection_prevalence[i] <- pda_confusion_matrix_summary[10]
pda_detection_prevalence_mean <- mean(pda_detection_prevalence)
pda_classification_error[i] <- 1 - pda_holdout[i]
pda_classification_error_mean <- mean(pda_classification_error)
pda_F1_score[i] <- pda_confusion_matrix_summary[7]
pda_F1_score_mean <- mean(pda_F1_score[i])
pda_end <- Sys.time()
pda_duration[i] <- pda_end - pda_start
pda_duration_mean <- mean(pda_duration)
pda_duration_sd <- sd(pda_duration)
#### 8. Random Forest ####
rf_start <- Sys.time()
message("Working on Random Forest analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
rf_train_fit <- randomForest::randomForest(x = train, y = y_train, data = df)
}
if(set_seed == "N"){
rf_train_fit <- randomForest::randomForest(x = train, y = y_train, data = df)
}
rf_train_fit <- randomForest::randomForest(x = train, y = y_train, data = df)
rf_train_pred <- predict(rf_train_fit, train, type = "class")
rf_train_table <- table(rf_train_pred, y_train)
rf_train_accuracy[i] <- sum(diag(rf_train_table)) / sum(rf_train_table)
rf_train_accuracy_mean <- mean(rf_train_accuracy)
rf_train_diag <- sum(diag(rf_train_table))
rf_train_mean <- mean(diag(rf_train_table)) / mean(rf_train_table)
rf_train_sd <- sd(diag(rf_train_table)) / sd(rf_train_table)
sum_diag_train_rf <- sum(diag(rf_train_table))
rf_train_prop <- diag(prop.table(rf_train_table, margin = 1))
rf_test_pred <- predict(rf_train_fit, test, type = "class")
rf_test_table <- table(rf_test_pred, y_test)
rf_test_accuracy[i] <- sum(diag(rf_test_table)) / sum(rf_test_table)
rf_test_accuracy_mean <- mean(rf_test_accuracy)
rf_test_diag <- sum(diag(rf_test_table))
rf_test_mean <- mean(diag(rf_test_table)) / mean(rf_test_table)
rf_test_sd <- sd(diag(rf_test_table)) / sd(rf_test_table)
sum_diag_test_rf <- sum(diag(rf_test_table))
rf_test_prop <- diag(prop.table(rf_test_table, margin = 1))
rf_validation_pred <- predict(rf_train_fit, validation, type = "class")
rf_validation_table <- table(rf_validation_pred, y_validation)
rf_validation_accuracy[i] <- sum(diag(rf_validation_table)) / sum(rf_validation_table)
rf_validation_accuracy_mean <- mean(rf_validation_accuracy)
rf_validation_diag <- sum(diag(rf_validation_table))
rf_validation_mean <- mean(diag(rf_validation_table)) / mean(rf_validation_table)
rf_validation_sd <- sd(diag(rf_validation_table)) / sd(rf_validation_table)
sum_diag_validation_rf <- sum(diag(rf_validation_table))
rf_validation_prop <- diag(prop.table(rf_validation_table, margin = 1))
rf_holdout[i] <- mean(c(rf_test_accuracy_mean, rf_validation_accuracy_mean))
rf_holdout_mean <- mean(rf_holdout)
rf_residuals[i] <- 1 - rf_holdout[i]
rf_residuals_mean <- mean(rf_residuals)
rf_holdout_sd <- sd(rf_holdout)
rf_holdout_vs_train[i] <- rf_holdout_mean / rf_train_accuracy_mean
rf_holdout_vs_train_mean <- mean(rf_holdout_vs_train)
rf_holdout_vs_train_range <- range(rf_holdout_vs_train)
rf_holdout_vs_train_sd <- sd(rf_holdout_vs_train)
rf_table <- rf_test_table + rf_validation_table
rf_table_total <- rf_table_total + rf_table
rf_table_sum_diag <- sum(diag(rf_table))
rf_confusion_matrix <- caret::confusionMatrix(rf_table_total)
rf_confusion_matrix_summary <- colMeans(rf_confusion_matrix$byClass, na.rm = TRUE)
rf_true_positive_rate[i] <- rf_confusion_matrix_summary[1]
rf_true_positive_rate_mean <- mean(rf_true_positive_rate[i])
rf_true_negative_rate[i] <- rf_confusion_matrix_summary[2]
rf_true_negative_rate_mean <- mean(rf_true_negative_rate)
rf_false_negative_rate[i] <- 1 - rf_true_positive_rate[i]
rf_false_negative_rate_mean <- mean(rf_false_positive_rate)
rf_false_positive_rate[i] <- 1 - rf_true_negative_rate[i]
rf_false_positive_rate_mean <- mean(rf_false_positive_rate)
rf_positive_pred_value[i] <- rf_confusion_matrix_summary[3]
rf_positive_pred_value_mean <- mean(rf_positive_pred_value)
rf_negative_pred_value[i] <- rf_confusion_matrix_summary[4]
rf_negative_pred_value_mean <- mean(rf_negative_pred_value)
rf_prevalence[i] <- rf_confusion_matrix_summary[8]
rf_prevalence_mean <- mean(rf_prevalence)
rf_detection_rate[i] <- rf_confusion_matrix_summary[9]
rf_detection_rate_mean <- mean(rf_detection_rate)
rf_detection_prevalence[i] <- rf_confusion_matrix_summary[10]
rf_detection_prevalence_mean <- mean(rf_detection_prevalence)
rf_classification_error[i] <- 1 - rf_holdout[i]
rf_classification_error_mean <- mean(rf_classification_error)
rf_F1_score[i] <- rf_confusion_matrix_summary[7]
rf_F1_score_mean <- mean(rf_F1_score[i])
rf_end <- Sys.time()
rf_duration[i] <- rf_end - rf_start
rf_duration_mean <- mean(rf_duration)
rf_duration_sd <- sd(rf_duration)
#### 9. Ranger Model ####
ranger_start <- Sys.time()
message("Working on Ranger analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ranger_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "RangerModel")
}
if(set_seed == "N"){
ranger_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "RangerModel")
}
ranger_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "RangerModel")
ranger_train_predict <- predict(object = ranger_train_fit, newdata = train01)
ranger_train_table <- table(ranger_train_predict, y_train)
ranger_train_accuracy[i] <- sum(diag(ranger_train_table)) / sum(ranger_train_table)
ranger_train_accuracy_mean <- mean(ranger_train_accuracy)
ranger_train_pred <- ranger_train_predict
ranger_train_mean <- mean(diag(ranger_train_table)) / sum(ranger_train_table)
ranger_train_sd <- sd(diag(ranger_train_table)) / sd(ranger_train_table)
sum_diag_train_ranger <- sum(diag(ranger_train_table))
ranger_train_prop <- diag(prop.table(ranger_train_table, margin = 1))
ranger_test_predict <- predict(object = ranger_train_fit, newdata = test01)
ranger_test_table <- table(ranger_test_predict, y_test)
ranger_test_accuracy[i] <- sum(diag(ranger_test_table)) / sum(ranger_test_table)
ranger_test_accuracy_mean <- mean(ranger_test_accuracy)
ranger_test_pred <- ranger_test_predict
ranger_test_mean <- mean(diag(ranger_test_table)) / sum(ranger_test_table)
ranger_test_sd <- sd(diag(ranger_test_table)) / sd(ranger_test_table)
sum_diag_test_ranger <- sum(diag(ranger_test_table))
ranger_test_prop <- diag(prop.table(ranger_test_table, margin = 1))
ranger_validation_predict <- predict(object = ranger_train_fit, newdata = validation01)
ranger_validation_table <- table(ranger_validation_predict, y_validation)
ranger_validation_accuracy[i] <- sum(diag(ranger_validation_table)) / sum(ranger_validation_table)
ranger_validation_accuracy_mean <- mean(ranger_validation_accuracy)
ranger_validation_pred <- ranger_validation_predict
ranger_validation_mean <- mean(diag(ranger_validation_table)) / sum(ranger_validation_table)
ranger_validation_sd <- sd(diag(ranger_validation_table)) / sd(ranger_validation_table)
sum_diag_validation_ranger <- sum(diag(ranger_validation_table))
ranger_validation_prop <- diag(prop.table(ranger_validation_table, margin = 1))
ranger_holdout[i] <- mean(c(ranger_test_accuracy_mean, ranger_validation_accuracy_mean))
ranger_holdout_mean <- mean(ranger_holdout)
ranger_residuals[i] <- 1 - ranger_holdout[i]
ranger_residuals_mean <- mean(ranger_residuals)
ranger_holdout_sd <- sd(ranger_holdout)
ranger_holdout_vs_train[i] <- ranger_holdout_mean / ranger_train_accuracy_mean
ranger_holdout_vs_train_mean <- mean(ranger_holdout_vs_train)
ranger_holdout_vs_train_range <- range(ranger_holdout_vs_train)
ranger_holdout_vs_train_sd <- sd(ranger_holdout_vs_train)
ranger_table <- ranger_test_table + ranger_validation_table
ranger_table_total <- ranger_table_total + ranger_table
ranger_table_sum_diag <- sum(diag(ranger_table))
ranger_confusion_matrix <- caret::confusionMatrix(ranger_table_total)
ranger_confusion_matrix_summary <- colMeans(ranger_confusion_matrix$byClass, na.rm = TRUE)
ranger_true_positive_rate[i] <- ranger_confusion_matrix_summary[1]
ranger_true_positive_rate_mean <- mean(ranger_true_positive_rate[i])
ranger_true_negative_rate[i] <- ranger_confusion_matrix_summary[2]
ranger_true_negative_rate_mean <- mean(ranger_true_negative_rate)
ranger_false_negative_rate[i] <- 1 - ranger_true_positive_rate[i]
ranger_false_negative_rate_mean <- mean(ranger_false_positive_rate)
ranger_false_positive_rate[i] <- 1 - ranger_true_negative_rate[i]
ranger_false_positive_rate_mean <- mean(ranger_false_positive_rate)
ranger_positive_pred_value[i] <- ranger_confusion_matrix_summary[3]
ranger_positive_pred_value_mean <- mean(ranger_positive_pred_value)
ranger_negative_pred_value[i] <- ranger_confusion_matrix_summary[4]
ranger_negative_pred_value_mean <- mean(ranger_negative_pred_value)
ranger_prevalence[i] <- ranger_confusion_matrix_summary[8]
ranger_prevalence_mean <- mean(ranger_prevalence)
ranger_detection_rate[i] <- ranger_confusion_matrix_summary[9]
ranger_detection_rate_mean <- mean(ranger_detection_rate)
ranger_detection_prevalence[i] <- ranger_confusion_matrix_summary[10]
ranger_detection_prevalence_mean <- mean(ranger_detection_prevalence)
ranger_classification_error[i] <- 1 - ranger_holdout[i]
ranger_classification_error_mean <- mean(ranger_classification_error)
ranger_F1_score[i] <- ranger_confusion_matrix_summary[7]
ranger_F1_score_mean <- mean(ranger_F1_score[i])
ranger_end <- Sys.time()
ranger_duration[i] <- ranger_end - ranger_start
ranger_duration_mean <- mean(ranger_duration)
ranger_duration_sd <- sd(ranger_duration)
#### 10. RPart Model ####
rpart_start <- Sys.time()
message("Working on RPart analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
rpart_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "RPartModel")
}
if(set_seed == "N"){
rpart_train_fit <- MachineShop::fit(y ~ ., data = train01, model = "RPartModel")
}
rpart_train_predict <- predict(object = rpart_train_fit, newdata = train01)
rpart_train_table <- table(rpart_train_predict, y_train)
rpart_train_accuracy[i] <- sum(diag(rpart_train_table)) / sum(rpart_train_table)
rpart_train_accuracy_mean <- mean(rpart_train_accuracy)
rpart_train_pred <- rpart_train_predict
rpart_train_mean <- mean(diag(rpart_train_table)) / sum(rpart_train_table)
rpart_train_sd <- sd(diag(rpart_train_table)) / sd(rpart_train_table)
sum_diag_train_rpart <- sum(diag(rpart_train_table))
rpart_train_prop <- diag(prop.table(rpart_train_table, margin = 1))
rpart_test_predict <- predict(object = rpart_train_fit, newdata = test01)
rpart_test_table <- table(rpart_test_predict, y_test)
rpart_test_accuracy[i] <- sum(diag(rpart_test_table)) / sum(rpart_test_table)
rpart_test_accuracy_mean <- mean(rpart_test_accuracy)
rpart_test_pred <- rpart_test_predict
rpart_test_mean <- mean(diag(rpart_test_table)) / sum(rpart_test_table)
rpart_test_sd <- sd(diag(rpart_test_table)) / sd(rpart_test_table)
sum_diag_test_rpart <- sum(diag(rpart_test_table))
rpart_test_prop <- diag(prop.table(rpart_test_table, margin = 1))
rpart_validation_predict <- predict(object = rpart_train_fit, newdata = validation01)
rpart_validation_table <- table(rpart_validation_predict, y_validation)
rpart_validation_accuracy[i] <- sum(diag(rpart_validation_table)) / sum(rpart_validation_table)
rpart_validation_accuracy_mean <- mean(rpart_validation_accuracy)
rpart_validation_pred <- rpart_validation_predict
rpart_validation_mean <- mean(diag(rpart_validation_table)) / sum(rpart_validation_table)
rpart_validation_sd <- sd(diag(rpart_validation_table)) / sd(rpart_validation_table)
sum_diag_validation_rpart <- sum(diag(rpart_validation_table))
rpart_validation_prop <- diag(prop.table(rpart_validation_table, margin = 1))
rpart_holdout[i] <- mean(c(rpart_test_accuracy_mean, rpart_validation_accuracy_mean))
rpart_holdout_mean <- mean(rpart_holdout)
rpart_residuals[i] <- 1 - rpart_holdout[i]
rpart_residuals_mean <- mean(rpart_residuals)
rpart_holdout_sd <- sd(rpart_holdout)
rpart_holdout_vs_train[i] <- rpart_holdout_mean / rpart_train_accuracy_mean
rpart_holdout_vs_train_mean <- mean(rpart_holdout_vs_train)
rpart_holdout_vs_train_range <- range(rpart_holdout_vs_train)
rpart_holdout_vs_train_sd <- sd(rpart_holdout_vs_train)
rpart_table <- rpart_test_table + rpart_validation_table
rpart_table_total <- rpart_table_total + rpart_table
rpart_table_sum_diag <- sum(diag(rpart_table))
rpart_confusion_matrix <- caret::confusionMatrix(rpart_table_total)
rpart_confusion_matrix_summary <- colMeans(rpart_confusion_matrix$byClass, na.rm = TRUE)
rpart_true_positive_rate[i] <- rpart_confusion_matrix_summary[1]
rpart_true_positive_rate_mean <- mean(rpart_true_positive_rate[i])
rpart_true_negative_rate[i] <- rpart_confusion_matrix_summary[2]
rpart_true_negative_rate_mean <- mean(rpart_true_negative_rate)
rpart_false_negative_rate[i] <- 1 - rpart_true_positive_rate[i]
rpart_false_negative_rate_mean <- mean(rpart_false_positive_rate)
rpart_false_positive_rate[i] <- 1 - rpart_true_negative_rate[i]
rpart_false_positive_rate_mean <- mean(rpart_false_positive_rate)
rpart_positive_pred_value[i] <- rpart_confusion_matrix_summary[3]
rpart_positive_pred_value_mean <- mean(rpart_positive_pred_value)
rpart_negative_pred_value[i] <- rpart_confusion_matrix_summary[4]
rpart_negative_pred_value_mean <- mean(rpart_negative_pred_value)
rpart_prevalence[i] <- rpart_confusion_matrix_summary[8]
rpart_prevalence_mean <- mean(rpart_prevalence)
rpart_detection_rate[i] <- rpart_confusion_matrix_summary[9]
rpart_detection_rate_mean <- mean(rpart_detection_rate)
rpart_detection_prevalence[i] <- rpart_confusion_matrix_summary[10]
rpart_detection_prevalence_mean <- mean(rpart_detection_prevalence)
rpart_classification_error[i] <- 1 - rpart_holdout[i]
rpart_classification_error_mean <- mean(rpart_classification_error)
rpart_F1_score[i] <- rpart_confusion_matrix_summary[7]
rpart_F1_score_mean <- mean(rpart_F1_score[i])
rpart_end <- Sys.time()
rpart_duration[i] <- rpart_end - rpart_start
rpart_duration_mean <- mean(rpart_duration)
rpart_duration_sd <- sd(rpart_duration)
#### 11. Support Vector Machines ####
svm_start <- Sys.time()
message("Working on Support Vector Machine analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
svm_train_fit <- e1071::svm(y_train ~ ., data = train, kernel = "radial", gamma = 1, cost = 1)
}
if(set_seed == "N"){
svm_train_fit <- e1071::svm(y_train ~ ., data = train, kernel = "radial", gamma = 1, cost = 1)
}
svm_train_pred <- predict(svm_train_fit, train, type = "class")
svm_train_table <- table(svm_train_pred, y_train)
svm_train_accuracy[i] <- sum(diag(svm_train_table)) / sum(svm_train_table)
svm_train_accuracy_mean <- mean(svm_train_accuracy)
svm_train_diag <- sum(diag(svm_train_table))
svm_train_mean <- mean(diag(svm_train_table)) / mean(svm_train_table)
svm_train_sd <- sd(diag(svm_train_table)) / sd(svm_train_table)
sum_diag_train_svm <- sum(diag(svm_train_table))
svm_train_prop <- diag(prop.table(svm_train_table, margin = 1))
svm_test_pred <- predict(svm_train_fit, test, type = "class")
svm_test_table <- table(svm_test_pred, y_test)
svm_test_accuracy[i] <- sum(diag(svm_test_table)) / sum(svm_test_table)
svm_test_accuracy_mean <- mean(svm_test_accuracy)
svm_test_diag <- sum(diag(svm_test_table))
svm_test_mean <- mean(diag(svm_test_table)) / mean(svm_test_table)
svm_test_sd <- sd(diag(svm_test_table)) / sd(svm_test_table)
sum_diag_test_svm <- sum(diag(svm_test_table))
svm_test_prop <- diag(prop.table(svm_test_table, margin = 1))
svm_validation_pred <- predict(svm_train_fit, validation, type = "class")
svm_validation_table <- table(svm_validation_pred, y_validation)
svm_validation_accuracy[i] <- sum(diag(svm_validation_table)) / sum(svm_validation_table)
svm_validation_accuracy_mean <- mean(svm_validation_accuracy)
svm_validation_diag <- sum(diag(svm_validation_table))
svm_validation_mean <- mean(diag(svm_validation_table)) / mean(svm_validation_table)
svm_validation_sd <- sd(diag(svm_validation_table)) / sd(svm_validation_table)
sum_diag_validation_svm <- sum(diag(svm_validation_table))
svm_validation_prop <- diag(prop.table(svm_validation_table, margin = 1))
svm_holdout[i] <- mean(c(svm_test_accuracy_mean, svm_validation_accuracy_mean))
svm_holdout_mean <- mean(svm_holdout)
svm_residuals[i] <- 1 - svm_holdout[i]
svm_residuals_mean <- mean(svm_residuals)
svm_holdout_sd <- sd(svm_holdout)
svm_holdout_vs_train[i] <- svm_holdout_mean / svm_train_accuracy_mean
svm_holdout_vs_train_mean <- mean(svm_holdout_vs_train)
svm_holdout_vs_train_range <- range(svm_holdout_vs_train)
svm_holdout_vs_train_sd <- sd(svm_holdout_vs_train)
svm_table <- svm_test_table + svm_validation_table
svm_table_total <- svm_table_total + svm_table
svm_table_sum_diag <- sum(diag(svm_table))
svm_confusion_matrix <- caret::confusionMatrix(svm_table_total)
svm_confusion_matrix_summary <- colMeans(svm_confusion_matrix$byClass, na.rm = TRUE)
svm_true_positive_rate[i] <- svm_confusion_matrix_summary[1]
svm_true_positive_rate_mean <- mean(svm_true_positive_rate[i])
svm_true_negative_rate[i] <- svm_confusion_matrix_summary[2]
svm_true_negative_rate_mean <- mean(svm_true_negative_rate)
svm_false_negative_rate[i] <- 1 - svm_true_positive_rate[i]
svm_false_negative_rate_mean <- mean(svm_false_positive_rate)
svm_false_positive_rate[i] <- 1 - svm_true_negative_rate[i]
svm_false_positive_rate_mean <- mean(svm_false_positive_rate)
svm_positive_pred_value[i] <- svm_confusion_matrix_summary[3]
svm_positive_pred_value_mean <- mean(svm_positive_pred_value)
svm_negative_pred_value[i] <- svm_confusion_matrix_summary[4]
svm_negative_pred_value_mean <- mean(svm_negative_pred_value)
svm_prevalence[i] <- svm_confusion_matrix_summary[8]
svm_prevalence_mean <- mean(svm_prevalence)
svm_detection_rate[i] <- svm_confusion_matrix_summary[9]
svm_detection_rate_mean <- mean(svm_detection_rate)
svm_detection_prevalence[i] <- svm_confusion_matrix_summary[10]
svm_detection_prevalence_mean <- mean(svm_detection_prevalence)
svm_classification_error[i] <- 1 - svm_holdout[i]
svm_classification_error_mean <- mean(svm_classification_error)
svm_F1_score[i] <- svm_confusion_matrix_summary[7]
svm_F1_score_mean <- mean(svm_F1_score[i])
svm_end <- Sys.time()
svm_duration[i] <- svm_end - svm_start
svm_duration_mean <- mean(svm_duration)
svm_duration_sd <- sd(svm_duration)
#### 12. Trees ####
tree_start <- Sys.time()
message("Working on Trees analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
tree_train_fit <- tree::tree(y_train ~ ., data = train)
}
if(set_seed == "N"){
tree_train_fit <- tree::tree(y_train ~ ., data = train)
}
tree_train_pred <- predict(tree_train_fit, train, type = "class")
tree_train_table <- table(tree_train_pred, y_train)
tree_train_accuracy[i] <- sum(diag(tree_train_table)) / sum(tree_train_table)
tree_train_accuracy_mean <- mean(tree_train_accuracy)
tree_train_diag <- sum(diag(tree_train_table))
tree_train_mean <- mean(diag(tree_train_table)) / mean(tree_train_table)
tree_train_sd <- sd(diag(tree_train_table)) / sd(tree_train_table)
sum_diag_train_tree <- sum(diag(tree_train_table))
tree_train_prop <- diag(prop.table(tree_train_table, margin = 1))
tree_test_pred <- predict(tree_train_fit, test, type = "class")
tree_test_table <- table(tree_test_pred, y_test)
tree_test_accuracy[i] <- sum(diag(tree_test_table)) / sum(tree_test_table)
tree_test_accuracy_mean <- mean(tree_test_accuracy)
tree_test_diag <- sum(diag(tree_test_table))
tree_test_mean <- mean(diag(tree_test_table)) / mean(tree_test_table)
tree_test_sd <- sd(diag(tree_test_table)) / sd(tree_test_table)
sum_diag_test_tree <- sum(diag(tree_test_table))
tree_test_prop <- diag(prop.table(tree_test_table, margin = 1))
tree_validation_pred <- predict(tree_train_fit, validation, type = "class")
tree_validation_table <- table(tree_validation_pred, y_validation)
tree_validation_accuracy[i] <- sum(diag(tree_validation_table)) / sum(tree_validation_table)
tree_validation_accuracy_mean <- mean(tree_validation_accuracy)
tree_validation_diag <- sum(diag(tree_validation_table))
tree_validation_mean <- mean(diag(tree_validation_table)) / mean(tree_validation_table)
tree_validation_sd <- sd(diag(tree_validation_table)) / sd(tree_validation_table)
sum_diag_validation_tree <- sum(diag(tree_validation_table))
tree_validation_prop <- diag(prop.table(tree_validation_table, margin = 1))
tree_holdout[i] <- mean(c(tree_test_accuracy_mean, tree_validation_accuracy_mean))
tree_holdout_mean <- mean(tree_holdout)
tree_residuals[i] <- 1 - tree_holdout[i]
tree_residuals_mean <- mean(tree_residuals)
tree_holdout_sd <- sd(tree_holdout)
tree_holdout_vs_train[i] <- tree_holdout_mean / tree_train_accuracy_mean
tree_holdout_vs_train_mean <- mean(tree_holdout_vs_train)
tree_holdout_vs_train_range <- range(tree_holdout_vs_train)
tree_holdout_vs_train_sd <- sd(tree_holdout_vs_train)
tree_table <- tree_test_table + tree_validation_table
tree_table_total <- tree_table_total + tree_table
tree_table_sum_diag <- sum(diag(tree_table))
tree_confusion_matrix <- caret::confusionMatrix(tree_table_total)
tree_confusion_matrix_summary <- colMeans(tree_confusion_matrix$byClass, na.rm = TRUE)
tree_true_positive_rate[i] <- tree_confusion_matrix_summary[1]
tree_true_positive_rate_mean <- mean(tree_true_positive_rate[i])
tree_true_negative_rate[i] <- tree_confusion_matrix_summary[2]
tree_true_negative_rate_mean <- mean(tree_true_negative_rate)
tree_false_negative_rate[i] <- 1 - tree_true_positive_rate[i]
tree_false_negative_rate_mean <- mean(tree_false_positive_rate)
tree_false_positive_rate[i] <- 1 - tree_true_negative_rate[i]
tree_false_positive_rate_mean <- mean(tree_false_positive_rate)
tree_positive_pred_value[i] <- tree_confusion_matrix_summary[3]
tree_positive_pred_value_mean <- mean(tree_positive_pred_value)
tree_negative_pred_value[i] <- tree_confusion_matrix_summary[4]
tree_negative_pred_value_mean <- mean(tree_negative_pred_value)
tree_prevalence[i] <- tree_confusion_matrix_summary[8]
tree_prevalence_mean <- mean(tree_prevalence)
tree_detection_rate[i] <- tree_confusion_matrix_summary[9]
tree_detection_rate_mean <- mean(tree_detection_rate)
tree_detection_prevalence[i] <- tree_confusion_matrix_summary[10]
tree_detection_prevalence_mean <- mean(tree_detection_prevalence)
tree_classification_error[i] <- 1 - tree_holdout[i]
tree_classification_error_mean <- mean(tree_classification_error)
tree_F1_score[i] <- tree_confusion_matrix_summary[7]
tree_F1_score_mean <- mean(tree_F1_score[i])
tree_end <- Sys.time()
tree_duration[i] <- tree_end - tree_start
tree_duration_mean <- mean(tree_duration)
tree_duration_sd <- sd(tree_duration)
#### Ensembles start here ####
ensemble1 <- data.frame(
"Bagged_Random_Forest" = c(bag_rf_test_pred, bag_rf_validation_pred),
"Bagging" = c(bagging_test_pred, bagging_validation_pred),
"C50" = c(C50_test_pred, C50_validation_pred),
"Linear" = c(linear_test_pred, linear_validation_pred),
"Naive_Bayes" = c(n_bayes_test_pred, n_bayes_validation_pred),
"Partial_Least_Squares" = c(pls_test_pred, pls_validation_pred),
"Penalized_Discriminant_Analysis" = c(pda_test_pred, pda_validation_pred),
"Random_Forest" = c(rf_test_pred, rf_validation_pred),
"Ranger" = c(ranger_test_pred, ranger_validation_pred),
"RPart" = c(rpart_test_pred, rpart_validation_pred),
"Support_Vector_Machines" = c(svm_test_pred, svm_validation_pred),
"Trees" = c(tree_test_pred, tree_validation_pred)
)
ensemble_row_numbers <- as.numeric(row.names(ensemble1))
ensemble1$y <- df[ensemble_row_numbers, "y"]
ensemble1 <- ensemble1[complete.cases(ensemble1), ]
head_ensemble <- reactable::reactable(head(ensemble1),
searchable = TRUE, pagination = FALSE, wrap = TRUE, rownames = TRUE, fullWidth = TRUE, filterable = TRUE, bordered = TRUE,
striped = TRUE, highlight = TRUE, resizable = TRUE
)%>%
reactablefmtr::add_title("Head of the ensemble")
ensemble_index <- sample(c(1:3), nrow(ensemble1), replace = TRUE, prob = c(train_amount, test_amount, validation_amount))
ensemble_train <- ensemble1[ensemble_index == 1, ]
ensemble_test <- ensemble1[ensemble_index == 2, ]
ensemble_validation <- ensemble1[ensemble_index == 3, ]
ensemble_y_train <- ensemble_train$y
ensemble_y_test <- ensemble_test$y
ensemble_y_validation <- ensemble_validation$y
message(noquote(""))
message("Working on the Ensembles section")
message(noquote(""))
#### 13. Ensemble Bagged CART ####
ensemble_bag_cart_start <- Sys.time()
message("Working on Ensemble Bagged CART analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_bag_cart_train_fit <- ipred::bagging(y ~ ., data = ensemble_train)
}
if(set_seed == "N"){
ensemble_bag_cart_train_fit <- ipred::bagging(y ~ ., data = ensemble_train)
}
ensemble_bag_cart_train_pred <- predict(ensemble_bag_cart_train_fit, ensemble_train)
ensemble_bag_cart_train_table <- table(ensemble_bag_cart_train_pred, ensemble_train$y)
ensemble_bag_cart_train_accuracy[i] <- sum(diag(ensemble_bag_cart_train_table)) / sum(ensemble_bag_cart_train_table)
ensemble_bag_cart_train_accuracy_mean <- mean(ensemble_bag_cart_train_accuracy)
ensemble_bag_cart_train_mean <- mean(diag(ensemble_bag_cart_train_table)) / mean(ensemble_bag_cart_train_table)
ensemble_bag_cart_train_sd <- sd(diag(ensemble_bag_cart_train_table)) / sd(ensemble_bag_cart_train_table)
ensemble_sum_diag_bag_train_cart <- sum(diag(ensemble_bag_cart_train_table))
ensemble_bag_cart_train_prop <- diag(prop.table(ensemble_bag_cart_train_table, margin = 1))
ensemble_bag_cart_test_pred <- predict(ensemble_bag_cart_train_fit, ensemble_test)
ensemble_bag_cart_test_table <- table(ensemble_bag_cart_test_pred, ensemble_test$y)
ensemble_bag_cart_test_accuracy[i] <- sum(diag(ensemble_bag_cart_test_table)) / sum(ensemble_bag_cart_test_table)
ensemble_bag_cart_test_accuracy_mean <- mean(ensemble_bag_cart_test_accuracy)
ensemble_bag_cart_test_mean <- mean(diag(ensemble_bag_cart_test_table)) / mean(ensemble_bag_cart_test_table)
ensemble_bag_cart_test_sd <- sd(diag(ensemble_bag_cart_test_table)) / sd(ensemble_bag_cart_test_table)
ensemble_sum_diag_bag_test_cart <- sum(diag(ensemble_bag_cart_test_table))
ensemble_bag_cart_test_prop <- diag(prop.table(ensemble_bag_cart_test_table, margin = 1))
ensemble_bag_cart_validation_pred <- predict(ensemble_bag_cart_train_fit, ensemble_validation)
ensemble_bag_cart_validation_table <- table(ensemble_bag_cart_validation_pred, ensemble_validation$y)
ensemble_bag_cart_validation_accuracy[i] <- sum(diag(ensemble_bag_cart_validation_table)) / sum(ensemble_bag_cart_validation_table)
ensemble_bag_cart_validation_accuracy_mean <- mean(ensemble_bag_cart_validation_accuracy)
ensemble_bag_cart_validation_mean <- mean(diag(ensemble_bag_cart_validation_table)) / mean(ensemble_bag_cart_validation_table)
ensemble_bag_cart_validation_sd <- sd(diag(ensemble_bag_cart_validation_table)) / sd(ensemble_bag_cart_validation_table)
ensemble_sum_diag_bag_validation_cart <- sum(diag(ensemble_bag_cart_validation_table))
ensemble_bag_cart_validation_prop <- diag(prop.table(ensemble_bag_cart_validation_table, margin = 1))
ensemble_bag_cart_holdout[i] <- mean(c(ensemble_bag_cart_test_accuracy_mean, ensemble_bag_cart_validation_accuracy_mean))
ensemble_bag_cart_holdout_mean <- mean(ensemble_bag_cart_holdout)
ensemble_bag_cart_residuals[i] <- 1-ensemble_bag_cart_holdout[i]
ensemble_bag_cart_residuals_mean <- mean(ensemble_bag_cart_residuals)
ensemble_bag_cart_holdout_sd <- sd(ensemble_bag_cart_holdout)
ensemble_bag_cart_holdout_vs_train[i] <- ensemble_bag_cart_holdout_mean / ensemble_bag_cart_train_accuracy_mean
ensemble_bag_cart_holdout_vs_train_mean <- mean(ensemble_bag_cart_holdout_vs_train)
ensemble_bag_cart_holdout_vs_train_range <- range(ensemble_bag_cart_holdout_vs_train)
ensemble_bag_cart_holdout_vs_train_sd <- sd(ensemble_bag_cart_holdout_vs_train)
ensemble_bag_cart_table <- ensemble_bag_cart_test_table + ensemble_bag_cart_validation_table
ensemble_bag_cart_table_total <- ensemble_bag_cart_table_total + ensemble_bag_cart_table
ensemble_bag_cart_table_sum_diag <- sum(diag(ensemble_bag_cart_table))
ensemble_bag_cart_confusion_matrix <- caret::confusionMatrix(ensemble_bag_cart_table_total)
ensemble_bag_cart_confusion_matrix_summary <- colMeans(ensemble_bag_cart_confusion_matrix$byClass, na.rm = TRUE)
ensemble_bag_cart_true_positive_rate[i] <- ensemble_bag_cart_confusion_matrix_summary[1]
ensemble_bag_cart_true_positive_rate_mean <- mean(ensemble_bag_cart_true_positive_rate[i])
ensemble_bag_cart_true_negative_rate[i] <- ensemble_bag_cart_confusion_matrix_summary[2]
ensemble_bag_cart_true_negative_rate_mean <- mean(ensemble_bag_cart_true_negative_rate)
ensemble_bag_cart_false_negative_rate[i] <- 1 - ensemble_bag_cart_true_positive_rate[i]
ensemble_bag_cart_false_negative_rate_mean <- mean(ensemble_bag_cart_false_positive_rate)
ensemble_bag_cart_false_positive_rate[i] <- 1 - ensemble_bag_cart_true_negative_rate[i]
ensemble_bag_cart_false_positive_rate_mean <- mean(ensemble_bag_cart_false_positive_rate)
ensemble_bag_cart_positive_pred_value[i] <- ensemble_bag_cart_confusion_matrix_summary[3]
ensemble_bag_cart_positive_pred_value_mean <- mean(ensemble_bag_cart_positive_pred_value)
ensemble_bag_cart_negative_pred_value[i] <- ensemble_bag_cart_confusion_matrix_summary[4]
ensemble_bag_cart_negative_pred_value_mean <- mean(ensemble_bag_cart_negative_pred_value)
ensemble_bag_cart_prevalence[i] <- ensemble_bag_cart_confusion_matrix_summary[8]
ensemble_bag_cart_prevalence_mean <- mean(ensemble_bag_cart_prevalence)
ensemble_bag_cart_detection_rate[i] <- ensemble_bag_cart_confusion_matrix_summary[9]
ensemble_bag_cart_detection_rate_mean <- mean(ensemble_bag_cart_detection_rate)
ensemble_bag_cart_detection_prevalence[i] <- ensemble_bag_cart_confusion_matrix_summary[10]
ensemble_bag_cart_detection_prevalence_mean <- mean(ensemble_bag_cart_detection_prevalence)
ensemble_bag_cart_classification_error[i] <- 1 - ensemble_bag_cart_holdout[i]
ensemble_bag_cart_classification_error_mean <- mean(ensemble_bag_cart_classification_error)
ensemble_bag_cart_F1_score[i] <- ensemble_bag_cart_confusion_matrix_summary[7]
ensemble_bag_cart_F1_score_mean <- mean(ensemble_bag_cart_F1_score[i])
ensemble_bag_cart_end <- Sys.time()
ensemble_bag_cart_duration[i] <- ensemble_bag_cart_end - ensemble_bag_cart_start
ensemble_bag_cart_duration_mean <- mean(ensemble_bag_cart_duration)
ensemble_bag_cart_duration_sd <- sd(ensemble_bag_cart_duration)
#### 14. Ensemble Bagged Random Forest ####
ensemble_bag_rf_start <- Sys.time()
message("Working on Ensemble Bagged Random Forest analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_bag_train_rf <- randomForest::randomForest(ensemble_y_train ~ ., data = ensemble_train, mtry = ncol(ensemble_train) - 1)
}
if(set_seed == "N"){
ensemble_bag_train_rf <- randomForest::randomForest(ensemble_y_train ~ ., data = ensemble_train, mtry = ncol(ensemble_train) - 1)
}
ensemble_bag_rf_train_pred <- predict(ensemble_bag_train_rf, ensemble_train, type = "class")
ensemble_bag_rf_train_table <- table(ensemble_bag_rf_train_pred, ensemble_train$y)
ensemble_bag_rf_train_accuracy[i] <- sum(diag(ensemble_bag_rf_train_table)) / sum(ensemble_bag_rf_train_table)
ensemble_bag_rf_train_accuracy_mean <- mean(ensemble_bag_rf_train_accuracy)
ensemble_bag_rf_train_diag <- sum(diag(ensemble_bag_rf_train_table))
ensemble_bag_rf_train_mean <- mean(diag(ensemble_bag_rf_train_table)) / mean(ensemble_bag_rf_train_table)
ensemble_bag_rf_train_sd <- sd(diag(ensemble_bag_rf_train_table)) / sd(ensemble_bag_rf_train_table)
sum_ensemble_bag_train_rf <- sum(diag(ensemble_bag_rf_train_table))
ensemble_bag_rf_train_prop <- diag(prop.table(ensemble_bag_rf_train_table, margin = 1))
ensemble_bag_rf_test_pred <- predict(ensemble_bag_train_rf, ensemble_test, type = "class")
ensemble_bag_rf_test_table <- table(ensemble_bag_rf_test_pred, ensemble_test$y)
ensemble_bag_rf_test_accuracy[i] <- sum(diag(ensemble_bag_rf_test_table)) / sum(ensemble_bag_rf_test_table)
ensemble_bag_rf_test_accuracy_mean <- mean(ensemble_bag_rf_test_accuracy)
ensemble_bag_rf_test_diag <- sum(diag(ensemble_bag_rf_test_table))
ensemble_bag_rf_test_mean <- mean(diag(ensemble_bag_rf_test_table)) / mean(ensemble_bag_rf_test_table)
ensemble_bag_rf_test_sd <- sd(diag(ensemble_bag_rf_test_table)) / sd(ensemble_bag_rf_test_table)
sum_ensemble_bag_test_rf <- sum(diag(ensemble_bag_rf_test_table))
ensemble_bag_rf_test_prop <- diag(prop.table(ensemble_bag_rf_test_table, margin = 1))
ensemble_bag_rf_validation_pred <- predict(ensemble_bag_train_rf, ensemble_validation, type = "class")
ensemble_bag_rf_validation_table <- table(ensemble_bag_rf_validation_pred, ensemble_validation$y)
ensemble_bag_rf_validation_accuracy[i] <- sum(diag(ensemble_bag_rf_validation_table)) / sum(ensemble_bag_rf_validation_table)
ensemble_bag_rf_validation_accuracy_mean <- mean(ensemble_bag_rf_validation_accuracy)
ensemble_bag_rf_validation_diag <- sum(diag(ensemble_bag_rf_validation_table))
ensemble_bag_rf_validation_mean <- mean(diag(ensemble_bag_rf_validation_table)) / mean(ensemble_bag_rf_validation_table)
ensemble_bag_rf_validation_sd <- sd(diag(ensemble_bag_rf_validation_table)) / sd(ensemble_bag_rf_validation_table)
sum_ensemble_bag_validation_rf <- sum(diag(ensemble_bag_rf_validation_table))
ensemble_bag_rf_validation_prop <- diag(prop.table(ensemble_bag_rf_validation_table, margin = 1))
ensemble_bag_rf_holdout[i] <- mean(c(ensemble_bag_rf_test_accuracy_mean, ensemble_bag_rf_validation_accuracy_mean))
ensemble_bag_rf_holdout_mean <- mean(ensemble_bag_rf_holdout)
ensemble_bag_rf_residuals[i] <- 1 - ensemble_bag_rf_holdout[i]
ensemble_bag_rf_residuals_mean <- mean(ensemble_bag_rf_residuals)
ensemble_bag_rf_holdout_sd <- sd(ensemble_bag_rf_holdout)
ensemble_bag_rf_holdout_vs_train[i] <- ensemble_bag_rf_holdout_mean / ensemble_bag_rf_train_accuracy_mean
ensemble_bag_rf_holdout_vs_train_mean <- mean(ensemble_bag_rf_holdout_vs_train)
ensemble_bag_rf_holdout_vs_train_range <- range(ensemble_bag_rf_holdout_vs_train)
ensemble_bag_rf_holdout_vs_train_sd <- sd(ensemble_bag_rf_holdout_vs_train)
ensemble_bag_rf_table <- ensemble_bag_rf_test_table + ensemble_bag_rf_validation_table
ensemble_bag_rf_table_total <- ensemble_bag_rf_table_total + ensemble_bag_rf_table
ensemble_bag_rf_table_sum_diag <- sum(diag(ensemble_bag_rf_table))
ensemble_bag_rf_confusion_matrix <- caret::confusionMatrix(ensemble_bag_rf_table_total)
ensemble_bag_rf_confusion_matrix_summary <- colMeans(ensemble_bag_rf_confusion_matrix$byClass, na.rm = TRUE)
ensemble_bag_rf_true_positive_rate[i] <- ensemble_bag_rf_confusion_matrix_summary[1]
ensemble_bag_rf_true_positive_rate_mean <- mean(ensemble_bag_rf_true_positive_rate[i])
ensemble_bag_rf_true_negative_rate[i] <- ensemble_bag_rf_confusion_matrix_summary[2]
ensemble_bag_rf_true_negative_rate_mean <- mean(ensemble_bag_rf_true_negative_rate)
ensemble_bag_rf_false_negative_rate[i] <- 1 - ensemble_bag_rf_true_positive_rate[i]
ensemble_bag_rf_false_negative_rate_mean <- mean(ensemble_bag_rf_false_positive_rate)
ensemble_bag_rf_false_positive_rate[i] <- 1 - ensemble_bag_rf_true_negative_rate[i]
ensemble_bag_rf_false_positive_rate_mean <- mean(ensemble_bag_rf_false_positive_rate)
ensemble_bag_rf_positive_pred_value[i] <- ensemble_bag_rf_confusion_matrix_summary[3]
ensemble_bag_rf_positive_pred_value_mean <- mean(ensemble_bag_rf_positive_pred_value)
ensemble_bag_rf_negative_pred_value[i] <- ensemble_bag_rf_confusion_matrix_summary[4]
ensemble_bag_rf_negative_pred_value_mean <- mean(ensemble_bag_rf_negative_pred_value)
ensemble_bag_rf_prevalence[i] <- ensemble_bag_rf_confusion_matrix_summary[8]
ensemble_bag_rf_prevalence_mean <- mean(ensemble_bag_rf_prevalence)
ensemble_bag_rf_detection_rate[i] <- ensemble_bag_rf_confusion_matrix_summary[9]
ensemble_bag_rf_detection_rate_mean <- mean(ensemble_bag_rf_detection_rate)
ensemble_bag_rf_detection_prevalence[i] <- ensemble_bag_rf_confusion_matrix_summary[10]
ensemble_bag_rf_detection_prevalence_mean <- mean(ensemble_bag_rf_detection_prevalence)
ensemble_bag_rf_classification_error[i] <- 1 - ensemble_bag_rf_holdout[i]
ensemble_bag_rf_classification_error_mean <- mean(ensemble_bag_rf_classification_error)
ensemble_bag_rf_F1_score[i] <- ensemble_bag_rf_confusion_matrix_summary[7]
ensemble_bag_rf_F1_score_mean <- mean(ensemble_bag_rf_F1_score[i])
ensemble_bag_rf_end <- Sys.time()
ensemble_bag_rf_duration[i] <- ensemble_bag_rf_end - ensemble_bag_rf_start
ensemble_bag_rf_duration_mean <- mean(ensemble_bag_rf_duration)
ensemble_bag_rf_duration_sd <- sd(ensemble_bag_rf_duration)
#### 15. Ensemble C50 ####
ensemble_C50_start <- Sys.time()
message("Working on Ensemble C50 analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_C50_train_fit <- C50::C5.0(ensemble_y_train ~ ., data = ensemble_train)
}
if(set_seed == "N"){
ensemble_C50_train_fit <- C50::C5.0(ensemble_y_train ~ ., data = ensemble_train)
}
ensemble_C50_train_pred <- predict(ensemble_C50_train_fit, ensemble_train)
ensemble_C50_train_table <- table(ensemble_C50_train_pred, ensemble_y_train)
ensemble_C50_train_accuracy[i] <- sum(diag(ensemble_C50_train_table)) / sum(ensemble_C50_train_table)
ensemble_C50_train_accuracy_mean <- mean(ensemble_C50_train_accuracy)
ensemble_C50_train_mean <- mean(diag(ensemble_C50_train_table)) / mean(ensemble_C50_train_table)
ensemble_C50_train_sd <- sd(diag(ensemble_C50_train_table)) / sd(ensemble_C50_train_table)
sum_diag_ensemble_train_C50 <- sum(diag(ensemble_C50_train_table))
ensemble_C50_train_prop <- diag(prop.table(ensemble_C50_train_table, margin = 1))
ensemble_C50_test_pred <- predict(ensemble_C50_train_fit, ensemble_test)
ensemble_C50_test_table <- table(ensemble_C50_test_pred, ensemble_y_test)
ensemble_C50_test_accuracy[i] <- sum(diag(ensemble_C50_test_table)) / sum(ensemble_C50_test_table)
ensemble_C50_test_accuracy_mean <- mean(ensemble_C50_test_accuracy)
ensemble_C50_test_mean <- mean(diag(ensemble_C50_test_table)) / mean(ensemble_C50_test_table)
ensemble_C50_test_sd <- sd(diag(ensemble_C50_test_table)) / sd(ensemble_C50_test_table)
sum_diag_ensemble_test_C50 <- sum(diag(ensemble_C50_test_table))
ensemble_C50_test_prop <- diag(prop.table(ensemble_C50_test_table, margin = 1))
ensemble_C50_validation_pred <- predict(ensemble_C50_train_fit, ensemble_validation)
ensemble_C50_validation_table <- table(ensemble_C50_validation_pred, ensemble_y_validation)
ensemble_C50_validation_accuracy[i] <- sum(diag(ensemble_C50_validation_table)) / sum(ensemble_C50_validation_table)
ensemble_C50_validation_accuracy_mean <- mean(ensemble_C50_validation_accuracy)
ensemble_C50_validation_mean <- mean(diag(ensemble_C50_validation_table)) / mean(ensemble_C50_validation_table)
ensemble_C50_validation_sd <- sd(diag(ensemble_C50_validation_table)) / sd(ensemble_C50_validation_table)
sum_diag_ensemble_validation_C50 <- sum(diag(ensemble_C50_validation_table))
ensemble_C50_validation_prop <- diag(prop.table(ensemble_C50_validation_table, margin = 1))
ensemble_C50_holdout[i] <- mean(c(ensemble_C50_test_accuracy_mean, ensemble_C50_validation_accuracy_mean))
ensemble_C50_holdout_mean <- mean(ensemble_C50_holdout)
ensemble_C50_residuals[i] <- 1 - ensemble_C50_holdout[i]
ensemble_C50_residuals_mean <- mean(ensemble_C50_residuals)
ensemble_C50_holdout_sd <- sd(ensemble_C50_holdout)
ensemble_C50_holdout_vs_train[i] <- ensemble_C50_holdout_mean / ensemble_C50_train_accuracy_mean
ensemble_C50_holdout_vs_train_mean <- mean(ensemble_C50_holdout_vs_train)
ensemble_C50_holdout_vs_train_range <- range(ensemble_C50_holdout_vs_train)
ensemble_C50_holdout_vs_train_sd <- sd(ensemble_C50_holdout_vs_train)
ensemble_C50_table <- ensemble_C50_test_table + ensemble_C50_validation_table
ensemble_C50_table_total <- ensemble_C50_table_total + ensemble_C50_table
ensemble_C50_table_sum_diag <- sum(diag(ensemble_C50_table))
ensemble_C50_confusion_matrix <- caret::confusionMatrix(ensemble_C50_table_total)
ensemble_C50_confusion_matrix_summary <- colMeans(ensemble_C50_confusion_matrix$byClass, na.rm = TRUE)
ensemble_C50_true_positive_rate[i] <- ensemble_C50_confusion_matrix_summary[1]
ensemble_C50_true_positive_rate_mean <- mean(ensemble_C50_true_positive_rate[i])
ensemble_C50_true_negative_rate[i] <- ensemble_C50_confusion_matrix_summary[2]
ensemble_C50_true_negative_rate_mean <- mean(ensemble_C50_true_negative_rate)
ensemble_C50_false_negative_rate[i] <- 1 - ensemble_C50_true_positive_rate[i]
ensemble_C50_false_negative_rate_mean <- mean(ensemble_C50_false_positive_rate)
ensemble_C50_false_positive_rate[i] <- 1 - ensemble_C50_true_negative_rate[i]
ensemble_C50_false_positive_rate_mean <- mean(ensemble_C50_false_positive_rate)
ensemble_C50_positive_pred_value[i] <- ensemble_C50_confusion_matrix_summary[3]
ensemble_C50_positive_pred_value_mean <- mean(ensemble_C50_positive_pred_value)
ensemble_C50_negative_pred_value[i] <- ensemble_C50_confusion_matrix_summary[4]
ensemble_C50_negative_pred_value_mean <- mean(ensemble_C50_negative_pred_value)
ensemble_C50_prevalence[i] <- ensemble_C50_confusion_matrix_summary[8]
ensemble_C50_prevalence_mean <- mean(ensemble_C50_prevalence)
ensemble_C50_detection_rate[i] <- ensemble_C50_confusion_matrix_summary[9]
ensemble_C50_detection_rate_mean <- mean(ensemble_C50_detection_rate)
ensemble_C50_detection_prevalence[i] <- ensemble_C50_confusion_matrix_summary[10]
ensemble_C50_detection_prevalence_mean <- mean(ensemble_C50_detection_prevalence)
ensemble_C50_classification_error[i] <- 1 - ensemble_C50_holdout[i]
ensemble_C50_classification_error_mean <- mean(ensemble_C50_classification_error)
ensemble_C50_F1_score[i] <- ensemble_C50_confusion_matrix_summary[7]
ensemble_C50_F1_score_mean <- mean(ensemble_C50_F1_score[i])
ensemble_C50_end <- Sys.time()
ensemble_C50_duration[i] <- ensemble_C50_end - ensemble_C50_start
ensemble_C50_duration_mean <- mean(ensemble_C50_duration)
ensemble_C50_duration_sd <- sd(ensemble_C50_duration)
#### 16. Ensemble Naive Bayes ####
ensemble_n_bayes_start <- Sys.time()
message("Working on Ensembles using Naive Bayes analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_n_bayes_train_fit <- e1071::naiveBayes(ensemble_y_train ~ ., data = ensemble_train)
}
if(set_seed == "N"){
ensemble_n_bayes_train_fit <- e1071::naiveBayes(ensemble_y_train ~ ., data = ensemble_train)
}
ensemble_n_bayes_train_pred <- predict(ensemble_n_bayes_train_fit, ensemble_train)
ensemble_n_bayes_train_table <- table(ensemble_n_bayes_train_pred, ensemble_y_train)
ensemble_n_bayes_train_accuracy[i] <- sum(diag(ensemble_n_bayes_train_table)) / sum(ensemble_n_bayes_train_table)
ensemble_n_bayes_train_accuracy_mean <- mean(ensemble_n_bayes_train_accuracy)
ensemble_n_bayes_train_diag <- sum(diag(ensemble_n_bayes_train_table))
ensemble_n_bayes_train_mean <- mean(diag(ensemble_n_bayes_train_table)) / mean(ensemble_n_bayes_train_table)
ensemble_n_bayes_train_sd <- sd(diag(ensemble_n_bayes_train_table)) / sd(ensemble_n_bayes_train_table)
sum_ensemble_n_train_bayes <- sum(diag(ensemble_n_bayes_train_table))
ensemble_n_bayes_train_prop <- diag(prop.table(ensemble_n_bayes_train_table, margin = 1))
ensemble_n_bayes_test_fit <- e1071::naiveBayes(ensemble_y_train ~ ., data = ensemble_train)
ensemble_n_bayes_test_pred <- predict(ensemble_n_bayes_test_fit, ensemble_test)
ensemble_n_bayes_test_table <- table(ensemble_n_bayes_test_pred, ensemble_y_test)
ensemble_n_bayes_test_accuracy[i] <- sum(diag(ensemble_n_bayes_test_table)) / sum(ensemble_n_bayes_test_table)
ensemble_n_bayes_test_accuracy_mean <- mean(ensemble_n_bayes_test_accuracy)
ensemble_n_bayes_test_diag <- sum(diag(ensemble_n_bayes_test_table))
ensemble_n_bayes_test_mean <- mean(diag(ensemble_n_bayes_test_table)) / mean(ensemble_n_bayes_test_table)
ensemble_n_bayes_test_sd <- sd(diag(ensemble_n_bayes_test_table)) / sd(ensemble_n_bayes_test_table)
sum_ensemble_n_test_bayes <- sum(diag(ensemble_n_bayes_test_table))
ensemble_n_bayes_test_prop <- diag(prop.table(ensemble_n_bayes_test_table, margin = 1))
ensemble_n_bayes_validation_fit <- e1071::naiveBayes(ensemble_y_train ~ ., data = ensemble_train)
ensemble_n_bayes_validation_pred <- predict(ensemble_n_bayes_validation_fit, ensemble_validation)
ensemble_n_bayes_validation_table <- table(ensemble_n_bayes_validation_pred, ensemble_y_validation)
ensemble_n_bayes_validation_accuracy[i] <- sum(diag(ensemble_n_bayes_validation_table)) / sum(ensemble_n_bayes_validation_table)
ensemble_n_bayes_validation_accuracy_mean <- mean(ensemble_n_bayes_validation_accuracy)
ensemble_n_bayes_validation_diag <- sum(diag(ensemble_n_bayes_validation_table))
ensemble_n_bayes_validation_mean <- mean(diag(ensemble_n_bayes_validation_table)) / mean(ensemble_n_bayes_validation_table)
ensemble_n_bayes_validation_sd <- sd(diag(ensemble_n_bayes_validation_table)) / sd(ensemble_n_bayes_validation_table)
sum_ensemble_n_validation_bayes <- sum(diag(ensemble_n_bayes_validation_table))
ensemble_n_bayes_validation_prop <- diag(prop.table(ensemble_n_bayes_validation_table, margin = 1))
ensemble_n_bayes_holdout[i] <- mean(c(ensemble_n_bayes_test_accuracy_mean, ensemble_n_bayes_validation_accuracy_mean))
ensemble_n_bayes_holdout_mean <- mean(ensemble_n_bayes_holdout)
ensemble_n_bayes_residuals[i] <- 1 - ensemble_n_bayes_holdout[i]
ensemble_n_bayes_residuals_mean <- mean(ensemble_n_bayes_residuals)
ensemble_n_bayes_holdout_sd <- sd(ensemble_n_bayes_holdout)
ensemble_n_bayes_holdout_vs_train[i] <- ensemble_n_bayes_holdout_mean / ensemble_n_bayes_train_accuracy_mean
ensemble_n_bayes_holdout_vs_train_mean <- mean(ensemble_n_bayes_holdout_vs_train)
ensemble_n_bayes_holdout_vs_train_range <- range(ensemble_n_bayes_holdout_vs_train)
ensemble_n_bayes_holdout_vs_train_sd <- sd(ensemble_n_bayes_holdout_vs_train)
ensemble_n_bayes_table <- ensemble_n_bayes_test_table + ensemble_n_bayes_validation_table
ensemble_n_bayes_table_total <- ensemble_n_bayes_table_total + ensemble_n_bayes_table
ensemble_n_bayes_table_sum_diag <- sum(diag(ensemble_n_bayes_table))
ensemble_n_bayes_confusion_matrix <- caret::confusionMatrix(ensemble_n_bayes_table_total)
ensemble_n_bayes_confusion_matrix_summary <- colMeans(ensemble_n_bayes_confusion_matrix$byClass, na.rm = TRUE)
ensemble_n_bayes_true_positive_rate[i] <- ensemble_n_bayes_confusion_matrix_summary[1]
ensemble_n_bayes_true_positive_rate_mean <- mean(ensemble_n_bayes_true_positive_rate[i])
ensemble_n_bayes_true_negative_rate[i] <- ensemble_n_bayes_confusion_matrix_summary[2]
ensemble_n_bayes_true_negative_rate_mean <- mean(ensemble_n_bayes_true_negative_rate)
ensemble_n_bayes_false_negative_rate[i] <- 1 - ensemble_n_bayes_true_positive_rate[i]
ensemble_n_bayes_false_negative_rate_mean <- mean(ensemble_n_bayes_false_positive_rate)
ensemble_n_bayes_false_positive_rate[i] <- 1 - ensemble_n_bayes_true_negative_rate[i]
ensemble_n_bayes_false_positive_rate_mean <- mean(ensemble_n_bayes_false_positive_rate)
ensemble_n_bayes_positive_pred_value[i] <- ensemble_n_bayes_confusion_matrix_summary[3]
ensemble_n_bayes_positive_pred_value_mean <- mean(ensemble_n_bayes_positive_pred_value)
ensemble_n_bayes_negative_pred_value[i] <- ensemble_n_bayes_confusion_matrix_summary[4]
ensemble_n_bayes_negative_pred_value_mean <- mean(ensemble_n_bayes_negative_pred_value)
ensemble_n_bayes_prevalence[i] <- ensemble_n_bayes_confusion_matrix_summary[8]
ensemble_n_bayes_prevalence_mean <- mean(ensemble_n_bayes_prevalence)
ensemble_n_bayes_detection_rate[i] <- ensemble_n_bayes_confusion_matrix_summary[9]
ensemble_n_bayes_detection_rate_mean <- mean(ensemble_n_bayes_detection_rate)
ensemble_n_bayes_detection_prevalence[i] <- ensemble_n_bayes_confusion_matrix_summary[10]
ensemble_n_bayes_detection_prevalence_mean <- mean(ensemble_n_bayes_detection_prevalence)
ensemble_n_bayes_classification_error[i] <- 1 - ensemble_n_bayes_holdout[i]
ensemble_n_bayes_classification_error_mean <- mean(ensemble_n_bayes_classification_error)
ensemble_n_bayes_F1_score[i] <- ensemble_n_bayes_confusion_matrix_summary[7]
ensemble_n_bayes_F1_score_mean <- mean(ensemble_n_bayes_F1_score[i])
ensemble_n_bayes_end <- Sys.time()
ensemble_n_bayes_duration[i] <- ensemble_n_bayes_end - ensemble_n_bayes_start
ensemble_n_bayes_duration_mean <- mean(ensemble_n_bayes_duration)
ensemble_n_bayes_duration_sd <- sd(ensemble_n_bayes_duration)
#### 17. Ensemble Ranger Model #####
ensemble_ranger_start <- Sys.time()
message("Working on Ensembles using Ranger analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_ranger_train_fit <- MachineShop::fit(y ~ ., data = ensemble_train, model = "RangerModel")
}
if(set_seed == "N"){
ensemble_ranger_train_fit <- MachineShop::fit(y ~ ., data = ensemble_train, model = "RangerModel")
}
ensemble_ranger_train_pred <- predict(ensemble_ranger_train_fit, newdata = ensemble_train)
ensemble_ranger_train_table <- table(ensemble_ranger_train_pred, ensemble_y_train)
ensemble_ranger_train_accuracy[i] <- sum(diag(ensemble_ranger_train_table)) / sum(ensemble_ranger_train_table)
ensemble_ranger_train_accuracy_mean <- mean(ensemble_ranger_train_accuracy)
ensemble_ranger_train_diag <- sum(diag(ensemble_ranger_train_table))
ensemble_ranger_train_mean <- mean(diag(ensemble_ranger_train_table)) / mean(ensemble_ranger_train_table)
ensemble_ranger_train_sd <- sd(diag(ensemble_ranger_train_table)) / sd(diag(ensemble_ranger_train_table))
sum_ensemble_train_ranger <- sum(diag(ensemble_ranger_train_table))
ensemble_ranger_train_prop <- diag(prop.table(ensemble_ranger_train_table, margin = 1))
ensemble_ranger_test_fit <- MachineShop::fit(y ~ ., data = ensemble_train, model = "RangerModel")
ensemble_ranger_test_pred <- predict(ensemble_ranger_test_fit, newdata = ensemble_test)
ensemble_ranger_test_table <- table(ensemble_ranger_test_pred, ensemble_y_test)
ensemble_ranger_test_accuracy[i] <- sum(diag(ensemble_ranger_test_table)) / sum(ensemble_ranger_test_table)
ensemble_ranger_test_accuracy_mean <- mean(ensemble_ranger_test_accuracy)
ensemble_ranger_test_diag <- sum(diag(ensemble_ranger_test_table))
ensemble_ranger_test_mean <- mean(diag(ensemble_ranger_test_table)) / mean(ensemble_ranger_test_table)
ensemble_ranger_test_sd <- sd(diag(ensemble_ranger_test_table)) / sd(diag(ensemble_ranger_test_table))
sum_ensemble_test_ranger <- sum(diag(ensemble_ranger_test_table))
ensemble_ranger_test_prop <- diag(prop.table(ensemble_ranger_test_table, margin = 1))
ensemble_ranger_validation_fit <- MachineShop::fit(y ~ ., data = ensemble_train, model = "RangerModel")
ensemble_ranger_validation_pred <- predict(ensemble_ranger_validation_fit, newdata = ensemble_validation)
ensemble_ranger_validation_table <- table(ensemble_ranger_validation_pred, ensemble_y_validation)
ensemble_ranger_validation_accuracy[i] <- sum(diag(ensemble_ranger_validation_table)) / sum(ensemble_ranger_validation_table)
ensemble_ranger_validation_accuracy_mean <- mean(ensemble_ranger_validation_accuracy)
ensemble_ranger_validation_diag <- sum(diag(ensemble_ranger_validation_table))
ensemble_ranger_validation_mean <- mean(diag(ensemble_ranger_validation_table)) / mean(ensemble_ranger_validation_table)
ensemble_ranger_validation_sd <- sd(diag(ensemble_ranger_validation_table)) / sd(diag(ensemble_ranger_validation_table))
sum_ensemble_validation_ranger <- sum(diag(ensemble_ranger_validation_table))
ensemble_ranger_validation_prop <- diag(prop.table(ensemble_ranger_validation_table, margin = 1))
ensemble_ranger_holdout[i] <- mean(c(ensemble_ranger_test_accuracy_mean, ensemble_ranger_validation_accuracy_mean))
ensemble_ranger_holdout_mean <- mean(ensemble_ranger_holdout)
ensemble_ranger_residuals[i] <- 1 - ensemble_ranger_holdout[i]
ensemble_ranger_residuals_mean <- mean(ensemble_ranger_residuals)
ensemble_ranger_holdout_sd <- sd(ensemble_ranger_holdout)
ensemble_ranger_holdout_vs_train[i] <- ensemble_ranger_holdout_mean / ensemble_ranger_train_accuracy_mean
ensemble_ranger_holdout_vs_train_mean <- mean(ensemble_ranger_holdout_vs_train)
ensemble_ranger_holdout_vs_train_range <- range(ensemble_ranger_holdout_vs_train)
ensemble_ranger_holdout_vs_train_sd <- sd(ensemble_ranger_holdout_vs_train)
ensemble_ranger_table <- ensemble_ranger_test_table + ensemble_ranger_validation_table
ensemble_ranger_table_total <- ensemble_ranger_table_total + ensemble_ranger_table
ensemble_ranger_table_sum_diag <- sum(diag(ensemble_ranger_table))
ensemble_ranger_confusion_matrix <- caret::confusionMatrix(ensemble_ranger_table_total)
ensemble_ranger_confusion_matrix_summary <- colMeans(ensemble_ranger_confusion_matrix$byClass, na.rm = TRUE)
ensemble_ranger_true_positive_rate[i] <- ensemble_ranger_confusion_matrix_summary[1]
ensemble_ranger_true_positive_rate_mean <- mean(ensemble_ranger_true_positive_rate[i])
ensemble_ranger_true_negative_rate[i] <- ensemble_ranger_confusion_matrix_summary[2]
ensemble_ranger_true_negative_rate_mean <- mean(ensemble_ranger_true_negative_rate)
ensemble_ranger_false_negative_rate[i] <- 1 - ensemble_ranger_true_positive_rate[i]
ensemble_ranger_false_negative_rate_mean <- mean(ensemble_ranger_false_positive_rate)
ensemble_ranger_false_positive_rate[i] <- 1 - ensemble_ranger_true_negative_rate[i]
ensemble_ranger_false_positive_rate_mean <- mean(ensemble_ranger_false_positive_rate)
ensemble_ranger_positive_pred_value[i] <- ensemble_ranger_confusion_matrix_summary[3]
ensemble_ranger_positive_pred_value_mean <- mean(ensemble_ranger_positive_pred_value)
ensemble_ranger_negative_pred_value[i] <- ensemble_ranger_confusion_matrix_summary[4]
ensemble_ranger_negative_pred_value_mean <- mean(ensemble_ranger_negative_pred_value)
ensemble_ranger_prevalence[i] <- ensemble_ranger_confusion_matrix_summary[8]
ensemble_ranger_prevalence_mean <- mean(ensemble_ranger_prevalence)
ensemble_ranger_detection_rate[i] <- ensemble_ranger_confusion_matrix_summary[9]
ensemble_ranger_detection_rate_mean <- mean(ensemble_ranger_detection_rate)
ensemble_ranger_detection_prevalence[i] <- ensemble_ranger_confusion_matrix_summary[10]
ensemble_ranger_detection_prevalence_mean <- mean(ensemble_ranger_detection_prevalence)
ensemble_ranger_classification_error[i] <- 1 - ensemble_ranger_holdout[i]
ensemble_ranger_classification_error_mean <- mean(ensemble_ranger_classification_error)
ensemble_ranger_F1_score[i] <- ensemble_ranger_confusion_matrix_summary[7]
ensemble_ranger_F1_score_mean <- mean(ensemble_ranger_F1_score[i])
ensemble_ranger_end <- Sys.time()
ensemble_ranger_duration[i] <- ensemble_ranger_end - ensemble_ranger_start
ensemble_ranger_duration_mean <- mean(ensemble_ranger_duration)
ensemble_ranger_duration_sd <- sd(ensemble_ranger_duration)
#### 18. Ensemble Random Forest ####
ensemble_rf_start <- Sys.time()
message("Working on Ensembles using Random Forest analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_train_rf_fit <- randomForest::randomForest(x = ensemble_train, y = ensemble_y_train)
}
if(set_seed == "N"){
ensemble_train_rf_fit <- randomForest::randomForest(x = ensemble_train, y = ensemble_y_train)
}
ensemble_rf_train_pred <- predict(ensemble_train_rf_fit, ensemble_train, type = "class")
ensemble_rf_train_table <- table(ensemble_rf_train_pred, ensemble_y_train)
ensemble_rf_train_accuracy[i] <- sum(diag(ensemble_rf_train_table)) / sum(ensemble_rf_train_table)
ensemble_rf_train_accuracy_mean <- mean(ensemble_rf_train_accuracy)
ensemble_rf_train_diag <- sum(diag(ensemble_rf_train_table))
ensemble_rf_train_mean <- mean(diag(ensemble_rf_train_table)) / mean(ensemble_rf_train_table)
ensemble_rf_train_sd <- sd(diag(ensemble_rf_train_table)) / sd(ensemble_rf_train_table)
sum_ensemble_train_rf <- sum(diag(ensemble_rf_train_table))
ensemble_rf_train_prop <- diag(prop.table(ensemble_rf_train_table, margin = 1))
ensemble_rf_test_pred <- predict(ensemble_train_rf_fit, ensemble_test, type = "class")
ensemble_rf_test_table <- table(ensemble_rf_test_pred, ensemble_y_test)
ensemble_rf_test_accuracy[i] <- sum(diag(ensemble_rf_test_table)) / sum(ensemble_rf_test_table)
ensemble_rf_test_accuracy_mean <- mean(ensemble_rf_test_accuracy)
ensemble_rf_test_diag <- sum(diag(ensemble_rf_test_table))
ensemble_rf_test_mean <- mean(diag(ensemble_rf_test_table)) / mean(ensemble_rf_test_table)
ensemble_rf_test_sd <- sd(diag(ensemble_rf_test_table)) / sd(ensemble_rf_test_table)
sum_ensemble_test_rf <- sum(diag(ensemble_rf_test_table))
ensemble_rf_test_prop <- diag(prop.table(ensemble_rf_test_table, margin = 1))
ensemble_rf_validation_pred <- predict(ensemble_train_rf_fit, ensemble_validation, type = "class")
ensemble_rf_validation_table <- table(ensemble_rf_validation_pred, ensemble_y_validation)
ensemble_rf_validation_accuracy[i] <- sum(diag(ensemble_rf_validation_table)) / sum(ensemble_rf_validation_table)
ensemble_rf_validation_accuracy_mean <- mean(ensemble_rf_validation_accuracy)
ensemble_rf_validation_diag <- sum(diag(ensemble_rf_validation_table))
ensemble_rf_validation_mean <- mean(diag(ensemble_rf_validation_table)) / mean(ensemble_rf_validation_table)
ensemble_rf_validation_sd <- sd(diag(ensemble_rf_validation_table)) / sd(ensemble_rf_validation_table)
sum_ensemble_validation_rf <- sum(diag(ensemble_rf_validation_table))
ensemble_rf_validation_prop <- diag(prop.table(ensemble_rf_validation_table, margin = 1))
ensemble_rf_holdout[i] <- mean(c(ensemble_rf_test_accuracy_mean, ensemble_rf_validation_accuracy_mean))
ensemble_rf_holdout_mean <- mean(ensemble_rf_holdout)
ensemble_rf_residuals[i] <- 1 - ensemble_rf_holdout[i]
ensemble_rf_residuals_mean <- mean(ensemble_rf_residuals)
ensemble_rf_holdout_sd <- sd(ensemble_rf_holdout)
ensemble_rf_holdout_vs_train[i] <- ensemble_rf_holdout_mean / ensemble_rf_train_accuracy_mean
ensemble_rf_holdout_vs_train_mean <- mean(ensemble_rf_holdout_vs_train)
ensemble_rf_holdout_vs_train_range <- range(ensemble_rf_holdout_vs_train)
ensemble_rf_holdout_vs_train_sd <- sd(ensemble_rf_holdout_vs_train)
ensemble_rf_table <- ensemble_rf_test_table + ensemble_rf_validation_table
ensemble_rf_table_total <- ensemble_rf_table_total + ensemble_rf_table
ensemble_rf_table_sum_diag <- sum(diag(ensemble_rf_table))
ensemble_rf_confusion_matrix <- caret::confusionMatrix(ensemble_rf_table_total)
ensemble_rf_confusion_matrix_summary <- colMeans(ensemble_rf_confusion_matrix$byClass, na.rm = TRUE)
ensemble_rf_true_positive_rate[i] <- ensemble_rf_confusion_matrix_summary[1]
ensemble_rf_true_positive_rate_mean <- mean(ensemble_rf_true_positive_rate[i])
ensemble_rf_true_negative_rate[i] <- ensemble_rf_confusion_matrix_summary[2]
ensemble_rf_true_negative_rate_mean <- mean(ensemble_rf_true_negative_rate)
ensemble_rf_false_negative_rate[i] <- 1 - ensemble_rf_true_positive_rate[i]
ensemble_rf_false_negative_rate_mean <- mean(ensemble_rf_false_positive_rate)
ensemble_rf_false_positive_rate[i] <- 1 - ensemble_rf_true_negative_rate[i]
ensemble_rf_false_positive_rate_mean <- mean(ensemble_rf_false_positive_rate)
ensemble_rf_positive_pred_value[i] <- ensemble_rf_confusion_matrix_summary[3]
ensemble_rf_positive_pred_value_mean <- mean(ensemble_rf_positive_pred_value)
ensemble_rf_negative_pred_value[i] <- ensemble_rf_confusion_matrix_summary[4]
ensemble_rf_negative_pred_value_mean <- mean(ensemble_rf_negative_pred_value)
ensemble_rf_prevalence[i] <- ensemble_rf_confusion_matrix_summary[8]
ensemble_rf_prevalence_mean <- mean(ensemble_rf_prevalence)
ensemble_rf_detection_rate[i] <- ensemble_rf_confusion_matrix_summary[9]
ensemble_rf_detection_rate_mean <- mean(ensemble_rf_detection_rate)
ensemble_rf_detection_prevalence[i] <- ensemble_rf_confusion_matrix_summary[10]
ensemble_rf_detection_prevalence_mean <- mean(ensemble_rf_detection_prevalence)
ensemble_rf_classification_error[i] <- 1 - ensemble_rf_holdout[i]
ensemble_rf_classification_error_mean <- mean(ensemble_rf_classification_error)
ensemble_rf_F1_score[i] <- ensemble_rf_confusion_matrix_summary[7]
ensemble_rf_F1_score_mean <- mean(ensemble_rf_F1_score[i])
ensemble_rf_end <- Sys.time()
ensemble_rf_duration[i] <- ensemble_rf_end - ensemble_rf_start
ensemble_rf_duration_mean <- mean(ensemble_rf_duration)
ensemble_rf_duration_sd <- sd(ensemble_rf_duration)
#### 19. Ensemble Support Vector Machines ####
ensemble_svm_start <- Sys.time()
message("Working on Ensembles using Support Vector Machines analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_svm_train_fit <- e1071::svm(ensemble_y_train ~ ., data = ensemble_train, kernel = "radial", gamma = 1, cost = 1)
}
if(set_seed == "N"){
ensemble_svm_train_fit <- e1071::svm(ensemble_y_train ~ ., data = ensemble_train, kernel = "radial", gamma = 1, cost = 1)
}
ensemble_svm_train_pred <- predict(ensemble_svm_train_fit, ensemble_train, type = "class")
ensemble_svm_train_table <- table(ensemble_svm_train_pred, ensemble_y_train)
ensemble_svm_train_accuracy[i] <- sum(diag(ensemble_svm_train_table)) / sum(ensemble_svm_train_table)
ensemble_svm_train_accuracy_mean <- mean(ensemble_svm_train_accuracy)
ensemble_svm_train_diag <- sum(diag(ensemble_svm_train_table))
ensemble_svm_train_mean <- mean(diag(ensemble_svm_train_table)) / mean(ensemble_svm_train_table)
ensemble_svm_train_sd <- sd(diag(ensemble_svm_train_table)) / sd(ensemble_svm_train_table)
sum_ensemble_train_svm <- sum(diag(ensemble_svm_train_table))
ensemble_svm_train_prop <- diag(prop.table(ensemble_svm_train_table, margin = 1))
ensemble_svm_test_fit <- e1071::svm(ensemble_y_train ~ ., data = ensemble_train, kernel = "radial", gamma = 1, cost = 1)
ensemble_svm_test_pred <- predict(ensemble_svm_test_fit, ensemble_test, type = "class")
ensemble_svm_test_table <- table(ensemble_svm_test_pred, ensemble_y_test)
ensemble_svm_test_accuracy[i] <- sum(diag(ensemble_svm_test_table)) / sum(ensemble_svm_test_table)
ensemble_svm_test_accuracy_mean <- mean(ensemble_svm_test_accuracy)
ensemble_svm_test_diag <- sum(diag(ensemble_svm_test_table))
ensemble_svm_test_mean <- mean(diag(ensemble_svm_test_table)) / mean(ensemble_svm_test_table)
ensemble_svm_test_sd <- sd(diag(ensemble_svm_test_table)) / sd(ensemble_svm_test_table)
sum_ensemble_test_svm <- sum(diag(ensemble_svm_test_table))
ensemble_svm_test_prop <- diag(prop.table(ensemble_svm_test_table, margin = 1))
ensemble_svm_validation_fit <- e1071::svm(ensemble_y_train ~ ., data = ensemble_train, kernel = "radial", gamma = 1, cost = 1)
ensemble_svm_validation_pred <- predict(ensemble_svm_validation_fit, ensemble_validation, type = "class")
ensemble_svm_validation_table <- table(ensemble_svm_validation_pred, ensemble_y_validation)
ensemble_svm_validation_accuracy[i] <- sum(diag(ensemble_svm_validation_table)) / sum(ensemble_svm_validation_table)
ensemble_svm_validation_accuracy_mean <- mean(ensemble_svm_validation_accuracy)
ensemble_svm_validation_diag <- sum(diag(ensemble_svm_validation_table))
ensemble_svm_validation_mean <- mean(diag(ensemble_svm_validation_table)) / mean(ensemble_svm_validation_table)
ensemble_svm_validation_sd <- sd(diag(ensemble_svm_validation_table)) / sd(ensemble_svm_validation_table)
sum_ensemble_validation_svm <- sum(diag(ensemble_svm_validation_table))
ensemble_svm_validation_prop <- diag(prop.table(ensemble_svm_validation_table, margin = 1))
ensemble_svm_holdout[i] <- mean(c(ensemble_svm_test_accuracy_mean, ensemble_svm_validation_accuracy_mean))
ensemble_svm_holdout_mean <- mean(ensemble_svm_holdout)
ensemble_svm_residuals[i] <- 1 - ensemble_svm_holdout[i]
ensemble_svm_residuals_mean <- mean(ensemble_svm_residuals)
ensemble_svm_holdout_sd <- sd(ensemble_svm_holdout)
ensemble_svm_holdout_vs_train[i] <- ensemble_svm_holdout_mean / ensemble_svm_train_accuracy_mean
ensemble_svm_holdout_vs_train_mean <- mean(ensemble_svm_holdout_vs_train)
ensemble_svm_holdout_vs_train_range <- range(ensemble_svm_holdout_vs_train)
ensemble_svm_holdout_vs_train_sd <- sd(ensemble_svm_holdout_vs_train)
ensemble_svm_table <- ensemble_svm_test_table + ensemble_svm_validation_table
ensemble_svm_table_total <- ensemble_svm_table_total + ensemble_svm_table
ensemble_svm_table_sum_diag <- sum(diag(ensemble_svm_table))
ensemble_svm_confusion_matrix <- caret::confusionMatrix(ensemble_svm_table_total)
ensemble_svm_confusion_matrix_summary <- colMeans(ensemble_svm_confusion_matrix$byClass, na.rm = TRUE)
ensemble_svm_true_positive_rate[i] <- ensemble_svm_confusion_matrix_summary[1]
ensemble_svm_true_positive_rate_mean <- mean(ensemble_svm_true_positive_rate[i])
ensemble_svm_true_negative_rate[i] <- ensemble_svm_confusion_matrix_summary[2]
ensemble_svm_true_negative_rate_mean <- mean(ensemble_svm_true_negative_rate)
ensemble_svm_false_negative_rate[i] <- 1 - ensemble_svm_true_positive_rate[i]
ensemble_svm_false_negative_rate_mean <- mean(ensemble_svm_false_positive_rate)
ensemble_svm_false_positive_rate[i] <- 1 - ensemble_svm_true_negative_rate[i]
ensemble_svm_false_positive_rate_mean <- mean(ensemble_svm_false_positive_rate)
ensemble_svm_positive_pred_value[i] <- ensemble_svm_confusion_matrix_summary[3]
ensemble_svm_positive_pred_value_mean <- mean(ensemble_svm_positive_pred_value)
ensemble_svm_negative_pred_value[i] <- ensemble_svm_confusion_matrix_summary[4]
ensemble_svm_negative_pred_value_mean <- mean(ensemble_svm_negative_pred_value)
ensemble_svm_prevalence[i] <- ensemble_svm_confusion_matrix_summary[8]
ensemble_svm_prevalence_mean <- mean(ensemble_svm_prevalence)
ensemble_svm_detection_rate[i] <- ensemble_svm_confusion_matrix_summary[9]
ensemble_svm_detection_rate_mean <- mean(ensemble_svm_detection_rate)
ensemble_svm_detection_prevalence[i] <- ensemble_svm_confusion_matrix_summary[10]
ensemble_svm_detection_prevalence_mean <- mean(ensemble_svm_detection_prevalence)
ensemble_svm_classification_error[i] <- 1 - ensemble_svm_holdout[i]
ensemble_svm_classification_error_mean <- mean(ensemble_svm_classification_error)
ensemble_svm_F1_score[i] <- ensemble_svm_confusion_matrix_summary[7]
ensemble_svm_F1_score_mean <- mean(ensemble_svm_F1_score[i])
ensemble_svm_end <- Sys.time()
ensemble_svm_duration[i] <- ensemble_svm_end - ensemble_svm_start
ensemble_svm_duration_mean <- mean(ensemble_svm_duration)
ensemble_svm_duration_sd <- sd(ensemble_svm_duration)
#### 20. Ensemble Trees ####
ensemble_tree_start <- Sys.time()
message("Working on Ensembles using Trees analysis")
if(set_seed == "Y"){
set.seed(seed = seed)
ensemble_tree_train_fit <- tree::tree(y ~ ., data = ensemble_train)
}
if(set_seed == "N"){
ensemble_tree_train_fit <- tree::tree(y ~ ., data = ensemble_train)
}
ensemble_tree_train_pred <- predict(ensemble_tree_train_fit, ensemble_train, type = "class")
ensemble_tree_train_table <- table(ensemble_tree_train_pred, ensemble_y_train)
ensemble_tree_train_accuracy[i] <- sum(diag(ensemble_tree_train_table)) / sum(ensemble_tree_train_table)
ensemble_tree_train_accuracy_mean <- mean(ensemble_tree_train_accuracy)
ensemble_tree_train_diag <- sum(diag(ensemble_tree_train_table))
ensemble_tree_train_mean <- mean(diag(ensemble_tree_train_table)) / mean(ensemble_tree_train_table)
ensemble_tree_train_sd <- sd(diag(ensemble_tree_train_table)) / sd(ensemble_tree_train_table)
sum_ensemble_train_tree <- sum(diag(ensemble_tree_train_table))
ensemble_tree_train_prop <- diag(prop.table(ensemble_tree_train_table, margin = 1))
ensemble_tree_test_pred <- predict(ensemble_tree_train_fit, ensemble_test, type = "class")
ensemble_tree_test_table <- table(ensemble_tree_test_pred, ensemble_y_test)
ensemble_tree_test_accuracy[i] <- sum(diag(ensemble_tree_test_table)) / sum(ensemble_tree_test_table)
ensemble_tree_test_accuracy_mean <- mean(ensemble_tree_test_accuracy)
ensemble_tree_test_diag <- sum(diag(ensemble_tree_test_table))
ensemble_tree_test_mean <- mean(diag(ensemble_tree_test_table)) / mean(ensemble_tree_test_table)
ensemble_tree_test_sd <- sd(diag(ensemble_tree_test_table)) / sd(ensemble_tree_test_table)
sum_ensemble_test_tree <- sum(diag(ensemble_tree_test_table))
ensemble_tree_test_prop <- diag(prop.table(ensemble_tree_test_table, margin = 1))
ensemble_tree_validation_pred <- predict(ensemble_tree_train_fit, ensemble_validation, type = "class")
ensemble_tree_validation_table <- table(ensemble_tree_validation_pred, ensemble_y_validation)
ensemble_tree_validation_accuracy[i] <- sum(diag(ensemble_tree_validation_table)) / sum(ensemble_tree_validation_table)
ensemble_tree_validation_accuracy_mean <- mean(ensemble_tree_validation_accuracy)
ensemble_tree_validation_diag <- sum(diag(ensemble_tree_validation_table))
ensemble_tree_validation_mean <- mean(diag(ensemble_tree_validation_table)) / mean(ensemble_tree_validation_table)
ensemble_tree_validation_sd <- sd(diag(ensemble_tree_validation_table)) / sd(ensemble_tree_validation_table)
sum_ensemble_validation_tree <- sum(diag(ensemble_tree_validation_table))
ensemble_tree_validation_prop <- diag(prop.table(ensemble_tree_validation_table, margin = 1))
ensemble_tree_holdout[i] <- mean(c(ensemble_tree_test_accuracy_mean, ensemble_tree_validation_accuracy_mean))
ensemble_tree_holdout_mean <- mean(ensemble_tree_holdout)
ensemble_tree_residuals[i] <- 1 - ensemble_tree_holdout[i]
ensemble_tree_residuals_mean <- mean(ensemble_tree_residuals)
ensemble_tree_holdout_sd <- sd(ensemble_tree_holdout)
ensemble_tree_holdout_vs_train[i] <- ensemble_tree_holdout_mean / ensemble_tree_train_accuracy_mean
ensemble_tree_holdout_vs_train_mean <- mean(ensemble_tree_holdout_vs_train)
ensemble_tree_holdout_vs_train_range <- range(ensemble_tree_holdout_vs_train)
ensemble_tree_holdout_vs_train_sd <- sd(ensemble_tree_holdout_vs_train)
ensemble_tree_table <- ensemble_tree_test_table + ensemble_tree_validation_table
ensemble_tree_table_total <- ensemble_tree_table_total + ensemble_tree_table
ensemble_tree_table_sum_diag <- sum(diag(ensemble_tree_table))
ensemble_tree_confusion_matrix <- caret::confusionMatrix(ensemble_tree_table_total)
ensemble_tree_confusion_matrix_summary <- colMeans(ensemble_tree_confusion_matrix$byClass, na.rm = TRUE)
ensemble_tree_true_positive_rate[i] <- ensemble_tree_confusion_matrix_summary[1]
ensemble_tree_true_positive_rate_mean <- mean(ensemble_tree_true_positive_rate[i])
ensemble_tree_true_negative_rate[i] <- ensemble_tree_confusion_matrix_summary[2]
ensemble_tree_true_negative_rate_mean <- mean(ensemble_tree_true_negative_rate)
ensemble_tree_false_negative_rate[i] <- 1 - ensemble_tree_true_positive_rate[i]
ensemble_tree_false_negative_rate_mean <- mean(ensemble_tree_false_positive_rate)
ensemble_tree_false_positive_rate[i] <- 1 - ensemble_tree_true_negative_rate[i]
ensemble_tree_false_positive_rate_mean <- mean(ensemble_tree_false_positive_rate)
ensemble_tree_positive_pred_value[i] <- ensemble_tree_confusion_matrix_summary[3]
ensemble_tree_positive_pred_value_mean <- mean(ensemble_tree_positive_pred_value)
ensemble_tree_negative_pred_value[i] <- ensemble_tree_confusion_matrix_summary[4]
ensemble_tree_negative_pred_value_mean <- mean(ensemble_tree_negative_pred_value)
ensemble_tree_prevalence[i] <- ensemble_tree_confusion_matrix_summary[8]
ensemble_tree_prevalence_mean <- mean(ensemble_tree_prevalence)
ensemble_tree_detection_rate[i] <- ensemble_tree_confusion_matrix_summary[9]
ensemble_tree_detection_rate_mean <- mean(ensemble_tree_detection_rate)
ensemble_tree_detection_prevalence[i] <- ensemble_tree_confusion_matrix_summary[10]
ensemble_tree_detection_prevalence_mean <- mean(ensemble_tree_detection_prevalence)
ensemble_tree_classification_error[i] <- 1 - ensemble_tree_holdout[i]
ensemble_tree_classification_error_mean <- mean(ensemble_tree_classification_error)
ensemble_tree_F1_score[i] <- ensemble_tree_confusion_matrix_summary[7]
ensemble_tree_F1_score_mean <- mean(ensemble_tree_F1_score[i])
ensemble_tree_end <- Sys.time()
ensemble_tree_duration[i] <- ensemble_tree_end - ensemble_tree_start
ensemble_tree_duration_mean <- mean(ensemble_tree_duration)
ensemble_tree_duration_sd <- sd(ensemble_tree_duration)
}
#### Ensembles end here ####
Results <- data.frame(
"Model" = c(
"Bagging", "Bagged Random Forest", "C50",
"Linear", "Naive Bayes",
"Partial Least Squares", "Penalized Discriminant Analysis", "Random Forest", "Ranger",
"RPart", "Support Vector Machines", "Trees",
"Ensemble Bagged Cart", "Ensemble Bagged Random Forest", "Ensemble C50",
"Ensemble Naive Bayes", "Ensemble Ranger", "Ensemble Random Forest", "Ensemble Support Vector Machines",
"Ensemble Trees"
),
"Mean_Holdout_Accuracy" = round(c(
bagging_holdout_mean, bag_rf_holdout_mean,
C50_holdout_mean, linear_holdout_mean,
n_bayes_holdout_mean, pls_holdout_mean, pda_holdout_mean, rf_holdout_mean, ranger_holdout_mean,
rpart_holdout_mean, svm_holdout_mean, tree_holdout_mean,
ensemble_bag_cart_holdout_mean, ensemble_bag_rf_holdout_mean, ensemble_C50_holdout_mean,
ensemble_n_bayes_holdout_mean, ensemble_ranger_holdout_mean, ensemble_rf_holdout_mean, ensemble_svm_holdout_mean,
ensemble_tree_holdout_mean
), 4),
"Accuracy_Holdout_Std.Dev" = round(c(
bagging_holdout_sd, bag_rf_holdout_sd,
C50_holdout_sd, linear_holdout_sd,
n_bayes_holdout_sd, pls_holdout_sd, pda_holdout_sd, rf_holdout_sd, ranger_holdout_sd,
rpart_holdout_sd, svm_holdout_sd, tree_holdout_sd,
ensemble_bag_cart_holdout_sd, ensemble_bag_rf_holdout_sd, ensemble_C50_holdout_sd,
ensemble_n_bayes_holdout_sd, ensemble_ranger_holdout_sd, ensemble_rf_holdout_sd, ensemble_svm_holdout_sd,
ensemble_tree_holdout_sd
), 4),
'Classification_Error_Mean' = round(c(
bagging_classification_error_mean, bag_rf_classification_error_mean,
C50_classification_error_mean, linear_classification_error_mean,
n_bayes_classification_error_mean, pls_classification_error_mean, pda_classification_error_mean, rf_classification_error_mean, ranger_classification_error_mean,
rpart_classification_error_mean, svm_classification_error_mean, tree_classification_error_mean,
ensemble_bag_cart_classification_error_mean, ensemble_bag_rf_classification_error_mean, ensemble_C50_classification_error_mean,
ensemble_n_bayes_classification_error_mean, ensemble_ranger_classification_error_mean, ensemble_rf_classification_error_mean, ensemble_svm_classification_error_mean,
ensemble_tree_classification_error_mean
), 4),
"Duration" = round(c(
bagging_duration_mean, bag_rf_duration_mean,
C50_duration_mean, linear_duration_mean,
n_bayes_duration_mean, pls_duration_mean, pda_duration_mean, rf_duration_mean, ranger_duration_mean,
rpart_duration_mean, svm_duration_mean, tree_duration_mean,
ensemble_bag_cart_duration_mean, ensemble_bag_rf_duration_mean, ensemble_C50_duration_mean,
ensemble_n_bayes_duration_mean, ensemble_ranger_duration_mean, ensemble_rf_duration_mean, ensemble_svm_duration_mean,
ensemble_tree_duration_mean
), 4),
"Duration_St_Dev" = round(c(
bagging_duration_sd, bag_rf_duration_sd,
C50_duration_sd, linear_duration_sd,
n_bayes_duration_sd, pls_duration_sd, pda_duration_sd, rf_duration_sd, ranger_duration_sd,
rpart_duration_sd, svm_duration_sd, tree_duration_sd,
ensemble_bag_cart_duration_sd, ensemble_bag_rf_duration_sd, ensemble_C50_duration_sd,
ensemble_n_bayes_duration_sd, ensemble_ranger_duration_sd, ensemble_rf_duration_sd, ensemble_svm_duration_sd,
ensemble_tree_duration_sd
), 4),
"True_Positive_Rate" = round(c(
bagging_true_positive_rate_mean, bag_rf_true_positive_rate_mean,
C50_true_positive_rate_mean, linear_true_positive_rate_mean,
n_bayes_true_positive_rate_mean, pls_true_positive_rate_mean, pda_true_positive_rate_mean, rf_true_positive_rate_mean, ranger_true_positive_rate_mean,
rpart_true_positive_rate_mean, svm_true_positive_rate_mean, tree_true_positive_rate_mean,
ensemble_bag_cart_true_positive_rate_mean, ensemble_bag_rf_true_positive_rate_mean, ensemble_C50_true_positive_rate_mean,
ensemble_n_bayes_true_positive_rate_mean, ensemble_ranger_true_positive_rate_mean, ensemble_rf_true_positive_rate_mean, ensemble_svm_true_positive_rate_mean,
ensemble_tree_true_positive_rate_mean
), 4),
"True_Negative_Rate" = round(c(
bagging_true_negative_rate_mean, bag_rf_true_negative_rate_mean,
C50_true_negative_rate_mean, linear_true_negative_rate_mean,
n_bayes_true_negative_rate_mean, pls_true_negative_rate_mean, pda_true_negative_rate_mean, rf_true_negative_rate_mean, ranger_true_negative_rate_mean,
rpart_true_negative_rate_mean, svm_true_negative_rate_mean, tree_true_negative_rate_mean,
ensemble_bag_cart_true_negative_rate_mean, ensemble_bag_rf_true_negative_rate_mean, ensemble_C50_true_negative_rate_mean,
ensemble_n_bayes_true_negative_rate_mean, ensemble_ranger_true_negative_rate_mean, ensemble_rf_true_negative_rate_mean, ensemble_svm_true_negative_rate_mean,
ensemble_tree_true_negative_rate_mean
), 4),
"False_Positive_Rate" = round(c(
bagging_false_positive_rate_mean, bag_rf_false_positive_rate_mean,
C50_false_positive_rate_mean, linear_false_positive_rate_mean,
n_bayes_false_positive_rate_mean, pls_false_positive_rate_mean, pda_false_positive_rate_mean, rf_false_positive_rate_mean, ranger_false_positive_rate_mean,
rpart_false_positive_rate_mean, svm_false_positive_rate_mean, tree_false_positive_rate_mean,
ensemble_bag_cart_false_positive_rate_mean, ensemble_bag_rf_false_positive_rate_mean, ensemble_C50_false_positive_rate_mean,
ensemble_n_bayes_false_positive_rate_mean, ensemble_ranger_false_positive_rate_mean, ensemble_rf_false_positive_rate_mean, ensemble_svm_false_positive_rate_mean,
ensemble_tree_false_positive_rate_mean
), 4),
"False_Negative_Rate" = round(c(
bagging_false_negative_rate_mean, bag_rf_false_negative_rate_mean,
C50_false_negative_rate_mean, linear_false_negative_rate_mean,
n_bayes_false_negative_rate_mean, pls_false_negative_rate_mean, pda_false_negative_rate_mean, rf_false_negative_rate_mean, ranger_false_negative_rate_mean,
rpart_false_negative_rate_mean, svm_false_negative_rate_mean, tree_false_negative_rate_mean,
ensemble_bag_cart_false_negative_rate_mean, ensemble_bag_rf_false_negative_rate_mean, ensemble_C50_false_negative_rate_mean,
ensemble_n_bayes_false_negative_rate_mean, ensemble_ranger_false_negative_rate_mean, ensemble_rf_false_negative_rate_mean, ensemble_svm_false_negative_rate_mean,
ensemble_tree_false_negative_rate_mean
), 4),
"Positive_Pred_Value" = round(c(
bagging_positive_pred_value_mean, bag_rf_positive_pred_value_mean,
C50_positive_pred_value_mean, linear_positive_pred_value_mean,
n_bayes_positive_pred_value_mean, pls_positive_pred_value_mean, pda_positive_pred_value_mean, rf_positive_pred_value_mean, ranger_positive_pred_value_mean,
rpart_positive_pred_value_mean, svm_positive_pred_value_mean, tree_positive_pred_value_mean,
ensemble_bag_cart_positive_pred_value_mean, ensemble_bag_rf_positive_pred_value_mean, ensemble_C50_positive_pred_value_mean,
ensemble_n_bayes_positive_pred_value_mean, ensemble_ranger_positive_pred_value_mean, ensemble_rf_positive_pred_value_mean, ensemble_svm_positive_pred_value_mean,
ensemble_tree_positive_pred_value_mean
), 4),
"Negative_Pred_Value" = round(c(
bagging_negative_pred_value_mean, bag_rf_negative_pred_value_mean,
C50_negative_pred_value_mean, linear_negative_pred_value_mean,
n_bayes_negative_pred_value_mean, pls_negative_pred_value_mean, pda_negative_pred_value_mean, rf_negative_pred_value_mean, ranger_negative_pred_value_mean,
rpart_negative_pred_value_mean, svm_negative_pred_value_mean, tree_negative_pred_value_mean,
ensemble_bag_cart_negative_pred_value_mean, ensemble_bag_rf_negative_pred_value_mean, ensemble_C50_negative_pred_value_mean,
ensemble_n_bayes_negative_pred_value_mean, ensemble_ranger_negative_pred_value_mean, ensemble_rf_negative_pred_value_mean, ensemble_svm_negative_pred_value_mean,
ensemble_tree_negative_pred_value_mean
), 4),
"Prevalence" = round(c(
bagging_prevalence_mean, bag_rf_prevalence_mean,
C50_prevalence_mean, linear_prevalence_mean,
n_bayes_prevalence_mean, pls_prevalence_mean, pda_prevalence_mean, rf_prevalence_mean, ranger_prevalence_mean,
rpart_prevalence_mean, svm_prevalence_mean, tree_prevalence_mean,
ensemble_bag_cart_prevalence_mean, ensemble_bag_rf_prevalence_mean, ensemble_C50_prevalence_mean,
ensemble_n_bayes_prevalence_mean, ensemble_ranger_prevalence_mean, ensemble_rf_prevalence_mean, ensemble_svm_prevalence_mean,
ensemble_tree_prevalence_mean
), 4),
"Detection_Rate" = round(c(
bagging_detection_rate_mean, bag_rf_detection_rate_mean,
C50_detection_rate_mean, linear_detection_rate_mean,
n_bayes_detection_rate_mean, pls_detection_rate_mean, pda_detection_rate_mean, rf_detection_rate_mean, ranger_detection_rate_mean,
rpart_detection_rate_mean, svm_detection_rate_mean, tree_detection_rate_mean,
ensemble_bag_cart_detection_rate_mean, ensemble_bag_rf_detection_rate_mean, ensemble_C50_detection_rate_mean,
ensemble_n_bayes_detection_rate_mean, ensemble_ranger_detection_rate_mean, ensemble_rf_detection_rate_mean, ensemble_svm_detection_rate_mean,
ensemble_tree_detection_rate_mean
), 4),
"Detection_Prevalence" = round(c(
bagging_detection_prevalence_mean, bag_rf_detection_prevalence_mean,
C50_detection_prevalence_mean, linear_detection_prevalence_mean,
n_bayes_detection_prevalence_mean, pls_detection_prevalence_mean, pda_detection_prevalence_mean, rf_detection_prevalence_mean, ranger_detection_prevalence_mean,
rpart_detection_prevalence_mean, svm_detection_prevalence_mean, tree_detection_prevalence_mean,
ensemble_bag_cart_detection_prevalence_mean, ensemble_bag_rf_detection_prevalence_mean, ensemble_C50_detection_prevalence_mean,
ensemble_n_bayes_detection_prevalence_mean, ensemble_ranger_detection_prevalence_mean, ensemble_rf_detection_prevalence_mean, ensemble_svm_detection_prevalence_mean,
ensemble_tree_detection_prevalence_mean
), 4),
"F1_Score" = round(c(
bagging_F1_score_mean, bag_rf_F1_score_mean,
C50_F1_score_mean, linear_F1_score_mean,
n_bayes_F1_score_mean, pls_F1_score_mean, pda_F1_score_mean, rf_F1_score_mean, ranger_F1_score_mean,
rpart_F1_score_mean, svm_F1_score_mean, tree_F1_score_mean,
ensemble_bag_cart_F1_score_mean, ensemble_bag_rf_F1_score_mean, ensemble_C50_F1_score_mean,
ensemble_n_bayes_F1_score_mean, ensemble_ranger_F1_score_mean, ensemble_rf_F1_score_mean, ensemble_svm_F1_score_mean,
ensemble_tree_F1_score_mean
), 4),
"Train_Accuracy" = round(c(
bagging_train_accuracy_mean, bag_rf_train_accuracy_mean,
C50_train_accuracy_mean, linear_train_accuracy_mean,
n_bayes_train_accuracy_mean, pls_train_accuracy_mean, pda_train_accuracy_mean, rf_train_accuracy_mean, ranger_train_accuracy_mean,
rpart_train_accuracy_mean, svm_train_accuracy_mean, tree_train_accuracy_mean,
ensemble_bag_cart_train_accuracy_mean, ensemble_bag_rf_train_accuracy_mean,
ensemble_C50_train_accuracy_mean, ensemble_n_bayes_train_accuracy_mean,
ensemble_ranger_train_accuracy_mean, ensemble_rf_train_accuracy_mean, ensemble_svm_train_accuracy_mean, ensemble_tree_train_accuracy_mean
), 4),
"Test_Accuracy" = round(c(
bagging_test_accuracy_mean, bag_rf_test_accuracy_mean,
C50_test_accuracy_mean, linear_test_accuracy_mean,
n_bayes_test_accuracy_mean, pls_test_accuracy_mean, pda_test_accuracy_mean, rf_test_accuracy_mean, ranger_test_accuracy_mean,
rpart_test_accuracy_mean, svm_test_accuracy_mean, tree_test_accuracy_mean,
ensemble_bag_cart_test_accuracy_mean, ensemble_bag_rf_test_accuracy_mean,
ensemble_C50_test_accuracy_mean, ensemble_n_bayes_test_accuracy_mean,
ensemble_ranger_test_accuracy_mean, ensemble_rf_test_accuracy_mean, ensemble_svm_test_accuracy_mean, ensemble_tree_test_accuracy_mean
), 4),
"Validation_Accuracy" = round(c(
bagging_validation_accuracy_mean, bag_rf_validation_accuracy_mean,
C50_validation_accuracy_mean,
linear_validation_accuracy_mean, n_bayes_validation_accuracy_mean, pls_validation_accuracy_mean,
pda_validation_accuracy_mean, rf_validation_accuracy_mean, ranger_validation_accuracy_mean, rpart_validation_accuracy_mean,
svm_validation_accuracy_mean, tree_validation_accuracy_mean,
ensemble_bag_cart_validation_accuracy_mean, ensemble_bag_rf_validation_accuracy_mean, ensemble_C50_validation_accuracy_mean,
ensemble_n_bayes_validation_accuracy_mean, ensemble_ranger_validation_accuracy_mean,
ensemble_rf_validation_accuracy_mean, ensemble_svm_validation_accuracy_mean, ensemble_tree_validation_accuracy_mean
), 4),
"Holdout_vs_train" = round(c(
bagging_holdout_vs_train_mean, bag_rf_holdout_vs_train_mean,
C50_holdout_vs_train_mean, linear_holdout_vs_train_mean,
n_bayes_holdout_vs_train_mean, pls_holdout_vs_train_mean, pda_holdout_vs_train_mean, rf_holdout_vs_train_mean, ranger_holdout_vs_train_mean,
rpart_holdout_vs_train_mean, svm_holdout_vs_train_mean, tree_holdout_vs_train_mean,
ensemble_bag_cart_holdout_vs_train_mean, ensemble_bag_rf_holdout_vs_train_mean, ensemble_C50_holdout_vs_train_mean,
ensemble_n_bayes_holdout_vs_train_mean, ensemble_ranger_holdout_vs_train_mean, ensemble_rf_holdout_vs_train_mean,
ensemble_svm_holdout_vs_train_mean, ensemble_tree_holdout_vs_train_mean
), 4),
"Holdout_vs_train_St_Dev" = round(c(
bagging_holdout_vs_train_sd, bag_rf_holdout_vs_train_sd,
C50_holdout_vs_train_sd, linear_holdout_vs_train_sd,
n_bayes_holdout_vs_train_sd, pls_holdout_vs_train_sd, pda_holdout_vs_train_sd, rf_holdout_vs_train_sd, ranger_holdout_vs_train_sd,
rpart_holdout_vs_train_sd, svm_holdout_vs_train_sd, tree_holdout_vs_train_sd,
ensemble_bag_cart_holdout_vs_train_sd, ensemble_bag_rf_holdout_vs_train_sd, ensemble_C50_holdout_vs_train_sd,
ensemble_n_bayes_holdout_vs_train_sd, ensemble_ranger_holdout_vs_train_sd, ensemble_rf_holdout_vs_train_sd,
ensemble_svm_holdout_vs_train_sd, ensemble_tree_holdout_vs_train_sd
), 4),
"Diagonal_Sum" = round(c(
bagging_table_sum_diag, bag_rf_table_sum_diag,
C50_table_sum_diag, linear_table_sum_diag,
n_bayes_table_sum_diag, pls_table_sum_diag, pda_table_sum_diag, rf_table_sum_diag, ranger_table_sum_diag,
rpart_table_sum_diag, svm_table_sum_diag, tree_table_sum_diag,
ensemble_bag_cart_table_sum_diag, ensemble_bag_rf_table_sum_diag, ensemble_C50_table_sum_diag,
ensemble_n_bayes_table_sum_diag, ensemble_ranger_table_sum_diag, ensemble_rf_table_sum_diag,
ensemble_svm_table_sum_diag, ensemble_tree_table_sum_diag
), 4)
)
Results <- Results %>% dplyr::arrange(dplyr::desc(Mean_Holdout_Accuracy))
Final_results <- reactable::reactable(Results,
searchable = TRUE, pagination = FALSE, wrap = TRUE, fullWidth = TRUE, filterable = TRUE, bordered = TRUE,
striped = TRUE, highlight = TRUE, rownames = TRUE, resizable = TRUE
) %>%
reactablefmtr::add_title("Classification analysis, accuracy, duration, holdout_vs_train, sum of diagonals")
summary_tables <- list(
"Bagging" = bagging_table_total, "Bagged Random Forest" = bag_rf_table_total, "C50" = C50_table_total,
"Linear" = linear_table_total, "Naive Bayes" = n_bayes_table_total,
"Partial Least Sqaures" = pls_table_total, "Penalized Discriminant Ananysis" = pda_table_total, "Random Forest" = rf_table_total,
"Ranger" = ranger_table_total, "RPart" = rpart_table_total, "Support Vector Machines" = svm_table_total,
"Trees" = tree_table_total,
"Ensemble Bagged Cart" = ensemble_bag_cart_table_total,
"Ensemble Bagged Random Forest" = ensemble_bag_rf_table_total, "Ensemble C50" = ensemble_C50_table_total, "Ensemble Naive Bayes" = ensemble_n_bayes_table_total,
"Ensemble Ranger" = ensemble_ranger_table_total, "Ensemble Random Forest" = ensemble_rf_table_total,
"Ensemble Support Vector Machines" = ensemble_svm_table_total, "Ensemble Trees" = ensemble_tree_table_total
)
accuracy_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_holdout, bag_rf_holdout, C50_holdout, linear_holdout,
n_bayes_holdout, pls_holdout, pda_holdout, rf_holdout, ranger_holdout, rpart_holdout, svm_holdout, tree_holdout,
ensemble_bag_cart_holdout, ensemble_bag_rf_holdout, ensemble_C50_holdout,
ensemble_n_bayes_holdout,
ensemble_ranger_holdout, ensemble_rf_holdout, ensemble_svm_holdout, ensemble_tree_holdout
),
"mean" = rep(c(
bagging_holdout_mean, bag_rf_holdout_mean, C50_holdout_mean,
linear_holdout_mean, n_bayes_holdout_mean, pls_holdout_mean,
pda_holdout_mean, rf_holdout_mean, ranger_holdout_mean, rpart_holdout_mean, svm_holdout_mean, tree_holdout_mean,
ensemble_bag_cart_holdout_mean, ensemble_bag_rf_holdout_mean, ensemble_C50_holdout_mean,
ensemble_n_bayes_holdout_mean, ensemble_ranger_holdout_mean, ensemble_rf_holdout_mean, ensemble_svm_holdout_mean,
ensemble_tree_holdout_mean
), each = numresamples)
)
accuracy_plot <- ggplot2::ggplot(data = accuracy_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("Accuracy by model, higher is better, 1 is best, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Accuracy by model, higher is better, 1 is best. \n The horizontal line is the mean of the results, the red line is 1.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("accuracy_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("accuracy_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("accuracy_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("accuracy_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("accuracy_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("accuracy_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
accuracy_plot2 <- ggplot2::ggplot(data = accuracy_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("Accuracy by model, higher is better, 1 is best. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Accuracy by model, higher is better, 1 is best. \n The horizontal line is the mean of the results, the red line is 1.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("accuracy_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("accuracy_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("accuracy_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("accuracy_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("accuracy_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("accuracy_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
residuals_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_residuals, bag_rf_residuals, C50_residuals, linear_residuals,
n_bayes_residuals, pls_residuals, pda_residuals, rf_residuals, ranger_residuals, rpart_residuals, svm_residuals, tree_residuals,
ensemble_bag_cart_residuals, ensemble_bag_rf_residuals, ensemble_C50_residuals,
ensemble_n_bayes_residuals,
ensemble_ranger_residuals, ensemble_rf_residuals, ensemble_svm_residuals, ensemble_tree_residuals
),
"mean" = rep(c(
bagging_residuals_mean, bag_rf_residuals_mean, C50_residuals_mean,
linear_residuals_mean, n_bayes_residuals_mean, pls_residuals_mean,
pda_residuals_mean, rf_residuals_mean, ranger_residuals_mean, rpart_residuals_mean, svm_residuals_mean, tree_residuals_mean,
ensemble_bag_cart_residuals_mean, ensemble_bag_rf_residuals_mean, ensemble_C50_residuals_mean,
ensemble_n_bayes_residuals_mean, ensemble_ranger_residuals_mean, ensemble_rf_residuals_mean, ensemble_svm_residuals_mean,
ensemble_tree_residuals_mean
), each = numresamples)
)
residuals_plot <- ggplot2::ggplot(data = residuals_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("Residuals by model, lower is better, 0 is best, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 0.") +
ggplot2::labs(y = "Residuals by model, lower is better, 0 is best. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("residuals_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("residuals_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("residuals_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("residuals_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("residuals_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("residuals_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
residuals_plot2 <- ggplot2::ggplot(data = residuals_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("Residuals by model, lower is better, 0 is best, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Residuals by model, lower is better, 0 is best. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("residuals_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("residuals_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("residuals_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("residuals_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("residuals_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("residuals_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
holdout_vs_train_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_holdout_vs_train, bag_rf_holdout_vs_train, C50_holdout_vs_train, linear_holdout_vs_train,
n_bayes_holdout_vs_train, pls_holdout_vs_train, pda_holdout_vs_train, rf_holdout_vs_train, ranger_holdout_vs_train, rpart_holdout_vs_train, svm_holdout_vs_train, tree_holdout_vs_train,
ensemble_bag_cart_holdout_vs_train, ensemble_bag_rf_holdout_vs_train, ensemble_C50_holdout_vs_train,
ensemble_n_bayes_holdout_vs_train,
ensemble_ranger_holdout_vs_train, ensemble_rf_holdout_vs_train, ensemble_svm_holdout_vs_train, ensemble_tree_holdout_vs_train
),
"mean" = rep(c(
bagging_holdout_vs_train_mean, bag_rf_holdout_vs_train_mean, C50_holdout_vs_train_mean,
linear_holdout_vs_train_mean, n_bayes_holdout_vs_train_mean, pls_holdout_vs_train_mean,
pda_holdout_vs_train_mean, rf_holdout_vs_train_mean, ranger_holdout_vs_train_mean, rpart_holdout_vs_train_mean, svm_holdout_vs_train_mean, tree_holdout_vs_train_mean,
ensemble_bag_cart_holdout_vs_train_mean, ensemble_bag_rf_holdout_vs_train_mean, ensemble_C50_holdout_vs_train_mean,
ensemble_n_bayes_holdout_vs_train_mean, ensemble_ranger_holdout_vs_train_mean, ensemble_rf_holdout_vs_train_mean, ensemble_svm_holdout_vs_train_mean,
ensemble_tree_holdout_vs_train_mean
), each = numresamples)
)
true_positive_rate_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_true_positive_rate, bag_rf_true_positive_rate, C50_true_positive_rate, linear_true_positive_rate,
n_bayes_true_positive_rate, pls_true_positive_rate, pda_true_positive_rate, rf_true_positive_rate, ranger_true_positive_rate, rpart_true_positive_rate, svm_true_positive_rate, tree_true_positive_rate,
ensemble_bag_cart_true_positive_rate, ensemble_bag_rf_true_positive_rate, ensemble_C50_true_positive_rate,
ensemble_n_bayes_true_positive_rate,
ensemble_ranger_true_positive_rate, ensemble_rf_true_positive_rate, ensemble_svm_true_positive_rate, ensemble_tree_true_positive_rate
),
"mean" = rep(c(
bagging_true_positive_rate_mean, bag_rf_true_positive_rate_mean, C50_true_positive_rate_mean,
linear_true_positive_rate_mean, n_bayes_true_positive_rate_mean, pls_true_positive_rate_mean,
pda_true_positive_rate_mean, rf_true_positive_rate_mean, ranger_true_positive_rate_mean, rpart_true_positive_rate_mean, svm_true_positive_rate_mean, tree_true_positive_rate_mean,
ensemble_bag_cart_true_positive_rate_mean, ensemble_bag_rf_true_positive_rate_mean, ensemble_C50_true_positive_rate_mean,
ensemble_n_bayes_true_positive_rate_mean, ensemble_ranger_true_positive_rate_mean, ensemble_rf_true_positive_rate_mean, ensemble_svm_true_positive_rate_mean,
ensemble_tree_true_positive_rate_mean
), each = numresamples)
)
true_positive_rate_plot <- ggplot2::ggplot(data = true_positive_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("True positive rate, closer to one is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "True positive rate, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("true_positive_rate_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("true_positive_rate_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("true_positive_rate_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("true_positive_rate_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("true_positive_rate_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("true_positive_rate_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
true_positive_rate_plot2 <- ggplot2::ggplot(data = true_positive_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("True positive rate, closer to one is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "True positive rate, closer to one is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("true_positive_rate_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("true_positive_rate_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("true_positive_rate_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("true_positive_rate_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("true_positive_rate_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("true_positive_rate_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
true_negative_rate_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_true_negative_rate, bag_rf_true_negative_rate, C50_true_negative_rate, linear_true_negative_rate,
n_bayes_true_negative_rate, pls_true_negative_rate, pda_true_negative_rate, rf_true_negative_rate, ranger_true_negative_rate, rpart_true_negative_rate, svm_true_negative_rate, tree_true_negative_rate,
ensemble_bag_cart_true_negative_rate, ensemble_bag_rf_true_negative_rate, ensemble_C50_true_negative_rate,
ensemble_n_bayes_true_negative_rate,
ensemble_ranger_true_negative_rate, ensemble_rf_true_negative_rate, ensemble_svm_true_negative_rate, ensemble_tree_true_negative_rate
),
"mean" = rep(c(
bagging_true_negative_rate_mean, bag_rf_true_negative_rate_mean, C50_true_negative_rate_mean,
linear_true_negative_rate_mean, n_bayes_true_negative_rate_mean, pls_true_negative_rate_mean,
pda_true_negative_rate_mean, rf_true_negative_rate_mean, ranger_true_negative_rate_mean, rpart_true_negative_rate_mean, svm_true_negative_rate_mean, tree_true_negative_rate_mean,
ensemble_bag_cart_true_negative_rate_mean, ensemble_bag_rf_true_negative_rate_mean, ensemble_C50_true_negative_rate_mean,
ensemble_n_bayes_true_negative_rate_mean, ensemble_ranger_true_negative_rate_mean, ensemble_rf_true_negative_rate_mean, ensemble_svm_true_negative_rate_mean,
ensemble_tree_true_negative_rate_mean
), each = numresamples)
)
true_negative_rate_plot <- ggplot2::ggplot(data = true_negative_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("True negative rate, closer to one is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "True negative rate, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("true_negative_rate_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("true_negative_rate_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("true_negative_rate_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("true_negative_rate_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("true_negative_rate_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("true_negative_rate_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
true_negative_rate_plot2 <- ggplot2::ggplot(data = true_negative_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("True negative rate, closer to one is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "True negative rate, closer to one is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("true_negative_rate_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("true_negative_rate_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("true_negative_rate_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("true_negative_rate_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("true_negative_rate_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("true_negative_rate_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
false_positive_rate_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_false_positive_rate, bag_rf_false_positive_rate, C50_false_positive_rate, linear_false_positive_rate,
n_bayes_false_positive_rate, pls_false_positive_rate, pda_false_positive_rate, rf_false_positive_rate, ranger_false_positive_rate, rpart_false_positive_rate, svm_false_positive_rate, tree_false_positive_rate,
ensemble_bag_cart_false_positive_rate, ensemble_bag_rf_false_positive_rate, ensemble_C50_false_positive_rate,
ensemble_n_bayes_false_positive_rate,
ensemble_ranger_false_positive_rate, ensemble_rf_false_positive_rate, ensemble_svm_false_positive_rate, ensemble_tree_false_positive_rate
),
"mean" = rep(c(
bagging_false_positive_rate_mean, bag_rf_false_positive_rate_mean, C50_false_positive_rate_mean,
linear_false_positive_rate_mean, n_bayes_false_positive_rate_mean, pls_false_positive_rate_mean,
pda_false_positive_rate_mean, rf_false_positive_rate_mean, ranger_false_positive_rate_mean, rpart_false_positive_rate_mean, svm_false_positive_rate_mean, tree_false_positive_rate_mean,
ensemble_bag_cart_false_positive_rate_mean, ensemble_bag_rf_false_positive_rate_mean, ensemble_C50_false_positive_rate_mean,
ensemble_n_bayes_false_positive_rate_mean, ensemble_ranger_false_positive_rate_mean, ensemble_rf_false_positive_rate_mean, ensemble_svm_false_positive_rate_mean,
ensemble_tree_false_positive_rate_mean
), each = numresamples)
)
false_positive_rate_plot <- ggplot2::ggplot(data = false_positive_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("False positive rate, closer to zero is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "False positive rate, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("false_positive_rate_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("false_positive_rate_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("false_positive_rate_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("false_positive_rate_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("false_positive_rate_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("false_positive_rate_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
false_positive_rate_plot2 <- ggplot2::ggplot(data = false_positive_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("False positive rate, closer to zero is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "False positive rate, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("false_positive_rate_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("false_positive_rate_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("false_positive_rate_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("false_positive_rate_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("false_positive_rate_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("false_positive_rate_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
false_negative_rate_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_false_negative_rate, bag_rf_false_negative_rate, C50_false_negative_rate, linear_false_negative_rate,
n_bayes_false_negative_rate, pls_false_negative_rate, pda_false_negative_rate, rf_false_negative_rate, ranger_false_negative_rate, rpart_false_negative_rate, svm_false_negative_rate, tree_false_negative_rate,
ensemble_bag_cart_false_negative_rate, ensemble_bag_rf_false_negative_rate, ensemble_C50_false_negative_rate,
ensemble_n_bayes_false_negative_rate,
ensemble_ranger_false_negative_rate, ensemble_rf_false_negative_rate, ensemble_svm_false_negative_rate, ensemble_tree_false_negative_rate
),
"mean" = rep(c(
bagging_false_negative_rate_mean, bag_rf_false_negative_rate_mean, C50_false_negative_rate_mean,
linear_false_negative_rate_mean, n_bayes_false_negative_rate_mean, pls_false_negative_rate_mean,
pda_false_negative_rate_mean, rf_false_negative_rate_mean, ranger_false_negative_rate_mean, rpart_false_negative_rate_mean, svm_false_negative_rate_mean, tree_false_negative_rate_mean,
ensemble_bag_cart_false_negative_rate_mean, ensemble_bag_rf_false_negative_rate_mean, ensemble_C50_false_negative_rate_mean,
ensemble_n_bayes_false_negative_rate_mean, ensemble_ranger_false_negative_rate_mean, ensemble_rf_false_negative_rate_mean, ensemble_svm_false_negative_rate_mean,
ensemble_tree_false_negative_rate_mean
), each = numresamples)
)
false_negative_rate_plot <- ggplot2::ggplot(data = false_negative_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("False negative rate, closer to zero is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "False negative rate, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("false_negative_rate_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("false_negative_rate_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("false_negative_rate_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("false_negative_rate_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("false_negative_rate_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("false_negative_rate_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
false_negative_rate_plot2 <- ggplot2::ggplot(data = false_negative_rate_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("False negative rate, closer to zero is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "False negative rate, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("false_negative_rate_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("false_negative_rate_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("false_negative_rate_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("false_negative_rate_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("false_negative_rate_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("false_negative_rate_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
positive_pred_value_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_positive_pred_value, bag_rf_positive_pred_value, C50_positive_pred_value, linear_positive_pred_value,
n_bayes_positive_pred_value, pls_positive_pred_value, pda_positive_pred_value, rf_positive_pred_value, ranger_positive_pred_value, rpart_positive_pred_value, svm_positive_pred_value, tree_positive_pred_value,
ensemble_bag_cart_positive_pred_value, ensemble_bag_rf_positive_pred_value, ensemble_C50_positive_pred_value,
ensemble_n_bayes_positive_pred_value,
ensemble_ranger_positive_pred_value, ensemble_rf_positive_pred_value, ensemble_svm_positive_pred_value, ensemble_tree_positive_pred_value
),
"mean" = rep(c(
bagging_positive_pred_value_mean, bag_rf_positive_pred_value_mean, C50_positive_pred_value_mean,
linear_positive_pred_value_mean, n_bayes_positive_pred_value_mean, pls_positive_pred_value_mean,
pda_positive_pred_value_mean, rf_positive_pred_value_mean, ranger_positive_pred_value_mean, rpart_positive_pred_value_mean, svm_positive_pred_value_mean, tree_positive_pred_value_mean,
ensemble_bag_cart_positive_pred_value_mean, ensemble_bag_rf_positive_pred_value_mean, ensemble_C50_positive_pred_value_mean,
ensemble_n_bayes_positive_pred_value_mean, ensemble_ranger_positive_pred_value_mean, ensemble_rf_positive_pred_value_mean, ensemble_svm_positive_pred_value_mean,
ensemble_tree_positive_pred_value_mean
), each = numresamples)
)
positive_pred_value_plot <- ggplot2::ggplot(data = positive_pred_value_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("Positive Pred Value, closer to one is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Positive Pred Value, closer to one is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("positive_pred_value_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("positive_pred_value_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("positive_pred_value_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("positive_pred_value_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("positive_pred_value_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("positive_pred_value_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
positive_pred_value_plot2 <- ggplot2::ggplot(data = positive_pred_value_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("Positive Pred Value, closer to one is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Positive Pred Value, closer to one is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("positive_pred_value_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("positive_pred_value_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("positive_pred_value_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("positive_pred_value_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("positive_pred_value_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("positive_pred_value_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
negative_pred_value_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_negative_pred_value, bag_rf_negative_pred_value, C50_negative_pred_value, linear_negative_pred_value,
n_bayes_negative_pred_value, pls_negative_pred_value, pda_negative_pred_value, rf_negative_pred_value, ranger_negative_pred_value, rpart_negative_pred_value, svm_negative_pred_value, tree_negative_pred_value,
ensemble_bag_cart_negative_pred_value, ensemble_bag_rf_negative_pred_value, ensemble_C50_negative_pred_value,
ensemble_n_bayes_negative_pred_value,
ensemble_ranger_negative_pred_value, ensemble_rf_negative_pred_value, ensemble_svm_negative_pred_value, ensemble_tree_negative_pred_value
),
"mean" = rep(c(
bagging_negative_pred_value_mean, bag_rf_negative_pred_value_mean, C50_negative_pred_value_mean,
linear_negative_pred_value_mean, n_bayes_negative_pred_value_mean, pls_negative_pred_value_mean,
pda_negative_pred_value_mean, rf_negative_pred_value_mean, ranger_negative_pred_value_mean, rpart_negative_pred_value_mean, svm_negative_pred_value_mean, tree_negative_pred_value_mean,
ensemble_bag_cart_negative_pred_value_mean, ensemble_bag_rf_negative_pred_value_mean, ensemble_C50_negative_pred_value_mean,
ensemble_n_bayes_negative_pred_value_mean, ensemble_ranger_negative_pred_value_mean, ensemble_rf_negative_pred_value_mean, ensemble_svm_negative_pred_value_mean,
ensemble_tree_negative_pred_value_mean
), each = numresamples)
)
negative_pred_value_plot <- ggplot2::ggplot(data = negative_pred_value_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("Negative Pred Value, closer to one is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Negative Pred Value, closer to one is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("negative_pred_value_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("negative_pred_value_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("negative_pred_value_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("negative_pred_value_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("negative_pred_value_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("negative_pred_value_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
negative_pred_value_plot2 <- ggplot2::ggplot(data = negative_pred_value_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("Negative Pred Value, closer to one is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Negative Pred Value, closer to one is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("negative_pred_value_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("negative_pred_value_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("negative_pred_value_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("negative_pred_value_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("negative_pred_value_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("negative_pred_value_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
holdout_vs_train_plot <- ggplot2::ggplot(data = holdout_vs_train_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("Holdout accuracy / train accuracy by model, closer to one is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Holdout accuracy / train accuracy by model, closer to one is better. \n The horizontal line is the mean of the results, the red line is 1.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("holdout_vs_train_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("holdout_vs_train_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("holdout_vs_train_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("holdout_vs_train_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("holdout_vs_train_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("holdout_vs_train_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
holdout_vs_train_plot2 <- ggplot2::ggplot(data = holdout_vs_train_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 1, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("Holdout accuracy / train accuracy by model, closer to one is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Holdout accuracy / train accuracy by model, closer to one is better. \n The horizontal line is the mean of the results, the red line is 1.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("holdout_vs_train_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("holdout_vs_train_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("holdout_vs_train_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("holdout_vs_train_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("holdout_vs_train_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("holdout_vs_train_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
classification_error_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"data" = c(
bagging_classification_error, bag_rf_classification_error, C50_classification_error, linear_classification_error,
n_bayes_classification_error, pls_classification_error, pda_classification_error, rf_classification_error, ranger_classification_error, rpart_classification_error, svm_classification_error, tree_classification_error,
ensemble_bag_cart_classification_error, ensemble_bag_rf_classification_error, ensemble_C50_classification_error,
ensemble_n_bayes_classification_error,
ensemble_ranger_classification_error, ensemble_rf_classification_error, ensemble_svm_classification_error, ensemble_tree_classification_error
),
"mean" = rep(c(
bagging_classification_error_mean, bag_rf_classification_error_mean, C50_classification_error_mean,
linear_classification_error_mean, n_bayes_classification_error_mean, pls_classification_error_mean,
pda_classification_error_mean, rf_classification_error_mean, ranger_classification_error_mean, rpart_classification_error_mean, svm_classification_error_mean, tree_classification_error_mean,
ensemble_bag_cart_classification_error_mean, ensemble_bag_rf_classification_error_mean, ensemble_C50_classification_error_mean,
ensemble_n_bayes_classification_error_mean, ensemble_ranger_classification_error_mean, ensemble_rf_classification_error_mean, ensemble_svm_classification_error_mean,
ensemble_tree_classification_error_mean
), each = numresamples)
)
classification_error_plot <- ggplot2::ggplot(data = classification_error_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("Classification error, closer to zero is better, fixed scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Classification error, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("classification_error_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("classification_error_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("classification_error_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("classification_error_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("classification_error_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("classification_error_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
classification_error_plot2 <- ggplot2::ggplot(data = classification_error_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_point(mapping = ggplot2::aes(x = count, y = data)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = mean)) +
ggplot2::geom_hline(ggplot2::aes(yintercept = 0, color = "red")) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("Classification error, closer to zero is better, free scales. \n The black horizontal line is the mean of the results, the red horizontal line is 1.") +
ggplot2::labs(y = "Classification error, closer to zero is better. \n The horizontal line is the mean of the results, the red line is 0.") +
ggplot2::theme(legend.position = "none")
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("classification_error_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("classification_error_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("classification_error_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("classification_error_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("classification_error_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("classification_error_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
####################################################
###### Total Data visualizations start here ########
####################################################
total_data <- data.frame(
"count" = 1:numresamples,
"model" = c(
rep("Bagging", numresamples), rep("Bagged Random Forest", numresamples),
rep("C50", numresamples), rep("Linear", numresamples),
rep("Naive Bayes", numresamples), rep("Partial Least Squares", numresamples),
rep("Penalized Discriminant Analysis", numresamples), rep("Random Forest", numresamples), rep("Ranger", numresamples),
rep("RPart", numresamples), rep("Support Vector Machines", numresamples), rep("Trees", numresamples),
rep("Ensemble Bagged Cart", numresamples), rep("Ensemble Bagged Random Forest", numresamples),
rep("Ensemble C50", numresamples),
rep("Ensemble Naive Bayes", numresamples), rep("Ensemble Ranger", numresamples), rep("Ensemble Random Forest", numresamples),
rep("Ensemble Support Vector Machines", numresamples),
rep("Ensemble Trees", numresamples)
),
"train" = c(
bagging_train_accuracy, bag_rf_train_accuracy, C50_train_accuracy, linear_train_accuracy,
n_bayes_train_accuracy, pls_train_accuracy, pda_train_accuracy, rf_train_accuracy, ranger_train_accuracy, rpart_train_accuracy, svm_train_accuracy, tree_train_accuracy,
ensemble_bag_cart_train_accuracy, ensemble_bag_rf_train_accuracy, ensemble_C50_train_accuracy,
ensemble_n_bayes_train_accuracy,
ensemble_ranger_train_accuracy, ensemble_rf_train_accuracy, ensemble_svm_train_accuracy, ensemble_tree_train_accuracy
),
"test" = c(
bagging_test_accuracy, bag_rf_test_accuracy, C50_test_accuracy, linear_test_accuracy,
n_bayes_test_accuracy, pls_test_accuracy, pda_test_accuracy, rf_test_accuracy, ranger_test_accuracy, rpart_test_accuracy, svm_test_accuracy, tree_test_accuracy,
ensemble_bag_cart_test_accuracy, ensemble_bag_rf_test_accuracy, ensemble_C50_test_accuracy,
ensemble_n_bayes_test_accuracy,
ensemble_ranger_test_accuracy, ensemble_rf_test_accuracy, ensemble_svm_test_accuracy, ensemble_tree_test_accuracy
),
"validation" = c(
bagging_validation_accuracy, bag_rf_validation_accuracy, C50_validation_accuracy, linear_validation_accuracy,
n_bayes_validation_accuracy, pls_validation_accuracy, pda_validation_accuracy, rf_validation_accuracy, ranger_validation_accuracy, rpart_validation_accuracy, svm_validation_accuracy, tree_validation_accuracy,
ensemble_bag_cart_validation_accuracy, ensemble_bag_rf_validation_accuracy, ensemble_C50_validation_accuracy,
ensemble_n_bayes_validation_accuracy,
ensemble_ranger_validation_accuracy, ensemble_rf_validation_accuracy, ensemble_svm_validation_accuracy, ensemble_tree_validation_accuracy
),
"holdout" = c(
bagging_holdout, bag_rf_holdout, C50_holdout, linear_holdout,
n_bayes_holdout, pls_holdout, pda_holdout, rf_holdout, ranger_holdout, rpart_holdout, svm_holdout, tree_holdout,
ensemble_bag_cart_holdout, ensemble_bag_rf_holdout, ensemble_C50_holdout,
ensemble_n_bayes_holdout,
ensemble_ranger_holdout, ensemble_rf_holdout, ensemble_svm_holdout, ensemble_tree_holdout)
)
total_plot <- ggplot2::ggplot(data = total_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = aes(x = count, y = train, color = "train")) +
ggplot2::geom_point(mapping = aes(x = count, y = train)) +
ggplot2::geom_line(mapping = aes(x = count, y = holdout, color = "holdout")) +
ggplot2::geom_point(mapping = aes(x = count, y = holdout)) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "fixed") +
ggplot2::ggtitle("Accuracy data including train and holdout by model, fixed scales. \nAccuracy by model, higher is better, 1 is best.") +
ggplot2::labs(y = "Higher is better, 1 is best.") +
ggplot2::scale_color_manual(
name = "Total Results",
breaks = c("train", "holdout"),
values = c("train" = "red", "holdout" = "black")
)
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("total_plot.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("total_plot.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("total_plot.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("total_plot.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("total_plot.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("total_plot.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
total_plot2 <- ggplot2::ggplot(data = total_data, mapping = ggplot2::aes(x = count, y = data, color = model)) +
ggplot2::geom_line(mapping = aes(x = count, y = train, color = "train")) +
ggplot2::geom_point(mapping = aes(x = count, y = train)) +
ggplot2::geom_line(mapping = aes(x = count, y = holdout, color = "holdout")) +
ggplot2::geom_point(mapping = aes(x = count, y = holdout)) +
ggplot2::facet_wrap(~model, ncol = 5, scales = "free") +
ggplot2::ggtitle("Accuracy data including train and holdout by model, free scales. \nAccuracy by model, higher is better, 1 is best.") +
ggplot2::labs(y = "Higher is better, 1 is best.") +
ggplot2::scale_color_manual(
name = "Total Results",
breaks = c("train", "holdout"),
values = c("train" = "red", "holdout" = "black")
)
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("total_plot2.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("total_plot2.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("total_plot2.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("total_plot2.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("total_plot2.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("total_plot2.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
accuracy_barchart <- ggplot2::ggplot(Results, aes(x = reorder(Model, dplyr::desc(Mean_Holdout_Accuracy)), y = Mean_Holdout_Accuracy)) +
ggplot2::geom_col(width = 0.5)+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, hjust=1)) +
ggplot2::labs(x = "Model", y = "Mean Accuracy", title = "Model accuracy, closer to one is better, 1 standard deviation error bars") +
ggplot2::geom_text(aes(label = Mean_Holdout_Accuracy), vjust = -0.5, hjust = -0.5, angle = 90) +
ggplot2::geom_errorbar(aes(x = Model, ymin = Mean_Holdout_Accuracy - Accuracy_Holdout_Std.Dev,
ymax = Mean_Holdout_Accuracy + Accuracy_Holdout_Std.Dev))
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("accuracy_barchart.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("accuracy_barchart.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("accuracy_barchart.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("accuracy_barchart.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("accuracy_barchart.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("accuracy_barchart.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
holdout_vs_train_barchart <- ggplot2::ggplot(Results, aes(x = reorder(Model, dplyr::desc(Holdout_vs_train)), y = Holdout_vs_train)) +
ggplot2::geom_col(width = 0.5)+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, hjust=1)) +
ggplot2::labs(x = "Model", y = "Holdout accuracy / train accuracy", title = "Over or Under Fitting, closer to 1 is better, 1 standard deviation error bars") +
ggplot2::geom_text(aes(label = Holdout_vs_train), vjust = 0,hjust = -0.5, angle = 90) +
ggplot2::ylim(0, max(Results$Holdout_vs_train) +1.0) +
ggplot2::geom_errorbar(aes(x = Model, ymin = Holdout_vs_train - Holdout_vs_train_St_Dev, ymax = Holdout_vs_train + Holdout_vs_train_St_Dev))
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("holdout_vs_train_barchart.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("holdout_vs_train_barchart.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("holdout_vs_train_barchart.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("holdout_vs_train_barchart.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("holdout_vs_train_barchart.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("holdout_vs_train_barchart.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
duration_barchart <- ggplot2::ggplot(Results, aes(x = reorder(Model, Duration), y = Duration)) +
ggplot2::geom_col(width = 0.5)+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, hjust=1)) +
ggplot2::labs(x = "Model", y = "Duration", title = "Duration, shorter is better, 1 standard deviation error bars") +
ggplot2::geom_text(aes(label = Duration), vjust = 0,hjust = -0.5, angle = 90) +
#ggplot2::ylim(0, max(Results$Duration) + 1) +
ggplot2::geom_errorbar(aes(x = Model, ymin = Duration - Duration_St_Dev, ymax = Duration + Duration_St_Dev))
if(save_all_plots == "Y" && device == "eps"){
ggplot2::ggsave("duration_barchart.eps", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "jpeg"){
ggplot2::ggsave("duration_barchart.jpeg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "pdf"){
ggplot2::ggsave("duration_barchart.pdf", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "png"){
ggplot2::ggsave("duration_barchart.png", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "svg"){
ggplot2::ggsave("duration_barchart.svg", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if(save_all_plots == "Y" && device == "tiff"){
ggplot2::ggsave("duration_barchart.tiff", width = width, path = tempdir1, height = height, units = units, scale = scale, device = device, dpi = dpi)
}
if (predict_on_new_data == "Y") {
is_num <- sapply(new_data, is.integer)
new_data[ , is_num] <- as.data.frame(apply(new_data[, is_num], 2, as.numeric))
Bagged_Random_Forest <- predict(object = bag_rf_train_fit, newdata = new_data)
Bagging <- predict(object = bagging_train_fit, newdata = new_data)
C50 <- predict(object = C50_train_fit, newdata = new_data)
Linear <- predict(object = linear_train_fit, newdata = new_data)
Naive_Bayes <- predict(object = n_bayes_train_fit, newdata = new_data)
Partial_Least_Squares <- predict(object = pls_train_fit, newdata = new_data)
Penalized_Discriminant_Analysis <- predict(object = pda_train_fit, newdata = new_data)
Random_Forest <- predict(object = rf_train_fit, newdata = new_data)
Ranger <- predict(object = ranger_train_fit, newdata = new_data)
RPart <- predict(object = rpart_train_fit, newdata = new_data)
Support_Vector_Machines <- predict(object = svm_train_fit, newdata = new_data)
Trees <- predict(tree_train_fit, new_data, type = "class")
new_ensemble <- data.frame(
Bagged_Random_Forest,
Bagging,
C50,
Linear,
Naive_Bayes,
Partial_Least_Squares,
Penalized_Discriminant_Analysis,
Random_Forest,
Ranger,
RPart,
Support_Vector_Machines,
Trees
)
new_ensemble_row_numbers <- as.numeric(row.names(new_data))
new_ensemble$y <- new_data$y
new_ensemble_bagged_cart <- predict(object = ensemble_bag_cart_train_fit, newdata = new_ensemble)
new_ensemble_bag_rf <- predict(object = ensemble_bag_train_rf, newdata = new_ensemble)
new_ensemble_C50 <- predict(object = ensemble_C50_train_fit, newdata = new_ensemble)
new_ensemble_n_bayes <- predict(object = ensemble_n_bayes_train_fit, newdata = new_ensemble)
new_ensemble_rf <- predict(object = ensemble_train_rf_fit, newdata = new_ensemble)
new_ensemble_svm <- predict(object = ensemble_svm_train_fit, newdata = new_ensemble)
new_ensemble_trees <- predict(object = ensemble_tree_train_fit, newdata = new_ensemble)
New_Data_Results <- data.frame(
"True_Value" = new_data$y,
"Bagged_Random_Forest" = Bagged_Random_Forest,
"Bagging" = Bagging,
"C50" = C50,
"Linear" = Linear,
"Naive_Bayes" = Naive_Bayes,
"Partial_Least_Squares" = Partial_Least_Squares,
"Penalized_Discriminant_Analysis" = Penalized_Discriminant_Analysis,
"Random_Forest" = Random_Forest,
"Ranger" = Ranger,
"RPart" = RPart,
"Support_Vector_Machines" = Support_Vector_Machines,
"Trees" = Trees,
"Ensemble_Bagged_Cart" = new_ensemble_bagged_cart,
"Ensemble_Bagged_Random_Forest" = new_ensemble_bag_rf,
"Ensemble_C50" = new_ensemble_C50,
"Ensemble_Naive_Bayes" = new_ensemble_n_bayes,
"Ensemble_Random_Forest" = new_ensemble_rf,
"Ensemble_Support_Vector_Machines" = new_ensemble_svm
)
New_Data_Results <- t(New_Data_Results)
New_Data_Results <- reactable::reactable(New_Data_Results,
searchable = TRUE, pagination = FALSE, wrap = TRUE, rownames = TRUE, fullWidth = TRUE, filterable = TRUE, bordered = TRUE,
striped = TRUE, highlight = TRUE, resizable = TRUE
)
return(list(
"Final_Results" = Final_results, "Barcharts" = barchart, "Accuracy_Barchart" = accuracy_barchart, "holdout_vs_train_barchart" = holdout_vs_train_barchart,
"Duration_barchart" = duration_barchart, "Data summary" = data_summary, "Correlation_Matrix" = correlation_marix, 'VIF' = VIF, "Boxplots" = boxplots, "Histograms" = histograms, "Head of data" = head_data, "Head of Ensembles" = head_ensemble,
"Summary_Tables" = summary_tables, "Accuracy_Plot" = accuracy_plot, "Total_Plot" = total_plot, "holdout_vs_train_plot" = holdout_vs_train_plot, "New_Data_Results" = New_Data_Results
))
}
if (save_all_trained_models == "Y") {
fil <- tempfile("bagging_train_fit", fileext = ".RDS")
bagging_train_fit <- saveRDS(bagging_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("bag_rf_train_fit", fileext = ".RDS")
bag_rf_train_fit <- saveRDS(bag_rf_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("C50_train_fit", fileext = ".RDS")
C50_train_fit <- saveRDS(C50_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("linear_train_fit", fileext = ".RDS")
linear_train_fit <- saveRDS(linear_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("n_bayes_train_fit", fileext = ".RDS")
n_bayes_train_fit <- saveRDS(n_bayes_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("pls_train_fit", fileext = ".RDS")
pls_train_fit <- saveRDS(pls_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("pda_train_fit", fileext = ".RDS")
pda_train_fit <- saveRDS(pda_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("rf_train_fit", fileext = ".RDS")
rf_train_fit <- saveRDS(rf_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ranger_train_fit", fileext = ".RDS")
ranger_train_fit <- saveRDS(ranger_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("rpart_train_fit", fileext = ".RDS")
rpart_train_fit <- saveRDS(rpart_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("svm_train_fit", fileext = ".RDS")
svm_train_fit <- saveRDS(svm_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("tree_train_fit", fileext = ".RDS")
tree_train_fit <- saveRDS(tree_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_bag_cart_train_fit", fileext = ".RDS")
ensemble_bag_cart_train_fit <- saveRDS(ensemble_bag_cart_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_bag_train_rf", fileext = ".RDS")
ensemble_bag_train_rf <- saveRDS(ensemble_bag_train_rf, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_C50_train_fit", fileext = ".RDS")
ensemble_C50_train_fit <- saveRDS(ensemble_C50_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_n_bayes_train_fit", fileext = ".RDS")
ensemble_n_bayes_train_fit <- saveRDS(ensemble_n_bayes_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_ranger_train_fit", fileext = ".RDS")
ensemble_ranger_train_fit <- saveRDS(ensemble_ranger_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_train_rf_fit", fileext = ".RDS")
ensemble_train_rf_fit <- saveRDS(ensemble_train_rf_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_svm_train_fit", fileext = ".RDS")
ensemble_svm_train_fit <- saveRDS(ensemble_svm_train_fit, fil)
}
if (save_all_trained_models == "Y") {
fil <- tempfile("ensemble_train_train_fit", fileext = ".RDS")
ensemble_tree_train_fit <- saveRDS(ensemble_tree_train_fit, fil)
}
return(list(
'Final_results' = Final_results, 'Barchart' = barchart, 'Barchart2' = barchart2, "Accuracy_Barchart" = accuracy_barchart, "holdout_vs_train_barchart" = holdout_vs_train_barchart,
'True_positive_rate1' = true_positive_rate_plot, 'True_positive_rate2' = true_positive_rate_plot2,
'True_negative_rate1' = true_negative_rate_plot, 'True_negative_rate2' = true_negative_rate_plot2,
'False_positive_rate1' = false_positive_rate_plot, 'False_positive_rate2' = false_positive_rate_plot2,
'False_negative_rate1' = false_negative_rate_plot, 'False_negative_rate2' = false_negative_rate_plot2,
"Duration_barchart" = duration_barchart, 'Data_summary' = data_summary, 'Correlation_matrix' = correlation_marix, 'VIF' = VIF,
'Boxplots' = boxplots, 'Histograms' = histograms, 'Head_of_data' = head_df, 'Head_of_ensemble' = head_ensemble,
'Summary_tables' = summary_tables, 'Accuracy_plot' = accuracy_plot, 'Accuracy_plot2' = accuracy_plot2, 'Total_plot' = total_plot, "Total_plot2" = total_plot2,
'Classification_error1' = classification_error_plot, 'Classification_error2' = classification_error_plot2,
'Residuals' = residuals_plot, 'Residuals2' = residuals_plot2, "Holdout_vs_train_plot" = holdout_vs_train_plot
)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.