We must load the package Laurae, data.table, rmarkdown, xgboost, DT, formattable, matrixStats, lattice, R.utils, ggplot2, grid, and gridExtra before continuing.
library(Laurae) library(data.table) library(rmarkdown) library(xgboost) library(DT) library(formattable) library(matrixStats) library(lattice) library(R.utils) library(ggplot2) library(grid) library(gridExtra)
We are going to print a lot. Therefore, we must go over the limitations of R.
previousLimit <- getOption("max.print") previousScipen <- getOption("scipen") options(max.print = 1e7) options(scipen = 999) my_data <- copy(data)
r normalize)The features can be normalized to the range [0, 1].
if (normalize) { for (i in 1:ncol(data)) { my_data[[i]] <- (my_data[[i]] - min(my_data[[i]], na.rm = TRUE)) / (max(my_data[[i]], na.rm = TRUE) - min(my_data[[i]], na.rm = TRUE)) } }
r ifelse(classification, "Classification", "Regression") ModelWe are generating the r ifelse(classification, "Classification", "Regression") model per fold.
fitted_xgb <- list() fitted_values <- list() fitted_predicted <- list() fitted_pre_importance <- list() fitted_post_importance <- list() StartTime <- timer() CurrentTime <- StartTime for (i in 1:length(folds)) { data_temp <- DTsubsample(my_data, kept = folds[[i]], remove = TRUE, low_mem = FALSE, collect = 50, silent = TRUE) label_temp <- label[-folds[[i]]] data_temp1 <- xgb.DMatrix(data = DT2mat(data_temp, low_mem = FALSE, collect = 50, silent = TRUE), label = label_temp) data_temp <- DTsubsample(my_data, kept = folds[[i]], remove = FALSE, low_mem = FALSE, collect = 50, silent = TRUE) label_temp <- label[folds[[i]]] data_temp2 <- xgb.DMatrix(data = DT2mat(data_temp, low_mem = FALSE, collect = 50, silent = TRUE), label = label_temp) fitted_xgb[[i]] <- report.xgb.helper(data_temp1, data_temp2, params) fitted_values[[i]] <- label_temp fitted_predicted[[i]] <- predict(fitted_xgb[[i]], data_temp2, ntreelimit = fitted_xgb$best_iteration) if (importance) { fitted_pre_importance[[i]] <- xgb.importance(model = fitted_xgb[[i]], feature_names = colnames(my_data)) if (unbiased) { fitted_xgb[[i]] <- report.xgb.helper(data_temp1, data_temp2, params, fitted_xgb[[i]]$best_iteration) rparams <- modifyList(params, list(process_type = "update", updater = "refresh", refresh_leaf = FALSE)) fitted_xgb[[i]] <- report.xgb.helper(data_temp1, data_temp2, rparams, fitted_xgb[[i]]$best_iteration, fitted_xgb[[i]]) fitted_post_importance[[i]] <- xgb.importance(model = fitted_xgb[[i]], feature_names = colnames(my_data)) } } rm(data_temp, label_temp, data_temp1, data_temp2) gc(verbose = FALSE) cat("[", format(Sys.time(), "%a %b %d %Y %X"), "] Fitted the xgboost model on fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), " in ", sprintf("%07.03f", (timer() - CurrentTime) / 1000), "s. \n", sep = "") CurrentTime <- timer() }
r ifelse(classification, "Classification", "Regression") Statistics (out of fold)We must gather all values first for r ifelse(classification, paste0("classification, with a binary threshold of ", threshold), "regression").
if (!classification) { fitted_diff <- list() fitted_sqdiff <- list() r_pearson <- numeric(length(folds)) r_spearman <- numeric(length(folds)) r_squared <- numeric(length(folds)) r_mae <- numeric(length(folds)) r_mse <- numeric(length(folds)) r_rmse <- numeric(length(folds)) r_mape <- numeric(length(folds)) } else { r_auc <- numeric(length(folds)) r_logloss <- numeric(length(folds)) r_kappa <- numeric(length(folds)) r_kappa_p <- numeric(length(folds)) r_kappa_n <- numeric(length(folds)) r_f1s <- numeric(length(folds)) r_f1s_p <- numeric(length(folds)) r_f1s_n <- numeric(length(folds)) r_mcc <- numeric(length(folds)) r_mcc_p <- numeric(length(folds)) r_mcc_n <- numeric(length(folds)) r_tpr <- numeric(length(folds)) r_tpr_p <- numeric(length(folds)) r_tpr_n <- numeric(length(folds)) r_tnr <- numeric(length(folds)) r_tnr_p <- numeric(length(folds)) r_tnr_n <- numeric(length(folds)) r_fpr <- numeric(length(folds)) r_fpr_p <- numeric(length(folds)) r_fpr_n <- numeric(length(folds)) r_fnr <- numeric(length(folds)) r_fnr_p <- numeric(length(folds)) r_fnr_n <- numeric(length(folds)) } if (! classification) { for (i in 1:length(folds)) { fitted_diff[[i]] <- abs(fitted_values[[i]] - fitted_predicted[[i]]) fitted_sqdiff[[i]] <- fitted_diff[[i]] * fitted_diff[[i]] r_pearson[i] <- cor(data.frame(A = fitted_values[[i]], B = fitted_predicted[[i]]), method = "pearson")[1, 2] r_squared[i] <- r_pearson[i] * r_pearson[i] r_mae[i] <- mean(fitted_diff[[i]]) r_mse[i] <- mean(fitted_sqdiff[[i]]) r_rmse[i] <- sqrt(r_mse[i]) r_mape[i] <- mean(fitted_diff[[i]] / fitted_values[[i]]) gc(verbose = FALSE) } } else { for (i in 1:length(folds)) { r_auc[[i]] <- FastROC(fitted_predicted[[i]], fitted_values[[i]]) r_logloss[[i]] <- LogLoss(fitted_predicted[[i]], fitted_values[[i]]) temp_vec <- get.max_kappa(fitted_predicted[[i]], fitted_values[[i]]) r_kappa[[i]] <- temp_vec[1] r_kappa_p[[i]] <- temp_vec[2] r_kappa_n[[i]] <- prob.max_kappa(fitted_predicted[[i]], fitted_values[[i]], thresh = threshold) temp_vec <- get.max_f1(fitted_predicted[[i]], fitted_values[[i]]) r_f1s[[i]] <- temp_vec[1] r_f1s_p[[i]] <- temp_vec[2] r_f1s_n[[i]] <- prob.max_f1(fitted_predicted[[i]], fitted_values[[i]], thresh = threshold) temp_vec <- get.max_mcc(fitted_predicted[[i]], fitted_values[[i]]) r_mcc[[i]] <- temp_vec[1] r_mcc_p[[i]] <- temp_vec[2] r_mcc_n[[i]] <- prob.max_mcc(fitted_predicted[[i]], fitted_values[[i]], thresh = threshold) temp_vec <- get.max_sensitivity(fitted_predicted[[i]], fitted_values[[i]]) r_tpr[[i]] <- temp_vec[1] r_tpr_p[[i]] <- temp_vec[2] r_tpr_n[[i]] <- prob.max_sensitivity(fitted_predicted[[i]], fitted_values[[i]], thresh = threshold) temp_vec <- get.max_specificity(fitted_predicted[[i]], fitted_values[[i]]) r_tnr[[i]] <- temp_vec[1] r_tnr_p[[i]] <- temp_vec[2] r_tnr_n[[i]] <- prob.max_specificity(fitted_predicted[[i]], fitted_values[[i]], thresh = threshold) temp_vec <- get.max_fallout(fitted_predicted[[i]], fitted_values[[i]]) r_fpr[[i]] <- temp_vec[1] r_fpr_p[[i]] <- temp_vec[2] r_fpr_n[[i]] <- prob.max_fallout(fitted_predicted[[i]], fitted_values[[i]], thresh = threshold) temp_vec <- get.max_missrate(fitted_predicted[[i]], fitted_values[[i]]) r_fnr[[i]] <- temp_vec[1] r_fnr_p[[i]] <- temp_vec[2] r_fnr_n[[i]] <- prob.max_missrate(fitted_predicted[[i]], fitted_values[[i]], thresh = threshold) gc(verbose = FALSE) } }
r stats)A pretty table is better than text to print the base statistics.
The values on the table are the most optimistic values you can get. They do not represent the best all-rounded model using a probability threshold of r threshold.
if (stats) { if (!classification) { stats_table <- data.table(Statistic = c("Pearson Correlation Coefficient (R)", "Coefficient of Determination (R^2)", "Mean Absolute Error (MAE)", "Mean Squared Error (MSE)", "Root Mean Squared Error (RMSE)", "Mean Average Percentage Error (MAPE)"), Mean = c(mean(r_pearson), mean(r_squared), mean(r_mae), mean(r_mse), mean(r_rmse), mean(r_mape)), SD = c(sd(r_pearson), sd(r_squared), sd(r_mae), sd(r_mse), sd(r_rmse), sd(r_mape))) formattable(stats_table) } else { stats_table <- data.table(Statistic = c("AUROC", "Log Loss", "Optimistic Kappa", "Regular Kappa", "Threshold Kappa", "Optimistic F1 Score", "Regular F1 Score", "Threshold F1 Score", "Optimistic MCC", "Regular MCC", "Threshold MCC", "Optimistic TPR", "Regular TPR", "Threshold TPR", "Optimistic TNR", "Regular TNR", "Threshold TNR", "Optimistic FPR", "Regular FPR", "Threshold FPR", "Optimistic FNR", "Regular FNR", "Threshold FNR"), Mean = c(mean(r_auc), mean(r_logloss), mean(r_kappa), mean(r_kappa_n), mean(r_kappa_p), mean(r_f1s), mean(r_f1s_n), mean(r_f1s_p), mean(r_mcc), mean(r_mcc_n), mean(r_mcc_p), mean(r_tpr), mean(r_tpr_n), mean(r_tpr_p), mean(r_tnr), mean(r_tnr_n), mean(r_tnr_p), mean(r_fpr), mean(r_fpr_n), mean(r_fpr_p), mean(r_fnr), mean(r_fnr_n), mean(r_fnr_p)), SD = c(sd(r_auc), sd(r_logloss), sd(r_kappa), sd(r_kappa_n), sd(r_kappa_p), sd(r_f1s), sd(r_f1s_n), sd(r_f1s_p), sd(r_mcc), sd(r_mcc_n), sd(r_mcc_p), sd(r_tpr), sd(r_tpr_n), sd(r_tpr_p), sd(r_tnr), sd(r_tnr_n), sd(r_tnr_p), sd(r_fpr), sd(r_fpr_n), sd(r_fpr_p), sd(r_fnr), sd(r_fnr_n), sd(r_fnr_p))) formattable(stats_table) } }
r stats)A pretty table is better than text to print the base statistics.
if (stats) { if (!classification) { stats_table <- data.table(Folds = 1:length(folds), R = r_pearson, R2 = r_squared, MAE = r_mae, MSE = r_mse, RMSE = r_rmse, MAPE = r_mape) stats_probs <- NULL formattable(stats_table, list(R = color_bar("lightpink"), R2 = color_bar("pink"), MAE = color_bar("lightgreen"), MSE = color_bar("lightgrey"), RMSE = color_bar("lightblue"), MAPE = color_bar("cyan"))) } else { stats_table <- data.table(Folds = 1:length(folds), AUROC = r_auc, LogLoss = r_logloss, bKappa = r_kappa, bF1_Score = r_f1s, bMCC = r_mcc, bTPR = r_tpr, bTNR = r_tnr, bFPR = r_fpr, bFNR = r_fnr, nKappa = r_kappa_n, nF1_Score = r_f1s_n, nMCC = r_mcc_n, nTPR = r_tpr_n, nTNR = r_tnr_n, nFPR = r_fpr_n, nFNR = r_fnr_n) formattable(stats_table[, 1:3, with = FALSE], list(AUROC = color_bar("lightpink"), LogLoss = color_bar("pink"))) } }
They do not represent the best all-rounded model ("b"), which is denoted as "n". Here are best possible models if applicable:
if (stats & classification) { formattable(stats_table[, c(1, 4:10), with = FALSE], list(bKappa = color_bar("lightgreen"), bF1_Score = color_bar("lightgreen"), bMCC = color_bar("lightgreen"), bTPR = color_bar("lightblue"), bTNR = color_bar("lightblue"), bFPR = color_bar("cyan"), bFNR = color_bar("cyan"))) }
Those are the best all-rounded models (typical, normal models) for classification using a probability threshold of r threshold if applicable:
if (stats & classification) { formattable(stats_table[, c(1, 11:17), with = FALSE], list(nKappa = color_bar("lightgreen"), nF1_Score = color_bar("lightgreen"), nMCC = color_bar("lightgreen"), nTPR = color_bar("lightblue"), nTNR = color_bar("lightblue"), nFPR = color_bar("cyan"), nFNR = color_bar("cyan"))) }
When using a classification, the "p" values on the table are the probability thresholds to use to maximize the relevant performance metric.
if (stats & classification) { stats_probs <- data.table(Folds = 1:length(folds), pKappa = r_kappa_p, pF1_Score = r_f1s_p, pMCC = r_mcc_p, pTPR = r_tpr_p, pTNR = r_tnr_p, pFPR = r_fpr_p, pFNR = r_fnr_p) formattable(stats_probs, list(pKappa = color_bar("lightgreen"), pF1_Score = color_bar("lightgreen"), pMCC = color_bar("lightgreen"), pTPR = color_bar("lightblue"), pTNR = color_bar("lightblue"), pFPR = color_bar("cyan"), pFPR = color_bar("cyan"), pFNR = color_bar("cyan"))) }
As there are many things to lookout for, plots are better for classification if applicable:
if (stats & plots & classification) { grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) { plots <- list(...) position <- match.arg(position) g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]] lheight <- sum(legend$height) lwidth <- sum(legend$width) gl <- lapply(plots, function(x) x + theme(legend.position="none")) gl <- c(gl, ncol = ncol, nrow = nrow) combined <- switch(position, "bottom" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 1, heights = unit.c(unit(1, "npc") - lheight, lheight)), "right" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 2, widths = unit.c(unit(1, "npc") - lwidth, lwidth))) grid.newpage() grid.draw(combined) } stats_table_shadow <- copy(stats_table) stats_table_shadow[["Folds"]] <- as.factor(stats_table_shadow[["Folds"]]) p01 <- ggplot(stats_table_shadow, aes(x = Folds, y = AUROC, fill = Folds, label = round(AUROC, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "AUROC", title = "AUROC per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p02 <- ggplot(stats_table_shadow, aes(x = Folds, y = LogLoss, fill = Folds, label = round(LogLoss, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Logloss", title = "Logloss per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p03 <- ggplot(stats_table_shadow, aes(x = Folds, y = bKappa, fill = Folds, label = round(bKappa, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Optimistic Kappa", title = "Optimistic Kappa per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p04 <- ggplot(stats_table_shadow, aes(x = Folds, y = nKappa, fill = Folds, label = round(nKappa, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Normal Kappa", title = "Normal Kappa per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p05 <- ggplot(stats_table_shadow, aes(x = Folds, y = bF1_Score, fill = Folds, label = round(bF1_Score, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Optimistic F1 Score", title = "Optimistic F1 Score per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p06 <- ggplot(stats_table_shadow, aes(x = Folds, y = nF1_Score, fill = Folds, label = round(nF1_Score, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Normal F1 Score", title = "Normal F1 Score per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p07 <- ggplot(stats_table_shadow, aes(x = Folds, y = bMCC, fill = Folds, label = round(bMCC, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Optimistic MCC", title = "Optimistic MCC per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p08 <- ggplot(stats_table_shadow, aes(x = Folds, y = nMCC, fill = Folds, label = round(nMCC, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Normal MCC", title = "Normal MCC per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p09 <- ggplot(stats_table_shadow, aes(x = Folds, y = bTPR, fill = Folds, label = round(bTPR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Optimistic TPR", title = "Optimistic TPR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p10 <- ggplot(stats_table_shadow, aes(x = Folds, y = nTPR, fill = Folds, label = round(nTPR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Normal TPR", title = "Normal TPR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p11 <- ggplot(stats_table_shadow, aes(x = Folds, y = bTNR, fill = Folds, label = round(bTNR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Optimistic TNR", title = "Optimistic TNR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p12 <- ggplot(stats_table_shadow, aes(x = Folds, y = nTNR, fill = Folds, label = round(nTNR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Normal TNR", title = "Normal TNR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p13 <- ggplot(stats_table_shadow, aes(x = Folds, y = bFPR, fill = Folds, label = round(bFPR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Optimistic FPR", title = "Optimistic FPR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p14 <- ggplot(stats_table_shadow, aes(x = Folds, y = nFPR, fill = Folds, label = round(nFPR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Normal FPR", title = "Normal FPR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p15 <- ggplot(stats_table_shadow, aes(x = Folds, y = bFNR, fill = Folds, label = round(bFNR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Optimistic FNR", title = "Optimistic FNR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p16 <- ggplot(stats_table_shadow, aes(x = Folds, y = nFNR, fill = Folds, label = round(nFNR, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Normal FNR", title = "Normal FNR per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() grid_arrange_shared_legend(p01, p02, p03, p04, p05, p06, p07, p08, p09, p10, p11, p12, p13, p14, p15, p16, nrow = 8, ncol = 2) }
if (stats & plots & (!classification)) { grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) { plots <- list(...) position <- match.arg(position) g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]] lheight <- sum(legend$height) lwidth <- sum(legend$width) gl <- lapply(plots, function(x) x + theme(legend.position="none")) gl <- c(gl, ncol = ncol, nrow = nrow) combined <- switch(position, "bottom" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 1, heights = unit.c(unit(1, "npc") - lheight, lheight)), "right" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 2, widths = unit.c(unit(1, "npc") - lwidth, lwidth))) grid.newpage() grid.draw(combined) } stats_table_shadow <- copy(stats_table) stats_table_shadow[["Folds"]] <- as.factor(stats_table_shadow[["Folds"]]) p01 <- ggplot(stats_table_shadow, aes(x = Folds, y = R, fill = Folds, label = round(R, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Pearson's R", title = "Pearson's R per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p02 <- ggplot(stats_table_shadow, aes(x = Folds, y = R2, fill = Folds, label = round(R2, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "R-Squared", title = "Pearson's R-Squared per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p03 <- ggplot(stats_table_shadow, aes(x = Folds, y = MAE, fill = Folds, label = round(MAE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MAE", title = "MAE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p04 <- ggplot(stats_table_shadow, aes(x = Folds, y = MSE, fill = Folds, label = round(MSE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MSE", title = "MSE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p05 <- ggplot(stats_table_shadow, aes(x = Folds, y = RMSE, fill = Folds, label = round(RMSE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "RMSE", title = "RMSE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p06 <- ggplot(stats_table_shadow, aes(x = Folds, y = MAPE, fill = Folds, label = round(MAPE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MAPE", title = "MAPE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() grid_arrange_shared_legend(p01, p02, p03, p04, p05, p06, nrow = 3, ncol = 2) }
r importance, unbiased = r unbiased)When the feature importance is unbiased, the model has its overfitting trees pruned. Also, the gain/cover values are computed using the out of fold data, unlike the default behavior of using the training data to compute the gain/cover (when unbiased == FALSE). The column names (when unbiased == TRUE) are in the form XYZ, where:
The table is sortable to help the user. Negative unbiased values denotes negative overfitting (bad), while positive unbiased values denotes positive overfitting (good). The unbiased value printed is the subtraction of the unbiased value by the biased value: this is the reason a positive value denotes a higher gain.
if (importance) { if (!unbiased) { fitted_importance <- fitted_pre_importance[[1]] for (i in 2:length(folds)) { fitted_importance <- DTrbind(fitted_importance, fitted_pre_importance[[i]]) } fitted_importance <- fitted_importance[, list(Gain_Mean = mean(Gain), Gain_SD = sd(Gain), Cover_Mean = mean(Cover), Cover_SD = sd(Cover), Freq_Mean = mean(Frequency), Freq_SD = sd(Frequency)), by = Feature] datatable(fitted_importance, filter = "top", class = "cell-border stripe", options = list(pageLength = 10, lengthMenu = c(5, 10, 15, 20, 25, 50, 100, 500)) ) %>% formatStyle("Gain_Mean", background = styleColorBar(range(fitted_importance$Gain_Mean, na.rm = TRUE, finite = TRUE), "lightgreen"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("Cover_Mean", background = styleColorBar(range(fitted_importance$Cover_Mean, na.rm = TRUE, finite = TRUE), "lightblue"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("Freq_Mean", background = styleColorBar(range(fitted_importance$Freq_Mean, na.rm = TRUE, finite = TRUE), "lightgrey"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("Gain_SD", background = styleColorBar(range(fitted_importance$Gain_SD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("Cover_SD", background = styleColorBar(range(fitted_importance$Cover_SD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("Freq_SD", background = styleColorBar(range(fitted_importance$Freq_SD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatPercentage(columns = c("Gain_Mean", "Gain_SD", "Cover_Mean", "Cover_SD", "Freq_Mean", "Freq_SD"), digits = 6) } else { fitted_importance1 <- fitted_pre_importance[[1]] fitted_importance2 <- fitted_post_importance[[1]] for (i in 2:length(folds)) { fitted_importance1 <- DTrbind(fitted_importance1, fitted_pre_importance[[i]]) fitted_importance2 <- DTrbind(fitted_importance2, fitted_post_importance[[i]]) } fitted_importance1 <- fitted_importance1[, list(BGainM = mean(Gain), BGainSD = sd(Gain), BCoverM = mean(Cover), BCoverSD = sd(Cover), FreqM = mean(Frequency), FreqSD = sd(Frequency)), by = Feature] fitted_importance2 <- fitted_importance2[, list(UGainM = mean(Gain), UGainSD = sd(Gain), UCoverM = mean(Cover), UCoverSD = sd(Cover)), by = Feature] fitted_importance <- merge(fitted_importance2, fitted_importance1, by = "Feature", sort = FALSE) setcolorder(fitted_importance, c(1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 11)) fitted_importance[["UGainM"]] <- fitted_importance[["UGainM"]] - fitted_importance[["BGainM"]] fitted_importance[["UCoverM"]] <- fitted_importance[["UCoverM"]] - fitted_importance[["BCoverM"]] datatable(fitted_importance, filter = "top", class = "cell-border stripe", options = list(pageLength = 10, lengthMenu = c(5, 10, 15, 20, 25, 50, 100, 500)) ) %>% formatStyle("BGainM", background = styleColorBar(range(fitted_importance$BGainM, na.rm = TRUE, finite = TRUE), "lightgreen"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("UGainM", background = styleColorBar(range(fitted_importance$UGainM, na.rm = TRUE, finite = TRUE), "lightgreen"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("BCoverM", background = styleColorBar(range(fitted_importance$BCoverM, na.rm = TRUE, finite = TRUE), "lightblue"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("UCoverM", background = styleColorBar(range(fitted_importance$UCoverM, na.rm = TRUE, finite = TRUE), "lightblue"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle('FreqM', background = styleColorBar(range(fitted_importance$FreqM, na.rm = TRUE, finite = TRUE), "lightgrey"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("BGainSD", background = styleColorBar(range(fitted_importance$BGainSD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("UGainSD", background = styleColorBar(range(fitted_importance$UGainSD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("BCoverSD", background = styleColorBar(range(fitted_importance$BCoverSD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("UCoverSD", background = styleColorBar(range(fitted_importance$UCoverSD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatStyle("FreqSD", background = styleColorBar(range(fitted_importance$FreqSD, na.rm = TRUE, finite = TRUE), "pink"), backgroundSize = '100% 90%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center') %>% formatPercentage(columns = c("BGainM", "UGainM", "BGainSD", "UGainSD", "BCoverM", "UCoverM", "BCoverSD", "UCoverSD", "FreqM", "FreqSD"), digits = 6) } }
r plots)On binary classes, plotting the prediction and the label does not make much sense.
if (plots & (!classification)) { fitted_in <- numeric(0) fitted_out <- numeric(0) folded <- numeric(0) for (i in 1:length(folds)) { fitted_in <- c(fitted_in, fitted_values[[i]]) fitted_out <- c(fitted_out, fitted_predicted[[i]]) folded <- c(folded, rep(i, length(folds[[i]]))) } print(xyplot(fitted_out ~ fitted_in, group = folded, data = data.frame(Folds = as.factor(folded), Fitted = fitted_in, Predicted = fitted_out), auto.key = list(space = "right"), main = "Cross-Validated xgboost fitted values vs predicted values", xlab = "Fitted Values", ylab = "Predicted Values")) }
if (plots & (!classification)) { for (i in 1:length(folds)) { plot(x = fitted_values[[i]], y = fitted_predicted[[i]], main = paste0("Cross-Validated (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ") xgboost fitted values vs predicted values"), xlab = "Fitted Values", ylab = "Predicted Values") } }
When using classification, We can print probability threshold calibration vs a metric to understand how the model behaves against classification metrics.
if (plots & classification) { grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) { plots <- list(...) position <- match.arg(position) g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]] lheight <- sum(legend$height) lwidth <- sum(legend$width) gl <- lapply(plots, function(x) x + theme(legend.position="none")) gl <- c(gl, ncol = ncol, nrow = nrow) combined <- switch(position, "bottom" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 1, heights = unit.c(unit(1, "npc") - lheight, lheight)), "right" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 2, widths = unit.c(unit(1, "npc") - lwidth, lwidth))) grid.newpage() grid.draw(combined) } f1s_evolution <- list() f1s_combo <- data.table(y_prob = numeric(0), f1s = numeric(0), Folds = numeric(0)) mcc_evolution <- list() mcc_combo <- data.table(y_prob = numeric(0), mcc = numeric(0), Folds = numeric(0)) sens_evolution <- list() sens_combo <- data.table(y_prob = numeric(0), sens = numeric(0), Folds = numeric(0)) spec_evolution <- list() spec_combo <- data.table(y_prob = numeric(0), spec = numeric(0), Folds = numeric(0)) fall_evolution <- list() fall_combo <- data.table(y_prob = numeric(0), fall = numeric(0), Folds = numeric(0)) miss_evolution <- list() miss_combo <- data.table(y_prob = numeric(0), miss = numeric(0), Folds = numeric(0)) par(mfrow = c(3, 2)) for (i in 1:length(folds)) { f1s_evolution[[i]] <- plotting.max_f1(fitted_predicted[[i]], fitted_values[[i]], plots = FALSE) f1s_combo <- rbind(f1s_combo, cbind(f1s_evolution[[i]], Folds = rep(i, nrow(f1s_evolution[[i]])))) mcc_evolution[[i]] <- plotting.max_mcc(fitted_predicted[[i]], fitted_values[[i]], plots = FALSE) mcc_combo <- rbind(mcc_combo, cbind(mcc_evolution[[i]], Folds = rep(i, nrow(mcc_evolution[[i]])))) sens_evolution[[i]] <- plotting.max_sensitivity(fitted_predicted[[i]], fitted_values[[i]], plots = FALSE) sens_combo <- rbind(sens_combo, cbind(sens_evolution[[i]], Folds = rep(i, nrow(sens_evolution[[i]])))) spec_evolution[[i]] <- plotting.max_specificity(fitted_predicted[[i]], fitted_values[[i]], plots = FALSE) spec_combo <- rbind(spec_combo, cbind(spec_evolution[[i]], Folds = rep(i, nrow(spec_evolution[[i]])))) fall_evolution[[i]] <- plotting.max_fallout(fitted_predicted[[i]], fitted_values[[i]], plots = FALSE) fall_combo <- rbind(fall_combo, cbind(fall_evolution[[i]], Folds = rep(i, nrow(fall_evolution[[i]])))) miss_evolution[[i]] <- plotting.max_missrate(fitted_predicted[[i]], fitted_values[[i]], plots = FALSE) miss_combo <- rbind(miss_combo, cbind(miss_evolution[[i]], Folds = rep(i, nrow(miss_evolution[[i]])))) } f1s_combo[["Folds"]] <- as.factor(f1s_combo[["Folds"]]) mcc_combo[["Folds"]] <- as.factor(mcc_combo[["Folds"]]) sens_combo[["Folds"]] <- as.factor(sens_combo[["Folds"]]) spec_combo[["Folds"]] <- as.factor(spec_combo[["Folds"]]) fall_combo[["Folds"]] <- as.factor(fall_combo[["Folds"]]) miss_combo[["Folds"]] <- as.factor(miss_combo[["Folds"]]) p1 <- ggplot(data = f1s_combo, aes(x = y_prob, y = f1s, group = Folds, colour = Folds)) + geom_path() + labs(x = "Probability", y = "F1 Score", title = "Cross-Validated F1 Score vs Probability") + theme_bw() p2 <- ggplot(data = mcc_combo, aes(x = y_prob, y = mcc, group = Folds, colour = Folds)) + geom_path() + labs(x = "Probability", y = "MCC", title = "Cross-Validated MCC vs Probability") + theme_bw() p3 <- ggplot(data = sens_combo, aes(x = y_prob, y = sens, group = Folds, colour = Folds)) + geom_path() + labs(x = "Probability", y = "TPR", title = "Cross-Validated TPR vs Probability") + theme_bw() p4 <- ggplot(data = spec_combo, aes(x = y_prob, y = spec, group = Folds, colour = Folds)) + geom_path() + labs(x = "Probability", y = "TNR", title = "Cross-Validated TNR vs Probability") + theme_bw() p5 <- ggplot(data = fall_combo, aes(x = y_prob, y = fall, group = Folds, colour = Folds)) + geom_path() + labs(x = "Probability", y = "FPR", title = "Cross-Validated FPR vs Probability") + theme_bw() p6 <- ggplot(data = miss_combo, aes(x = y_prob, y = miss, group = Folds, colour = Folds)) + geom_path() + labs(x = "Probability", y = "FNR", title = "Cross-Validated FNR vs Probability") + theme_bw() grid_arrange_shared_legend(p1, p2, p3, p4, p5, p6, nrow = 3, ncol = 2) for (i in 1:length(folds)) { plot(x = f1s_evolution[[i]]$y_prob, y = f1s_evolution[[i]]$f1s, main = paste0("F1 Score (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ")"), xlab = "Predicted Value", ylab = "F1 Score", type = plot_type) plot(x = mcc_evolution[[i]]$y_prob, y = mcc_evolution[[i]]$mcc, main = paste0("MCC (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ")"), xlab = "Predicted Value", ylab = "Matthews Correlation Coefficient", type = plot_type) plot(x = sens_evolution[[i]]$y_prob, y = sens_evolution[[i]]$sens, main = paste0("TPR (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ")"), xlab = "Predicted Value", ylab = "True Positive Rate", type = plot_type) plot(x = spec_evolution[[i]]$y_prob, y = spec_evolution[[i]]$spec, main = paste0("TNR (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ")"), xlab = "Predicted Value", ylab = "True Negative Rate", type = plot_type) plot(x = fall_evolution[[i]]$y_prob, y = fall_evolution[[i]]$fall, main = paste0("FPR (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ")"), xlab = "Predicted Value", ylab = "False Positive Rate", type = plot_type) plot(x = miss_evolution[[i]]$y_prob, y = miss_evolution[[i]]$miss, main = paste0("FNR (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ")"), xlab = "Predicted Value", ylab = "False Negative Rate", type = plot_type) } }
We can reset the printing options and leave away.
options(max.print = previousLimit) options(scipen = previousScipen)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.