We must load the package Laurae
, data.table
, rmarkdown
, RcppArmadillo
, DT
, formattable
, matrixStats
, lattice
, R.utils
, ggplot2
, grid
, and gridExtra
before continuing.
library(Laurae) library(data.table) library(rmarkdown) library(RcppArmadillo) library(DT) library(formattable) library(matrixStats) library(lattice) library(R.utils) library(ggplot2) library(grid) library(gridExtra)
We are going to print a lot. Therefore, we must go over the limitations of R.
previousLimit <- getOption("max.print") previousScipen <- getOption("scipen") options(max.print = 1e7) options(scipen = 999) my_data <- copy(data)
r normalize
)The features can be normalized to the range [0, 1].
if (normalize) { for (i in 1:ncol(data)) { my_data[[i]] <- (my_data[[i]] - min(my_data[[i]], na.rm = TRUE)) / (max(my_data[[i]], na.rm = TRUE) - min(my_data[[i]], na.rm = TRUE)) } }
r cleaning
)The linear model we are using is not supporting missing values. We are replacing all NAs by 0.
if (cleaning) { my_data <- DTfillNA(my_data, value = 0) }
r deficiency
)We can check for rank deficiency using kappa. The higher the value, the higher the rank deficiency.
The kappa value is r ifelse(deficiency, kappa(my_data), "not computed")
.
We are generating the regression model per fold.
fitted_lm <- list() StartTime <- timer() CurrentTime <- StartTime for (i in 1:length(folds)) { data_temp <- DTsubsample(my_data, kept = folds[[i]], remove = TRUE, low_mem = FALSE, collect = 50, silent = TRUE) label_temp <- label[-folds[[i]]] fitted_lm[[i]] <- fastLm(X = data_temp, y = label_temp) gc(verbose = FALSE) cat("[", format(Sys.time(), "%a %b %d %Y %X"), "] Fitted the regression model on fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), " in ", sprintf("%07.03f", (timer() - CurrentTime) / 1000), "s. \n", sep = "") CurrentTime <- timer() }
We must gather all values first.
fitted_values <- list() fitted_predicted <- list() fitted_diff <- list() fitted_sqdiff <- list() fitted_coefficients <- data.table(Feature = 1:ncol(my_data)) r_pearson <- numeric(length(folds)) r_spearman <- numeric(length(folds)) r_squared <- numeric(length(folds)) r_mae <- numeric(length(folds)) r_mse <- numeric(length(folds)) r_rmse <- numeric(length(folds)) r_mape <- numeric(length(folds)) for (i in 1:length(folds)) { fitted_values[[i]] <- label[folds[[i]]] fitted_predicted[[i]] <- as.numeric(as.matrix(my_data[folds[[i]], ]) %*% fitted_lm[[i]]$coefficients) fitted_diff[[i]] <- abs(fitted_values[[i]] - fitted_predicted[[i]]) fitted_sqdiff[[i]] <- fitted_diff[[i]] * fitted_diff[[i]] fitted_coefficients <- fitted_coefficients[, (paste0("Fold_", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i))) := as.numeric(fitted_lm[[i]]$coefficients)] r_pearson[i] <- cor(data.frame(A = fitted_values[[i]], B = fitted_predicted[[i]]), method = "pearson")[1, 2] r_spearman[i] <- cor(data.frame(A = fitted_values[[i]], B = fitted_predicted[[i]]), method = "spearman")[1, 2] r_squared[i] <- r_pearson[i] * r_pearson[i] r_mae[i] <- mean(fitted_diff[[i]]) r_mse[i] <- mean(fitted_sqdiff[[i]]) r_rmse[i] <- sqrt(r_mse[i]) r_mape[i] <- mean(fitted_diff[[i]] / fitted_values[[i]]) gc(verbose = FALSE) } fitted_coefficients <- fitted_coefficients[, Feature := NULL] fitted_coefficients2 <- data.table(Feature = colnames(my_data), Mean = rowMeans(as.matrix(fitted_coefficients)), SD = rowSds(as.matrix(fitted_coefficients))) fitted_coefficients <- DTcbind(fitted_coefficients2, fitted_coefficients)
r stats
)A pretty table is better than text to print the base statistics.
if (stats) { stats_table <- data.table(Statistic = c("Pearson Correlation Coefficient (R)", "Coefficient of Determination (R^2)", "Mean Absolute Error (MAE)", "Mean Squared Error (MSE)", "Root Mean Squared Error (RMSE)", "Mean Average Percentage Error (MAPE)"), Mean = c(mean(r_pearson), mean(r_squared), mean(r_mae), mean(r_mse), mean(r_rmse), mean(r_mape)), SD = c(sd(r_pearson), sd(r_squared), sd(r_mae), sd(r_mse), sd(r_rmse), sd(r_mape))) formattable(stats_table) }
r stats
)A pretty table is better than text to print the base statistics.
if (stats) { stats_table <- data.table(Folds = 1:length(folds), R = r_pearson, R2 = r_squared, MAE = r_mae, MSE = r_mse, RMSE = r_rmse, MAPE = r_mape) formattable(stats_table, list(R = color_bar("lightpink"), R2 = color_bar("pink"), MAE = color_bar("lightgreen"), MSE = color_bar("lightgrey"), RMSE = color_bar("lightblue"), MAPE = color_bar("cyan"))) }
We can plot pie charts if applicable.
if (stats & plots) { grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) { plots <- list(...) position <- match.arg(position) g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]] lheight <- sum(legend$height) lwidth <- sum(legend$width) gl <- lapply(plots, function(x) x + theme(legend.position="none")) gl <- c(gl, ncol = ncol, nrow = nrow) combined <- switch(position, "bottom" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 1, heights = unit.c(unit(1, "npc") - lheight, lheight)), "right" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 2, widths = unit.c(unit(1, "npc") - lwidth, lwidth))) grid.newpage() grid.draw(combined) } stats_table_shadow <- copy(stats_table) stats_table_shadow[["Folds"]] <- as.factor(stats_table_shadow[["Folds"]]) p01 <- ggplot(stats_table_shadow, aes(x = Folds, y = R, fill = Folds, label = round(R, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Pearson's R", title = "Pearson's R per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p02 <- ggplot(stats_table_shadow, aes(x = Folds, y = R2, fill = Folds, label = round(R2, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "R-Squared", title = "Pearson's R-Squared per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p03 <- ggplot(stats_table_shadow, aes(x = Folds, y = MAE, fill = Folds, label = round(MAE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MAE", title = "MAE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p04 <- ggplot(stats_table_shadow, aes(x = Folds, y = MSE, fill = Folds, label = round(MSE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MSE", title = "MSE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p05 <- ggplot(stats_table_shadow, aes(x = Folds, y = RMSE, fill = Folds, label = round(RMSE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "RMSE", title = "RMSE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() p06 <- ggplot(stats_table_shadow, aes(x = Folds, y = MAPE, fill = Folds, label = round(MAPE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MAPE", title = "MAPE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw() grid_arrange_shared_legend(p01, p02, p03, p04, p05, p06, nrow = 3, ncol = 2) }
r coefficients
)We can get the multiplicative coefficient assigned to each feature per fold.
if (coefficients) { as.datatable(formattable(fitted_coefficients, c(list(Mean = color_tile("lightpink", "lightblue"), SD = color_tile("lightgreen", "lightpink"), (area(col = colnames(fitted_coefficients)[3:ncol(fitted_coefficients)]) ~ color_tile("lightpink", "lightblue")))))) %>% formatRound(1:ncol(my_data), digits = 5) }
r plots
)if (plots) { fitted_in <- numeric(0) fitted_out <- numeric(0) folded <- numeric(0) for (i in 1:length(folds)) { fitted_in <- c(fitted_in, fitted_values[[i]]) fitted_out <- c(fitted_out, fitted_predicted[[i]]) folded <- c(folded, rep(i, length(folds[[i]]))) } print(xyplot(fitted_out ~ fitted_in, group = folded, data = data.frame(Folds = as.factor(folded), Fitted = fitted_in, Predicted = fitted_out), auto.key = list(space = "right"), main = "Cross-Validated Linear Regression fitted values vs predicted values", xlab = "Fitted Values", ylab = "Predicted Values")) }
if (plots) { for (i in 1:length(folds)) { plot(x = fitted_values[[i]], y = fitted_predicted[[i]], main = paste0("Cross-Validated (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ") Linear Regression fitted values vs predicted values"), xlab = "Fitted Values", ylab = "Predicted Values") } }
r adv_stats
)options(scipen = 6) if (adv_stats) { for (i in 1:length(folds)) { cat(" \nFold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ". \n", sep = "") print(summary(fitted_lm[[i]])) } }
We can reset the printing options and leave away.
options(max.print = previousLimit) options(scipen = previousScipen)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.