Preparing Data

Loading necessary packages

We must load the package Laurae, data.table, rmarkdown, RcppArmadillo, DT, formattable, matrixStats, lattice, R.utils, ggplot2, grid, and gridExtra before continuing.

library(Laurae)
library(data.table)
library(rmarkdown)
library(RcppArmadillo)
library(DT)
library(formattable)
library(matrixStats)
library(lattice)
library(R.utils)
library(ggplot2)
library(grid)
library(gridExtra)

Print-A-Lot

We are going to print a lot. Therefore, we must go over the limitations of R.

previousLimit <- getOption("max.print")
previousScipen <- getOption("scipen")
options(max.print = 1e7)
options(scipen = 999)
my_data <- copy(data)

Data Normalization (normalize = r normalize)

The features can be normalized to the range [0, 1].

if (normalize) {
  for (i in 1:ncol(data)) {
    my_data[[i]] <- (my_data[[i]] - min(my_data[[i]], na.rm = TRUE)) / (max(my_data[[i]], na.rm = TRUE) - min(my_data[[i]], na.rm = TRUE))
  }
}

Data Cleaning (cleaning = r cleaning)

The linear model we are using is not supporting missing values. We are replacing all NAs by 0.

if (cleaning) {
  my_data <- DTfillNA(my_data, value = 0)
}

Rank Deficiency Check (deficiency = r deficiency)

We can check for rank deficiency using kappa. The higher the value, the higher the rank deficiency.

The kappa value is r ifelse(deficiency, kappa(my_data), "not computed").

Creating the Regression Model

We are generating the regression model per fold.

fitted_lm <- list()
StartTime <- timer()
CurrentTime <- StartTime
for (i in 1:length(folds)) {
  data_temp <- DTsubsample(my_data, kept = folds[[i]], remove = TRUE, low_mem = FALSE, collect = 50, silent = TRUE)
  label_temp <- label[-folds[[i]]]
  fitted_lm[[i]] <- fastLm(X = data_temp, y = label_temp)
  gc(verbose = FALSE)
  cat("[", format(Sys.time(), "%a %b %d %Y %X"), "] Fitted the regression model on fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), " in ", sprintf("%07.03f", (timer() - CurrentTime) / 1000), "s.  \n", sep = "")
  CurrentTime <- timer()
}

Aggregated Regression Statistics (out of fold)

We must gather all values first.

fitted_values <- list()
fitted_predicted <- list()
fitted_diff <- list()
fitted_sqdiff <- list()
fitted_coefficients <- data.table(Feature = 1:ncol(my_data))
r_pearson <- numeric(length(folds))
r_spearman <- numeric(length(folds))
r_squared <- numeric(length(folds))
r_mae <- numeric(length(folds))
r_mse <- numeric(length(folds))
r_rmse <- numeric(length(folds))
r_mape <- numeric(length(folds))
for (i in 1:length(folds)) {
  fitted_values[[i]] <- label[folds[[i]]]
  fitted_predicted[[i]] <- as.numeric(as.matrix(my_data[folds[[i]], ]) %*% fitted_lm[[i]]$coefficients)
  fitted_diff[[i]] <- abs(fitted_values[[i]] - fitted_predicted[[i]])
  fitted_sqdiff[[i]] <- fitted_diff[[i]] * fitted_diff[[i]]
  fitted_coefficients <- fitted_coefficients[, (paste0("Fold_", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i))) := as.numeric(fitted_lm[[i]]$coefficients)]
  r_pearson[i] <- cor(data.frame(A = fitted_values[[i]], B = fitted_predicted[[i]]), method = "pearson")[1, 2]
  r_spearman[i] <- cor(data.frame(A = fitted_values[[i]], B = fitted_predicted[[i]]), method = "spearman")[1, 2]
  r_squared[i] <- r_pearson[i] * r_pearson[i]
  r_mae[i] <- mean(fitted_diff[[i]])
  r_mse[i] <- mean(fitted_sqdiff[[i]])
  r_rmse[i] <- sqrt(r_mse[i])
  r_mape[i] <- mean(fitted_diff[[i]] / fitted_values[[i]])
  gc(verbose = FALSE)
}
fitted_coefficients <- fitted_coefficients[, Feature := NULL]
fitted_coefficients2 <- data.table(Feature = colnames(my_data), Mean = rowMeans(as.matrix(fitted_coefficients)), SD = rowSds(as.matrix(fitted_coefficients)))
fitted_coefficients <- DTcbind(fitted_coefficients2, fitted_coefficients)

Base Statistics, global (stats = r stats)

A pretty table is better than text to print the base statistics.

if (stats) {
  stats_table <- data.table(Statistic = c("Pearson Correlation Coefficient (R)", "Coefficient of Determination (R^2)", "Mean Absolute Error (MAE)", "Mean Squared Error (MSE)", "Root Mean Squared Error (RMSE)", "Mean Average Percentage Error (MAPE)"), Mean = c(mean(r_pearson), mean(r_squared), mean(r_mae), mean(r_mse), mean(r_rmse), mean(r_mape)), SD = c(sd(r_pearson), sd(r_squared), sd(r_mae), sd(r_mse), sd(r_rmse), sd(r_mape)))
  formattable(stats_table)
}

Base Statistics (per fold) (stats = r stats)

A pretty table is better than text to print the base statistics.

if (stats) {
  stats_table <- data.table(Folds = 1:length(folds), R = r_pearson, R2 = r_squared, MAE = r_mae, MSE = r_mse, RMSE = r_rmse, MAPE = r_mape)
  formattable(stats_table, list(R = color_bar("lightpink"), R2 = color_bar("pink"), MAE = color_bar("lightgreen"), MSE = color_bar("lightgrey"), RMSE = color_bar("lightblue"), MAPE = color_bar("cyan")))
}

We can plot pie charts if applicable.

if (stats & plots) {

  grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) {

    plots <- list(...)
    position <- match.arg(position)
    g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs
    legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
    lheight <- sum(legend$height)
    lwidth <- sum(legend$width)
    gl <- lapply(plots, function(x) x + theme(legend.position="none"))
    gl <- c(gl, ncol = ncol, nrow = nrow)

    combined <- switch(position,
                       "bottom" = arrangeGrob(do.call(arrangeGrob, gl),
                                              legend,
                                              ncol = 1,
                                              heights = unit.c(unit(1, "npc") - lheight, lheight)),
                       "right" = arrangeGrob(do.call(arrangeGrob, gl),
                                             legend,
                                             ncol = 2,
                                             widths = unit.c(unit(1, "npc") - lwidth, lwidth)))
    grid.newpage()
    grid.draw(combined)

  }

  stats_table_shadow <- copy(stats_table)
  stats_table_shadow[["Folds"]] <- as.factor(stats_table_shadow[["Folds"]])

  p01 <- ggplot(stats_table_shadow, aes(x = Folds, y = R, fill = Folds, label = round(R, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "Pearson's R", title = "Pearson's R per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw()
  p02 <- ggplot(stats_table_shadow, aes(x = Folds, y = R2, fill = Folds, label = round(R2, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "R-Squared", title = "Pearson's R-Squared per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw()
  p03 <- ggplot(stats_table_shadow, aes(x = Folds, y = MAE, fill = Folds, label = round(MAE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MAE", title = "MAE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw()
  p04 <- ggplot(stats_table_shadow, aes(x = Folds, y = MSE, fill = Folds, label = round(MSE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MSE", title = "MSE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw()
  p05 <- ggplot(stats_table_shadow, aes(x = Folds, y = RMSE, fill = Folds, label = round(RMSE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "RMSE", title = "RMSE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw()
  p06 <- ggplot(stats_table_shadow, aes(x = Folds, y = MAPE, fill = Folds, label = round(MAPE, digits = 5))) + geom_bar(stat = "identity") + coord_polar(theta = "x", direction = 1) + labs(x = "Fold", y = "MAPE", title = "MAPE per fold") + geom_text(position = position_stack(vjust = 0.5)) + theme_bw()
  grid_arrange_shared_legend(p01, p02, p03, p04, p05, p06, nrow = 3, ncol = 2)

}

Coefficients on Features (coefficients = r coefficients)

We can get the multiplicative coefficient assigned to each feature per fold.

if (coefficients) {
  as.datatable(formattable(fitted_coefficients, c(list(Mean = color_tile("lightpink", "lightblue"), SD = color_tile("lightgreen", "lightpink"), (area(col = colnames(fitted_coefficients)[3:ncol(fitted_coefficients)]) ~ color_tile("lightpink", "lightblue")))))) %>% formatRound(1:ncol(my_data), digits = 5)
}

Plotting Statistics (plots = r plots)

if (plots) {
  fitted_in <- numeric(0)
  fitted_out <- numeric(0)
  folded <- numeric(0)
  for (i in 1:length(folds)) {
    fitted_in <- c(fitted_in, fitted_values[[i]])
    fitted_out <- c(fitted_out, fitted_predicted[[i]])
    folded <- c(folded, rep(i, length(folds[[i]])))
  }
  print(xyplot(fitted_out ~ fitted_in, group = folded, data = data.frame(Folds = as.factor(folded), Fitted = fitted_in, Predicted = fitted_out), auto.key = list(space = "right"), main = "Cross-Validated Linear Regression fitted values vs predicted values", xlab = "Fitted Values", ylab = "Predicted Values"))
}
if (plots) {
  for (i in 1:length(folds)) {
    plot(x = fitted_values[[i]], y = fitted_predicted[[i]], main = paste0("Cross-Validated (fold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ") Linear Regression fitted values vs predicted values"), xlab = "Fitted Values", ylab = "Predicted Values")
  }
}

In-depth Statistics (adv_stats = r adv_stats)

options(scipen = 6)
if (adv_stats) {
  for (i in 1:length(folds)) {
    cat("  \nFold ", sprintf(paste0("%0", floor(log10(length(folds))) + 1, "d"), i), ".  \n", sep = "")
    print(summary(fitted_lm[[i]]))
  }
}

We can reset the printing options and leave away.

options(max.print = previousLimit)
options(scipen = previousScipen)


Laurae2/Laurae documentation built on May 8, 2019, 7:59 p.m.