knitr::opts_chunk$set(echo = TRUE) # init libraries library(glmnet) library(ggpubr) library(knitr) library(dplyr) library(ggplot2) library(neuralnet) library(tidyverse) library(GGally) library(corrplot) # document init knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) knitr::opts_chunk$set(out.width = "100%", text = element_text(size=5), axis.text.x = element_text(angle=90, hjust=1), fig.height = 3, memory.limit(20000000), setwd("D:/cfrasier/work/R"))
# set column names for the data features and label column ( DN ) col_names <- c("DN", "Phase", "Emission", "Incidence", "LEmission", "LIncidence", "Lat", "Long", "SunAz", "CraftAz") # Read Data in from text file and set column names data = read.table( "../data/band1unit7.dat", stringsAsFactors=FALSE, header=FALSE, strip.white=TRUE, sep="", col.names=col_names) # convert to a matrix data <- as.data.frame(data) # filter out incidence angles greater than some number of degrees data <- data[data$Incidence < 70, ] # calculate the cosines of the emission, incidence, local emission, and local incidence data[,3] = cos(data[,3]) data[,4] = cos(data[,4]) data[,5] = cos(data[,5]) data[,6] = cos(data[,6])
Ridge regression is an extension of general linear regression. It's basically regular linear regression with L2 Regularization. The important note about Ridge is that it minimizes the error by using a beta value which can be represented as an weight. This beta in Ridge cannot be 0 but can be very small and even negative. The beta determines how that certain variable effects the outcome. In Lasso regression it uses a loss function that can reach absolute 0. That means that some variable weights can also be set to 0. This has a massive impact on the training potential of any function that has been over trained because the algorithm specifically ignored variables that are considered "noise" and uses the most highly correlated variables for predicting.
Lasso Regression is an algorithm that focuses on preventing overfitting of data. The secret behind how it does this is the activation function, which completly ignores any features that have below a certian weight in the regression.
# prep data y.vec <- as.double(data[,1]) X.mat <- as.matrix(data[,-1]) lambda_seq <- 10^seq(3, -3, by = -.125) alpha <- 0.75 colnames(X.mat) <- NULL # Splitting the data into test and train set.seed(150) # train with only a 3/4 of the total data train = sample( 1:nrow(X.mat), size = (nrow(X.mat) * 0.80) ) test = (-train) # run the 10 fold cross validation of lasso regression to select best lambda cv_output <- cv.glmnet(X.mat[train,], y.vec[train], nfolds = 20, alpha = alpha, lambda = lambda_seq) # identifying best lamda best_lam <- cv_output$lambda.min # Rebuilding the model with best lamda value identified lasso_best <- glmnet(X.mat[train,], y.vec[train], alpha = alpha, lambda = best_lam) # calculate predictions pred <- predict(lasso_best, s = best_lam, newx = X.mat[test,]) # create the output data matrix and rename final <- data.frame( y.vec[test], pred ) colnames(final) <- c("Actual", "Predicted") # calculate the R^2 of the function RSS <- sum( (final$Predicted - final$Actual) ^ 2 ) TSS <- sum( (final$Predicted - mean(final$Actual)) ^ 2 ) R_SQ <- 1 - RSS/TSS R_SQ # calculate error lasso_RMSE <- sqrt(sum(final$Predicted - final$Actual)^2) / nrow(final) lasso_MSE <- sum(final$Predicted - final$Actual)^2 / nrow(final) # display the errors with 10 digits round(lasso_RMSE, digits=10) round(lasso_MSE, digits=10) # prep plot data.frames with results actualData = data.frame( c( seq(1, nrow(final)) ), final$Actual) predictedData = data.frame( c( seq(1,nrow(final)) ) , final$Predicted) cols = c("Predictions", "Point") colnames(predictedData) = cols colnames(actualData) = cols # create and plot guesses vs actual values p2 <- ggplot() + geom_point(data=actualData, aes(x = Predictions ,y = Point), color="red") + geom_point(data=predictedData, aes(x = Predictions ,y = Point), color="blue") + xlab("Observations") + ylab("DN Value 0-0.035") + labs(title = "Predicted vs. Actual", subtitle = "Blue vs Red") ylim(c(min(actualData),max(actualData))) plot(p2) # check lengths of prediction and actual vectors length(actualData$Predictions) length(predictedData$Predictions)
These results above are the most promising so far. Where the basic single-layer NN overfits the training set a little too much causing about a 5% error on predictions, and the multi-layer NN seems to overtrain so ungodly bad that the error is hardly minimizable to any accurate predictions whatsoever, this Linear Model using Lasso Regression seems to be training very well in coparision to the other models tried so far.
The reason this is a better model than the NN is because in a neural network, biases between EVERY input node will be manipulated and used in order to create a prediction for the inputs. That means that every value will be taken into consideration for the prediction no matter how low the biaes is. Which intern can cause discrepencies when there are hidden linear relationships between other input variables that the NN does not discover on it's own.
Lasso Regression uses Shrinkage to converge on a central point of the data, usually the mean of the set. This training model encourages simple, sparse models which means that only the variables that are deemed "useful" are actually used in the prediction. We can see this idea in action by viewing this model's biased using the coef(lasso_best)
command, this command in R prints out the variables' biases to show us which variables in the model are being "ignored". ( The bias is 0 )
# display which variables are being used in the model coef(lasso_best)
# run the 10 fold cross validation of lasso regression to select best lambda cv_output <- cv.glmnet(X.mat[train,], y.vec[train], nfolds = 30, alpha = alpha, lambda = lambda_seq) # identifying best lamda best_lam <- cv_output$lambda.min # Rebuilding the model with best lamda value identified lasso_best <- glmnet(X.mat[train,], y.vec[train], alpha = alpha, lambda = best_lam) # calculate predictions pred <- predict(lasso_best, s = best_lam, newx = X.mat[test,]) # create the output data matrix and rename final <- data.frame( y.vec[test], pred ) colnames(final) <- c("Actual", "Predicted") # calculate the R^2 of the function RSS <- sum( (final$Predicted - final$Actual) ^ 2 ) TSS <- sum( (final$Predicted - mean(final$Actual)) ^ 2 ) R_SQ <- 1 - RSS/TSS R_SQ # calculate error lasso_RMSE <- sqrt(sum(final$Predicted - final$Actual)^2) / nrow(final) lasso_MSE <- sum(final$Predicted - final$Actual)^2 / nrow(final) # display the errors with 10 digits round(lasso_RMSE, digits=10) round(lasso_MSE, digits=10) # prep plot data.frames with results actualData = data.frame( c( seq(1, nrow(final)) ), final$Actual) predictedData = data.frame( c( seq(1,nrow(final)) ) , final$Predicted) cols = c("Predictions", "Point") colnames(predictedData) = cols colnames(actualData) = cols # create and plot guesses vs actual values p2 <- ggplot() + geom_point(data=actualData, aes(x = Predictions ,y = Point), color="red") + geom_point(data=predictedData, aes(x = Predictions ,y = Point), color="blue") + xlab("Observations") + ylab("DN Value 0-0.035") + labs(title = "Predicted vs. Actual", subtitle = "Blue vs Red") ylim(c(min(actualData),max(actualData))) plot(p2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.