# Syntax for Vignette Snippets ( TEMPORARY ) #------------------ # footnotes^[A footnote here.] # latex equation $Y = X\beta + \epsilon$ # > "This is a direct quote" # ([hyperlink](https://github.com/ChaddFrasier/lunarLearningAlgorithms)) #--------------------
# init project library(lunarLearningAlgorithms) # init libraries library(glmnet) library(ggpubr) library(knitr) library(dplyr) library(ggplot2) library(neuralnet) library(tidyverse) library(GGally) library(corrplot) # document init knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) knitr::opts_chunk$set(out.width = "100%", text = element_text(size=5), axis.text.x = element_text(angle=90, hjust=1), fig.height = 3, memory.limit(20000000), setwd("D:/cfrasier/work/R"))
Neural Networks are making a rise in the supervised learning field of machine learning and plays an esspecially large roll in Image recognition software, and it has even found use in predicting stock prices. We want to use these concepts to help us predict DN values based on the photometric data.
Assuming a linear relationships between some variables of ISIS data such as Incidence, Emissions and Phase and the DN value.
The data that we will be using the train and test the accuracy of our model. Is read in from the ascii ISIS output and converted to UTF-8 by laz@usgs.gov
# set column names for the data features and label column ( DN ) col_names <- c("DN", "Phase", "Emission", "Incidence", "LEmission", "LIncidence", "Lat", "Long", "SunAz", "CraftAz") # Read Data in from text file and set column names data = read.table( "../data/unit4band4.dat", stringsAsFactors=FALSE, header=FALSE, strip.white=TRUE, sep="", col.names=col_names) # convert to a matrix data <- as.matrix(data) # calculate the cosines of the emission, incidence, local emission, and local incidence data[,3] = cos(data[,3]) data[,4] = cos(data[,4]) data[,5] = cos(data[,5]) data[,6] = cos(data[,6])
knitr::kable(head(data))
A Single layer neural network is very good at determining hidden relationships inside of a large data set. This model is done using general linear regression using gradient decent. This method assumes that some linear model can be created between all the variables of the input.
# get the data labels ( DN ) from the input data and convert the vector to a double y.vec = as.double(data[,1]) # get the data matrix by returning a matrix excluding the DN row X.mat = matrix( as.double(data[,-1]), nrow = nrow(data), ncol = (ncol(data)-1) ) # remove colnames from data matrix for computation colnames(X.mat) <- NULL # creates a fold vector with nfolds n.folds = 20 fold.vec = sample( rep(1:n.folds), length(y.vec), TRUE ) # show structure of fold.vec str(fold.vec) # init the NN Cross validation information max.iterations = 1000L step.size = 0.002 n.hidden.units = 10L # train the network using n folds of train/test splits resultCV = NNetEarlyStoppingCV( X.mat, y.vec, fold.vec, max.iterations, step.size, n.hidden.units, n.folds ) # Prepair the loss vector data to be displayed by ggplot ( requires* class == data.frame ) df = tbl_df( data.frame(c(seq(1,max.iterations), resultCV$mean.validation.loss.vec, resultCV$mean.train.loss.vec)) ) # creates 2x2 data frames with the nuber of iterations as one column and the loss values as the other dataVal = data.frame( c(seq(1,max.iterations)),resultCV$mean.validation.loss.vec) dataTrain = data.frame( c(seq(1,max.iterations)),resultCV$mean.train.loss.vec) # Set the 2 Col names for the chart cols = c("Iteration", "Loss") colnames(dataVal) = cols colnames(dataTrain) = cols # create plot of the loss values of each fold through the all iterations p <- ggplot() + geom_line(data=dataVal, aes(x = Iteration ,y = Loss), color="blue") + geom_line(data=dataTrain, aes(x = Iteration, y = Loss) , color="red") + xlab("1 to Max Iterations") + ylab("Mean Loss") + labs(title = "Validation vs. Train", subtitle = "Blue vs Red") plot(p)
This is done using the R package for neural networks. The problem with using this is that training it can be very difficult and convergence is not always guaranteed.
# TODO: write a simple loop here to use the train data to loop through and minimize the error by changing the amount of training observations # NN using neuralnet r # Initialize the data variable for processing in a new format data <- as.data.frame( matrix(as.double(data[,]), nrow = nrow(data), ncol = ncol(data)) ) colnames(data) <- col_names # create a scaled matrix the same size as the data matrix by just copying it x.scaled.mat <- data # scale each row using each row min and the distance of each data column ( including the DN ) # Note: this means predictions must be scaled back up for ( i in 1:ncol(data) ) { x.scaled.mat[,i] <- (data[,i]-min(data[,i]))/(max(data[,i])-min(data[,i])) }
knitr::kable(head(x.scaled.mat))
# training with X observations is.train <- sample( 1:nrow(x.scaled.mat), 2000 ) # get the train and test data matrices x.train.scaled <- x.scaled.mat[is.train,] # get the test data which is the opposite of the train x.test.scaled <- x.scaled.mat[-is.train,] # use the ANN function with 2 layers one with 8 and another with 4 to train using a linear function of all variables nn <- neuralnet( DN ~ Phase + Emission + Incidence + LEmission + LIncidence + Lat + Long + SunAz + CraftAz, data=x.train.scaled, hidden=c(8,6), linear.output=TRUE ) # plot the best run of the NN plot(nn, rep='best')
Using this structure we can make predictions on the test data by running a computation with the model and the test data. The predictions will then need to be scaled back up using the original data matrix.
# compute the predictions after the training is completed output <- compute(nn, x.test.scaled[,-1]) predictions <- as.data.frame(output$net.result * (data[-is.train, 1]-min(data[-is.train, 1]))/(max(data[-is.train,1])-min(data[-is.train,1]))) actuals <- as.data.frame(data[-is.train, 1]) # Mean Square Loss MSE <- sum( (predictions-actuals)^2 ) / nrow(predictions) MSE # Root Mean Square Loss RMSE <- sqrt( sum( (predictions-actuals)^2 ) ) / nrow(predictions) RMSE # prep data as 2x2 data.frame with a count of observations and the guesses / actuals actualData = data.frame( c( seq(1, nrow(actuals)) ), actuals) predictedData = data.frame( c( seq(1,nrow(predictions)) ) , predictions) # name the columns cols = c("Predictions", "Point") colnames(predictedData) = cols colnames(actualData) = cols # plot p2 <- ggplot() + geom_point(data=predictedData, aes(x = Predictions ,y = Point), color="blue") + geom_point(data=actualData, aes(x = Predictions ,y = Point), color="red") + xlab("Observations") + ylab("DN Value 0-0.4") + ylim(c(0,0.4)) + labs(title = "Predicted vs Actual", subtitle = "Blue vs Red") plot(p2)
These results are indicating over fitting of the training data again, not promising, so I decided to check the correlation between each variable seperatly.
By checking the correlation matrix in this way we can see which variables have the most effect on the DN value individually. This will help us see any linear relationship that can be hidding in the data.
# correlation tests correlation <- cor( data ) # plot all correlations corrplot( correlation, type="upper", order="hclust" )
# Using A5 and A4 because they are the highest correlated ggscatter(as.data.frame(data), x = "LEmission", y = "DN", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "A4 Observation", ylab = "DN ( 0 - 1 )") ggscatter(as.data.frame(data), x = "LIncidence", y = "DN", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "A5 Observation", ylab = "DN ( 0 - 1 )")
What I found was that there is not significant correlation between any single variable and the DN value which is some what expected considering that Minnaert's function requires more than 1 variable. Obviously if there was a linear relationship to draw from a single variable he would have drawn it.
I did notice that there is some small correlations between a few of the variables.
# plot ggplot correlation graph ggpairs( as.data.frame(data), title = "Scatterplot Matrix of Data Features" )
# seed the randoms set.seed( 0411020 ) # run the NN trying to find the 2 main variables that need to be used ( 2 hidden units in the last layer ) n <- neuralnet( DN ~ Phase + Emission + Incidence + LEmission + LIncidence + Lat + Long + SunAz + CraftAz, data = x.train.scaled, hidden = c(12,6,3) ) plot(n, rep='best')
Neural networks have proven inaccurate for predicting lunar data thus far. This is due to the fact that the neural networks over fit the training data which then forces the weights in the network to be too specific for a generalized prediction. We need a way to prevent over-fitting.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.