# Syntax for Vignette Snippets ( TEMPORARY )
#------------------
# footnotes^[A footnote here.]
# latex equation $Y = X\beta + \epsilon$
# > "This is a direct quote"
# ([hyperlink](https://github.com/ChaddFrasier/lunarLearningAlgorithms))
#--------------------
# init project
library(lunarLearningAlgorithms)

# init libraries
library(glmnet)
library(ggpubr)
library(knitr)
library(dplyr)
library(ggplot2)
library(neuralnet)
library(tidyverse)
library(GGally)
library(corrplot)

# document init
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
knitr::opts_chunk$set(out.width = "100%",
                      text = element_text(size=5),
                      axis.text.x = element_text(angle=90, hjust=1),
                      fig.height = 3,
                      memory.limit(20000000),
                      setwd("D:/cfrasier/work/R"))

Introduction

Neural Networks are making a rise in the supervised learning field of machine learning and plays an esspecially large roll in Image recognition software, and it has even found use in predicting stock prices. We want to use these concepts to help us predict DN values based on the photometric data.

Assuming a linear relationships between some variables of ISIS data such as Incidence, Emissions and Phase and the DN value.

Data Set

The data that we will be using the train and test the accuracy of our model. Is read in from the ascii ISIS output and converted to UTF-8 by laz@usgs.gov

# set column names for the data features and label column ( DN )
col_names <- c("DN", "Phase", "Emission", "Incidence", "LEmission", "LIncidence", "Lat", "Long", "SunAz", "CraftAz")
# Read Data in from text file and set column names
data = read.table( "../data/unit4band4.dat", stringsAsFactors=FALSE, 
                      header=FALSE, strip.white=TRUE, sep="", col.names=col_names)
# convert to a matrix
data <- as.matrix(data)

# calculate the cosines of the emission, incidence, local emission, and local incidence
data[,3] = cos(data[,3])
data[,4] = cos(data[,4])
data[,5] = cos(data[,5])
data[,6] = cos(data[,6])

Data Table 1

Unscaled Raw Data

knitr::kable(head(data))

Single Layer Neural Network in R

A Single layer neural network is very good at determining hidden relationships inside of a large data set. This model is done using general linear regression using gradient decent. This method assumes that some linear model can be created between all the variables of the input.

# get the data labels ( DN ) from the input data and convert the vector to a double
y.vec = as.double(data[,1])

# get the data matrix by returning a matrix excluding the DN row
X.mat = matrix( as.double(data[,-1]), nrow = nrow(data), ncol = (ncol(data)-1) )

# remove colnames from data matrix for computation
colnames(X.mat) <- NULL

# creates a fold vector with nfolds
n.folds = 20
fold.vec = sample( rep(1:n.folds), length(y.vec), TRUE )
# show structure of fold.vec
str(fold.vec)

# init the NN Cross validation information
max.iterations = 1000L
step.size = 0.002
n.hidden.units = 10L

# train the network using n folds of train/test splits
resultCV = NNetEarlyStoppingCV( X.mat, y.vec, fold.vec, max.iterations, step.size, n.hidden.units, n.folds )

# Prepair the loss vector data to be displayed by ggplot ( requires* class == data.frame )
df = tbl_df( data.frame(c(seq(1,max.iterations), resultCV$mean.validation.loss.vec, resultCV$mean.train.loss.vec)) )

# creates 2x2 data frames with the nuber of iterations as one column and the loss values as the other
dataVal = data.frame( c(seq(1,max.iterations)),resultCV$mean.validation.loss.vec)
dataTrain = data.frame( c(seq(1,max.iterations)),resultCV$mean.train.loss.vec)

# Set the 2 Col names for the chart
cols = c("Iteration", "Loss")
colnames(dataVal) = cols
colnames(dataTrain) = cols

# create plot of the loss values of each fold through the all iterations
p <- ggplot() +
  geom_line(data=dataVal, aes(x = Iteration ,y = Loss), color="blue") +
  geom_line(data=dataTrain, aes(x = Iteration, y = Loss) , color="red") +
  xlab("1 to Max Iterations") +
  ylab("Mean Loss") +
  labs(title = "Validation vs. Train",
       subtitle = "Blue vs Red")
plot(p)

Artificial Neural Network (ANN)

This is done using the R package for neural networks. The problem with using this is that training it can be very difficult and convergence is not always guaranteed.

# TODO: write a simple loop here to use the train data to loop through and minimize the error by changing the amount of training observations
# NN using neuralnet r
# Initialize the data variable for processing in a new format
data <- as.data.frame( matrix(as.double(data[,]), nrow = nrow(data), ncol = ncol(data)) )
colnames(data) <- col_names

# create a scaled matrix the same size as the data matrix by just copying it
x.scaled.mat <- data

# scale each row using each row min and the distance of each data column ( including the DN )
# Note: this means predictions must be scaled back up
for ( i in 1:ncol(data) )
{
  x.scaled.mat[,i] <- (data[,i]-min(data[,i]))/(max(data[,i])-min(data[,i]))
}

Data Table 2

Scaled Data Table

knitr::kable(head(x.scaled.mat))

ANN Figure

# training with X observations
is.train <- sample( 1:nrow(x.scaled.mat), 2000 )

# get the train and test data matrices
x.train.scaled <- x.scaled.mat[is.train,]

# get the test data which is the opposite of the train
x.test.scaled <- x.scaled.mat[-is.train,]

# use the ANN function with 2 layers one with 8 and another with 4 to train using a linear function of all variables
nn <- neuralnet( DN ~ Phase + Emission + Incidence + LEmission + LIncidence + Lat + Long + SunAz + CraftAz,
                data=x.train.scaled, hidden=c(8,6), linear.output=TRUE )

# plot the best run of the NN
plot(nn, rep='best')

ANN Predictions

Using this structure we can make predictions on the test data by running a computation with the model and the test data. The predictions will then need to be scaled back up using the original data matrix.

# compute the predictions after the training is completed
output <- compute(nn, x.test.scaled[,-1])
predictions <- as.data.frame(output$net.result * (data[-is.train, 1]-min(data[-is.train, 1]))/(max(data[-is.train,1])-min(data[-is.train,1])))
actuals <- as.data.frame(data[-is.train, 1])

# Mean Square Loss
MSE <- sum( (predictions-actuals)^2 ) / nrow(predictions)
MSE

# Root Mean Square Loss
RMSE <- sqrt( sum( (predictions-actuals)^2 ) ) / nrow(predictions)
RMSE

# prep data as 2x2 data.frame with a count of observations and the guesses / actuals
actualData = data.frame( c( seq(1, nrow(actuals)) ), actuals)
predictedData = data.frame( c( seq(1,nrow(predictions)) ) , predictions)

# name the columns
cols = c("Predictions", "Point")
colnames(predictedData) = cols
colnames(actualData) = cols

# plot
p2 <- ggplot() +
  geom_point(data=predictedData, aes(x = Predictions ,y = Point), color="blue") +
  geom_point(data=actualData, aes(x = Predictions ,y = Point), color="red") +
  xlab("Observations") +
  ylab("DN Value 0-0.4") + 
  ylim(c(0,0.4)) + 
  labs(title = "Predicted vs Actual",
       subtitle = "Blue vs Red")
plot(p2)

These results are indicating over fitting of the training data again, not promising, so I decided to check the correlation between each variable seperatly.

Correlations

By checking the correlation matrix in this way we can see which variables have the most effect on the DN value individually. This will help us see any linear relationship that can be hidding in the data.

  # correlation tests
  correlation <- cor( data )
  # plot all correlations
  corrplot( correlation, type="upper", order="hclust" )
  # Using A5 and A4 because they are the highest correlated
  ggscatter(as.data.frame(data), x = "LEmission", y = "DN", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "A4 Observation", ylab = "DN ( 0 - 1 )")
  ggscatter(as.data.frame(data), x = "LIncidence", y = "DN", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "A5 Observation", ylab = "DN ( 0 - 1 )")

What I found was that there is not significant correlation between any single variable and the DN value which is some what expected considering that Minnaert's function requires more than 1 variable. Obviously if there was a linear relationship to draw from a single variable he would have drawn it.

I did notice that there is some small correlations between a few of the variables.

Different Correlation

# plot ggplot correlation graph
ggpairs( as.data.frame(data), title = "Scatterplot Matrix of Data Features" )

Simple ANN w/ Limited Input Features

# seed the randoms
set.seed( 0411020 )

# run the NN trying to find the 2 main variables that need to be used ( 2 hidden units in the last layer )
n <- neuralnet( DN ~ Phase + Emission + Incidence + LEmission + LIncidence + Lat + Long + SunAz + CraftAz,
           data = x.train.scaled, hidden = c(12,6,3) )
plot(n, rep='best')

Neural Network Conclusion ( for now )

Neural networks have proven inaccurate for predicting lunar data thus far. This is due to the fact that the neural networks over fit the training data which then forces the weights in the network to be too specific for a generalized prediction. We need a way to prevent over-fitting.



ChaddFrasier/lunarLearningAlgorithms documentation built on May 17, 2020, 6:50 p.m.