library(knitr)
library(planetLearn)
library(glmnet)
library(ggplot2)

# document init
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
# configuration for all code blocks
knitr::opts_chunk$set(out.width = "100%",
                      text = element_text(size=5),
                      axis.text.x = element_text(angle=90, hjust=1),
                      fig.height = 3,
                      memory.limit(50000000))
setwd("D:/cfrasier/work/R/")

All Variables

load("../../data/lunarData.rda")

# filter out incidence angles greater than some number of degrees
data <- lunarData[lunarData$Incidence <= 70,]

# check for 0 variance
sc <- scale(data)

# filter out all fetures with 0 variance
if( any(attr(sc[-1], "scaled:scale") == 0))
{
  data <- cbind(DN=data[,1], data[,which(attr(sc[-1], "scaled:scale") == 0)] )
}

# show the dataset
knitr::kable( data[1:5,], caption = "LROC WAC Pixel Data ( band 1 unit 7 )" )
# prep data
y.vec <- as.double(data[,1])
X.mat <- as.matrix(data[,-1])
lambda_seq <- 10^seq(4, -4, by = -.015)
colnames(X.mat) <- NULL

# Splitting the data into test and train
set.seed(150)
# train with only a 80 % of the total data
train = sample( 1:nrow(X.mat), size = (nrow(X.mat) * 0.70) )
test = (-train)

Train Error

cv_output <- cv.glmnet(X.mat[train,], y.vec[train], nfolds = 50, 
                         alpha = 0, lambda = lambda_seq)
plot(cv_output)
optimal.lambda <- cv_output$lambda.min
fit <- cv_output$glmnet.fit

y_predicted <- predict(fit, s = optimal.lambda, newx = X.mat[test,])

# Sum of Squares Total and Error
sst <- sum((y.vec[test] - mean(y.vec[test]))^2)
sse <- sum((y_predicted - y.vec[test])^2)

# R squared
rsq <- 1 - sse / sst
rsq
# create the output data matrix and rename
outputData <- data.frame( y.vec[test], y_predicted )
colnames(outputData) <- c("Actual", "Predicted")
# plot the best run we had during the cv for better speed as oposed to printing all runs
actualData = data.frame( c( seq(1, nrow(outputData)) ), outputData$Actual)
predictedData = data.frame( c( seq(1,nrow(outputData)) ) , outputData$Predicted)
cols = c("Predictions", "Point")
colnames(predictedData) = cols
colnames(actualData) = cols

# create and plot guesses vs actual values
p2 <- ggplot() +
  geom_point(data=actualData, aes(x = Predictions ,y = Point), color="red") +
  geom_point(data=predictedData, aes(x = Predictions ,y = Point), color="blue") +
  xlab("Observations") +
  ylab("DN Value") +
  labs(title = "Predicted vs. Actual",
       subtitle = "Blue vs Red")
  ylim(c(min(actualData),max(actualData))) 
plot(p2)

Minnaert Variables

# prep data
y.vec <- as.double(data[,1])
X.mat <- as.matrix(data[,-1])
lambda_seq <- 10^seq(4, -4, by = -.015)
colnames(X.mat) <- NULL

# Splitting the data into test and train
set.seed(150)
# train with only a 80 % of the total data
train = sample( 1:nrow(X.mat), size = (nrow(X.mat) * 0.7) )
test = (-train)

Train Error

cv_output <- cv.glmnet(X.mat[train,], y.vec[train], nfolds = 50, 
                         alpha = 0, lambda = lambda_seq)
plot(cv_output)
optimal.lambda <- cv_output$lambda.min
fit <- cv_output$glmnet.fit

y_predicted <- predict(fit, s = optimal.lambda, newx = X.mat[test,])

# Sum of Squares Total and Error
sst <- sum((y.vec[test] - mean(y.vec[test]))^2)
sse <- sum((y_predicted - y.vec[test])^2)

# R squared
rsq <- 1 - sse / sst
rsq
# create the output data matrix and rename
outputData <- data.frame( y.vec[test], y_predicted )
colnames(outputData) <- c("Actual", "Predicted")
# plot the best run we had during the cv for better speed as oposed to printing all runs
actualData = data.frame( c( seq(1, nrow(outputData)) ), outputData$Actual)
predictedData = data.frame( c( seq(1,nrow(outputData)) ) , outputData$Predicted)
cols = c("Predictions", "Point")
colnames(predictedData) = cols
colnames(actualData) = cols

# create and plot guesses vs actual values
p2 <- ggplot() +
  geom_point(data=actualData, aes(x = Predictions ,y = Point), color="red") +
  geom_point(data=predictedData, aes(x = Predictions ,y = Point), color="blue") +
  xlab("Observations") +
  ylab("DN Value") +
  labs(title = "Predicted vs. Actual",
       subtitle = "Blue vs Red")
  ylim(c(min(actualData),max(actualData))) 
plot(p2)


ChaddFrasier/planetLearn documentation built on July 5, 2020, 2:32 a.m.