# Install some utility functions and the plotly.Spec() plotting function from GitHub
sourceFunctionURL <- function (URL) {
" # For more functionality, see gitAFile() in the rgit package ( https://github.com/John-R-Wallace-NOAA/rgit ) which includes gitPush() and git() "
require(httr)
File.ASCII <- tempfile()
on.exit(file.remove(File.ASCII))
getTMP <- httr::GET(URL)
write(paste(readLines(textConnection(httr::content(getTMP))), collapse = "\n"), File.ASCII)
source(File.ASCII)}
sourceFunctionURL("https://raw.githubusercontent.com/John-R-Wallace-NOAA/JRWToolBox/master/R/lib.R")
sourceFunctionURL("https://raw.githubusercontent.com/John-R-Wallace-NOAA/FishNIRS/master/R/plotly.Spec.R")
# Set Path
PATH <- "W:/ALL_USR/JRW/SIDT/2019 Hake/"
setwd(PATH) # set working directory to folder containing spectral files
getwd()
# ================== Resample() all the wave lengths using prospectr::resample ===============================
base::load(file = 'ldf.RData')
ldf_int <- matrix(data = NA, nrow = length(ldf), ncol = length(ldf[[N]]$wavenumbers)) #make empty matrix for loop
for (j in 1:length(ldf)){
bar(j, length(ldf))
ldf_int[j,] <- prospectr::resample(X = ldf[[j]][[2]], wav = ldf[[j]]$wavenumbers, new.wav = ldf[[N]]$wavenumbers)
}
colnames(ldf_int) <- ldf[[N]]$wavenumbers
dat_spc <- as.data.frame(ldf_int)
dim(dat_spc)
[1] 2849 1112
dat_spc[1:20, 1:10]
# Add file names back in, could add other variables here as well (age etc.)
# metadat <- sapply(rdat,'[[', 1)
# (filenames <- unlist(metadat[2,]))[1:10]
# ================= ADD Hake_OPUS.RData and compare ================================
load('Hake_OPUS.RData')
Hake_OPUS[1:5, ]
names(Hake_OPUS)[2] <- "filenames"
Hake_OPUS[1:5, ]
base::load(file = "W:/ALL_USR/JRW/SIDT/2019 Hake/hake_all_2019.6.20 6 Oct 2022.RData") # If needed
preJoinNumCol <- ncol(hake_all_2019.6.20)
(hake_all_2019.6.20 <- left_join(hake_all_2019.6.20, Hake_OPUS[, 2:4], by = "filenames"))[1:5, c(1:3, 1112:1120, 1155:(preJoinNumCol + ncol(Hake_OPUS[, 2:4]) - 1))]
for( i in 1:nrow(Hake_OPUS)) # No match for row 714. ID = 1239
cat("\n", i, hake_all_2019.6.20$filenames[grep(Hake_OPUS[i,2], hake_all_2019.6.20$filenames)])
grep('PACIFIC_HAKE_BMS201906206D_1239_OD1.0', hake_all_2019.6.20$filenames)
integer(0)
Table(is.na(hake_all_2019.6.20$Age_OPUS))
FALSE TRUE
854 1995
# With missing row:
855/(855 + 1995)
[1] 0.3
# is.na(hake_all_2019.6.20$Age_OPUS) == TRUE appears to be calibration (70%) and FALSE is test (validation) set (30%)
# ========================== Analysis =========================================================
lib(data.table)
lib(mdatools)
lib(dplyr)
lib(hyperSpec) # http://hyperspec.r-forge.r-project.org
lib(prospectr)
lib(e1071)
lib(rpart)
lib(vegan)
# install and load simplerspec
# install.packages("remotes")
# remotes::install_github("philipp-baumann/simplerspec")
# library(simplerspec)
lib("philipp-baumann/simplerspec") # https://github.com/philipp-baumann/simplerspec
test <- plotly.Spec(hake_all_2019.6.20, 100)
Glm <- glm(Age ~ poly(Value, 4) + poly(Band, 5), data = test)
Age_Hat <- round(predict(Glm))
Age_Hat[Age_Hat < 1] <- 1
Table(test$Age, round(Age_Hat))
test <- plotly.Spec(hake_all_2019.6.20, 300, plot = FALSE)
Glm <- glm(Age ~ poly(Value, 4) + poly(Band, 5), data = test[test$Band <= 8000, ])
Age_Hat <- round(predict(Glm))
Age_Hat[Age_Hat < 1] <- 1
Table(round(Age_Hat), test[test$Band <= 8000, ]$Age)
classAgreement(Table(round(Age_Hat), test[test$Band <= 8000, ]$Age))
sum(abs(round(Age_Hat) - test[test$Band <= 8000, ]$Age))
Glm <- glm(Age ~ poly(Value, 8):poly(Band, 8), data = test[test$Band <= 8000, ])
Age_Hat <- round(predict(Glm))
Age_Hat[Age_Hat < 1] <- 1
Table(round(Age_Hat), test[test$Band <= 8000, ]$Age)
classAgreement(Table(round(Age_Hat), test[test$Band <= 8000, ]$Age))
sum(abs(round(Age_Hat) - test[test$Band <= 8000, ]$Age))
plot(WaveLengths, hake_all_2019.6.20[1, 2:1113], type = 'l')
# matplot(WaveLengths, t(hake_all_2019.6.20[1:3, 2:1113]), type = 'l')
# Plot scans changing colors by age
Cols <- rainbow(22)
dev.new()
pie(rep(1, length(Cols)), col = Cols)
# Only plot a random sample - including group A
N <- 100
dev.new(height = 8, width = 14)
plot(WaveLengths, hake_all_2019.6.20[1, 2:1113], type = 'l', ylab = 'Reflectance', col = Cols[hake_all_2019.6.20$Age[1]],
ylim = c(floor(min(hake_all_2019.6.20[, 2:1113])*100)/100, ceiling(max(hake_all_2019.6.20[, 2:1113])*100)/100), lwd = 0.5)
for (i in sample(2:nrow(hake_all_2019.6.20), N)) {
lines(WaveLengths, hake_all_2019.6.20[i, 2:1113], col = Cols[hake_all_2019.6.20$Age[i]], lwd = 0.5)
# cat(paste0('Row = ', i, "\n"))
# ask()
}
# Only plot a random sample
N <- 100
dev.new(height = 8, width = 14)
plot(WaveLengths, hake_all_2019.6.20[1, 2:1113], type = 'l', ylab = 'Reflectance', col = Cols[hake_all_2019.6.20$Age[1]],
ylim = c(floor(min(hake_all_2019.6.20[, 2:1113])*100)/100, ceiling(max(hake_all_2019.6.20[, 2:1113])*100)/100))
for (i in sample(2:nrow(hake_all_2019.6.20), N)) {
lines(WaveLengths, hake_all_2019.6.20[i, 2:1113] - ifelse(grepl('6A', hake_all_2019.6.20$filenames[i], 1000, 0), col = Cols[hake_all_2019.6.20$Age[i]])
cat(paste0('Row = ', i, "\n"))
ask()
}
# Plot all the scans by age
dev.new(height = 8, width = 14)
plot(WaveLengths, hake_all_2019.6.20[1, 2:1113], type = 'l', ylab = 'Reflectance', col = Cols[hake_all_2019.6.20$Age[1]],
ylim = c(floor(min(hake_all_2019.6.20[, 2:1113])*100)/100, ceiling(max(hake_all_2019.6.20[, 2:1113])*100)/100))
for (i in 2:nrow(hake_all_2019.6.20))
lines(WaveLengths, hake_all_2019.6.20[i, 2:1113], col = Cols[hake_all_2019.6.20$Age[i]])
# Two outliers are seen in the plot - need only the 8,000 freq. less than 0.4 reflectance
dim(hake_all_2019.6.20)
hake_all_2019.6.20 <- hake_all_2019.6.20[hake_all_2019.6.20['8000'] < 0.38, ]
dim(hake_all_2019.6.20)
# Plot all the scans by age - no extreme outlies now
dev.new(height = 8, width = 14)
plot(WaveLengths, hake_all_2019.6.20[1, 2:1113], type = 'l', ylab = 'Reflectance', col = Cols[hake_all_2019.6.20$Age[1]],
ylim = c(floor(min(hake_all_2019.6.20[, 2:1113])*100)/100, ceiling(max(hake_all_2019.6.20[, 2:1113])*100)/100))
for (i in 2:nrow(hake_all_2019.6.20))
lines(WaveLengths, hake_all_2019.6.20[i, 2:1113], col = Cols[hake_all_2019.6.20$Age[i]])
# Models
set.seed(c(777, 747)[2])
index <- 1:nrow(hake_all_2019.6.20)
testindex <- sample(index, trunc(length(index)/2))
(AgeColNum <- grep('Age', names(hake_all_2019.6.20))[1])
# Columns <- c(2:1113, AgeColNum) # All Wavelengths
Columns <- c((562:1112) + 1, AgeColNum) # Only Wavelengths between 3,600 and 8,000
testset <- hake_all_2019.6.20[testindex, Columns]
trainset <- hake_all_2019.6.20[-testindex, Columns]
# svm using e1071 package
svm.model <- e1071::svm(Age ~ ., data = trainset, cost = 100, gamma = 1)
svm.pred <- predict(svm.model, testset[, -grep('Age', names(testset))])
Table(NIRS_SVM_AGE = round(svm.pred), TMA = testset[, grep('Age', names(testset))])
e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred), TMA = testset[, grep('Age', names(testset))]))
e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred), TMA = testset[, grep('Age', names(testset))]), match.names = TRUE)
sum(abs(round(svm.pred)- testset[, grep('Age', names(testset))]))
# rpart
rpart.model <- rpart(Age ~ ., data = trainset)
rpart.pred <- predict(rpart.model, testset[, -grep('Age', names(testset))])
Table(NIRS_RPART_AGE = round(rpart.pred), TMA = testset[, grep('Age', names(testset))])
e1071::classAgreement(Table(NIRS_RPART_AGE = round(rpart.pred), TMA = testset[, grep('Age', names(testset))]))
e1071::classAgreement(Table(NIRS_RPART_AGE = round(rpart.pred), TMA = testset[, grep('Age', names(testset))]), match.names = TRUE)
sum(abs(round(rpart.pred) - testset[, grep('Age', names(testset))]))
# Hack in OPUS age for comparison - a look at limited data from OPUS model by Brenna, Alica, Beverely, or Irina???????????
names(hake_all_2019.6.20)[grep('Age', names(hake_all_2019.6.20))]
# [1] "Age" "Age_BB" "Age_OPUS"
(AgeColNum_OPUS <- grep('Age', names(hake_all_2019.6.20))[3])
names(hake_all_2019.6.20)[AgeColNum_OPUS]
Columns <- c((562:1112) + 1, AgeColNum, AgeColNum_OPUS)
testset_OPUS <- hake_all_2019.6.20[testindex, Columns]
dim(testset_OPUS)
# testset_OPUS <- testset_OPUS[is.finite(testset_OPUS[, ncol(testset_OPUS)]), ]
# dim(testset_OPUS)
trainset_OPUS <- hake_all_2019.6.20[-testindex, Columns]
dim(trainset_OPUS)
# trainset_OPUS <- trainset_OPUS[is.finite(trainset_OPUS[, ncol(trainset_OPUS)]), ]
# dim(trainset_OPUS)
# OPUS_Age vs TMA_Age
Table(OPUS_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]),
TMA_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[1]]))
classAgreement(Table(OPUS_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]),
TMA_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[1]])))
classAgreement(Table(OPUS_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]),
TMA_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[1]])), match.names = TRUE)
sum(abs(round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]) -
round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[1]])))
# OPUS_Age vs NIRS_SVM_Age
Table(OPUS_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]),
NIRS_SVM_Age = round(svm.pred[!is.na(testset_OPUS[, ncol(testset_OPUS)])]))
classAgreement(Table(OPUS_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]),
NIRS_SVM_Age = round(svm.pred[!is.na(testset_OPUS[, ncol(testset_OPUS)])])))
classAgreement(Table(OPUS_Age = round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]),
NIRS_SVM_Age = round(svm.pred[!is.na(testset_OPUS[, ncol(testset_OPUS)])])), match.names = TRUE)
sum(abs(round(testset_OPUS[!is.na(testset_OPUS[, ncol(testset_OPUS)]), grep('Age', names(testset_OPUS))[2]]) -
round(svm.pred[!is.na(testset_OPUS[, ncol(testset_OPUS)])])))
# =============== iPLSR - following Jordan =========================
base::load("W:\\ALL_USR\\JRW\\SIDT\\2019 Hake\\hake_all_2019.6.20 6 Oct 2022.RData")
source("W:\\ALL_USR\\JRW\\SIDT\\2019 Hake\\plotly.Spec.R")
Hake_Spectra_2019 <- hake_all_2019.6.20[, (562:1112) + 1] #spectra matrix
Hake_Age_2019 <- as.numeric(hake_all_2019.6.20$Age) #Vector of Ages
Hake_Age_2019.fac <- factor(Hake_Age_2019)
################################
###Quick view data in plotly ###
################################
hake_all_2019.6.20$ID <- as.numeric(subStr(hake_all_2019.6.20$shortName, "_", elements = 2))
hake_all_2019.6.20 <- sort.f(hake_all_2019.6.20, "ID")
hake_all_2019.6.20[1:4, c(1:2, 1160:1164)]
# # filenames 12488 GroupNum Age_BB Age_OPUS shortName ID
# # 1 PACIFIC_HAKE_BMS201906206A_1_OD1.0 0.2087611 1 1 0.9209 HAKE_1 1
# # 2 PACIFIC_HAKE_BMS201906206A_2_OD1.0 0.1907212 1 1 0.1541 HAKE_2 2
# # 3 PACIFIC_HAKE_BMS201906206A_3_OD1.0 0.1880061 1 NA NA HAKE_3 3
# plotly.Spec(hake_all_2019.6.20, WaveRange = c(0, Inf))
plotly.Spec(hake_all_2019.6.20)
plotly.Spec(hake_all_2019.6.20, 500)
plotly.Spec(hake_all_2019.6.20, NULL, reverse = TRUE)
plotly.Spec(rbind(hake_all_2019.6.20[hake_all_2019.6.20$ID <= 150, ], hake_all_2019.6.20[hake_all_2019.6.20$ID > 200 & hake_all_2019.6.20$ID < 300, ]), NULL)
plotly.Spec(hake_all_2019.6.20[hake_all_2019.6.20$ID <= 150, ], NULL)
plotly.Spec(hake_all_2019.6.20[hake_all_2019.6.20$ID > 150, ], NULL)
# plotly.Spec(hake_all_2019.6.20[hake_all_2019.6.20$ID >= 151 & hake_all_2019.6.20$ID < 700, ], NULL)
cbind(140:150, hake_all_2019.6.20[140:150, 'ID' ]) # 2 Oties missing below 150 ID
plotly.Spec(hake_all_2019.6.20, N_Samp = 150, randomAfterSampNum = 148)
plotly.Spec(hake_all_2019.6.20, N_Samp = 0, randomAfterSampNum = 148)
dev.new()
plot(hake_all_2019.6.20$ID, hake_all_2019.6.20$Age)
dev.new()
plot(hake_all_2019.6.20$ID, hake_all_2019.6.20$Oto_Weight)
dev.new()
plot(hake_all_2019.6.20$ID, hake_all_2019.6.20$fork_length)
plot(hake_all_2019.6.20$latitude, hake_all_2019.6.20$Age)
"Oto_Weight" "OtolithSide" "Processed.by" "Shipped.Date" "CatchDate"
[37] "sex_determination" "fork_length"
plot(jitter(hake_all_2019.6.20$Age), jitter(hake_all_2019.6.20$latitude, 100), ylim = c(31, 48.5))
points(jitter(hake_all_2019.6.20$Age[hake_all_2019.6.20$ID <= 100]), jitter(hake_all_2019.6.20$latitude[hake_all_2019.6.20$ID <= 150], 100), col = 'red', pch = 16)
points(jitter(hake_all_2019.6.20$Age[hake_all_2019.6.20$ID %in% c(15, 113, 122)]), hake_all_2019.6.20$latitude[hake_all_2019.6.20$ID %in% c(15, 113, 122)], col = 'green', pch = 16)
dev.new()
# imap(latrange = c(26, 49), longrange = c(-128, -113), zoom = FALSE)
imap(latrange = c(34, 36), longrange = c(-122, -120), zoom = FALSE)
TF <- hake_all_2019.6.20$ID > 150
points(jitter(hake_all_2019.6.20$longitude[TF], 200), jitter(hake_all_2019.6.20$latitude[TF], 20))
TF <- hake_all_2019.6.20$ID <= 150
points(jitter(hake_all_2019.6.20$longitude[TF], 1), jitter(hake_all_2019.6.20$latitude[TF], 1), col = 'red')
TF <- hake_all_2019.6.20$ID %in% c(15, 113)
points(hake_all_2019.6.20$longitude[TF], hake_all_2019.6.20$latitude[TF], col = 'green', pch =16) # 2 year olds in green with last peak < 5,000
TF <- hake_all_2019.6.20$ID %in% 122
points(hake_all_2019.6.20$longitude[TF], hake_all_2019.6.20$latitude[TF], col = 'dodger blue', pch =16) # 1 year olds in Dodger blue with last peak < 5,000
TF <- hake_all_2019.6.20$ID %in% c(6, 12, 20)
set.seed(c(747, 787)[2])
points(jitter(hake_all_2019.6.20$longitude[TF], 0.05), jitter(hake_all_2019.6.20$latitude[TF], 0.05), col = 'cyan', pch = 16) # 2 year olds in red with lat peak > 5,000
# Otie weight
dev.new()
TF <- hake_all_2019.6.20$ID > 150
plot(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$Oto_Weight[TF], xlim = c(-5, 3000), na.rm = TRUE)
TF <- hake_all_2019.6.20$ID <= 150
points(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$Oto_Weight[TF], col = 'red')
TF <- hake_all_2019.6.20$ID %in% c(15, 113)
points(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$Oto_Weight[TF], col = 'green', pch =16) # 2 year olds in green
cbind(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$Oto_Weight[TF]) # Missing otie weight on #113
TF <- hake_all_2019.6.20$ID %in% 122
points(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$Oto_Weight[TF], col = 'dodger blue', pch =16) #1 year olds in Dodger blue
# Fork length
dev.new()
TF <- hake_all_2019.6.20$ID > 150
plot(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$fork_length[TF], ylim = c(16.5, 73), xlim = c(-5, 3000), na.rm = TRUE)
TF <- hake_all_2019.6.20$ID <= 150
points(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$fork_length[TF], col = 'red')
TF <- hake_all_2019.6.20$ID %in% c(15, 113)
points(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$fork_length[TF], col = 'green', pch =16) # 2 year olds in green
cbind(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$fork_length[TF]) # Missing otie weight on #113
TF <- hake_all_2019.6.20$ID %in% 122
points(hake_all_2019.6.20$ID[TF], hake_all_2019.6.20$fork_length[TF], col = 'dodger blue', pch =16) #1 year olds in Dodger blue
# Find names of all metadata columns
names(hake_all_2019.6.20[, -(2:(sum(!is.na(as.numeric(names(hake_all_2019.6.20)))) + 1))])
Table(hake_all_2019.6.20$broken) # 98 oties are broken
hake_all_2019.6.20[hake_all_2019.6.20$broken == 1, "ID"]
Table(hake_all_2019.6.20$crystallized)
hake_all_2019.6.20[hake_all_2019.6.20$crystallized == 1, "ID"]
hake_all_2019.6.20[hake_all_2019.6.20$ID %in% c(595, 1083), -(2:(sum(!is.na(as.numeric(names(hake_all_2019.6.20)))) + 1))]
plotly.Spec(hake_all_2019.6.20, nrow(hake_all_2019.6.20))
###################################################################################################################
### Perform Savitzky-Golay 1st derivative with 17 point window smoothing 2rd order polynomial fit and visualize ###
###################################################################################################################
### NOTE ### If there are multiple years of data, all subsequent transformations should be applied to the whole data set, then re-subset
Hake_Spectra_2019.sg <- data.frame(prospectr::savitzkyGolay(Hake_Spectra_2019, p = 2, w = 15, m = 1))
Hake_Spectra_2019.Age.sg <- data.frame(Age = Hake_Age_2019, prospectr::savitzkyGolay(Hake_Spectra_2019, p = 2, w = 15, m = 1))
# add Age back to the spectra matrix
# pca <- stats::prcomp(Hake_Spectra_2019.sg, scale = TRUE)
# summary(pca)
# Hake_Spectra_2019.scores <- data.frame(pca$x)
#
# PCA <- data.frame(Hake_Age_2019.fac, Hake_Spectra_2019.scores)
#
# s.class(PCA[,2:3],
# fac = Hake_Age_2019.fac, # color by groups
# col = c("black", "dark red", "red","dark orange", "green", "dark green", "blue", "dark blue", "purple", "brown"),
# grid = FALSE,
# label = levels(Hake_Age_2019.fac),
# cellipse = 2, #2 standard deviations (95% conf interval)
# clabel = 1.5,
# cpoint = 1
# )
#
# PCA and CCA, vegan package from : https://stackoverflow.com/questions/13936051/adding-ellipses-to-a-principal-component-analysis-pca-plot
# --- pca (rda) and cca (same result) ---
prin_comp.pca <- vegan::rda(Hake_Spectra_2019, scale = TRUE)
dev.new()
plot(prin_comp.pca)
prin_comp.pca.sg <- vegan::rda(Hake_Spectra_2019.sg, scale = TRUE)
pca_scores.sg <- vegan::scores(prin_comp.pca.sg)
dev.new()
plot(prin_comp.pca.sg)
dev.new()
plot(prin_comp.pca.sg, xlim=c(-21, 5), ylim=c(-8, 8))
dev.new()
plot(pca_scores.sg$sites[,1], pca_scores.sg$sites[,2], pch=21, bg = rainbow(30), xlim=c(-3, 3), ylim=c(-6, 2))
arrows(0,0,pca_scores.sg$species[,1],pca_scores.sg$species[,2],lwd=1,length=0.2)
Hake_Spectra_2019 <- hake_all_2019.6.20[, (562:1112) + 1] # Spectra matrix
Hake_Meta_2019 <- hake_all_2019.6.20[, -((1:1112) + 1)] # Meta data
Hake_Meta_2019$Lat.1.degree <- round(Hake_Meta_2019$latitude)
names(Hake_Meta_2019)
[1] "filenames" "vessel_code" "cruise_number" "date_collected" "collection_year" "region"
[7] "latitude" "longitude" "sequence" "length" "weight" "sex"
[13] "read_age" "test_age" "final_age" "readability" "unscannable" "broken"
[19] "crystallized" "other_problem" "percent_affected" "instrument_name" "comments" "misc_data"
[25] "gear_depth" "bottom_depth" "Cruise" "Barcode" "Age" "Species"
[31] "SampleYear" "Oto_Weight" "OtolithSide" "Processed.by" "Shipped.Date" "CatchDate"
[37] "sex_determination" "fork_length" "Notes" "Survey" "Haul" "TD_Time"
[43] "TD_Lat" "TD_Lon" "Avg_Bot_Depth_m" "Avg_Gear_Depth_m" "Group" "GroupNum"
[49] "Age_BB" "Age_OPUS" "shortName" "Lat.1.degree"
# Hake.2019.cca <- cca(Hake_Spectra_2019 ~ Age, data=Hake_Meta_2019)
# Hake.2019.cca <- cca(Hake_Spectra_2019 ~ Age + fork_length, data=Hake_Meta_2019)
Hake_Meta_2019$Avg_Bot_Depth_m <- as.numeric(Hake_Meta_2019$Avg_Bot_Depth_m)
TF <- !(is.na(Hake_Meta_2019$Age) | is.na(Hake_Meta_2019$fork_length) | is.na(Hake_Meta_2019$Oto_Weight) | is.na(Hake_Meta_2019$sex) |
is.na(Hake_Meta_2019$Lat.1.degree) | is.na(Hake_Meta_2019$weight) | is.na(Hake_Meta_2019$Avg_Bot_Depth_m))
sum(TF)
Hake.2019.cca <- cca(Hake_Spectra_2019[TF, ] ~ Age + Lat.1.degree + fork_length + Oto_Weight + sex + weight + Avg_Bot_Depth_m,
data=Hake_Meta_2019[TF, c('Age', 'fork_length', 'Oto_Weight', 'sex', 'Lat.1.degree', 'weight', 'Avg_Bot_Depth_m')])
# Hake.2019.cca
summary(Hake.2019.cca)
dev.new()
plotCCA(Hake.2019.cca, Col = 'green')
dev.new()
plotCCA(Hake.2019.cca, display = c("sp", "wa", "cn", "lc", "bp", "reg")[c(3)], Col = 'green')
dev.new()
plotCCA(Hake.2019.cca, display = c("sp", "wa", "cn", "lc", "bp", "reg")[c(1:4)], Col = 'dodger blue')
dev.new()
par(mfrow = c(3,2))
plotCCA(Hake.2019.cca, c(1, 2))
plotCCA(Hake.2019.cca, c(1, 3))
plotCCA(Hake.2019.cca, c(1, 4))
plotCCA(Hake.2019.cca, c(2, 3))
plotCCA(Hake.2019.cca, c(2, 4))
plotCCA(Hake.2019.cca, c(3, 4))
dev.new()
plot(Hake.2019.cca, display = c("sp", "wa", "cn", "lc", "bp", "reg")[c(3)])
plotCCA(Hake.2019.cca, Col = 'green')
Hake.2019.cca <- cca(Hake_Spectra_2019[TF, ] ~ Age + sex,
data=Hake_Meta_2019[TF, c('Age', 'fork_length', 'Oto_Weight', 'sex', 'Lat.1.degree', 'weight')])
summary(Hake.2019.cca)
Hake.2019.cca <- cca(Hake_Spectra_2019[TF, ] ~ sex + Lat.1.degree + fork_length + Oto_Weight + Age + weight,
data=Hake_Meta_2019[TF, c('Age', 'fork_length', 'Oto_Weight', 'sex', 'Lat.1.degree', 'weight')])
summary(Hake.2019.cca)
# ------------------------------------------------------------------------------------------------------------------------
# calculate proportional variance for each PC
PoV <- pca$sdev^2/sum(pca$sdev^2)
#Screen Plot of proportional variances
dev.new()
par(mar =c(5,5,1,1))
plot(PoV[1:20], ylab = "Variance Explained", xlab = "Principle Components")
lines(PoV[1:20])
####################################################
### iPLS algorithm in mdatools ###
####################################################
Hake_Spectra_2019.iPLS.F <- mdatools::ipls(Hake_Spectra_2019.sg, Hake_Age_2019, glob.ncomp = 10, center = T, scale = T, cv = 100,
int.ncomp = 10, int.num = 10, ncomp.selcrit = "min", method = "forward", silent = F)
save(Hake_Spectra_2019.iPLS.F, file = 'Hake_Spectra_2019.iPLS.F 13 Oct 2022.RData')
summary(Hake_Spectra_2019.iPLS.F)
dev.new()
plot(Hake_Spectra_2019.iPLS.F)
dev.new()
plot(Hake_Spectra_2019.iPLS.F, main = NULL)
# plot predictions before and after selection
dev.new()
par(mfrow = c(2, 1))
mdatools::plotPredictions(Hake_Spectra_2019.iPLS.F$gm)
mdatools::plotPredictions(Hake_Spectra_2019.iPLS.F$om)
dev.new()
mdatools::plotRMSE(Hake_Spectra_2019.iPLS.F)
# RMSE before and after selection
dev.new()
par(mfrow = c(2, 1))
mdatools::plotRMSE(Hake_Spectra_2019.iPLS.F$gm, ylim = c(-0.2, 3))
mdatools::plotRMSE(Hake_Spectra_2019.iPLS.F$om, ylim = c(-0.2, 3))
# plot the newly selected spectra regions
Hake_Spectra_2019.iPLS.vars <- Hake_Spectra_2019.iPLS.F$var.selected
(p <- length(Hake_Spectra_2019.iPLS.vars))
Hake_Spectra_2019.sg.iPLS <- data.frame(Hake_Spectra_2019.sg[, Hake_Spectra_2019.iPLS.vars])
Hake_Spectra_2019.Age.sg.iPLS <- data.frame(Age = Hake_Age_2019, Hake_Spectra_2019.sg.iPLS)
dim(Hake_Spectra_2019.Age.sg.iPLS)
#Split by age - Age.0.avg, Age.1.avg, ...
for (i in 0:17) {
Subset <- subset(Hake_Spectra_2019.Age.sg.iPLS, Hake_Age_2019 == i)
if(i == 0) cat("\n")
cat("Dim of", paste0('Age', i), " = ", dim(Subset), "\n")
assign(paste0('Age', i, '.avg'), apply(Subset, 2, mean)) # Character or numeric works in subset()
cat("Length of", paste0('Age', i, '.avg'), " = ", length(eval(parse(text = paste0('Age', i, '.avg')))), "\n\n")
}
### rbind, transpose and plot the averaged spectra matrix
age.avg <- NULL
for ( i in 0:17)
age.avg <- rbind(age.avg, eval(parse(text = paste0('Age', i, '.avg'))))
dim(age.avg)
age.avg[1:3, 1:5]
age.avg[1:3, (ncol(age.avg) - 3):ncol(age.avg)]
p + 1 # vars plus age
t.age.avg <- data.frame(t(age.avg[, -1])) # remove the age column so there is only an xmatrix
dim(t.age.avg)
t.age.avg[1:3, 1:5]
#plot the transformed spectra
dev.new(width = 14, height = 8)
par(mar=c(5,5,1,1))
plot(t.age.avg$X10, xlab = "Wavenumber Range", ylab = "Absorbance", cex.lab = 1, type = 'n',
ylim = c(min(t.age.avg, na.rm = TRUE) - abs(min(t.age.avg, na.rm = TRUE) * 0.025), max(t.age.avg, na.rm = TRUE) + abs(max(t.age.avg, na.rm = TRUE) * 0.025)))
Cols <- rainbow(22)
for ( i in 0:17)
lines(t.age.avg[[paste0('X', i)]], col = Cols[i + 1], lwd = 2)
legend("topleft", legend = paste(0:17, "Years"), col = c('red', 'blue', Cols[1:18]), lty=1, lwd = 2,cex=.8)
#########################################
### Conduct PLSr on iPLSr selected data ###
#########################################
PLSr <- mdatools::pls(Hake_Spectra_2019.sg.iPLS, Hake_Age_2019, ncomp = 10, center = T, scale = F, cv = 100,
method = "simpls", alpha = 0.05, ncomp.selcrit = "min")
summary(PLSr)
dev.new()
plot(PLSr)
set.seed(c(777, 747)[2])
index <- 1:nrow(Hake_Spectra_2019.sg.iPLS)
testindex <- sample(index, trunc(length(index)/3))
x.testset <- Hake_Spectra_2019.sg.iPLS[testindex, ]
x.trainset <- Hake_Spectra_2019.sg.iPLS[-testindex, ]
y.test <- Hake_Age_2019[testindex]
y.train <- Hake_Age_2019[-testindex]
# PLSr <- mdatools::pls(x.trainset, y.train, ncomp = 10, center = T, scale = F, cv = 100,
# method = "simpls", alpha = 0.05, ncomp.selcrit = "min", x.test = x.testset, y.test = y.test)
PLSr <- mdatools::pls(x.trainset, y.train, ncomp = 10, center = T, scale = F, cv = 100,
method = "simpls", alpha = 0.05, ncomp.selcrit = "min")
summary(PLSr)
dev.new()
plot(PLSr)
###########################
### Plot the PLSr results ###
###########################
# pull the prediction values of age from the PLSr object
compNum <- length(PLSr$res$cal$slope) # Number of selected components
Predicted_Age <- data.frame(PLSr$cvres$y.pred[ , , 1])[, compNum]
# Reference Ages
Reference_Age <- PLSr$cvres$y.ref # Equals y.train
# Plot Predicted vs Reference Ages
(Slope <- PLSr$res$cal$slope[length(PLSr$res$cal$slope)]) # Slope from PLSr
dev.new()
par(mfrow = c(2,1))
# Decimal Predicted age
# dev.new(height = 8, width = 15)
par(mar = c(5,5,1,1))
plot(Reference_Age, Predicted_Age, ylim = c(0,12), xlim = c(0,12), col = "dark blue")
abline(0, 1, lwd = 2)
abline(0, Slope, col = "dark blue", lwd = 2)
# Integer Predicted age with reference age jitter
# dev.new(height = 8, width = 15)
par(mar = c(5,5,1,1))
plot(jitter(Reference_Age), round(Predicted_Age), ylim = c(0,12), xlim = c(0,12), col = "dark blue")
abline(0, 1, lwd = 2)
abline(0, Slope, col = "dark blue", lwd = 2)
summary(lm(Predicted_Age ~ Reference_Age))$r.squared
summary(lm(round(Predicted_Age) ~ Reference_Age))$r.squared
Table(NIRS_PLSr_AGE = round(Predicted_Age), TMA = Reference_Age)
e1071::classAgreement(Table(NIRS_PLSr_AGE = round(Predicted_Age), TMA = Reference_Age)) # $diag 0.1677193
e1071::classAgreement(Table(NIRS_PLSr_AGE = round(Predicted_Age), TMA = Reference_Age), match.names = TRUE) # diag 0.5859649
sum(abs(Reference_Age - round(Predicted_Age)))
(Results <- data.frame(Reference_Age, Predicted_Age))[1:10, ]
# Store results in an excel.csv file
write.csv(Results, file ="10_smoothing_iPLSR_Res.csv", row.names = FALSE)
# ---------- Support Vector Machine and RPART --------------------
lib(caret)
lib(gmodels)
setwd("W:/ALL_USR/JRW/SIDT/2019 Hake")
base::load(file = "hake_all_2019.6.20 6 Oct 2022.RData")
index <- 1:nrow(hake_all_2019.6.20)
# set.seed(c(777, 747)[2])
# testindex <- sample(index, trunc(length(index)/3))
# Use Hake_OPUS test and train sets
# is.na(hake_all_2019.6.20$Age_OPUS) == TRUE appears to be calibration (70%) and FALSE is test (validation) set (30%)
testindex <- index[!is.na(hake_all_2019.6.20$Age_OPUS)]
length(testindex)/length(index)
(AgeColNum <- grep('Age', names(hake_all_2019.6.20))[1])
# Columns <- 2:1113 # All Wavelengths
Columns.set <- (562:1112) + 1 # Only Wavelengths between 3,600 and 8,000
x.testset <- hake_all_2019.6.20[testindex, Columns.set]
x.trainset <- hake_all_2019.6.20[-testindex, Columns.set]
y.test <- hake_all_2019.6.20[testindex, AgeColNum]
y.train <- hake_all_2019.6.20[-testindex, AgeColNum]
# ## In PLS the "test set" creates the model????????????????????????????????????????
# #
# ## svm using e1071 package
# #svm.model <- e1071::svm(y.test ~ ., data = x.testset, cost = 100, gamma = 1)
# #svm.model
# #svm.pred <- predict(svm.model, x.trainset)
# #Table(NIRS_SVM_AGE = round(svm.pred), TMA = y.train)
# #e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred), TMA = y.train)) # $diag 0.3855337
# #e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred), TMA = y.train) , match.names = TRUE) # $diag 0.3855337
# #sum(abs(round(svm.pred)- y.train)) # 1432
# #
# #
# ## rpart
# #rpart.model <- rpart::rpart(y.test ~ ., data = x.testset)
# #rpart.model
# #rpart.pred <- predict(rpart.model, x.trainset)
# #Table(NIRS_RPART_AGE = round(rpart.pred), TMA = y.train)
# #e1071::classAgreement(Table(NIRS_RPART_AGE = round(rpart.pred), TMA = y.train)) # $diag 0.4424157
# #e1071::classAgreement(Table(NIRS_RPART_AGE = round(rpart.pred), TMA = y.train) , match.names = TRUE) # $diag 0.3735955
# #sum(abs(round(rpart.pred)- y.train)) # 1434
confusionMatrix(
# svm using e1071 package
svm.model <- e1071::svm(y.train ~ ., data = x.trainset, cost = 100, gamma = 1)
svm.model
svm.pred <- predict(svm.model, x.testset)
Table(NIRS_SVM_AGE = round(svm.pred), TMA = y.test)
e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred), TMA = y.test)) # $diag 0.3855337
e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred), TMA = y.test) , match.names = TRUE) # $diag 0.3855337
caret::confusionMatrix(rbind(Table(NIRS_SVM_AGE = round(svm.pred), TMA = y.test), "14" = rep(0, 16), "15" = rep(0, 16), "17" = rep(0, 16)))
gmodels::CrossTable(round(svm.pred), y.test)
sum(abs(round(svm.pred)- y.test)) # 1432
# Hake_OPUS
sum(is.na(hake_all_2019.6.20$Age_OPUS[testindex])) # 0
len(hake_all_2019.6.20$Age_OPUS[testindex])
len(y.test)
Table(NIRS_OPUS_AGE = round(hake_all_2019.6.20$Age_OPUS[testindex]), TMA = y.test)
e1071::classAgreement(Table(NIRS_OPUS_AGE = round(hake_all_2019.6.20$Age_OPUS[testindex]), TMA = y.test))
e1071::classAgreement(Table(NIRS_OPUS_AGE = round(hake_all_2019.6.20$Age_OPUS[testindex]), TMA = y.test) , match.names = TRUE)
NIRS_OPUS_AGE <- round(hake_all_2019.6.20$Age_OPUS[testindex])
NIRS_OPUS_AGE[NIRS_OPUS_AGE < 1] <- 1
Table(NIRS_OPUS_AGE = NIRS_OPUS_AGE, TMA = y.test)
caret::confusionMatrix(rbind(Table(NIRS_OPUS_AGE = NIRS_OPUS_AGE, TMA = y.test), "17" = rep(0, 16)))
sum(abs(NIRS_OPUS_AGE - y.test))
svm.pred.train <- predict(svm.model)
Table(NIRS_SVM_AGE = round(svm.pred.train), TMA = y.train)
e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred.train), TMA = y.train))
e1071::classAgreement(Table(NIRS_SVM_AGE = round(svm.pred.train), TMA = y.train), match.names = TRUE)
sum(abs(round(svm.pred.train)- y.train))
# rpart
rpart.model <- rpart::rpart(y.train ~ ., data = x.trainset)
rpart.model
rpart.pred <- predict(rpart.model, x.testset)
Table(NIRS_RPART_AGE = round(rpart.pred), TMA = y.test)
e1071::classAgreement(Table(NIRS_RPART_AGE = round(rpart.pred), TMA = y.test)) # $diag 0.4424157
e1071::classAgreement(Table(NIRS_RPART_AGE = round(rpart.pred), TMA = y.test) , match.names = TRUE) # $diag 0.3735955
sum(abs(round(rpart.pred)- y.test)) # 1434
# ================================ PLS using plantspec package - TOO OLD!!!!!!!!!!!!!=====================================================
# Select the component of interest (i.e., Age)
TMA_Age <- hake_all_2019.6.20$Age
# Tried this first........
# Optimize the preprocessing for spectra and spectral subsetting to emphasize
# the important spectra features related to predicting N. This can be very time
# consuming.
N_opt <- optimizePLS(component = TMA_Age,
spectra = hake_all_2019.6.20[, (562:1112) + 1],
training_set = !testindex)
# Fit our PLS regression using the optimal setting from our optimization.
N_cal <- calibrate(component = TMA_Age,
spectra = hake_all_2019.6.20[, (562:1112) + 1],
optimal_params = N_opt,
optimal_model = 1, # In N_opt, use the best model
validation = "testset",
training_set = testindex)
# -------------------------------------
# Select a training set for model fitting (i.e., FALSE values for test set)
# - use "!" because subdivideDataset() returns TRUE for test set selections.
# - use "type = 'validation'" so both training and test data are representative.
training_set_MDKS <- !(subdivideDataset(spectra = hake_all_2019.6.20[, (562:1112) + 1],
component = TMA_Age,
method = "MDKS",
p = 0.5, # 10% for this example
type = "validation"))
# Optimize the preprocessing for spectra and spectral subsetting to emphasize
# the important spectra features related to predicting N. This can be very time
# consuming.
N_opt <- optimizePLS(component = TMA_Age,
spectra = hake_all_2019.6.20[, (562:1112) + 1],
training_set = training_set_MDKS)
# Fit our PLS regression using the optimal setting from our optimization.
N_cal <- calibrate(component = TMA_Age,
spectra = hake_all_2019.6.20[, (562:1112) + 1],
optimal_params = N_opt,
optimal_model = 1, # In N_opt, use the best model
validation = "testset",
training_set = training_set_MDKS)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.