Nothing
## ----setup, include = FALSE---------------------------------------------------
library(formatR)
knitr::opts_chunk$set(
collapse = TRUE, eval.after = "fig.cap"
)
## ----classdiagram, echo = FALSE, out.width = '20%', fig.align = 'right'-------
knitr::include_graphics("logo.jpg")
## ----eval = TRUE--------------------------------------------------------------
citation(package = "resemble")
## ----libraries, tidy = TRUE, message = FALSE----------------------------------
library(resemble)
library(prospectr)
library(magrittr)
## ---- tidy = FALSE, message = FALSE, results = 'hide'-------------------------
data(NIRsoil)
dim(NIRsoil)
str(NIRsoil)
## ----NIRsoil, tidy = FALSE, message = FALSE-----------------------------------
# obtain a numeric vector of the wavelengths at which spectra is recorded
wavs <- NIRsoil$spc %>% colnames() %>% as.numeric()
# pre-process the spectra:
# - resample it to a resolution of 6 nm
# - use first order derivative
new_res <- 5
poly_order <- 1
window <- 5
diff_order <- 1
NIRsoil$spc_p <- NIRsoil$spc %>%
resample(wav = wavs, new.wav = seq(min(wavs), max(wavs), by = new_res)) %>%
savitzkyGolay(p = poly_order, w = window, m = diff_order)
## ----plotspectra, fig.cap = "Raw spectral absorbance data (top) and first derivative of the absorbance spectra (bottom).", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 7, fig.height = 7, echo = FALSE, fig.retina = 0.85----
old_par <- par("mfrow", "mar")
par(mfrow = c(2, 1), mar = c(4, 4, 1, 4))
new_wavs <- as.matrix(as.numeric(colnames(NIRsoil$spc_p)))
plot(range(wavs), range(NIRsoil$spc), col = NA,
xlab = "",
ylab = "Absorbance")
rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "#EFBF4780")
grid(lty = 1, col = "#E47C4E80")
matlines(x = wavs, y = t(NIRsoil$spc),
lty = 1, col = "#5177A133")
plot(range(new_wavs), range(NIRsoil$spc_p), col = NA,
xlab = "Wavelengths, nm",
ylab = "1st derivative")
rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "#EFBF4780")
grid(lty = 1, col = "#E47C4E80")
matlines(x = new_wavs, y = t(NIRsoil$spc_p),
lty = 1, col = "#5177A133")
par(old_par)
## ----eval = FALSE-------------------------------------------------------------
# new_wavs <- as.matrix(as.numeric(colnames(NIRsoil$spc_p)))
#
# matplot(x = wavs, y = t(NIRsoil$spc),
# xlab = "Wavelengths, nm",
# ylab = "Absorbance",
# type = "l", lty = 1, col = "#5177A133")
#
# matplot(x = new_wavs, y = t(NIRsoil$spc_p),
# xlab = "Wavelengths, nm",
# ylab = "1st derivative",
# type = "l", lty = 1, col = "#5177A133")
## -----------------------------------------------------------------------------
# training dataset
training <- NIRsoil[NIRsoil$train == 1, ]
# testing dataset
testing <- NIRsoil[NIRsoil$train == 0, ]
## ---- results = 'hide'--------------------------------------------------------
# principal component (pc) analysis with the default
# method (singular value decomposition)
pca_tr <- ortho_projection(Xr = training$spc_p, method = "pca")
pca_tr
## ----plotpcsvariance, fig.cap = "Individual contribution to the explained variance for each component (left) and cumulative variance explained by the principal components (right).", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 7, fig.height = 3, fig.retina = 0.85----
plot(pca_tr, col = "#D42B08CC")
## ---- results = 'hide'--------------------------------------------------------
# principal component (pc) analysis with the default
# NIPALS algorithm
pca_nipals_tr <- ortho_projection(Xr = training$spc_p,
method = "pca.nipals")
pca_nipals_tr
## ---- results = 'hide', eval = FALSE------------------------------------------
# # Partial Least Squares decomposition using
# # Total carbon as side information
# # (this might take some seconds)
# pls_tr <- ortho_projection(Xr = training$spc_p,
# Yr = training$Ciso,
# method = "pls")
# pls_tr
## ---- results = 'hide', eval = FALSE------------------------------------------
# # This retains components that alone explain at least 5% of the original
# # variation in training$spc_p
# var_sel <- list(method = "var", value = 0.05)
# pca_tr_minvar5 <- ortho_projection(Xr = training$spc_p,
# method = "pca",
# pc_selection = var_sel)
#
# pca_tr_minvar5
## ---- results = 'hide', eval = FALSE------------------------------------------
# # This retains components that together explain at least 90% of the original
# # variation in training$spc_p
# cumvar_sel <- list(method = "cumvar", value = 0.90)
#
# pca_tr_cumvar90 <- ortho_projection(Xr = training$spc_p,
# method = "pca",
# pc_selection = cumvar_sel)
#
# pca_tr_cumvar90
## ---- results = 'hide'--------------------------------------------------------
# This uses optimal component selection
# variation in training$spc_p
optimal_sel <- list(method = "opc", value = 40)
pca_tr_opc <- ortho_projection(Xr = training$spc_p,
Yr = training$Ciso,
method = "pca",
pc_selection = optimal_sel)
pca_tr_opc
## ----pcrmsd, fig.cap = "Root mean squared difference between the samples and their corresponding nearest neighbors (for Total Carbon as side finormation) found by using dissimilarity matrices computed with different number of PCs.", fig.id = "plot_pcs_opc", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 5, fig.height = 4, fig.retina = 0.85----
plot(pca_tr_opc, col = "#FF1A00CC")
## ----rmsdscatter, fig.cap = paste("Comparison between each sample and its corresponding nearest neighbor (in terms of Total Carbon) when ", pca_tr_opc$n_components, "are used for dissimilarity matrix computations."), fig.id = "plot_pcs_opc2", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 4, fig.height = 4, fig.retina = 0.85----
# compute the dissimilarity matrix using all the retained scores
pc_diss <- f_diss(pca_tr_opc$scores, diss_method = "mahalanobis")
# get the nearest neighbor for each sample
nearest_n <- apply(pc_diss, MARGIN = 1, FUN = function(x) order(x)[2])
# compute the RMSD
rmsd <- sqrt(mean((training$Ciso - training$Ciso[nearest_n])^2, na.rm = TRUE))
rmsd
# the RSMD for all the components is already available in
# ...$opc_evaluation
pca_tr_opc$opc_evaluation[pca_tr_opc$n_components, , drop = FALSE]
plot(training$Ciso[nearest_n],
training$Ciso,
ylab = "Ciso of the nearest neighbor, %", xlab = "Ciso, %",
col = "#D19C17CC", pch = 16)
grid()
## ---- results = 'hide'--------------------------------------------------------
# This uses manual component selection
manual_sel <- list(method = "manual", value = 9)
# PC
pca_tr_manual <- ortho_projection(Xr = training$spc_p,
method = "pca",
pc_selection = manual_sel)
pca_tr_manual
# PLS
pls_tr_manual <- ortho_projection(Xr = training$spc_p,
Yr = training$Ciso,
method = "pls",
pc_selection = manual_sel)
pls_tr_manual
## ---- results = 'hide'--------------------------------------------------------
optimal_sel <- list(method = "opc", value = 40)
# PLS
pls_tr_opc <- ortho_projection(Xr = training$spc_p,
Yr = training$Ciso,
method = "pls",
pc_selection = optimal_sel,
scale = TRUE)
# the pls projection matrix
pls_tr_opc$projection_mat
pls_projected <- predict(pls_tr_opc, newdata = testing$spc_p)
# PC
pca_tr_opc <- ortho_projection(Xr = training$spc_p,
Yr = training$Ciso,
method = "pca",
pc_selection = optimal_sel,
scale = TRUE)
# the pca projection matrix
t(pca_tr_opc$X_loadings)
pca_projected <- predict(pca_tr_opc, newdata = testing$spc_p)
## ---- results = 'hide', eval = FALSE------------------------------------------
# optimal_sel <- list(method = "opc", value = 40)
# pca_tr_ts <- ortho_projection(Xr = training$spc_p,
# Xu = testing$spc_p,
# Yr = training$Ciso,
# method = "pca",
# pc_selection = optimal_sel,
# scale = TRUE)
# plot(pca_tr_ts)
## ---- results = 'hide', eval = FALSE------------------------------------------
# optimal_sel <- list(method = "opc", value = 40)
# pls_tr_ts <- ortho_projection(Xr = training$spc_p,
# Xu = testing$spc_p,
# Yr = training$Ciso,
# method = "pls",
# pc_selection = optimal_sel,
# scale = TRUE)
#
# # the same PLS projection model can be obtained with:
# pls_tr_ts2 <- ortho_projection(Xr = training$spc_p[!is.na(training$Ciso),],
# Yr = training$Ciso[!is.na(training$Ciso)],
# method = "pls",
# pc_selection = optimal_sel,
# scale = TRUE)
#
# identical(pls_tr_ts$projection_mat, pls_tr_ts2$projection_mat)
## ---- results = 'hide', eval = FALSE------------------------------------------
# optimal_sel <- list(method = "opc", value = 40)
# pls_multi_yr <- ortho_projection(Xr = training$spc_p,
# Xu = testing$spc_p,
# Yr = training[, c("Ciso", "Nt", "CEC")],
# method = "pls",
# pc_selection = optimal_sel,
# scale = TRUE)
# plot(pls_multi_yr)
## ---- results = 'hide', eval = FALSE------------------------------------------
# pls_multi_yr$opc_evaluation
## ---- results = 'hide', eval = FALSE------------------------------------------
# # for PC dissimilarity using the default settings
# pcd <- dissimilarity(Xr = training$spc_p,
# diss_method = "pca")
# dim(pcd$dissimilarity)
#
# # for PC dissimilarity using the optimized component selection method
# pcd2 <- dissimilarity(Xr = training$spc_p,
# diss_method = "pca.nipals",
# Yr = training$Ciso,
# pc_selection = list("opc", 20),
# return_projection = TRUE)
# dim(pcd2$dissimilarity)
# pcd2$dissimilarity
# pcd2$projection # the projection used to compute the dissimilarity matrix
#
# # for PLS dissimilarity
# plsd <- dissimilarity(Xr = training$spc_p,
# diss_method = "pls",
# Yr = training$Ciso,
# pc_selection = list("opc", 20),
# return_projection = TRUE)
# dim(plsd$dissimilarity)
# plsd$dissimilarity
# plsd$projection # the projection used to compute the dissimilarity matrix
## ---- results = 'hide', eval = TRUE-------------------------------------------
# For PC dissimilarity using the optimized component selection method
pcd_tr_ts <- dissimilarity(Xr = training$spc_p,
Xu = testing$spc_p,
diss_method = "pca.nipals",
Yr = training$Ciso,
pc_selection = list("opc", 20))
dim(pcd_tr_ts$dissimilarity)
# For PLS dissimilarity
plsd_tr_ts <- dissimilarity(Xr = training$spc_p,
Xu = testing$spc_p,
diss_method = "pls",
Yr = training$Ciso,
pc_selection = list("opc", 20))
dim(plsd_tr_ts$dissimilarity)
## ----localdiss, eval = TRUE---------------------------------------------------
# for localized PC dissimilarity using the optimized component selection method
# set the number of neighbors to retain
knn <- 200
local_pcd_tr_ts <- dissimilarity(Xr = training$spc_p,
Xu = testing$spc_p,
diss_method = "pca",
Yr = training$Ciso,
pc_selection = list("opc", 20),
.local = TRUE,
pre_k = knn)
dim(local_pcd_tr_ts$dissimilarity)
# For PLS dissimilarity
local_plsd_tr_ts <- dissimilarity(Xr = training$spc_p,
Xu = testing$spc_p,
diss_method = "pls",
Yr = training$Ciso,
pc_selection = list("opc", 20),
.local = TRUE,
pre_k = knn)
dim(local_plsd_tr_ts$dissimilarity)
# check the dissimilarity scores between the first two
# observations in the testing dataset and the first 10
# observations in the training dataset
local_plsd_tr_ts$dissimilarity[1:10, 1:2]
## ---- results = 'hide', eval = FALSE------------------------------------------
# cd_tr <- dissimilarity(Xr = training$spc_p, diss_method = "cor")
# dim(cd_tr$dissimilarity)
# cd_tr$dissimilarity
## ---- results = 'hide', eval = TRUE-------------------------------------------
cd_tr_ts <- dissimilarity(Xr = training$spc_p,
Xu = testing$spc_p,
diss_method = "cor")
dim(cd_tr_ts$dissimilarity)
cd_tr_ts$dissimilarity
## ---- results = 'hide', eval = FALSE------------------------------------------
# # a moving window correlation dissimilarity between training and testing
# # using a window size of 19 spectral data points (equivalent to 95 nm)
# cd_mw <- dissimilarity(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "cor",
# ws = 19)
# cd_mw$dissimilarity
## ---- results = 'hide', eval = FALSE------------------------------------------
# # compute the dissimilarity between all the training observations
# ed <- dissimilarity(Xr = training$spc_p, diss_method = "euclid")
# ed$dissimilarity
## ---- results = 'hide', eval = FALSE------------------------------------------
# # compute the dissimilarity between all the training observations
# pre_time_resemble <- proc.time()
# ed_resemble <- dissimilarity(Xr = training$spc_p, diss_method = "euclid")
# post_time_resemble <- proc.time()
# post_time_resemble - pre_time_resemble
#
# pre_time_stats <- proc.time()
# ed_stats <- dist(training$spc_p, method = "euclid")
# post_time_stats <- proc.time()
# post_time_stats - pre_time_stats
#
# # scale the results of dist() based on the number of input columns
# ed_stats_tr <- sqrt((as.matrix(ed_stats)^2)/ncol(training$spc_p))
# ed_stats_tr[1:2, 1:3]
#
# # compare resemble and R stats results of Euclidean distances
# ed_resemble$dissimilarity[1:2, 1:3]
## ---- results = 'hide', eval = FALSE------------------------------------------
# # compute the dissimilarity between the training and testing observations
# ed_tr_ts <- dissimilarity(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "euclid")
## ---- results = 'hide', eval = FALSE------------------------------------------
# # compute the dissimilarity between the training and testing observations
# cosine_tr_ts <- dissimilarity(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "cosine")
# dim(cosine_tr_ts$dissimilarity)
# cosine_tr_ts$dissimilarity
## ---- results = 'hide', eval = FALSE------------------------------------------
# sid_tr_ts <- dissimilarity(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "sid")
# dim(sid_tr_ts$dissimilarity)
# sid_tr_ts$dissimilarity
## ---- results = 'hide', eval = TRUE-------------------------------------------
# PC dissimilarity with default settings (variance-based
# of components)
pcad <- dissimilarity(training$spc_p, diss_method = "pca", scale = TRUE)
# PLS dissimilarity with default settings (variance-based
# of components)
plsd <- dissimilarity(training$spc_p, diss_method = "pls", Yr = training$Ciso,
scale = TRUE)
# PC dissimilarity with optimal selection of components
opc_sel <- list("opc", 30)
o_pcad <- dissimilarity(training$spc_p,
diss_method = "pca",
Yr = training$Ciso,
pc_selection = opc_sel,
scale = TRUE)
# PLS dissimilarity with optimal selection of components
o_plsd <- dissimilarity(training$spc_p,
diss_method = "pls",
Yr = training$Ciso,
pc_selection = opc_sel,
scale = TRUE)
# Correlation dissimilarity
cd <- dissimilarity(training$spc_p, diss_method = "cor", scale = TRUE)
# Moving window correlation dissimilarity
mcd <- dissimilarity(training$spc_p, diss_method = "cor", ws = 51, scale = TRUE)
# Euclidean dissimilarity
ed <- dissimilarity(training$spc_p, diss_method = "euclid", scale = TRUE)
# Cosine dissimilarity
cosd <- dissimilarity(training$spc_p, diss_method = "cosine", scale = TRUE)
# Spectral information divergence/dissimilarity
sinfd <- dissimilarity(training$spc_p, diss_method = "sid", scale = TRUE)
## ---- results = 'hide', eval = TRUE-------------------------------------------
Ciso <- as.matrix(training$Ciso)
ev <- NULL
ev[["pcad"]] <- sim_eval(pcad$dissimilarity, side_info = Ciso)
ev[["plsd"]] <- sim_eval(plsd$dissimilarity, side_info = Ciso)
ev[["o_pcad"]] <- sim_eval(o_pcad$dissimilarity, side_info = Ciso)
ev[["o_plsd"]] <- sim_eval(o_plsd$dissimilarity, side_info = Ciso)
ev[["cd"]] <- sim_eval(cd$dissimilarity, side_info = Ciso)
ev[["mcd"]] <- sim_eval(mcd$dissimilarity, side_info = Ciso)
ev[["ed"]] <- sim_eval(ed$dissimilarity, side_info = Ciso)
ev[["cosd"]] <- sim_eval(cosd$dissimilarity, side_info = Ciso)
ev[["sinfd"]] <- sim_eval(sinfd$dissimilarity, side_info = Ciso)
## ----fcaption, results = 'hide', eval = TRUE, echo = FALSE--------------------
fig_cap <- paste("Comparison between observations and their corresponding",
"nearest neighbor (1-NN)","observation in terms of Total Carbon (Ciso). The 1-NNs",
"are retrieved with the",
"following dissimilarity metrics:",
"pcad: PC dissimilarity with default settings (variance-based of components);",
"plsd: PLS dissimilarity with default settings (variance-based of components);",
"o-pcad: PC dissimilarity with optimal selection of components;",
"o-plsd: PLS dissimilarity with optimal selection of components;",
"cd: Correlation dissimilarity;",
"mcd: Moving window correlation dissimilarity;",
"ed: Euclidean dissimilarity;",
"sinfd: Spectral information divergence.", collapse = "")
## ----results = 'hide', eval = TRUE, echo = FALSE------------------------------
comparisons <- lapply(names(ev),
FUN = function(x, label) {
irmsd <- round(x[[label]]$eval[1], 2)
ir <- round(x[[label]]$eval[2], 2)
if (label %in% c("o_pcad", "o_plsd")) {
label <- paste0("**", label, "**")
irmsd <- paste0("**", irmsd, "**")
ir <- paste0("**", ir, "**")
}
data.frame(Measure = label,
RMSD = irmsd,
r = ir)
},
x = ev)
comparisons
## ----eval = FALSE, echo = TRUE------------------------------------------------
# comparisons <- lapply(names(ev),
# FUN = function(x, label) {
# irmsd <- x[[label]]$eval[1]
# ir <- x[[label]]$eval[2]
# data.frame(Measure = label,
# RMSD = irmsd,
# r = ir)
# },
# x = ev)
# comparisons
## ----tcomparisons, eval = TRUE, echo = FALSE----------------------------------
knitr::kable(do.call("rbind", comparisons),
caption = paste("Root mean squared difference (RMSD)",
"and correlation coefficients",
"for between the observations and their",
"corrresponding closest observations",
"retrieved with the different dissimilarity
methods."),
format = "simple", digits = 2, align = "l", padding = 2)
## ----eval = FALSE-------------------------------------------------------------
# old_par <- par("mfrow")
# par(mfrow = c(3, 3))
# p <- sapply(names(ev),
# FUN = function(x, label, labs = c("Ciso (1-NN), %", "Ciso, %")) {
# xy <- x[[label]]$first_nn[,2:1]
# plot(xy, xlab = labs[1], ylab = labs[2], col = "red")
# title(label)
# grid()
# abline(0, 1)
#
# },
# x = ev)
# par(old_par)
## ----pcomparisons, fig.cap = paste(fig_cap), fig.cap.style = "Image Caption", fig.align = "center", fig.width = 8, fig.height = 8, echo = FALSE, fig.retina = 0.85----
old_par <- par("mfrow", "mar")
par(mfrow = c(3, 3), pch = 16, mar = c(4,4,4,4))
my_cols <- c("#750E3380",
"#C3BC6180",
"#FFE64F80",
"#EFBF4780",
"#E47C4E80",
"#F1A07380",
"#A1CFC480",
"#6B8EB580",
"#5177A180")
names(my_cols) <- sample(names(ev))
p <- sapply(names(ev),
FUN = function(x,
label,
labs = c("Ciso (1-NN), %", "Ciso, %"),
cols) {
xy <- x[[label]]$first_nn[,2:1]
plot(xy, xlab = labs[1], ylab = labs[2], col = cols[label])
title(label)
grid(col= "#80808080", lty = 1)
abline(0, 1, col = "#FF1A0080")
},
x = ev,
cols = my_cols)
par(old_par)
## ---- results = 'hide', eval = TRUE-------------------------------------------
knn_pc <- search_neighbors(Xr = training$spc_p,
Xu = testing$spc_p,
diss_method = "pca.nipals",
k = 50)
# matrix of neighbors
knn_pc$neighbors
# matrix of neighbor distances (dissimilarity scores)
knn_pc$neighbors_diss
# the index (in the training set) of the first two closest neighbors found in
# training for the first observation in testing:
knn_pc$neighbors[1:2, 1, drop = FALSE]
# the distances of the two closest neighbors found in
# training for the first observation in testing:
knn_pc$neighbors_diss[1:2, 1, drop = FALSE]
# the indices in training that fall in any of the
# neighborhoods of testing
knn_pc$unique_neighbors
## ---- results = 'hide', eval = FALSE------------------------------------------
# # using PC dissimilarity with optimal selection of components
# knn_opc <- search_neighbors(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "pca.nipals",
# Yr = training$Ciso,
# k = 50,
# pc_selection = list("opc", 20),
# scale = TRUE)
#
# # using PLS dissimilarity with optimal selection of components
# knn_pls <- search_neighbors(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "pls",
# Yr = training$Ciso,
# k = 50,
# pc_selection = list("opc", 20),
# scale = TRUE)
#
# # using correlation dissimilarity
# knn_c <- search_neighbors(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "cor",
# k = 50, scale = TRUE)
#
# # using moving window correlation dissimilarity
# knn_mwc <- search_neighbors(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "cor",
# k = 50,
# ws = 51, scale = TRUE)
## ---- results = 'hide', eval = FALSE------------------------------------------
# # using localized PC dissimilarity with optimal selection of components
# knn_local_opc <- search_neighbors(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "pca.nipals",
# Yr = training$Ciso,
# k = 50,
# pc_selection = list("opc", 20),
# scale = TRUE,
# .local = TRUE,
# pre_k = 250)
#
# # using localized PLS dissimilarity with optimal selection of components
# knn_local_opc <- search_neighbors(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "pls",
# Yr = training$Ciso,
# k = 50,
# pc_selection = list("opc", 20),
# scale = TRUE,
# .local = TRUE,
# pre_k = 250)
## ---- results = 'hide', eval = TRUE-------------------------------------------
# a dissimilarity threshold
d_th <- 1
# the minimum number of observations required in each neighborhood
k_min <- 20
# the maximum number of observations allowed in each neighborhood
k_max <- 300
dnn_pc <- search_neighbors(Xr = training$spc_p,
Xu = testing$spc_p,
diss_method = "pca.nipals",
k_diss = d_th,
k_range = c(k_min, k_max),
scale = TRUE)
# matrix of neighbors. The minimum number of indices is 20 (given by k_min)
# and the maximum number of indices is 300 (given by k_max).
# NAs indicate "not a neighbor"
dnn_pc$neighbors
# this reports how many neighbors were found for each observation in
# testing using the input distance threshold (column n_k) and how
# many were finally selected (column final_n_k)
dnn_pc$k_diss_info
# matrix of neighbor distances
dnn_pc$neighbors_diss
# the indices in training that fall in any of the
# neighborhoods of testing
dnn_pc$unique_neighbors
## ---- knnhist, fig.cap = "Histogram of the original neighborhood sizes", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 5, fig.height = 4, fig.retina = 0.80----
hist(dnn_pc$k_diss_info$final_n_k,
breaks = k_min,
xlab = "Final neighborhood size",
main = "", col = "#EFBF47CC")
## ---- results = 'hide', eval = FALSE------------------------------------------
# # the indices of the observations that we want to "invite" to every neighborhood
# forced_guests <- c(1, 5, 8, 9)
#
# # using PC dissimilarity with optimal selection of components
# knn_spiked <- search_neighbors(Xr = training$spc_p,
# Xu = testing$spc_p,
# diss_method = "pca.nipals",
# Yr = training$Ciso,
# k = 50,
# spike = forced_guests,
# pc_selection = list("opc", 20))
#
# # check the first 8 neighbors found in training for the
# # first 2 observations in testing
# knn_spiked$neighbors[1:8, 1:2]
## ----mblgif, fig.cap = "Example of the main steps in memory-based learning for predicting a response variable in five different observations based on set of p-dimensional space.", echo = FALSE, out.width = '65%', fig.align = 'center', fig.retina = 0.85----
knitr::include_graphics("MBL.gif")
## ---- eval = TRUE-------------------------------------------------------------
# creates an object with instructions to build PLS models
my_plsr <- local_fit_pls(pls_c = 15)
my_plsr
# creates an object with instructions to build WAPLS models
my_waplsr <- local_fit_wapls(min_pls_c = 3, max_pls_c = 20)
my_waplsr
# creates an object with instructions to build GPR models
my_gpr <- local_fit_gpr()
my_gpr
## ---- eval = TRUE-------------------------------------------------------------
# create an object with instructions to conduct both validation types
# "NNv" "local_cv"
two_val_control <- mbl_control(validation_type = c("NNv", "local_cv"),
number = 10,
p = 0.75)
## ---- results = 'hide', eval = TRUE-------------------------------------------
# define the dissimilarity method
my_diss <- "cor"
# define the neighborhood sizes to test
my_ks <- seq(80, 200, by = 40)
# define how to use the dissimilarity information (ignore it)
ignore_diss <- "none"
# define the regression method to be used at each neighborhood
my_waplsr <- local_fit_wapls(min_pls_c = 3, max_pls_c = 20)
# for the moment use only "NNv" validation (it will be faster)
nnv_val_control <- mbl_control(validation_type = "NNv")
# predict Total Carbon
# (remove missing values)
local_ciso <- mbl(
Xr = training$spc_p[!is.na(training$Ciso),],
Yr = training$Ciso[!is.na(training$Ciso)],
Xu = testing$spc_p,
k = my_ks,
method = my_waplsr,
diss_method = my_diss,
diss_usage = ignore_diss,
control = nnv_val_control,
scale = TRUE
)
## ----localresultsciso, fig.cap = "MBL results for Total Carbon predictions using the LOCAL algorithm. NNv: nearest-neighbor cross-validation.", fig.align = "center", fig.width = 7.5, fig.height = 4, fig.retina = 0.85----
plot(local_ciso, main = "")
local_ciso
## ---- results = 'hide', eval = TRUE, echo = FALSE-----------------------------
bestk <- which.min(local_ciso$validation_results$nearest_neighbor_validation$rmse)
bestk <- local_ciso$validation_results$nearest_neighbor_validation$k[bestk]
## ---- results = 'hide', eval = TRUE, fig.show = 'hide'-----------------------
bki <- which.min(local_ciso$validation_results$nearest_neighbor_validation$rmse)
bk <- local_ciso$validation_results$nearest_neighbor_validation$k[bki]
# all the prediction results are stored in:
local_ciso$results
# the get_predictions function makes easier to retrieve the
# predictions from the previous object
ciso_hat <- as.matrix(get_predictions(local_ciso))[, bki]
## ---- eval = F----------------------------------------------------------------
# # Plot predicted vs reference
# plot(ciso_hat, testing$Ciso,
# xlim = c(0, 14),
# ylim = c(0, 14),
# xlab = "Predicted Total Carbon, %",
# ylab = "Total Carbon, %",
# main = "LOCAL using argument k")
# grid()
# abline(0, 1, col = "red")
## ---- eval = TRUE-------------------------------------------------------------
# prediction RMSE:
sqrt(mean((ciso_hat - testing$Ciso)^2, na.rm = TRUE))
# squared R
cor(ciso_hat, testing$Ciso, use = "complete.obs")^2
## ---- results = 'hide', eval = TRUE, fig.show = 'hide'-----------------------
# create a vector of dissimilarity thresholds to evaluate
# since the correlation dissimilarity will be used
# these thresholds need to be > 0 and <= 1
dths <- seq(0.025, 0.3, by = 0.025)
# indicate the minimum and maximum sizes allowed for the neighborhood
k_min <- 30
k_max <- 200
local_ciso_diss <- mbl(
Xr = training$spc_p[!is.na(training$Ciso),],
Yr = training$Ciso[!is.na(training$Ciso)],
Xu = testing$spc_p,
k_diss = dths,
k_range = c(k_min, k_max),
method = my_waplsr,
diss_method = my_diss,
diss_usage = ignore_diss,
control = nnv_val_control,
scale = TRUE
)
## ---- eval = FALSE------------------------------------------------------------
# plot(local_ciso_diss)
## ---- results = 'hide', eval = TRUE, echo = FALSE-----------------------------
bestd <- which.min(local_ciso_diss$validation_results$nearest_neighbor_validation$rmse)
bestd <- local_ciso_diss$validation_results$nearest_neighbor_validation$k[bestd]
## ---- eval = TRUE-------------------------------------------------------------
local_ciso_diss
## ---- results = 'hide', eval = TRUE, fig.show = 'hide'-----------------------
# best distance threshold
bdi <- which.min(local_ciso_diss$validation_results$nearest_neighbor_validation$rmse)
bd <- local_ciso_diss$validation_results$nearest_neighbor_validation$k[bdi]
# predictions for the best distance
ciso_diss_hat <- as.matrix(get_predictions(local_ciso_diss))[, bdi]
## ---- eval = FALSE------------------------------------------------------------
# # Plot predicted vs reference
# plot(ciso_diss_hat, testing$Ciso,
# xlim = c(0, 14),
# ylim = c(0, 14),
# xlab = "Predicted Total Carbon, %",
# ylab = "Total Carbon, %",
# main = "LOCAL using argument k_diss")
# grid()
# abline(0, 1, col = "red")
## ----addexamples, eval = TRUE, echo = FALSE-----------------------------------
abbr <- c("`local_cec`",
"`pc_pred_cec`",
"`pls_pred_cec`",
"`local_gpr_cec`")
diss <- c("Correlation",
"optimized PC",
"optimized PLS",
"optimized PC")
d_usage <- c("None",
"Source of predictors",
"None",
"Source of predictors")
reg_m <- c("Weighted average PLS",
"Weighted average PLS",
"Weighted average PLS",
"Gaussian process")
my_tab <- as.data.frame(cbind(abbr, diss, d_usage, reg_m))
colnames(my_tab) <- c("Abreviation",
"Dissimilarity method",
"Dissimilarity usage",
"Local regression")
knitr::kable(my_tab,
caption = paste("Basic description of the different MBL",
"configurations in the examples to predict",
"Cation Exhange Capacity (CEC)."),
format = "simple", align = "l", padding = 2)
## ---- results = 'hide', eval = TRUE, fig.show = 'hide'-----------------------
# Lets define some methods:
my_wapls <- local_fit_wapls(2, 25)
k_min_max <- c(80, 200)
# use the LOCAL algorithm
# specific thresholds for cor dissimilarity
dth_cor <- seq(0.01, 0.3, by = 0.03)
local_cec <- mbl(
Xr = training$spc_p[!is.na(training$CEC),],
Yr = training$CEC[!is.na(training$CEC)],
Xu = testing$spc_p,
k_diss = dth_cor,
k_range = k_min_max,
method = my_wapls,
diss_method = "cor",
diss_usage = "none",
control = nnv_val_control,
scale = TRUE
)
# use one where pca dissmilarity is used and the dissimilarity matrix
# is used as source of additional predictors
# lets define first a an appropriate vector of dissimilarity thresholds
# for the PC dissimilarity method
dth_pc <- seq(0.05, 1, by = 0.1)
pc_pred_cec <- mbl(
Xr = training$spc_p[!is.na(training$CEC),],
Yr = training$CEC[!is.na(training$CEC)],
Xu = testing$spc_p,
k_diss = dth_pc,
k_range = k_min_max,
method = my_wapls,
diss_method = "pca",
diss_usage = "predictors",
control = nnv_val_control,
scale = TRUE
)
# use one where PLS dissmilarity is used and the dissimilarity matrix
# is used as source of additional predictors
pls_pred_cec <- mbl(
Xr = training$spc_p[!is.na(training$CEC),],
Yr = training$CEC[!is.na(training$CEC)],
Xu = testing$spc_p,
Yu = testing$CEC,
k_diss = dth_pc,
k_range = k_min_max,
method = my_wapls,
diss_method = "pls",
diss_usage = "none",
control = nnv_val_control,
scale = TRUE
)
# use one where Gaussian process regression and pca dissmilarity are used
# and the dissimilarity matrix is used as source of additional predictors
local_gpr_cec <- mbl(
Xr = training$spc_p[!is.na(training$CEC),],
Yr = training$CEC[!is.na(training$CEC)],
Xu = testing$spc_p,
k_diss = dth_pc,
k_range = k_min_max,
method = local_fit_gpr(),
diss_method = "pca",
diss_usage = "predictors",
control = nnv_val_control,
scale = TRUE
)
## ---- eval = TRUE, fig.show = 'hide'-----------------------------------------
# get the indices of the best results according to
# nearest neighbor validation statistics
c_val_name <- "validation_results"
c_nn_val_name <- "nearest_neighbor_validation"
bi_local <- which.min(local_cec[[c_val_name]][[c_nn_val_name]]$rmse)
bi_pc_pred <- which.min(pc_pred_cec[[c_val_name]][[c_nn_val_name]]$rmse)
bi_pls_pred <- which.min(pls_pred_cec[[c_val_name]][[c_nn_val_name]]$rmse)
bi_local_gpr <- which.min(local_gpr_cec[[c_val_name]][[c_nn_val_name]]$rmse)
preds <- cbind(get_predictions(local_cec)[, ..bi_local],
get_predictions(pc_pred_cec)[, ..bi_pc_pred],
get_predictions(pls_pred_cec)[, ..bi_pls_pred],
get_predictions(local_gpr_cec)[, ..bi_local_gpr])
colnames(preds) <- c("local_cec",
"pc_pred_cec",
"pls_pred_cec",
"local_gpr_cec")
preds <- as.matrix(preds)
# R2s
cor(testing$CEC, preds, use = "complete.obs")^2
#RMSEs
colMeans((preds - testing$CEC)^2, na.rm = TRUE)^0.5
## ---- eval = FALSE, fig.show = 'hide'----------------------------------------
# old_par <- par("mfrow", "mar")
#
# par(mfrow = c(2, 2))
# plot(testing$CEC, preds[, 2],
# xlab = "Predicted CEC, meq/100g",
# ylab = "CEC, meq/100g", main = colnames(preds)[2])
# abline(0, 1, col = "red")
#
# plot(testing$CEC, preds[, 3],
# xlab = "Predicted CEC, meq/100g",
# ylab = "CEC, meq/100g", main = colnames(preds)[3])
# abline(0, 1, col = "red")
#
# plot(testing$CEC, preds[, 4],
# xlab = "Predicted CEC, meq/100g",
# ylab = "CEC, meq/100g", main = colnames(preds)[4])
# abline(0, 1, col = "red")
# par(old_par)
## ----mblcomparisons, fig.cap = "CEC prediction results for the different MBL configurations tested" , fig.cap.style = "Image Caption", fig.align = "center", fig.width = 8, fig.height = 8, echo = FALSE, fig.retina = 0.85----
old_par <- par("mfrow", "mar")
par(mfrow = c(2, 2), pch = 16, mar = c(4,4,4,4))
my_cols <- c("#D42B08CC",
"#750E3380",
"#EFBF4780",
"#5177A180")
names(my_cols) <- colnames(preds)
# R2s
r2s <- round(cor(testing$CEC, preds, use = "complete.obs")^2, 2)
#RMSEs
rmses <- round(colMeans((preds - testing$CEC)^2, na.rm = TRUE)^0.5,2)
names(r2s) <- colnames(preds)
names(rmses) <- colnames(preds)
p <- sapply(colnames(preds),
FUN = function(y,
yhats,
label,
labs = c("Predicted CEC, meq/100g", "CEC, meq/100g"),
rsq,
rmse,
cols) {
plot(x = yhats[,label],
y = y,
ylim = range(y, na.rm = T), xlim = range(y, na.rm = T),
xlab = labs[1], ylab = labs[2],
col = cols[label])
title(label)
title(paste0("\n\n\n RMSE: ", rmse[label], "; Rsq: ", rsq[label]), cex.main=1)
grid(col= "#80808080", lty = 1)
abline(0, 1, col = "#FF1A0080")
},
yhats = preds,
y = testing$CEC,
cols = my_cols,
rsq = r2s,
rmse = rmses)
par(old_par)
## ---- eval = FALSE------------------------------------------------------------
# # use Yu argument to validate the predictions
# pc_pred_nt_yu <- mbl(
# Xr = training$spc_p[!is.na(training$Nt),],
# Yr = training$Nt[!is.na(training$Nt)],
# Xu = testing$spc_p,
# Yu = testing$Nt,
# k = seq(40, 100, by = 10),
# diss_usage = "none",
# control = mbl_control(validation_type = "NNv"),
# scale = TRUE
# )
#
# pc_pred_nt_yu
## ---- eval = FALSE------------------------------------------------------------
# # Running the mbl function using multiple cores
#
# # Execute with two cores, if available, ...
# n_cores <- 2
#
# # ... if not then go with 1 core
# if (parallel::detectCores() < 2) {
# n_cores <- 1
# }
#
# # Set the number of cores
# library(doParallel)
# clust <- makeCluster(n_cores)
# registerDoParallel(clust)
#
# # Alternatively:
# # library(doSNOW)
# # clust <- makeCluster(n_cores, type = "SOCK")
# # registerDoSNOW(clust)
# # getDoParWorkers()
#
# pc_pred_nt <- mbl(
# Xr = training$spc_p[!is.na(training$Nt),],
# Yr = training$Nt[!is.na(training$Nt)],
# Xu = testing$spc_p,
# k = seq(40, 100, by = 10),
# diss_usage = "none",
# control = mbl_control(validation_type = "NNv"),
# scale = TRUE
# )
#
# # go back to sequential processing
# registerDoSEQ()
# try(stopCluster(clust))
#
# pc_pred_nt
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.