resemble.R
In resemble: Memory-Based Learning in Spectral Chemometrics

## ----setup, include = FALSE---------------------------------------------------
library(formatR)
knitr::opts_chunk$set(
  collapse = TRUE, eval.after = "fig.cap"
)

## ----classdiagram, echo = FALSE, out.width = '20%', fig.align = 'right'-------
knitr::include_graphics("logo.jpg")

## ----eval = TRUE--------------------------------------------------------------
citation(package = "resemble")

## ----libraries, tidy = TRUE, message = FALSE----------------------------------
library(resemble)
library(prospectr)
library(magrittr)

## ----tidy = FALSE, message = FALSE, results = 'hide'--------------------------
data(NIRsoil)
dim(NIRsoil)
str(NIRsoil)

## ----NIRsoil, tidy = FALSE, message = FALSE-----------------------------------
# obtain a numeric vector of the wavelengths at which spectra is recorded 
wavs <- NIRsoil$spc %>% colnames() %>% as.numeric()

# pre-process the spectra:
# - resample it to a resolution of 6 nm
# - use first order derivative
new_res <- 5
poly_order <- 1
window <- 5
diff_order <- 1

NIRsoil$spc_p <- NIRsoil$spc %>% 
  resample(wav = wavs, new.wav = seq(min(wavs), max(wavs), by = new_res)) %>% 
  savitzkyGolay(p = poly_order, w = window, m = diff_order)

## ----plotspectra, fig.cap = "Raw spectral absorbance data (top) and first derivative of the absorbance spectra (bottom).",  fig.cap.style = "Image Caption", fig.align = "center", fig.width = 7, fig.height = 7, echo = FALSE, fig.retina = 0.85----
old_par <- par("mfrow", "mar")
par(mfrow = c(2, 1), mar = c(4, 4, 1, 4))

new_wavs <- as.matrix(as.numeric(colnames(NIRsoil$spc_p)))
plot(range(wavs), range(NIRsoil$spc), col = NA,
     xlab = "",
     ylab = "Absorbance")
rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "#EFBF4780")
grid(lty = 1, col = "#E47C4E80")
matlines(x = wavs, y = t(NIRsoil$spc), 
         lty = 1, col = "#5177A133")

plot(range(new_wavs), range(NIRsoil$spc_p), col = NA,
     xlab = "Wavelengths, nm",
     ylab = "1st derivative")
rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "#EFBF4780")
grid(lty = 1, col = "#E47C4E80")
matlines(x = new_wavs, y = t(NIRsoil$spc_p), 
        lty = 1, col = "#5177A133")
par(old_par)

## ----eval = FALSE-------------------------------------------------------------
# new_wavs <- as.matrix(as.numeric(colnames(NIRsoil$spc_p)))
# 
# matplot(x = wavs, y = t(NIRsoil$spc),
#         xlab = "Wavelengths, nm",
#         ylab = "Absorbance",
#         type = "l", lty = 1, col = "#5177A133")
# 
# matplot(x = new_wavs, y = t(NIRsoil$spc_p),
#         xlab = "Wavelengths, nm",
#         ylab = "1st derivative",
#         type = "l", lty = 1, col = "#5177A133")

## -----------------------------------------------------------------------------
# training dataset
training  <- NIRsoil[NIRsoil$train == 1, ]
# testing dataset
testing  <- NIRsoil[NIRsoil$train == 0, ]

## ----results = 'hide'---------------------------------------------------------
# principal component (pc) analysis with the default 
# method (singular value decomposition) 
pca_tr <- ortho_projection(Xr = training$spc_p, method = "pca")

pca_tr

## ----plotpcsvariance, fig.cap = "Individual contribution to the explained variance for each component (left) and cumulative variance explained by the principal components (right).",  fig.cap.style = "Image Caption", fig.align = "center", fig.width = 7, fig.height = 3, fig.retina = 0.85----
plot(pca_tr, col = "#D42B08CC")

## ----results = 'hide'---------------------------------------------------------
# principal component (pc) analysis with the default 
# NIPALS algorithm
pca_nipals_tr <- ortho_projection(Xr = training$spc_p,
                                  method = "pca.nipals")

pca_nipals_tr

## ----results = 'hide', eval = FALSE-------------------------------------------
# # Partial Least Squares decomposition using
# # Total carbon as side information
# # (this might take some seconds)
# pls_tr <- ortho_projection(Xr = training$spc_p,
#                            Yr = training$Ciso,
#                            method = "pls")
# pls_tr

## ----results = 'hide', eval = FALSE-------------------------------------------
# # This retains components that alone explain at least 5% of the original
# # variation in training$spc_p
# var_sel <-  list(method = "var", value = 0.05)
# pca_tr_minvar5 <- ortho_projection(Xr = training$spc_p,
#                                    method = "pca",
#                                    pc_selection = var_sel)
# 
# pca_tr_minvar5

## ----results = 'hide', eval = FALSE-------------------------------------------
# # This retains components that together explain at least 90% of the original
# # variation in training$spc_p
# cumvar_sel <-  list(method = "cumvar", value = 0.90)
# 
# pca_tr_cumvar90 <- ortho_projection(Xr = training$spc_p,
#                                     method = "pca",
#                                     pc_selection = cumvar_sel)
# 
# pca_tr_cumvar90

## ----results = 'hide'---------------------------------------------------------
# This uses optimal component selection
# variation in training$spc_p
optimal_sel <-  list(method = "opc", value = 40)
pca_tr_opc <- ortho_projection(Xr = training$spc_p,
                               Yr = training$Ciso,
                               method = "pca", 
                               pc_selection = optimal_sel)
pca_tr_opc

## ----pcrmsd, fig.cap = "Root mean squared difference between the samples and their corresponding nearest neighbors (for Total Carbon as side finormation) found by using dissimilarity matrices computed with different number of PCs.", fig.id = "plot_pcs_opc", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 5, fig.height = 4, fig.retina = 0.85----
plot(pca_tr_opc, col = "#FF1A00CC")

## ----rmsdscatter, fig.cap = paste("Comparison between each sample and its corresponding nearest neighbor (in terms of  Total Carbon) when ", pca_tr_opc$n_components, "are used for dissimilarity matrix computations."), fig.id = "plot_pcs_opc2", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 4, fig.height = 4, fig.retina = 0.85----
# compute the dissimilarity matrix using all the retained scores
pc_diss <- f_diss(pca_tr_opc$scores, diss_method = "mahalanobis")
# get the nearest neighbor for each sample
nearest_n <- apply(pc_diss, MARGIN = 1, FUN = function(x) order(x)[2])
# compute the RMSD
rmsd <- sqrt(mean((training$Ciso - training$Ciso[nearest_n])^2, na.rm = TRUE))
rmsd
# the RSMD for all the components is already available in 
# ...$opc_evaluation
pca_tr_opc$opc_evaluation[pca_tr_opc$n_components, , drop = FALSE]
plot(training$Ciso[nearest_n], 
     training$Ciso, 
     ylab = "Ciso of the nearest neighbor, %", xlab = "Ciso, %",
     col = "#D19C17CC", pch = 16)
grid()

## ----results = 'hide'---------------------------------------------------------
# This uses manual component selection 
manual_sel <-  list(method = "manual", value = 9)
# PC
pca_tr_manual <- ortho_projection(Xr = training$spc_p,
                                  method = "pca", 
                                  pc_selection = manual_sel)
pca_tr_manual

# PLS
pls_tr_manual <- ortho_projection(Xr = training$spc_p,
                                  Yr = training$Ciso,
                                  method = "pls", 
                                  pc_selection = manual_sel)
pls_tr_manual

## ----results = 'hide'---------------------------------------------------------
optimal_sel <-  list(method = "opc", value = 40)
# PLS
pls_tr_opc <- ortho_projection(Xr = training$spc_p,
                               Yr = training$Ciso,
                               method = "pls", 
                               pc_selection = optimal_sel,
                               scale = TRUE)
# the pls projection matrix
pls_tr_opc$projection_mat

pls_projected <- predict(pls_tr_opc, newdata = testing$spc_p)

# PC
pca_tr_opc <- ortho_projection(Xr = training$spc_p,
                               Yr = training$Ciso,
                               method = "pca", 
                               pc_selection = optimal_sel,
                               scale = TRUE)
# the pca projection matrix
t(pca_tr_opc$X_loadings)

pca_projected <- predict(pca_tr_opc, newdata = testing$spc_p)

## ----results = 'hide', eval = FALSE-------------------------------------------
# optimal_sel <-  list(method = "opc", value = 40)
# pca_tr_ts <- ortho_projection(Xr = training$spc_p,
#                               Xu = testing$spc_p,
#                               Yr = training$Ciso,
#                               method = "pca",
#                               pc_selection = optimal_sel,
#                               scale = TRUE)
# plot(pca_tr_ts)

## ----results = 'hide', eval = FALSE-------------------------------------------
# optimal_sel <-  list(method = "opc", value = 40)
# pls_tr_ts <- ortho_projection(Xr = training$spc_p,
#                               Xu = testing$spc_p,
#                               Yr = training$Ciso,
#                               method = "pls",
#                               pc_selection = optimal_sel,
#                               scale = TRUE)
# 
# # the same PLS projection model can be obtained with:
# pls_tr_ts2 <- ortho_projection(Xr = training$spc_p[!is.na(training$Ciso),],
#                                Yr = training$Ciso[!is.na(training$Ciso)],
#                                method = "pls",
#                                pc_selection = optimal_sel,
#                                scale = TRUE)
# 
# identical(pls_tr_ts$projection_mat, pls_tr_ts2$projection_mat)

## ----results = 'hide', eval = FALSE-------------------------------------------
# optimal_sel <-  list(method = "opc", value = 40)
# pls_multi_yr <- ortho_projection(Xr = training$spc_p,
#                                  Xu = testing$spc_p,
#                                  Yr = training[, c("Ciso", "Nt", "CEC")],
#                                  method = "pls",
#                                  pc_selection = optimal_sel,
#                                  scale = TRUE)
# plot(pls_multi_yr)

## ----results = 'hide', eval = FALSE-------------------------------------------
# pls_multi_yr$opc_evaluation

## ----results = 'hide', eval = FALSE-------------------------------------------
# # for PC dissimilarity using the default settings
# pcd <- dissimilarity(Xr = training$spc_p,
#                      diss_method = "pca")
# dim(pcd$dissimilarity)
# 
# # for PC dissimilarity using the optimized component selection method
# pcd2 <- dissimilarity(Xr = training$spc_p,
#                       diss_method = "pca.nipals",
#                       Yr = training$Ciso,
#                       pc_selection = list("opc", 20),
#                       return_projection = TRUE)
# dim(pcd2$dissimilarity)
# pcd2$dissimilarity
# pcd2$projection # the projection used to compute the dissimilarity matrix
# 
# # for PLS dissimilarity
# plsd <- dissimilarity(Xr = training$spc_p,
#                       diss_method = "pls",
#                       Yr = training$Ciso,
#                       pc_selection = list("opc", 20),
#                       return_projection = TRUE)
# dim(plsd$dissimilarity)
# plsd$dissimilarity
# plsd$projection # the projection used to compute the dissimilarity matrix

## ----results = 'hide', eval = TRUE--------------------------------------------
# For PC dissimilarity using the optimized component selection method
pcd_tr_ts <- dissimilarity(Xr = training$spc_p,
                           Xu = testing$spc_p,
                           diss_method = "pca.nipals",
                           Yr = training$Ciso,
                           pc_selection = list("opc", 20))
dim(pcd_tr_ts$dissimilarity)

# For PLS dissimilarity
plsd_tr_ts <- dissimilarity(Xr = training$spc_p,
                            Xu = testing$spc_p,
                            diss_method = "pls",
                            Yr = training$Ciso,
                            pc_selection = list("opc", 20))
dim(plsd_tr_ts$dissimilarity)

## ----localdiss, eval = TRUE---------------------------------------------------
# for localized PC dissimilarity using the optimized component selection method
# set the number of neighbors to retain
knn <- 200
local_pcd_tr_ts <- dissimilarity(Xr = training$spc_p,
                                 Xu = testing$spc_p,
                                 diss_method = "pca",
                                 Yr = training$Ciso,
                                 pc_selection = list("opc", 20),
                                 .local = TRUE, 
                                 pre_k = knn)
dim(local_pcd_tr_ts$dissimilarity)

# For PLS dissimilarity
local_plsd_tr_ts <- dissimilarity(Xr = training$spc_p,
                                  Xu = testing$spc_p,
                                  diss_method = "pls",
                                  Yr = training$Ciso,
                                  pc_selection = list("opc", 20),
                                  .local = TRUE, 
                                  pre_k = knn)
dim(local_plsd_tr_ts$dissimilarity)

# check the dissimilarity scores between the first two 
# observations in the testing dataset and the first 10 
# observations in the training dataset
local_plsd_tr_ts$dissimilarity[1:10, 1:2]


## ----results = 'hide', eval = FALSE-------------------------------------------
# cd_tr <- dissimilarity(Xr = training$spc_p, diss_method = "cor")
# dim(cd_tr$dissimilarity)
# cd_tr$dissimilarity

## ----results = 'hide', eval = TRUE--------------------------------------------
cd_tr_ts <- dissimilarity(Xr = training$spc_p,
                          Xu = testing$spc_p,
                          diss_method = "cor")
dim(cd_tr_ts$dissimilarity)
cd_tr_ts$dissimilarity

## ----results = 'hide', eval = FALSE-------------------------------------------
# # a moving window correlation dissimilarity between training and testing
# # using a window size of 19 spectral data points (equivalent to 95 nm)
# cd_mw <- dissimilarity(Xr = training$spc_p,
#                        Xu = testing$spc_p,
#                        diss_method = "cor",
#                        ws = 19)
# cd_mw$dissimilarity

## ----results = 'hide', eval = FALSE-------------------------------------------
# # compute the dissimilarity between all the training observations
# ed <- dissimilarity(Xr = training$spc_p, diss_method = "euclid")
# ed$dissimilarity

## ----results = 'hide', eval = FALSE-------------------------------------------
# # compute the dissimilarity between all the training observations
# pre_time_resemble <- proc.time()
# ed_resemble <- dissimilarity(Xr = training$spc_p, diss_method = "euclid")
# post_time_resemble <- proc.time()
# post_time_resemble - pre_time_resemble
# 
# pre_time_stats <- proc.time()
# ed_stats <- dist(training$spc_p, method = "euclid")
# post_time_stats <- proc.time()
# post_time_stats - pre_time_stats
# 
# # scale the results of dist() based on the number of input columns
# ed_stats_tr <- sqrt((as.matrix(ed_stats)^2)/ncol(training$spc_p))
# ed_stats_tr[1:2, 1:3]
# 
# # compare resemble and R stats results of Euclidean distances
# ed_resemble$dissimilarity[1:2, 1:3]

## ----results = 'hide', eval = FALSE-------------------------------------------
# # compute the dissimilarity between the training and testing observations
# ed_tr_ts <- dissimilarity(Xr = training$spc_p,
#                           Xu = testing$spc_p,
#                           diss_method = "euclid")

## ----results = 'hide', eval = FALSE-------------------------------------------
# # compute the dissimilarity between the training and testing observations
# cosine_tr_ts <- dissimilarity(Xr = training$spc_p,
#                               Xu = testing$spc_p,
#                               diss_method = "cosine")
# dim(cosine_tr_ts$dissimilarity)
# cosine_tr_ts$dissimilarity

## ----results = 'hide', eval = FALSE-------------------------------------------
# sid_tr_ts <- dissimilarity(Xr = training$spc_p,
#                            Xu = testing$spc_p,
#                            diss_method = "sid")
# dim(sid_tr_ts$dissimilarity)
# sid_tr_ts$dissimilarity

## ----results = 'hide', eval = TRUE--------------------------------------------
# PC dissimilarity with default settings (variance-based 
# of components)
pcad <- dissimilarity(training$spc_p, diss_method = "pca", scale = TRUE)

# PLS dissimilarity with default settings (variance-based 
# of components)
plsd <- dissimilarity(training$spc_p, diss_method = "pls", Yr = training$Ciso,
                      scale = TRUE)

# PC dissimilarity with optimal selection of components
opc_sel <- list("opc", 30)
o_pcad <- dissimilarity(training$spc_p,
                        diss_method = "pca",
                        Yr = training$Ciso,
                        pc_selection = opc_sel, 
                        scale = TRUE)

# PLS dissimilarity with optimal selection of components
o_plsd <- dissimilarity(training$spc_p,
                        diss_method = "pls",
                        Yr = training$Ciso,
                        pc_selection = opc_sel, 
                        scale = TRUE)

# Correlation dissimilarity 
cd <- dissimilarity(training$spc_p, diss_method = "cor", scale = TRUE)

# Moving window correlation dissimilarity 
mcd <- dissimilarity(training$spc_p, diss_method = "cor", ws = 51, scale = TRUE)

# Euclidean dissimilarity 
ed <- dissimilarity(training$spc_p, diss_method = "euclid", scale = TRUE)

# Cosine dissimilarity 
cosd <- dissimilarity(training$spc_p, diss_method = "cosine", scale = TRUE)

# Spectral information divergence/dissimilarity 
sinfd <- dissimilarity(training$spc_p, diss_method = "sid", scale = TRUE)

## ----results = 'hide', eval = TRUE--------------------------------------------
Ciso <- as.matrix(training$Ciso)
ev <- NULL
ev[["pcad"]] <- sim_eval(pcad$dissimilarity, side_info = Ciso)
ev[["plsd"]] <- sim_eval(plsd$dissimilarity, side_info = Ciso)
ev[["o_pcad"]] <- sim_eval(o_pcad$dissimilarity, side_info = Ciso)
ev[["o_plsd"]] <- sim_eval(o_plsd$dissimilarity, side_info = Ciso)
ev[["cd"]] <- sim_eval(cd$dissimilarity, side_info = Ciso)
ev[["mcd"]] <- sim_eval(mcd$dissimilarity, side_info = Ciso)
ev[["ed"]] <- sim_eval(ed$dissimilarity, side_info = Ciso)
ev[["cosd"]] <- sim_eval(cosd$dissimilarity, side_info = Ciso)
ev[["sinfd"]] <- sim_eval(sinfd$dissimilarity, side_info = Ciso)

## ----fcaption, results = 'hide', eval = TRUE, echo = FALSE--------------------
fig_cap <- paste("Comparison between observations and their corresponding",
      "nearest neighbor (1-NN)","observation in terms of Total Carbon (Ciso). The 1-NNs",
      "are retrieved with the", 
      "following dissimilarity metrics:",
      "pcad: PC dissimilarity with default settings (variance-based of components);",
      "plsd: PLS dissimilarity with default settings (variance-based of components);",
      "o-pcad: PC dissimilarity with optimal selection of components;",  
      "o-plsd: PLS dissimilarity with optimal selection of components;",  
      "cd: Correlation dissimilarity;",  
      "mcd: Moving window correlation dissimilarity;",  
      "ed: Euclidean dissimilarity;",  
      "sinfd: Spectral information divergence.", collapse = "")

## ----results = 'hide', eval = TRUE, echo = FALSE------------------------------
comparisons <- lapply(names(ev), 
                      FUN = function(x, label) {
                        irmsd <- round(x[[label]]$eval[1], 2)
                        ir <- round(x[[label]]$eval[2], 2)
                        if (label %in% c("o_pcad", "o_plsd")) {
                          label <- paste0("**", label, "**")
                          irmsd <- paste0("**", irmsd, "**")
                          ir <- paste0("**", ir, "**")
                        }
                        
                        data.frame(Measure = label, 
                                   RMSD = irmsd, 
                                   r =  ir)
                      },
                      x = ev)
comparisons

## ----eval = FALSE, echo = TRUE------------------------------------------------
# comparisons <- lapply(names(ev),
#                       FUN = function(x, label) {
#                         irmsd <- x[[label]]$eval[1]
#                         ir <- x[[label]]$eval[2]
#                         data.frame(Measure = label,
#                                    RMSD = irmsd,
#                                    r =  ir)
#                       },
#                       x = ev)
# comparisons

## ----tcomparisons, eval = TRUE, echo = FALSE----------------------------------
knitr::kable(do.call("rbind", comparisons), 
             caption = paste("Root mean squared difference (RMSD)",
                             "and correlation coefficients",
                             "for between the observations and their", 
                             "corrresponding closest observations", 
                             "retrieved with the different dissimilarity 
                             methods."), 
             format = "simple", digits = 2, align = "l", padding = 2)

## ----eval = FALSE-------------------------------------------------------------
# old_par <- par("mfrow")
# par(mfrow = c(3, 3))
# p <- sapply(names(ev),
#             FUN = function(x, label, labs = c("Ciso (1-NN), %", "Ciso, %")) {
#               xy <- x[[label]]$first_nn[,2:1]
#               plot(xy, xlab = labs[1], ylab = labs[2], col = "red")
#               title(label)
#               grid()
#               abline(0, 1)
# 
#             },
#             x = ev)
# par(old_par)

## ----pcomparisons, fig.cap = paste(fig_cap), fig.cap.style = "Image Caption", fig.align = "center", fig.width = 8, fig.height = 8, echo = FALSE, fig.retina = 0.85----
old_par <- par("mfrow", "mar")

par(mfrow = c(3, 3), pch = 16, mar = c(4,4,4,4))
my_cols <- c("#750E3380", 
             "#C3BC6180", 
             "#FFE64F80", 
             "#EFBF4780", 
             "#E47C4E80", 
             "#F1A07380", 
             "#A1CFC480", 
             "#6B8EB580", 
             "#5177A180")
names(my_cols) <- sample(names(ev))
p <- sapply(names(ev), 
            FUN = function(x, 
                           label, 
                           labs = c("Ciso (1-NN), %", "Ciso, %"),
                           cols) {
              xy <- x[[label]]$first_nn[,2:1]
              plot(xy, xlab = labs[1], ylab = labs[2], col = cols[label])
              title(label)
              grid(col= "#80808080", lty = 1)
              abline(0, 1, col = "#FF1A0080")
            },
            x = ev,
            cols = my_cols)
par(old_par)

## ----results = 'hide', eval = TRUE--------------------------------------------
knn_pc <- search_neighbors(Xr = training$spc_p, 
                           Xu = testing$spc_p, 
                           diss_method = "pca.nipals",
                           k = 50)

# matrix of neighbors
knn_pc$neighbors

# matrix of neighbor distances (dissimilarity scores)
knn_pc$neighbors_diss

# the index (in the training set) of the first two closest neighbors found in 
# training for the first observation in testing:
knn_pc$neighbors[1:2, 1, drop = FALSE]

# the distances of the two closest neighbors found in 
# training for the first observation in testing:
knn_pc$neighbors_diss[1:2, 1, drop = FALSE]

# the indices in training that fall in any of the 
# neighborhoods of testing
knn_pc$unique_neighbors

## ----results = 'hide', eval = FALSE-------------------------------------------
# # using PC dissimilarity with optimal selection of components
# knn_opc <- search_neighbors(Xr = training$spc_p,
#                             Xu = testing$spc_p,
#                             diss_method = "pca.nipals",
#                             Yr = training$Ciso,
#                             k = 50,
#                             pc_selection = list("opc", 20),
#                             scale = TRUE)
# 
# # using PLS dissimilarity with optimal selection of components
# knn_pls <- search_neighbors(Xr = training$spc_p,
#                             Xu = testing$spc_p,
#                             diss_method = "pls",
#                             Yr = training$Ciso,
#                             k = 50,
#                             pc_selection = list("opc", 20),
#                             scale = TRUE)
# 
# # using correlation dissimilarity
# knn_c <- search_neighbors(Xr = training$spc_p,
#                           Xu = testing$spc_p,
#                           diss_method = "cor",
#                           k = 50, scale = TRUE)
# 
# # using moving window correlation dissimilarity
# knn_mwc <- search_neighbors(Xr = training$spc_p,
#                             Xu = testing$spc_p,
#                             diss_method = "cor",
#                             k = 50,
#                             ws = 51, scale = TRUE)

## ----results = 'hide', eval = FALSE-------------------------------------------
# # using localized PC dissimilarity with optimal selection of components
# knn_local_opc <- search_neighbors(Xr = training$spc_p,
#                                   Xu = testing$spc_p,
#                                   diss_method = "pca.nipals",
#                                   Yr = training$Ciso,
#                                   k = 50,
#                                   pc_selection = list("opc", 20),
#                                   scale = TRUE,
#                                   .local = TRUE,
#                                   pre_k = 250)
# 
# # using localized PLS dissimilarity with optimal selection of components
# knn_local_opc <- search_neighbors(Xr = training$spc_p,
#                                   Xu = testing$spc_p,
#                                   diss_method = "pls",
#                                   Yr = training$Ciso,
#                                   k = 50,
#                                   pc_selection = list("opc", 20),
#                                   scale = TRUE,
#                                   .local = TRUE,
#                                   pre_k = 250)

## ----results = 'hide', eval = TRUE--------------------------------------------
# a dissimilarity threshold
d_th <- 1

# the minimum number of observations required in each neighborhood
k_min <- 20

# the maximum number of observations allowed in each neighborhood
k_max <- 300

dnn_pc <- search_neighbors(Xr = training$spc_p, 
                           Xu = testing$spc_p, 
                           diss_method = "pca.nipals",
                           k_diss = d_th,
                           k_range = c(k_min, k_max), 
                           scale = TRUE)

# matrix of neighbors. The minimum number of indices is 20 (given by k_min)
# and the maximum number of indices is 300 (given by k_max).
# NAs indicate "not a neighbor"
dnn_pc$neighbors

# this reports how many neighbors were found for each observation in 
# testing using the input distance threshold (column n_k) and how 
# many were finally selected (column final_n_k)
dnn_pc$k_diss_info

# matrix of neighbor distances
dnn_pc$neighbors_diss

# the indices in training that fall in any of the 
# neighborhoods of testing
dnn_pc$unique_neighbors

## ----knnhist, fig.cap = "Histogram of the original neighborhood sizes", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 5, fig.height = 4, fig.retina = 0.80----
hist(dnn_pc$k_diss_info$final_n_k, 
     breaks = k_min,
     xlab = "Final neighborhood size", 
     main = "", col = "#EFBF47CC")

## ----results = 'hide', eval = FALSE-------------------------------------------
# # the indices of the observations that we want to "invite" to every neighborhood
# forced_guests <- c(1, 5, 8, 9)
# 
# # using PC dissimilarity with optimal selection of components
# knn_spiked <- search_neighbors(Xr = training$spc_p,
#                                Xu = testing$spc_p,
#                                diss_method = "pca.nipals",
#                                Yr = training$Ciso,
#                                k = 50,
#                                spike = forced_guests,
#                                pc_selection = list("opc", 20))
# 
# # check the first 8 neighbors found in training for the
# # first 2 observations in testing
# knn_spiked$neighbors[1:8, 1:2]

## ----mblgif, fig.cap =  "Example of the main steps in memory-based learning for predicting a response variable in five different observations based on set of p-dimensional space.", echo = FALSE, out.width = '65%', fig.align = 'center', fig.retina = 0.85----
knitr::include_graphics("MBL.gif")

## ----eval = TRUE--------------------------------------------------------------
# creates an object with instructions to build PLS models
my_plsr <- local_fit_pls(pls_c = 15)
my_plsr

# creates an object with instructions to build WAPLS models
my_waplsr <- local_fit_wapls(min_pls_c = 3, max_pls_c = 20)
my_waplsr
  
# creates an object with instructions to build GPR models
my_gpr <- local_fit_gpr()
my_gpr

## ----eval = TRUE--------------------------------------------------------------
# create an object with instructions to conduct both validation types 
# "NNv" "local_cv"
two_val_control <- mbl_control(validation_type = c("NNv", "local_cv"),
                               number = 10,
                               p = 0.75)

## ----results = 'hide', eval = TRUE--------------------------------------------
# define the dissimilarity method 
my_diss <- "cor"

# define the neighborhood sizes to test
my_ks <- seq(80, 200, by = 40)
  
# define how to use the dissimilarity information (ignore it)
ignore_diss <- "none"

# define the regression method to be used at each neighborhood 
my_waplsr <- local_fit_wapls(min_pls_c = 3, max_pls_c = 20)

# for the moment use only "NNv" validation (it will be faster)
nnv_val_control <- mbl_control(validation_type = "NNv")
  
# predict Total Carbon
# (remove missing values)
local_ciso <- mbl(
  Xr = training$spc_p[!is.na(training$Ciso),],
  Yr = training$Ciso[!is.na(training$Ciso)],
  Xu = testing$spc_p,
  k = my_ks,
  method = my_waplsr,
  diss_method = my_diss,
  diss_usage = ignore_diss,
  control = nnv_val_control,
  scale = TRUE
)

## ----localresultsciso, fig.cap = "MBL results for Total Carbon predictions using the LOCAL algorithm. NNv: nearest-neighbor cross-validation.", fig.align = "center", fig.width = 7.5, fig.height = 4, fig.retina = 0.85----
plot(local_ciso, main = "")
local_ciso 

## ----results = 'hide', eval = TRUE, echo = FALSE------------------------------
bestk <- which.min(local_ciso$validation_results$nearest_neighbor_validation$rmse)
bestk <- local_ciso$validation_results$nearest_neighbor_validation$k[bestk]

## ----results = 'hide', eval = TRUE,  fig.show = 'hide'------------------------
bki <- which.min(local_ciso$validation_results$nearest_neighbor_validation$rmse)
bk <- local_ciso$validation_results$nearest_neighbor_validation$k[bki]

# all the prediction results are stored in:
local_ciso$results

# the get_predictions function makes easier to retrieve the
# predictions from the previous object
ciso_hat <- as.matrix(get_predictions(local_ciso))[, bki]

## ----eval = F-----------------------------------------------------------------
# # Plot predicted vs reference
# plot(ciso_hat, testing$Ciso,
#      xlim = c(0, 14),
#      ylim = c(0, 14),
#      xlab = "Predicted Total Carbon, %",
#      ylab = "Total Carbon, %",
#      main = "LOCAL using argument k")
# grid()
# abline(0, 1, col = "red")

## ----eval = TRUE--------------------------------------------------------------
# prediction RMSE:
sqrt(mean((ciso_hat - testing$Ciso)^2, na.rm = TRUE))

# squared R
cor(ciso_hat, testing$Ciso, use = "complete.obs")^2

## ----results = 'hide', eval = TRUE,  fig.show = 'hide'------------------------
# create a vector of dissimilarity thresholds to evaluate
# since the correlation dissimilarity will be used
# these thresholds need to be > 0 and <= 1 
dths <- seq(0.025, 0.3, by = 0.025)

# indicate the minimum and maximum sizes allowed for the neighborhood
k_min <- 30 
k_max <- 200 

local_ciso_diss <- mbl(
  Xr = training$spc_p[!is.na(training$Ciso),],
  Yr = training$Ciso[!is.na(training$Ciso)],
  Xu = testing$spc_p,
  k_diss = dths,
  k_range = c(k_min, k_max),
  method = my_waplsr,
  diss_method = my_diss,
  diss_usage = ignore_diss,
  control = nnv_val_control,
  scale = TRUE
)

## ----eval = FALSE-------------------------------------------------------------
# plot(local_ciso_diss)

## ----results = 'hide', eval = TRUE, echo = FALSE------------------------------
bestd <- which.min(local_ciso_diss$validation_results$nearest_neighbor_validation$rmse)
bestd <- local_ciso_diss$validation_results$nearest_neighbor_validation$k[bestd]

## ----eval = TRUE--------------------------------------------------------------
local_ciso_diss

## ----results = 'hide', eval = TRUE,  fig.show = 'hide'------------------------
# best distance threshold
bdi <- which.min(local_ciso_diss$validation_results$nearest_neighbor_validation$rmse)
bd <- local_ciso_diss$validation_results$nearest_neighbor_validation$k[bdi]

# predictions for the best distance
ciso_diss_hat <- as.matrix(get_predictions(local_ciso_diss))[, bdi]

## ----eval = FALSE-------------------------------------------------------------
# # Plot predicted vs reference
# plot(ciso_diss_hat, testing$Ciso,
#      xlim = c(0, 14),
#      ylim = c(0, 14),
#      xlab = "Predicted Total Carbon, %",
#      ylab = "Total Carbon, %",
#      main = "LOCAL using argument k_diss")
# grid()
# abline(0, 1, col = "red")

## ----addexamples, eval = TRUE, echo = FALSE-----------------------------------
abbr <- c("`local_cec`",
          "`pc_pred_cec`",
          "`pls_pred_cec`",
          "`local_gpr_cec`")

diss <- c("Correlation",
          "optimized PC",
          "optimized PLS",
          "optimized PC")

d_usage <- c("None",
             "Source of predictors",
             "None",
             "Source of predictors")

reg_m <- c("Weighted average PLS",
           "Weighted average PLS",
           "Weighted average PLS",
           "Gaussian process")

my_tab <-  as.data.frame(cbind(abbr, diss, d_usage, reg_m))
colnames(my_tab) <- c("Abreviation",
                      "Dissimilarity method",
                      "Dissimilarity usage",
                      "Local regression")

knitr::kable(my_tab, 
             caption = paste("Basic description of the different MBL", 
                             "configurations in the examples to predict",
                             "Cation Exhange Capacity (CEC)."), 
             format = "simple", align = "l", padding = 2)

## ----results = 'hide', eval = TRUE,  fig.show = 'hide'------------------------
# Lets define some methods:
my_wapls <- local_fit_wapls(2, 25)
k_min_max <- c(80, 200)

# use the LOCAL algorithm
# specific thresholds for cor dissimilarity
dth_cor <- seq(0.01, 0.3, by = 0.03)
local_cec <- mbl(
  Xr = training$spc_p[!is.na(training$CEC),],
  Yr = training$CEC[!is.na(training$CEC)],
  Xu = testing$spc_p,
  k_diss = dth_cor, 
  k_range = k_min_max,
  method = my_wapls,
  diss_method = "cor",
  diss_usage = "none",
  control = nnv_val_control,
  scale = TRUE
)

# use one where pca dissmilarity is used and the dissimilarity matrix  
# is used as source of additional predictors
# lets define first a an appropriate vector of dissimilarity thresholds 
# for the PC dissimilarity method
dth_pc <- seq(0.05, 1, by = 0.1)
pc_pred_cec <- mbl(
  Xr = training$spc_p[!is.na(training$CEC),],
  Yr = training$CEC[!is.na(training$CEC)],
  Xu = testing$spc_p,
  k_diss = dth_pc,
  k_range = k_min_max,
  method = my_wapls,
  diss_method = "pca",
  diss_usage = "predictors",
  control = nnv_val_control,
  scale = TRUE
)

# use one where PLS dissmilarity is used and the dissimilarity matrix  
# is used as source of additional predictors
pls_pred_cec <- mbl(
  Xr = training$spc_p[!is.na(training$CEC),],
  Yr = training$CEC[!is.na(training$CEC)],
  Xu = testing$spc_p,
  Yu = testing$CEC,
  k_diss = dth_pc,
  k_range = k_min_max,
  method = my_wapls,
  diss_method = "pls",
  diss_usage = "none",
  control = nnv_val_control,
  scale = TRUE
)

# use one where Gaussian process regression and pca dissmilarity are used 
# and the dissimilarity matrix  is used as source of additional predictors
local_gpr_cec <- mbl(
  Xr = training$spc_p[!is.na(training$CEC),],
  Yr = training$CEC[!is.na(training$CEC)],
  Xu = testing$spc_p,
  k_diss = dth_pc,
  k_range = k_min_max,
  method = local_fit_gpr(),
  diss_method = "pca",
  diss_usage = "predictors",
  control = nnv_val_control,
  scale = TRUE
)

## ----eval = TRUE,  fig.show = 'hide'------------------------------------------
# get the indices of the best results according to 
# nearest neighbor validation statistics
c_val_name <- "validation_results"
c_nn_val_name <- "nearest_neighbor_validation"

bi_local <- which.min(local_cec[[c_val_name]][[c_nn_val_name]]$rmse)
bi_pc_pred <- which.min(pc_pred_cec[[c_val_name]][[c_nn_val_name]]$rmse)
bi_pls_pred <- which.min(pls_pred_cec[[c_val_name]][[c_nn_val_name]]$rmse)
bi_local_gpr  <- which.min(local_gpr_cec[[c_val_name]][[c_nn_val_name]]$rmse)

preds <- cbind(get_predictions(local_cec)[, ..bi_local],
               get_predictions(pc_pred_cec)[, ..bi_pc_pred],
               get_predictions(pls_pred_cec)[, ..bi_pls_pred],
               get_predictions(local_gpr_cec)[, ..bi_local_gpr])

colnames(preds) <- c("local_cec", 
                     "pc_pred_cec", 
                     "pls_pred_cec", 
                     "local_gpr_cec")
preds <- as.matrix(preds)

# R2s
cor(testing$CEC, preds, use = "complete.obs")^2

#RMSEs
colMeans((preds - testing$CEC)^2, na.rm = TRUE)^0.5

## ----eval = FALSE,  fig.show = 'hide'-----------------------------------------
# old_par <- par("mfrow", "mar")
# 
# par(mfrow = c(2, 2))
# plot(testing$CEC, preds[, 2],
#      xlab = "Predicted CEC, meq/100g",
#      ylab = "CEC, meq/100g", main = colnames(preds)[2])
# abline(0, 1, col = "red")
# 
# plot(testing$CEC, preds[, 3],
#      xlab = "Predicted CEC, meq/100g",
#      ylab = "CEC, meq/100g", main = colnames(preds)[3])
# abline(0, 1, col = "red")
# 
# plot(testing$CEC, preds[, 4],
#      xlab = "Predicted CEC, meq/100g",
#      ylab = "CEC, meq/100g", main = colnames(preds)[4])
# abline(0, 1, col = "red")
# par(old_par)

## ----mblcomparisons, fig.cap = "CEC prediction results for the different MBL configurations tested" , fig.cap.style = "Image Caption", fig.align = "center", fig.width = 8, fig.height = 8, echo = FALSE, fig.retina = 0.85----
old_par <- par("mfrow", "mar")

par(mfrow = c(2, 2), pch = 16, mar = c(4,4,4,4))
my_cols <- c("#D42B08CC",
             "#750E3380",
             "#EFBF4780", 
             "#5177A180")
names(my_cols) <- colnames(preds)

# R2s
r2s <- round(cor(testing$CEC, preds, use = "complete.obs")^2, 2)

#RMSEs
rmses <- round(colMeans((preds - testing$CEC)^2, na.rm = TRUE)^0.5,2)

names(r2s) <- colnames(preds)
names(rmses) <- colnames(preds)

p <- sapply(colnames(preds), 
            FUN = function(y,
                           yhats,
                           label, 
                           labs = c("Predicted CEC, meq/100g", "CEC, meq/100g"),
                           rsq,
                           rmse,
                           cols) {
              plot(x = yhats[,label], 
                   y = y, 
                   ylim = range(y, na.rm = T), xlim = range(y, na.rm = T),
                   xlab = labs[1], ylab = labs[2], 
                   col = cols[label])
              title(label)
              title(paste0("\n\n\n RMSE: ", rmse[label], "; Rsq: ", rsq[label]), cex.main=1)
              grid(col= "#80808080", lty = 1)
              abline(0, 1, col = "#FF1A0080")
            },
            yhats = preds,
            y = testing$CEC,
            cols = my_cols,
            rsq = r2s,
            rmse = rmses)
par(old_par)

## ----eval = FALSE-------------------------------------------------------------
# # use Yu argument to validate the predictions
# pc_pred_nt_yu <- mbl(
#   Xr = training$spc_p[!is.na(training$Nt),],
#   Yr = training$Nt[!is.na(training$Nt)],
#   Xu = testing$spc_p,
#   Yu = testing$Nt,
#   k = seq(40, 100, by = 10),
#   diss_usage = "none",
#   control = mbl_control(validation_type = "NNv"),
#   scale = TRUE
# )
# 
# pc_pred_nt_yu

## ----eval = FALSE-------------------------------------------------------------
# # Running the mbl function using multiple cores
# 
# # Execute with two cores, if available, ...
# n_cores <- 2
# 
# # ... if not then go with 1 core
# if (parallel::detectCores() < 2) {
#   n_cores <- 1
# }
# 
# # Set the number of cores
# library(doParallel)
# clust <- makeCluster(n_cores)
# registerDoParallel(clust)
# 
# # Alternatively:
# # library(doSNOW)
# # clust <- makeCluster(n_cores, type = "SOCK")
# # registerDoSNOW(clust)
# # getDoParWorkers()
# 
# pc_pred_nt <- mbl(
#   Xr = training$spc_p[!is.na(training$Nt),],
#   Yr = training$Nt[!is.na(training$Nt)],
#   Xu = testing$spc_p,
#   k = seq(40, 100, by = 10),
#   diss_usage = "none",
#   control = mbl_control(validation_type = "NNv"),
#   scale = TRUE
# )
# 
# # go back to sequential processing
# registerDoSEQ()
# try(stopCluster(clust))
# 
# pc_pred_nt