Nothing
## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)
Sys.setenv(OMP_NUM_THREADS = 2)
## -----------------------------------------------------------------------------
#| message: false
library(prospectr)
data(NIRsoil)
# Preprocess spectra
NIRsoil$spc_pr <- savitzkyGolay(
detrend(NIRsoil$spc, wav = as.numeric(colnames(NIRsoil$spc))),
m = 1, p = 1, w = 7
)
# Missing values in the response are allowed
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]
test_x <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y <- NIRsoil$Ciso[NIRsoil$train == 0]
md <- dissimilarity(
test_x,
apply(test_x, 2, median),
diss_method = diss_correlation(ws = 101, center = T, scale = T)
)
threshold <- 0.4
## -----------------------------------------------------------------------------
#| label: fig-diss-threshold
#| fig-cap: "Left: Dissimilarity of the test spectra to the median test spectrum. Samples above the threshold (dashed line and red) were excluded from the target set as a simple attempt to reduce spectral heterogeneity and potential non-linearities in the spectra-response relationship. Right: Spectra of the samples above (red, to be excluded) and below (blue, to be kept) the dissimilarity threshold."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 4.5
op <- par(mfrow = c(1, 2), mar = c(4.5, 4.5, 2, 1))
plot(
seq_along(md$dissimilarity),
md$dissimilarity,
ylim = c(0, 1),
col = "steelblue",
pch = 16,
cex = 1.5,
xlab = "Sample index",
ylab = "Dissimilarity to median spectrum"
)
points(
which(md$dissimilarity >= threshold),
md$dissimilarity[md$dissimilarity >= threshold],
col = "tomato",
pch = 16,
cex = 1.5
)
abline(h = threshold, col = "tomato", lty = 2, lwd = 2)
grid(lty = 1)
matplot(
as.numeric(colnames(train_x)),
t(test_x[md$dissimilarity >= threshold, ]),
col = "tomato",
ylab = "Preprocessed spectra",
xlab = "Wavelengths, nm",
lty = 1,
type = "l"
)
grid(lty = 1)
matlines(
as.numeric(colnames(train_x)),
t(test_x[md$dissimilarity < threshold, ]),
col = "steelblue",
lty = 1,
type = "l"
)
par(op)
## -----------------------------------------------------------------------------
keep <- md$dissimilarity < threshold
cat("Number of samples retained:", sum(keep))
test_x_test <- test_x[keep, ]
test_y_test <- test_y[keep]
# sample a very small subset to guide the search,
# and use the rest for testing the final model
set.seed(1124)
ref_ind <- sample(sum(keep), 8)
test_x_ref <- test_x_test[ref_ind, ]
test_y_ref <- test_y_test[ref_ind]
test_x_test <- test_x_test[-ref_ind, ]
test_y_test <- test_y_test[-ref_ind]
## -----------------------------------------------------------------------------
set.seed(1124)
pls_model <- model(
Xr = rbind(
train_x[!is.na(train_y), ],
test_x_ref[!is.na(test_y_ref), ]
),
Yr = c(
train_y[!is.na(train_y)],
test_y_ref[!is.na(test_y_ref)]
),
fit_method = fit_pls(ncomp = 15, method = "simpls", scale = TRUE),
control = model_control(validation_type = "lgo", number = 10)
)
best_ncomp <- which(pls_model$cv_results$optimal)
pred <- predict(pls_model, test_x_test, ncomp = best_ncomp)
## -----------------------------------------------------------------------------
reg_metrics <- function(obs, pred, na.rm = TRUE) {
if (na.rm) {
ok <- complete.cases(obs, pred)
obs <- obs[ok]
pred <- pred[ok]
}
c(RMSE = sqrt(mean((obs - pred)^2)), R2 = cor(obs, pred)^2)
}
## -----------------------------------------------------------------------------
reg_metrics(test_y_test, pred)
## -----------------------------------------------------------------------------
my_control <- gesearch_control(
retain_by = "probability",
tune = FALSE,
number = 10L,
p = 0.75
)
## -----------------------------------------------------------------------------
#| eval: true
#| echo: false
gs <- gesearch(
Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
Yr = train_y[!is.na(train_y)], # the response of the reference "set/library"
Xu = test_x_ref, # the available target domain spectra
Yu = test_y_ref, # the available target domain response (accepts NAs)
k = 50, b = 100, retain = 0.95,
target_size = 120,
fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
optimization = c("reconstruction", "similarity", "response"),
control = my_control,
seed = 1124,
verbose = FALSE
)
## -----------------------------------------------------------------------------
#| eval: false
# gs <- gesearch(
# Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
# Yr = train_y[!is.na(train_y)], # the response of the reference "set/library"
# Xu = test_x_ref, # the available target domain spectra
# Yu = test_y_ref, # the available target domain response (accepts NAs)
# k = 50, b = 100, retain = 0.95,
# target_size = 120,
# fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
# optimization = c("reconstruction", "similarity", "response"),
# control = my_control,
# seed = 1124
# )
## -----------------------------------------------------------------------------
#| eval: false
# plot(gs)
## -----------------------------------------------------------------------------
#| label: fig-evolution-gs
#| echo: false
#| fig-cap: "Evolution of the three weakness metrics (response, reconstruction and dissimilarity) used in the search."
#| fig-align: "center"
#| fig-width: 8.5
#| fig-height: 5.5
plot(gs, main = NA)
## -----------------------------------------------------------------------------
# the internal leave-group out CV results for the data found
gs$validation_results[[1]]$results$train
## -----------------------------------------------------------------------------
# the CV on the target samples that drove the search (note that this is not a proper test set, as these samples were used to guide the search, but it can provide some insight into the performance of the final model on the target domain)
gs$validation_results[[1]]$results$test
## -----------------------------------------------------------------------------
best_ncomp <- which.min(gs$validation_results[[1]]$results$train$rmse_cv)
## -----------------------------------------------------------------------------
pred_gs <- predict(gs, test_x_test)[[1]]
reg_metrics(test_y_test, pred_gs[, best_ncomp])
## -----------------------------------------------------------------------------
#| label: fig-pred-comparison
#| fig-cap: "Comparison of observed versus predicted total carbon values. Left: model fitted using all available training samples. Right: model fitted using the subset of samples selected by gesearch()."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 4.5
op <- par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1))
rng <- range(pred, pred_gs, test_y_test, na.rm = TRUE)
plot(
pred, test_y_test,
xlim = rng,
ylim = rng,
xlab = "Predicted Total Carbon, %",
ylab = "Observed Total Carbon, %",
main = "Model with all training \nsamples",
cex = 1.5,
pch = 16,
col = rgb(0.5, 0.5, 0.5, 0.6)
)
grid(lty = 1)
abline(0, 1, col = "red")
plot(
pred_gs[, best_ncomp], test_y_test,
xlim = rng,
ylim = rng,
xlab = "Predicted Total Carbon, %",
ylab = "Observed Total Carbon, %",
main = "Model with gesearch-selected \nsamples",
cex = 1.5,
pch = 16,
col = rgb(0.5, 0.5, 0.5, 0.6)
)
grid(lty = 1)
abline(0, 1, col = "red")
par(op)
## -----------------------------------------------------------------------------
#| eval: true
#| echo: false
gs_nr <- gesearch(
Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
Yr = train_y[!is.na(train_y)], # the response of the reference "set/library"
Xu = test_x_ref, # the available target domain spectra
Yu = NULL, # NO available target domain response (accepts NAs)
k = 50, b = 80, retain = 0.98,
target_size = 160,
fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
optimization = c("reconstruction", "similarity"),
control = my_control,
seed = 1124,
verbose = FALSE
)
## -----------------------------------------------------------------------------
#| eval: false
# gs_nr <- gesearch(
# Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
# Yr = train_y[!is.na(train_y)], # the response of the reference "set/library"
# Xu = test_x_ref, # the available target domain spectra
# Yu = NULL, # NO available target domain response (accepts NAs)
# k = 50, b = 80, retain = 0.98,
# target_size = 160,
# fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
# optimization = c("reconstruction", "similarity"),
# control = my_control,
# seed = 1124
# )
## -----------------------------------------------------------------------------
best_ncomp_nr <- which.min(gs_nr$validation_results[[1]]$results$train$rmse_cv)
best_ncomp_nr
pred_gs_nr <- predict(gs_nr, test_x_test)[[1]]
reg_metrics(test_y_test, pred_gs_nr[, best_ncomp_nr])
## -----------------------------------------------------------------------------
#| eval: true
#| echo: false
gs_nr2 <- gesearch(
Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
Yr = train_y[!is.na(train_y)], # the response of the reference "set/library"
Xu = test_x_ref, # the available target domain spectra
Yu = NULL, # NO available target domain response (accepts NAs)
Yu_lims = c(0, 5), # the response range in the target domain
k = 50, b = 80, retain = 0.98,
target_size = 160,
fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
optimization = c("reconstruction", "similarity", "range"),
control = my_control,
seed = 1124,
verbose = FALSE
)
## -----------------------------------------------------------------------------
#| eval: false
# gs_nr2 <- gesearch(
# Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
# Yr = train_y[!is.na(train_y)], # the response of the reference "set/library"
# Xu = test_x_ref, # the available target domain spectra
# Yu = NULL, # NO available target domain response (accepts NAs)
# Yu_lims = c(0, 5), # the response range in the target domain
# k = 50, b = 80, retain = 0.98,
# target_size = 160,
# fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
# optimization = c("reconstruction", "similarity", "range"),
# control = my_control,
# seed = 1124
# )
## -----------------------------------------------------------------------------
best_ncomp_nr2 <- which.min(gs_nr2$validation_results[[1]]$results$train$rmse_cv)
best_ncomp_nr2
pred_gs_nr2 <- predict(gs_nr2, test_x_test)[[1]]
reg_metrics(test_y_test, pred_gs_nr2[, best_ncomp_nr2])
## -----------------------------------------------------------------------------
#| label: parallel-gesearch
#| eval: false
# # Running gesearch() using multiple cores
#
# # Execute with two cores, if available, ...
# n_cores <- 2
#
# # ... if not then go with 1 core
# if (parallel::detectCores() < 2) {
# n_cores <- 1
# }
#
# # Set the number of cores
# library(doParallel)
# clust <- makeCluster(n_cores)
# registerDoParallel(clust)
#
# # Alternatively:
# # library(doSNOW)
# # clust <- makeCluster(n_cores, type = "SOCK")
# # registerDoSNOW(clust)
# # getDoParWorkers()
#
# my_control <- gesearch_control(
# retain_by = "probability",
# tune = FALSE,
# number = 10L,
# p = 0.75
# )
#
# gs_p <- gesearch(
# Xr = train_x[!is.na(train_y), ],
# Yr = train_y[!is.na(train_y)],
# Xu = test_x_ref,
# Yu = test_y_ref,
# k = 50,
# b = 100,
# retain = 0.98,
# target_size = 80,
# fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
# optimization = c("reconstruction", "similarity", "response"),
# control = my_control,
# seed = 1124,
# pchunks = 2L
# )
#
# # Go back to sequential processing
# registerDoSEQ()
# try(stopCluster(clust))
#
# gs_p
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.