evolutionary-subset-search.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)
Sys.setenv(OMP_NUM_THREADS = 2)


## -----------------------------------------------------------------------------
#| message: false
library(prospectr)
data(NIRsoil)

# Preprocess spectra
NIRsoil$spc_pr <- savitzkyGolay(
  detrend(NIRsoil$spc, wav = as.numeric(colnames(NIRsoil$spc))),
  m = 1, p = 1, w = 7
)

# Missing values in the response are allowed
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]
test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y  <- NIRsoil$Ciso[NIRsoil$train == 0]

md <- dissimilarity(
  test_x,
  apply(test_x, 2, median),
  diss_method = diss_correlation(ws = 101, center = T, scale = T)
)

threshold <- 0.4


## -----------------------------------------------------------------------------
#| label: fig-diss-threshold
#| fig-cap: "Left: Dissimilarity of the test spectra to the median test spectrum. Samples above the threshold (dashed line and red) were excluded from the target set as a simple attempt to reduce spectral heterogeneity and potential non-linearities in the spectra-response relationship. Right: Spectra of the samples above (red, to be excluded) and below (blue, to be kept) the dissimilarity threshold."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 4.5
op <- par(mfrow = c(1, 2), mar = c(4.5, 4.5, 2, 1))

plot(
  seq_along(md$dissimilarity),
  md$dissimilarity, 
  ylim = c(0, 1), 
  col = "steelblue",
  pch = 16,
  cex = 1.5,
  xlab = "Sample index", 
  ylab = "Dissimilarity to median spectrum"
)
points(
  which(md$dissimilarity >= threshold),
  md$dissimilarity[md$dissimilarity >= threshold],
  col = "tomato",
  pch = 16,
  cex = 1.5
)
abline(h = threshold, col = "tomato", lty = 2, lwd = 2)
grid(lty = 1)

matplot(
  as.numeric(colnames(train_x)),
  t(test_x[md$dissimilarity >= threshold, ]),
  col = "tomato",
  ylab = "Preprocessed spectra",
  xlab = "Wavelengths, nm",
  lty = 1,
  type = "l"
)
grid(lty = 1)
matlines(
  as.numeric(colnames(train_x)),
  t(test_x[md$dissimilarity < threshold, ]),
  col = "steelblue",
  lty = 1,
  type = "l"
)

par(op)


## -----------------------------------------------------------------------------
keep <- md$dissimilarity < threshold

cat("Number of samples retained:", sum(keep))

test_x_test <- test_x[keep, ]
test_y_test <- test_y[keep]

# sample a very small subset to guide the search, 
# and use the rest for testing the final model
set.seed(1124)
ref_ind <- sample(sum(keep), 8)
test_x_ref <- test_x_test[ref_ind, ]
test_y_ref <- test_y_test[ref_ind]

test_x_test <- test_x_test[-ref_ind, ]
test_y_test <- test_y_test[-ref_ind]


## -----------------------------------------------------------------------------
set.seed(1124)

pls_model <- model(
  Xr = rbind(
    train_x[!is.na(train_y), ],
    test_x_ref[!is.na(test_y_ref), ]
  ), 
  Yr = c(
    train_y[!is.na(train_y)],
    test_y_ref[!is.na(test_y_ref)]
  ),
  fit_method = fit_pls(ncomp = 15, method = "simpls", scale = TRUE),
  control = model_control(validation_type = "lgo", number = 10)
)

best_ncomp <- which(pls_model$cv_results$optimal)

pred <- predict(pls_model, test_x_test, ncomp = best_ncomp)


## -----------------------------------------------------------------------------
reg_metrics <- function(obs, pred, na.rm = TRUE) {
  if (na.rm) {
    ok <- complete.cases(obs, pred)
    obs <- obs[ok]
    pred <- pred[ok]
  }
  c(RMSE = sqrt(mean((obs - pred)^2)), R2 = cor(obs, pred)^2)
}


## -----------------------------------------------------------------------------
reg_metrics(test_y_test, pred)


## -----------------------------------------------------------------------------
my_control <- gesearch_control(
  retain_by = "probability",
  tune = FALSE,
  number = 10L,
  p = 0.75
)


## -----------------------------------------------------------------------------
#| eval: true
#| echo: false 
gs <- gesearch(
  Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
  Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
  Xu = test_x_ref, # the available target domain spectra
  Yu = test_y_ref, # the available target domain response (accepts NAs)
  k = 50, b = 100, retain = 0.95,
  target_size = 120,
  fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
  optimization = c("reconstruction", "similarity", "response"),
  control = my_control,
  seed = 1124, 
  verbose = FALSE
)


## -----------------------------------------------------------------------------
#| eval: false
# gs <- gesearch(
#   Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
#   Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
#   Xu = test_x_ref, # the available target domain spectra
#   Yu = test_y_ref, # the available target domain response (accepts NAs)
#   k = 50, b = 100, retain = 0.95,
#   target_size = 120,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity", "response"),
#   control = my_control,
#   seed = 1124
# )


## -----------------------------------------------------------------------------
#| eval: false
# plot(gs)


## -----------------------------------------------------------------------------
#| label: fig-evolution-gs
#| echo: false 
#| fig-cap: "Evolution of the three weakness metrics (response, reconstruction and dissimilarity) used in the search."
#| fig-align: "center"
#| fig-width: 8.5
#| fig-height: 5.5
plot(gs, main = NA)


## -----------------------------------------------------------------------------
# the internal leave-group out CV results for the data found
gs$validation_results[[1]]$results$train


## -----------------------------------------------------------------------------
# the CV on the target samples that drove the search (note that this is not a proper test set, as these samples were used to guide the search, but it can provide some insight into the performance of the final model on the target domain)
gs$validation_results[[1]]$results$test


## -----------------------------------------------------------------------------
best_ncomp <- which.min(gs$validation_results[[1]]$results$train$rmse_cv)


## -----------------------------------------------------------------------------
pred_gs <- predict(gs, test_x_test)[[1]]

reg_metrics(test_y_test, pred_gs[, best_ncomp])


## -----------------------------------------------------------------------------
#| label: fig-pred-comparison
#| fig-cap: "Comparison of observed versus predicted total carbon values. Left: model fitted using all available training samples. Right: model fitted using the subset of samples selected by gesearch()."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 4.5
op <- par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1))

rng <- range(pred, pred_gs, test_y_test, na.rm = TRUE)

plot(
  pred, test_y_test,
  xlim = rng,
  ylim = rng,
  xlab = "Predicted Total Carbon, %",
  ylab = "Observed Total Carbon, %",
  main = "Model with all training \nsamples",
  cex = 1.5,
  pch = 16,
  col = rgb(0.5, 0.5, 0.5, 0.6)
)
grid(lty = 1)
abline(0, 1, col = "red")

plot(
  pred_gs[, best_ncomp], test_y_test,
  xlim = rng,
  ylim = rng,
  xlab = "Predicted Total Carbon, %",
  ylab = "Observed Total Carbon, %",
  main = "Model with gesearch-selected \nsamples",
  cex = 1.5,
  pch = 16,
  col = rgb(0.5, 0.5, 0.5, 0.6)
)
grid(lty = 1)
abline(0, 1, col = "red")

par(op)


## -----------------------------------------------------------------------------
#| eval: true
#| echo: false 
gs_nr <- gesearch(
  Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
  Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
  Xu = test_x_ref, # the available target domain spectra
  Yu = NULL, # NO available target domain response (accepts NAs)
  k = 50, b = 80, retain = 0.98,
  target_size = 160,
  fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
  optimization = c("reconstruction", "similarity"),
  control = my_control,
  seed = 1124, 
  verbose = FALSE
)


## -----------------------------------------------------------------------------
#| eval: false
# gs_nr <- gesearch(
#   Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
#   Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
#   Xu = test_x_ref, # the available target domain spectra
#   Yu = NULL, # NO available target domain response (accepts NAs)
#   k = 50, b = 80, retain = 0.98,
#   target_size = 160,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity"),
#   control = my_control,
#   seed = 1124
# )


## -----------------------------------------------------------------------------
best_ncomp_nr <- which.min(gs_nr$validation_results[[1]]$results$train$rmse_cv)
best_ncomp_nr
pred_gs_nr <- predict(gs_nr, test_x_test)[[1]]

reg_metrics(test_y_test, pred_gs_nr[, best_ncomp_nr])


## -----------------------------------------------------------------------------
#| eval: true
#| echo: false 
gs_nr2 <- gesearch(
  Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
  Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
  Xu = test_x_ref, # the available target domain spectra
  Yu = NULL, # NO available target domain response (accepts NAs)
  Yu_lims = c(0, 5), # the response range in the target domain
  k = 50, b = 80, retain = 0.98,
  target_size = 160,
  fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
  optimization = c("reconstruction", "similarity", "range"),
  control = my_control,
  seed = 1124, 
  verbose = FALSE
)


## -----------------------------------------------------------------------------
#| eval: false
# gs_nr2 <- gesearch(
#   Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
#   Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
#   Xu = test_x_ref, # the available target domain spectra
#   Yu = NULL, # NO available target domain response (accepts NAs)
#   Yu_lims = c(0, 5), # the response range in the target domain
#   k = 50, b = 80, retain = 0.98,
#   target_size = 160,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity", "range"),
#   control = my_control,
#   seed = 1124
# )


## -----------------------------------------------------------------------------
best_ncomp_nr2 <- which.min(gs_nr2$validation_results[[1]]$results$train$rmse_cv)
best_ncomp_nr2
pred_gs_nr2 <- predict(gs_nr2, test_x_test)[[1]]

reg_metrics(test_y_test, pred_gs_nr2[, best_ncomp_nr2])


## -----------------------------------------------------------------------------
#| label: parallel-gesearch
#| eval: false
# # Running gesearch() using multiple cores
# 
# # Execute with two cores, if available, ...
# n_cores <- 2
# 
# # ... if not then go with 1 core
# if (parallel::detectCores() < 2) {
#   n_cores <- 1
# }
# 
# # Set the number of cores
# library(doParallel)
# clust <- makeCluster(n_cores)
# registerDoParallel(clust)
# 
# # Alternatively:
# # library(doSNOW)
# # clust <- makeCluster(n_cores, type = "SOCK")
# # registerDoSNOW(clust)
# # getDoParWorkers()
# 
# my_control <- gesearch_control(
#   retain_by = "probability",
#   tune = FALSE,
#   number = 10L,
#   p = 0.75
# )
# 
# gs_p <- gesearch(
#   Xr = train_x[!is.na(train_y), ],
#   Yr = train_y[!is.na(train_y)],
#   Xu = test_x_ref,
#   Yu = test_y_ref,
#   k = 50,
#   b = 100,
#   retain = 0.98,
#   target_size = 80,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity", "response"),
#   control = my_control,
#   seed = 1124,
#   pchunks = 2L
# )
# 
# # Go back to sequential processing
# registerDoSEQ()
# try(stopCluster(clust))
# 
# gs_p
Any scripts or data that you put into this service are public.
resemble documentation built on April 21, 2026, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/evolutionary-subset-search.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/evolutionary-subset-search.R In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/evolutionary-subset-search.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics