building-a-library-of-models-with-liblex.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)
library(prospectr)
Sys.setenv(OMP_NUM_THREADS = 2)


## -----------------------------------------------------------------------------
#| label: data-prep
#| message: false
library(resemble)
library(prospectr)

data(NIRsoil)

# Wavelengths
wavs <- as.numeric(colnames(NIRsoil$spc))

# Preprocess: detrend + first derivative
NIRsoil$spc_pr <- savitzkyGolay(
  detrend(NIRsoil$spc, wav = wavs),
  m = 1, p = 1, w = 7
)

# Split into training and test sets
# Note: missing values in the response are allowed in liblex
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]
test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y  <- NIRsoil$Ciso[NIRsoil$train == 0]

cat("Training set:", nrow(train_x), "observations\n")
cat("Test set:", nrow(test_x), "observations\n")


## -----------------------------------------------------------------------------
#| label: neighbors-example1
# Fixed neighborhood sizes to evaluate
neighbors_k(k = seq(40, 120, by = 20))


## -----------------------------------------------------------------------------
#| label: neighbors-example2
# Threshold-based selection with bounds
neighbors_diss(
  threshold = seq(0.05, 0.5, length.out = 10), 
  k_min = 40, 
  k_max = 150
)


## -----------------------------------------------------------------------------
#| label: diss-example
# Moving-window correlation dissimilarity (recommended for spectra)
diss_correlation(ws = 37, scale = TRUE)

# PCA-based Mahalanobis distance with optimized components
diss_pca()


## -----------------------------------------------------------------------------
#| label: fit-method-example
# waPLS with modified PLS algorithm and scaling
fit_wapls(
  min_ncomp = 3, 
  max_ncomp = 15, 
  scale = TRUE, 
  method = "mpls"
)


## -----------------------------------------------------------------------------
#| label: control-example
#| eval: false
# # Build library with hyperparameter tuning
# liblex_control(mode = "build", tune = TRUE)


## -----------------------------------------------------------------------------
#| eval: false
# ciso_lib_k <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_k(seq(40, 80, by = 20)),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   control = liblex_control(tune = TRUE)
# )
# 
# ciso_lib_k


## -----------------------------------------------------------------------------
#| label: liblex-fixed-k
#| cache: true
#| echo: false
#| message: false
#| warning: false
ciso_lib_k <- liblex(
  Xr = train_x, 
  Yr = train_y, 
  neighbors = neighbors_k(seq(40, 80, by = 20)),
  diss_method = diss_correlation(ws = 37, scale = TRUE),
  fit_method = fit_wapls(
    min_ncomp = 3,
    max_ncomp = 15,
    scale = TRUE,
    method = "mpls"
  ),
  control = liblex_control(tune = TRUE),
  verbose = FALSE
)

ciso_lib_k


## -----------------------------------------------------------------------------
#| label: fig-liblex
#| fig-cap: "Top: Best model obtained for each neighborhood size (based on the RMSE). Bottom: Centroids of the neighborhoods, usled later in prediction to selet the models to be used."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 6.5
plot(ciso_lib_k)


## -----------------------------------------------------------------------------
#| label: liblex-threshold
#| cache: true
#| echo: false
ciso_lib_thr <- liblex(
  Xr = train_x, 
  Yr = train_y, 
  neighbors = neighbors_diss(
    threshold = seq(0.05, 0.5, length.out = 5), 
    k_min = 40, 
    k_max = 150
  ),
  diss_method = diss_correlation(ws = 37, scale = TRUE),
  fit_method = fit_wapls(
    min_ncomp = 3,
    max_ncomp = 15,
    scale = TRUE,
    method = "mpls"
  ),
  control = liblex_control(tune = TRUE),
  verbose = FALSE
)

ciso_lib_thr


## -----------------------------------------------------------------------------
#| cache: true
#| eval: false
# ciso_lib_thr <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_diss(
#     threshold = seq(0.05, 0.5, length.out = 5),
#     k_min = 40,
#     k_max = 150
#   ),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   control = liblex_control(tune = TRUE)
# )
# 
# ciso_lib_thr


## -----------------------------------------------------------------------------
#| label: fig-centroids
#| fig-cap: "Neighborhood centroids (blue) compared to test spectra (red)"
#| fig-width: 8
#| fig-height: 5
#| fig-align: center
wavs_pr <- as.numeric(colnames(ciso_lib_k$scaling$local_x_center))

matplot(
  wavs_pr,
  t(test_x),
  col = rgb(1, 0, 0, 0.3),
  lty = 1,
  type = "l",
  xlab = "Wavelength (nm)",
  ylab = "First derivative detrended absorbance"
)

matlines(
  wavs_pr,
  t(ciso_lib_k$scaling$local_x_center),
  col = rgb(0, 0, 1, 0.3),
  lty = 1
)

grid(lty = 1)

legend(
  "topright",
  legend = c("Samples to predict", "Neighborhood centroids"),
  col = c(rgb(1, 0, 0, 0.8), rgb(0, 0, 1, 0.8)),
  lty = 1,
  lwd = 2,
  bty = "n"
)


## -----------------------------------------------------------------------------
#| label: fig-regression-coefficients
#| fig-cap: "Regression coefficients (top) and intercept distribution (bottom) of the local experts"
#| fig-width: 8
#| fig-height: 7
#| fig-align: center

par(mfrow = c(2, 1), mar = c(4, 4, 2, 1))

# Regression coefficients across wavelengths
models <- ciso_lib_k$coefficients

matplot(
  x = wavs_pr,
  y = t(models$B),
  type = "l",
  lty = 1,
  col = rgb(0.23, 0.51, 0.96, 0.15),
  xlab = "Wavelength (nm)",
  ylab = "Regression coefficient",
  main = paste0("Regression coefficients (", nrow(models$B), " experts)")
)

# Add mean coefficient profile
lines(wavs_pr, colMeans(models$B), col = "red", lwd = 2)
grid(lty = 1)
legend(
  "topright",
  legend = c("Individual experts", "Mean"),
  col = c(rgb(0.23, 0.51, 0.96, 0.6), "red"),
  lty = 1,
  lwd = c(1, 2),
  bty = "n"
)

# Distribution of intercepts
plot(
  density(models$B0, na.rm = TRUE),
  col = rgb(0.23, 0.51, 0.96),
  lwd = 2,
  xlab = "Intercept value",
  ylab = "Density",
  main = "Distribution of intercepts"
)
polygon(
  density(models$B0, na.rm = TRUE),
  col = rgb(0.23, 0.51, 0.96, 0.2),
  border = NA
)
abline(v = mean(models$B0, na.rm = TRUE), col = "red", lty = 2, lwd = 2)
grid(lty = 1)
legend(
  "topright",
  legend = c("Density", "Mean"),
  col = c(rgb(0.23, 0.51, 0.96), "red"),
  lty = c(1, 2),
  lwd = 2,
  bty = "n"
)

par(mfrow = c(1, 1))


## -----------------------------------------------------------------------------
#| label: kmeans-anchors
# Select 350 representative anchors using k-means sampling
# on the first 20 principal components
set.seed(1124)
kms <- naes(
  train_x, 
  k = 350, 
  pc = 20, 
  iter.max = 100,
  .center = TRUE, 
  .scale = TRUE
)

anchor_km <- kms$model

cat("Selected", length(anchor_km), "anchors via k-means\n")


## -----------------------------------------------------------------------------
#| label: liblex-anchored
#| cache: true
#| echo: false
ciso_lib_anchored <- liblex(
  Xr = train_x, 
  Yr = train_y, 
  neighbors = neighbors_k(seq(40, 80, by = 20)),
  diss_method = diss_correlation(ws = 37, scale = TRUE),
  fit_method = fit_wapls(
    min_ncomp = 3,
    max_ncomp = 15,
    scale = TRUE,
    method = "mpls"
  ),
  anchor_indices = anchor_km,
  control = liblex_control(tune = TRUE),
  verbose = FALSE
)

ciso_lib_anchored


## -----------------------------------------------------------------------------
#| cache: true
#| eval: false
# ciso_lib_anchored <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_k(seq(40, 80, by = 20)),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   anchor_indices = anchor_km,
#   control = liblex_control(tune = TRUE)
# )
# 
# ciso_lib_anchored


## -----------------------------------------------------------------------------
# Number of experts in the anchored library
n_experts <- nrow(ciso_lib_anchored$coefficients$B)
cat("Number of experts in the anchored library:", n_experts, "\n")


## -----------------------------------------------------------------------------
#| label: fig-centroids-anchored
#| fig-cap: "Neighborhood centroids from anchored library compared to test spectra"
#| fig-width: 8
#| fig-height: 5
#| fig-align: center

wavs_pr <- as.numeric(colnames(ciso_lib_anchored$scaling$local_x_center))
n_experts <- nrow(ciso_lib_anchored$scaling$local_x_center)
n_test <- nrow(test_x)

# Plot test spectra first (background)
matplot(
  wavs_pr,
  t(test_x),
  col = rgb(0.8, 0.2, 0.2, 0.2),
  lty = 1,
  type = "l",
  xlab = "Wavelength (nm)",
  ylab = "First derivative detrended absorbance",
  main = paste0("Coverage: ", n_experts, " experts vs ", n_test, " test samples")
)

# Overlay centroids
matlines(
  wavs_pr,
  t(ciso_lib_anchored$scaling$local_x_center),
  col = rgb(0.23, 0.51, 0.96, 0.3),
  lty = 1
)
grid(lty = 1)

legend(
  "topright",
  legend = c(
    paste0("Test samples (n = ", n_test, ")"),
    paste0("Centroids (n = ", n_experts, ")")
  ),
  col = c(
    rgb(0.8, 0.2, 0.2, 0.5),
    rgb(0.23, 0.51, 0.96, 0.5)
  ),
  lty = 1,
  lwd = c(1, 1, 2, 2),
  bty = "n"
)


## -----------------------------------------------------------------------------
#| label: predict-basic
ciso_pred <- predict(ciso_lib_k, newdata = test_x, verbose = FALSE)

# Prediction output structure
names(ciso_pred)

# Main predictions with uncertainty
head(ciso_pred$predictions)


## -----------------------------------------------------------------------------
#| label: weighting-options
#| eval: false
# # Gaussian kernel (default)
# predict(ciso_lib_k, newdata = test_x, weighting = "gaussian")
# 
# # Tricubic kernel (similar to LOCAL algorithm)
# predict(ciso_lib_k, newdata = test_x, weighting = "tricube")
# 
# # Equal weights
# predict(ciso_lib_k, newdata = test_x, weighting = "none")


## -----------------------------------------------------------------------------
#| label: enforce-indices
#| eval: false
# # Always include specific experts in predictions
# predict(ciso_lib_k, newdata = test_x, enforce_indices = c(1, 5, 10))


## -----------------------------------------------------------------------------
#| label: evaluation
pred_values <- ciso_pred$predictions$pred
pred_sd <- ciso_pred$predictions$pred_sd

# Performance metrics (excluding missing values)
complete <- !is.na(test_y)
r2 <- cor(pred_values[complete], test_y[complete])^2
rmse <- sqrt(mean((pred_values[complete] - test_y[complete])^2))

cat("R²:  ", round(r2, 3), "\n")
cat("RMSE:", round(rmse, 3), "\n")


## -----------------------------------------------------------------------------
#| label: uncertainty-filter
# Filter predictions by uncertainty threshold
unc_threshold <- quantile(pred_sd[complete], 0.75)
reliable <- complete & (pred_sd < unc_threshold)

r2_filtered <- cor(pred_values[reliable], test_y[reliable])^2
rmse_filtered <- sqrt(mean((pred_values[reliable] - test_y[reliable])^2))

cat("After filtering high-uncertainty predictions:\n")
cat("  Retained:", sum(reliable), "/", sum(complete), "observations\n")
cat("  R²:      ", round(r2_filtered, 3), "\n")
cat("  RMSE:    ", round(rmse_filtered, 3), "\n")


## -----------------------------------------------------------------------------
#| label: fig-validation
#| fig-cap: "Predicted versus observed values"
#| fig-width: 6
#| fig-height: 5
#| fig-align: center
lims <- range(pred_values[complete], test_y[complete], na.rm = TRUE)

plot(
  pred_values[complete],
  test_y[complete],
  pch = 16,
  col = rgb(0, 0, 0, 0.5),
  xlab = "Predicted",
  ylab = "Observed",
  xlim = lims,
  ylim = lims
)

abline(0, 1, col = "red", lwd = 2)
grid(lty = 1)


## -----------------------------------------------------------------------------
#| label: parallel-example
#| eval: false
# library(doParallel)
# 
# # Register parallel backend
# n_cores <- min(parallel::detectCores() - 1, 4)
# cl <- makeCluster(n_cores)
# registerDoParallel(cl)
# 
# # Build library with parallel processing
# ciso_lib_parallel <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_k(seq(40, 120, by = 20)),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   control = liblex_control(
#     tune = TRUE,
#     allow_parallel = TRUE,
#     chunk_size = 10
#   )
# )
# 
# # Clean up
# stopCluster(cl)
# registerDoSEQ()
Any scripts or data that you put into this service are public.
resemble documentation built on April 21, 2026, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/building-a-library-of-models-with-liblex.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/building-a-library-of-models-with-liblex.R In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/building-a-library-of-models-with-liblex.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics