estimating-dissimilarity-between-spectra.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

## -----------------------------------------------------------------------------
#| echo: false
Sys.setenv(OMP_NUM_THREADS = 2)


## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)


## -----------------------------------------------------------------------------
diss_correlation()


## -----------------------------------------------------------------------------
diss_correlation(center = TRUE, scale = TRUE)


## -----------------------------------------------------------------------------
#| message: false
library(resemble)
library(prospectr)

# obtain a numeric vector of the wavelengths at which spectra is recorded 
wavs <- as.numeric(colnames(NIRsoil$spc))

# pre-process the spectra:
# - use detrend
# - use first order derivative
diff_order <- 1
poly_order <- 1
window <- 7

# Preprocess spectra
NIRsoil$spc_pr <- savitzkyGolay(
  detrend(NIRsoil$spc, wav = wavs),
  m = diff_order, p = poly_order, w = window
)
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]

test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y  <- NIRsoil$Ciso[NIRsoil$train == 0]


## -----------------------------------------------------------------------------
#| label: pca-diss
#| eval: TRUE

# Default: variance-based component selection (ncomp_by_var(0.01))
d_pca <- dissimilarity(train_x, diss_method = diss_pca())

# With OPC-based component selection (requires Yr)
d_pca_opc <- dissimilarity(
  train_x,
  Yr = train_y,
  diss_method = diss_pca(ncomp = ncomp_by_opc(30))
)

# Between training and test sets
d_pca_tr_ts <- dissimilarity(
 train_x,
 Xu = test_x,
 Yr = train_y,
 diss_method = diss_pca(
   ncomp = ncomp_by_opc(30),
   return_projection = TRUE
 )
)

d_pca_tr_ts


## -----------------------------------------------------------------------------
# Example of the first 5 rows and columns of the dissimilarity 
# matrix between training and test sets
# Rows: training observations; 
# Columns: test observations
d_pca_tr_ts$dissimilarity[1:5, 1:5]


## -----------------------------------------------------------------------------
d_pca_tr_ts$projection


## ----pls-diss, eval = FALSE---------------------------------------------------
# # Default: OPC-based component selection
# d_pls <- dissimilarity(
#   train_x,
#   Xu = test_x,
#   Yr = train_y,
#   diss_method = diss_pls()
# )
# 
# # Fixed number of components
# d_pls_fixed <- dissimilarity(
#   train_x,
#   Xu = test_x,
#   Yr = train_y,
#   diss_method = diss_pls(ncomp = 15)
# )


## ----cor-diss, eval = FALSE---------------------------------------------------
# # Standard correlation dissimilarity
# d_cor <- dissimilarity(train_x, diss_method = diss_correlation())
# d_cor


## -----------------------------------------------------------------------------
correlation_baser <- 0.5 * (1 - cor(t(train_x)))
corrrelation_resemble <- dissimilarity(
  train_x, diss_method = diss_correlation(center = FALSE)
)
# the maximum discreepancy between the two implementations should 
# be very close to zero
max(abs(corrrelation_resemble$dissimilarity - correlation_baser))


## -----------------------------------------------------------------------------
#| eval: false
# # Compare computational speed: resemble vs base R correlation dissimilarity
# n_iter <- 50
# 
# # Transpose once before timing (fair comparison)
# train_x_t <- t(train_x)
# 
# # resemble implementation
# time_resemble <- system.time({
#   for (i in seq_len(n_iter)) {
#     cor_resemble <- dissimilarity(train_x, diss_method = diss_correlation(center = FALSE))
#   }
# })
# 
# # base R implementation
# time_base <- system.time({
#   for (i in seq_len(n_iter)) {
#     cor_base <- 0.5 * (1 - cor(train_x_t))
#   }
# })
# 
# # Results
# data.frame(
#   method = c("resemble", "base R"),
#   elapsed_sec = c(time_resemble["elapsed"], time_base["elapsed"]),
#   per_call_ms = c(time_resemble["elapsed"], time_base["elapsed"]) / n_iter * 1000
# )


## ----cor-diss-mw, eval = FALSE------------------------------------------------
# # Moving window correlation (window size must be odd)
# d_cor_mw <- dissimilarity(
#   train_x,
#   Xu = test_x,
#   diss_method = diss_correlation(ws = 51)
# )
# d_cor_mw


## ----omp-threads-use, eval = FALSE--------------------------------------------
# # Use 4 threads
# Sys.setenv(OMP_NUM_THREADS = 4)
# 
# # Or use all available cores
# Sys.setenv(OMP_NUM_THREADS = parallel::detectCores())


## ----omp-threads-restrict, eval = FALSE---------------------------------------
# # Limit OpenMP threads
# Sys.setenv(OMP_NUM_THREADS = 1)
# 
# # If using OpenBLAS
# Sys.setenv(OPENBLAS_NUM_THREADS = 1)
# 
# # If using MKL
# Sys.setenv(MKL_NUM_THREADS = 1)


## ----blas-threads, eval = FALSE-----------------------------------------------
# RhpcBLASctl::blas_set_num_threads(1)
# RhpcBLASctl::omp_set_num_threads(1)


## ----euclid-diss, eval = FALSE------------------------------------------------
# d_euclid <- dissimilarity(train_x, Xu = test_x, diss_method = diss_euclidean())
# d_euclid


## -----------------------------------------------------------------------------
ed_resemble <- dissimilarity(train_x, diss_method = diss_euclidean(center = FALSE))
ed_baser <- as.matrix(dist(train_x, method = "euclidean"))

# scaling the base R Euclidean distance by the number of variables:
ed_baser_scaled <- sqrt((ed_baser^2) / ncol(train_x))

# differences between the two implementations
max(abs(ed_baser_scaled - ed_resemble$dissimilarity))


## -----------------------------------------------------------------------------
#| label: euclid-speed
#| eval: false
# # Compare computational speed: resemble vs base R
# n_iter <- 50
# 
# # resemble implementation
# time_resemble <- system.time({
#   for (i in seq_len(n_iter)) {
#     ed_resemble <- dissimilarity(train_x, diss_method = diss_euclidean(center = FALSE))
#   }
# })
# 
# # base R implementation
# time_base <- system.time({
#   for (i in seq_len(n_iter)) {
#     ed_base <- as.matrix(dist(train_x, method = "euclidean"))
#   }
# })
# 
# # Results
# data.frame(
#   method = c("resemble", "base R"),
#   elapsed_sec = c(time_resemble["elapsed"], time_base["elapsed"]),
#   per_call_ms = c(time_resemble["elapsed"], time_base["elapsed"]) / n_iter * 1000
# )


## ----cosine-diss, eval = FALSE------------------------------------------------
# d_cosine <- dissimilarity(train_x, Xu = test_x, diss_method = diss_cosine())
# d_cosine


## -----------------------------------------------------------------------------
# Compute dissimilarity matrices using different methods
d_pca_var <- dissimilarity(train_x, diss_method = diss_pca(scale = TRUE))

d_pca_opc <- dissimilarity(
  train_x,
  Yr = train_y,
  diss_method = diss_pca(ncomp = ncomp_by_opc(30), scale = TRUE)
)

d_pls_opc <- dissimilarity(
  train_x,
  Yr = train_y,
  diss_method = diss_pls(ncomp = ncomp_by_opc(), scale = TRUE)
)

d_cor <- dissimilarity(
  train_x,
  diss_method = diss_correlation(ws = 51, scale = TRUE)
)

d_euclid <- dissimilarity(
  train_x,
  diss_method = diss_euclidean(scale = TRUE)
)

d_cosine <- dissimilarity(
  train_x,
  diss_method = diss_cosine(scale = TRUE)
)


## -----------------------------------------------------------------------------
# Evaluate each method using Ciso as side information
side_info <- as.matrix(train_y)

eval_results <- list(
  "PCA (var)" = diss_evaluate(
    d_pca_var$dissimilarity, side_info
  ),
  "PCA (opc)" = diss_evaluate(
    d_pca_opc$dissimilarity, side_info
  ),
  "PLS (opc)" = diss_evaluate(
    d_pls_opc$dissimilarity, side_info
  ),
  correlation = diss_evaluate(
    d_cor$dissimilarity, side_info
  ),
  Euclidean = diss_evaluate(
    d_euclid$dissimilarity, side_info
  ),
  cosine = diss_evaluate(
    d_cosine$dissimilarity, side_info
  )
)

# Extract RMSD and correlation for each method
comparison <- do.call(rbind, lapply(names(eval_results), function(nm) {
  data.frame(
    method = nm,
    rmsd   = eval_results[[nm]]$eval[, "rmsd"],
    r      = eval_results[[nm]]$eval[, "r"]
  )
}))
comparison


## -----------------------------------------------------------------------------
#| label: diss-eval-plot
#| fig-cap: "Comparison of dissimilarity methods based on first nearest-neighbor evaluation using `Ciso` as side information. In each panel, the x-axis shows the `Ciso` value of the nearest neighbor identified from the spectral dissimilarity matrix, and the y-axis shows the observed `Ciso` value of the corresponding sample. The dashed line represents perfect agreement (y = x)."
#| fig-align: "center"
#| fig-width: 8
#| fig-height: 6
#| echo: true
#| fig-show: hold
blue  <- "#3B82F6"
amber <- "#F59E0B"
slate_grid <- "#33415540"

old_par <- par(mfrow = c(2, 3), mar = c(4, 4, 3, 1))

for (nm in names(eval_results)) {
  xy <- eval_results[[nm]]$first_nn
  plot(
    xy[, 2], xy[, 1],
    xlab = "Ciso (1-NN)",
    ylab = "Ciso",
    col = adjustcolor(blue, alpha.f = 0.5),
    pch = 16,
    main = nm
  )
  grid(lty = 1, col = slate_grid)
  abline(0, 1, col = amber, lwd = 2)
}

par(old_par)

Any scripts or data that you put into this service are public.

resemble documentation built on April 21, 2026, 1:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/estimating-dissimilarity-between-spectra.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/estimating-dissimilarity-between-spectra.R In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

inst/doc/estimating-dissimilarity-between-spectra.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics