inst/doc/estimating-dissimilarity-between-spectra.R

## -----------------------------------------------------------------------------
#| echo: false
Sys.setenv(OMP_NUM_THREADS = 2)


## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)


## -----------------------------------------------------------------------------
diss_correlation()


## -----------------------------------------------------------------------------
diss_correlation(center = TRUE, scale = TRUE)


## -----------------------------------------------------------------------------
#| message: false
library(resemble)
library(prospectr)

# obtain a numeric vector of the wavelengths at which spectra is recorded 
wavs <- as.numeric(colnames(NIRsoil$spc))

# pre-process the spectra:
# - use detrend
# - use first order derivative
diff_order <- 1
poly_order <- 1
window <- 7

# Preprocess spectra
NIRsoil$spc_pr <- savitzkyGolay(
  detrend(NIRsoil$spc, wav = wavs),
  m = diff_order, p = poly_order, w = window
)
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]

test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y  <- NIRsoil$Ciso[NIRsoil$train == 0]


## -----------------------------------------------------------------------------
#| label: pca-diss
#| eval: TRUE

# Default: variance-based component selection (ncomp_by_var(0.01))
d_pca <- dissimilarity(train_x, diss_method = diss_pca())

# With OPC-based component selection (requires Yr)
d_pca_opc <- dissimilarity(
  train_x,
  Yr = train_y,
  diss_method = diss_pca(ncomp = ncomp_by_opc(30))
)

# Between training and test sets
d_pca_tr_ts <- dissimilarity(
 train_x,
 Xu = test_x,
 Yr = train_y,
 diss_method = diss_pca(
   ncomp = ncomp_by_opc(30),
   return_projection = TRUE
 )
)

d_pca_tr_ts


## -----------------------------------------------------------------------------
# Example of the first 5 rows and columns of the dissimilarity 
# matrix between training and test sets
# Rows: training observations; 
# Columns: test observations
d_pca_tr_ts$dissimilarity[1:5, 1:5]


## -----------------------------------------------------------------------------
d_pca_tr_ts$projection


## ----pls-diss, eval = FALSE---------------------------------------------------
# # Default: OPC-based component selection
# d_pls <- dissimilarity(
#   train_x,
#   Xu = test_x,
#   Yr = train_y,
#   diss_method = diss_pls()
# )
# 
# # Fixed number of components
# d_pls_fixed <- dissimilarity(
#   train_x,
#   Xu = test_x,
#   Yr = train_y,
#   diss_method = diss_pls(ncomp = 15)
# )


## ----cor-diss, eval = FALSE---------------------------------------------------
# # Standard correlation dissimilarity
# d_cor <- dissimilarity(train_x, diss_method = diss_correlation())
# d_cor


## -----------------------------------------------------------------------------
correlation_baser <- 0.5 * (1 - cor(t(train_x)))
corrrelation_resemble <- dissimilarity(
  train_x, diss_method = diss_correlation(center = FALSE)
)
# the maximum discreepancy between the two implementations should 
# be very close to zero
max(abs(corrrelation_resemble$dissimilarity - correlation_baser))


## -----------------------------------------------------------------------------
#| eval: false
# # Compare computational speed: resemble vs base R correlation dissimilarity
# n_iter <- 50
# 
# # Transpose once before timing (fair comparison)
# train_x_t <- t(train_x)
# 
# # resemble implementation
# time_resemble <- system.time({
#   for (i in seq_len(n_iter)) {
#     cor_resemble <- dissimilarity(train_x, diss_method = diss_correlation(center = FALSE))
#   }
# })
# 
# # base R implementation
# time_base <- system.time({
#   for (i in seq_len(n_iter)) {
#     cor_base <- 0.5 * (1 - cor(train_x_t))
#   }
# })
# 
# # Results
# data.frame(
#   method = c("resemble", "base R"),
#   elapsed_sec = c(time_resemble["elapsed"], time_base["elapsed"]),
#   per_call_ms = c(time_resemble["elapsed"], time_base["elapsed"]) / n_iter * 1000
# )


## ----cor-diss-mw, eval = FALSE------------------------------------------------
# # Moving window correlation (window size must be odd)
# d_cor_mw <- dissimilarity(
#   train_x,
#   Xu = test_x,
#   diss_method = diss_correlation(ws = 51)
# )
# d_cor_mw


## ----omp-threads-use, eval = FALSE--------------------------------------------
# # Use 4 threads
# Sys.setenv(OMP_NUM_THREADS = 4)
# 
# # Or use all available cores
# Sys.setenv(OMP_NUM_THREADS = parallel::detectCores())


## ----omp-threads-restrict, eval = FALSE---------------------------------------
# # Limit OpenMP threads
# Sys.setenv(OMP_NUM_THREADS = 1)
# 
# # If using OpenBLAS
# Sys.setenv(OPENBLAS_NUM_THREADS = 1)
# 
# # If using MKL
# Sys.setenv(MKL_NUM_THREADS = 1)


## ----blas-threads, eval = FALSE-----------------------------------------------
# RhpcBLASctl::blas_set_num_threads(1)
# RhpcBLASctl::omp_set_num_threads(1)


## ----euclid-diss, eval = FALSE------------------------------------------------
# d_euclid <- dissimilarity(train_x, Xu = test_x, diss_method = diss_euclidean())
# d_euclid


## -----------------------------------------------------------------------------
ed_resemble <- dissimilarity(train_x, diss_method = diss_euclidean(center = FALSE))
ed_baser <- as.matrix(dist(train_x, method = "euclidean"))

# scaling the base R Euclidean distance by the number of variables:
ed_baser_scaled <- sqrt((ed_baser^2) / ncol(train_x))

# differences between the two implementations
max(abs(ed_baser_scaled - ed_resemble$dissimilarity))


## -----------------------------------------------------------------------------
#| label: euclid-speed
#| eval: false
# # Compare computational speed: resemble vs base R
# n_iter <- 50
# 
# # resemble implementation
# time_resemble <- system.time({
#   for (i in seq_len(n_iter)) {
#     ed_resemble <- dissimilarity(train_x, diss_method = diss_euclidean(center = FALSE))
#   }
# })
# 
# # base R implementation
# time_base <- system.time({
#   for (i in seq_len(n_iter)) {
#     ed_base <- as.matrix(dist(train_x, method = "euclidean"))
#   }
# })
# 
# # Results
# data.frame(
#   method = c("resemble", "base R"),
#   elapsed_sec = c(time_resemble["elapsed"], time_base["elapsed"]),
#   per_call_ms = c(time_resemble["elapsed"], time_base["elapsed"]) / n_iter * 1000
# )


## ----cosine-diss, eval = FALSE------------------------------------------------
# d_cosine <- dissimilarity(train_x, Xu = test_x, diss_method = diss_cosine())
# d_cosine


## -----------------------------------------------------------------------------
# Compute dissimilarity matrices using different methods
d_pca_var <- dissimilarity(train_x, diss_method = diss_pca(scale = TRUE))

d_pca_opc <- dissimilarity(
  train_x,
  Yr = train_y,
  diss_method = diss_pca(ncomp = ncomp_by_opc(30), scale = TRUE)
)

d_pls_opc <- dissimilarity(
  train_x,
  Yr = train_y,
  diss_method = diss_pls(ncomp = ncomp_by_opc(), scale = TRUE)
)

d_cor <- dissimilarity(
  train_x,
  diss_method = diss_correlation(ws = 51, scale = TRUE)
)

d_euclid <- dissimilarity(
  train_x,
  diss_method = diss_euclidean(scale = TRUE)
)

d_cosine <- dissimilarity(
  train_x,
  diss_method = diss_cosine(scale = TRUE)
)


## -----------------------------------------------------------------------------
# Evaluate each method using Ciso as side information
side_info <- as.matrix(train_y)

eval_results <- list(
  "PCA (var)" = diss_evaluate(
    d_pca_var$dissimilarity, side_info
  ),
  "PCA (opc)" = diss_evaluate(
    d_pca_opc$dissimilarity, side_info
  ),
  "PLS (opc)" = diss_evaluate(
    d_pls_opc$dissimilarity, side_info
  ),
  correlation = diss_evaluate(
    d_cor$dissimilarity, side_info
  ),
  Euclidean = diss_evaluate(
    d_euclid$dissimilarity, side_info
  ),
  cosine = diss_evaluate(
    d_cosine$dissimilarity, side_info
  )
)

# Extract RMSD and correlation for each method
comparison <- do.call(rbind, lapply(names(eval_results), function(nm) {
  data.frame(
    method = nm,
    rmsd   = eval_results[[nm]]$eval[, "rmsd"],
    r      = eval_results[[nm]]$eval[, "r"]
  )
}))
comparison


## -----------------------------------------------------------------------------
#| label: diss-eval-plot
#| fig-cap: "Comparison of dissimilarity methods based on first nearest-neighbor evaluation using `Ciso` as side information. In each panel, the x-axis shows the `Ciso` value of the nearest neighbor identified from the spectral dissimilarity matrix, and the y-axis shows the observed `Ciso` value of the corresponding sample. The dashed line represents perfect agreement (y = x)."
#| fig-align: "center"
#| fig-width: 8
#| fig-height: 6
#| echo: true
#| fig-show: hold
blue  <- "#3B82F6"
amber <- "#F59E0B"
slate_grid <- "#33415540"

old_par <- par(mfrow = c(2, 3), mar = c(4, 4, 3, 1))

for (nm in names(eval_results)) {
  xy <- eval_results[[nm]]$first_nn
  plot(
    xy[, 2], xy[, 1],
    xlab = "Ciso (1-NN)",
    ylab = "Ciso",
    col = adjustcolor(blue, alpha.f = 0.5),
    pch = 16,
    main = nm
  )
  grid(lty = 1, col = slate_grid)
  abline(0, 1, col = amber, lwd = 2)
}

par(old_par)

Try the resemble package in your browser

Any scripts or data that you put into this service are public.

resemble documentation built on April 21, 2026, 1:07 a.m.