Nothing
#' @title Evaluate dissimilarity matrices
#'
#' @description
#' \loadmathjax
#' `r lifecycle::badge("stable")`
#'
#' Evaluates a dissimilarity matrix by comparing each observation to its
#' nearest neighbor based on side information. For continuous variables,
#' RMSD and correlation are computed; for categorical variables, the kappa
#' index is used.
#'
#' @usage
#' diss_evaluate(diss, side_info)
#'
#' @param diss A symmetric dissimilarity matrix. Alternatively, a vector
#' containing the lower triangle values (as returned by \code{\link[stats]{dist}}).
#' @param side_info A matrix of side information corresponding to the
#' observations. Can be numeric (one or more columns) or character (single
#' column for categorical data).
#'
#' @details
#' This function assesses whether a dissimilarity matrix captures meaningful
#' structure by examining the side information of nearest neighbor pairs
#' (Ramirez-Lopez et al., 2013). If observations that are similar in the
#' dissimilarity space also have similar side information values, the
#' dissimilarity is considered effective.
#'
#' For numeric \code{side_info}, the root mean square of differences (RMSD)
#' between each observation and its nearest neighbor is computed:
#'
#' \mjdeqn{j(i) = NN(x_i, X^{\{-i\}})}{j(i) = NN(x_i, X^{-i})}
#' \mjdeqn{RMSD = \sqrt{\frac{1}{m} \sum_{i=1}^{m} (y_i - y_{j(i)})^2}}{RMSD = sqrt(1/m sum (y_i - y_j(i))^2)}
#'
#' where \mjeqn{NN(x_i, X^{-i})}{NN(x_i, X^{-i})} returns the index of the
#' nearest neighbor of observation \mjeqn{i}{i} (excluding itself),
#' \mjeqn{y_i}{y_i} is the side information value for observation \mjeqn{i}{i},
#' and \mjeqn{m}{m} is the number of observations.
#'
#' For categorical \code{side_info}, the kappa index is computed:
#'
#' \mjdeqn{\kappa = \frac{p_o - p_e}{1 - p_e}}{kappa = (p_o - p_e) / (1 - p_e)}
#'
#' where \mjeqn{p_o}{p_o} is the observed agreement and \mjeqn{p_e}{p_e} is the
#' agreement expected by chance.
#'
#' @return A list with the following components:
#' \describe{
#' \item{eval}{For numeric side information: a matrix with columns
#' \code{rmsd} and \code{r} (correlation). For categorical: a matrix
#' with column \code{kappa}.}
#' \item{global_eval}{If multiple numeric side information variables are
#' provided, summary statistics across variables.}
#' \item{first_nn}{A matrix with the original side information and the
#' side information of each observation's nearest neighbor.}
#' }
#'
#' @seealso \code{\link{dissimilarity}}, \code{\link{ncomp_by_opc}}
#'
#' @references
#' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Stevens, A., Dematte, J.A.M.,
#' Scholten, T. 2013a. The spectrum-based learner: A new local approach for
#' modeling soil vis-NIR spectra of complex datasets. Geoderma 195-196,
#' 268-279.
#'
#' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Viscarra Rossel, R., Dematte,
#' J.A.M., Scholten, T. 2013b. Distance and similarity-search metrics for use
#' with soil vis-NIR spectra. Geoderma 199, 43-53.
#'
#' @author
#' \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez}
#'
#' @examples
#' \donttest{
#' library(prospectr)
#' data(NIRsoil)
#'
#' sg <- savitzkyGolay(NIRsoil$spc, p = 3, w = 11, m = 0)
#' NIRsoil$spc <- sg
#'
#' Yr <- NIRsoil$Nt[as.logical(NIRsoil$train)]
#' Xr <- NIRsoil$spc[as.logical(NIRsoil$train), ]
#'
#' # Compute PCA-based dissimilarity
#' d <- dissimilarity(Xr, diss_method = diss_pca(ncomp = 8))
#'
#' # Evaluate using side information
#' ev <- diss_evaluate(d$dissimilarity, side_info = as.matrix(Yr))
#' ev$eval
#'
#' # Evaluate with multiple side information variables
#' Yr_2 <- NIRsoil$CEC[as.logical(NIRsoil$train)]
#' ev_2 <- diss_evaluate(d$dissimilarity, side_info = cbind(Yr, Yr_2))
#' ev_2$eval
#' ev_2$global_eval
#' }
#'
#' @export
diss_evaluate <- function(diss, side_info) {
# ---------------------------------------------------------------------------
# Validate side_info
# ---------------------------------------------------------------------------
if (!is.matrix(side_info)) {
stop("'side_info' must be a matrix.", call. = FALSE)
}
if (is.character(side_info)) {
if (ncol(side_info) > 1L) {
stop(
"For categorical 'side_info', only one column is allowed.",
call. = FALSE
)
}
side_info <- as.factor(side_info)
if (nlevels(side_info) < 2L) {
stop(
"'side_info' must have at least two categories.",
call. = FALSE
)
}
get_eval <- .eval_categorical
} else {
get_eval <- .eval_continuous
}
if (any(colSums(!is.na(side_info)) < 4L)) {
stop(
"Each 'side_info' variable must have at least 4 non-missing values.",
call. = FALSE
)
}
# ---------------------------------------------------------------------------
# Validate diss
# ---------------------------------------------------------------------------
if (!is.vector(diss) && !is.matrix(diss)) {
stop("'diss' must be a matrix or a vector.", call. = FALSE)
}
if (is.vector(diss)) {
expected_len <- (nrow(side_info)^2 - nrow(side_info)) / 2
if (length(diss) != expected_len) {
stop(
"Length of 'diss' does not match a lower triangular matrix ",
"for ", nrow(side_info), " observations.",
call. = FALSE
)
}
find_nearest <- which_min_vector
} else {
if (nrow(diss) != ncol(diss)) {
stop("'diss' matrix must be square.", call. = FALSE)
}
if (nrow(diss) != nrow(side_info)) {
stop(
"Number of rows in 'diss' must match 'side_info'.",
call. = FALSE
)
}
find_nearest <- which_min
}
# ---------------------------------------------------------------------------
# Find nearest neighbors and evaluate
# ---------------------------------------------------------------------------
nearest_idx <- find_nearest(diss)
cnms <- colnames(side_info)
if (is.null(cnms)) {
cnms <- paste0("side_info_", seq_len(ncol(side_info)))
}
eval_result <- get_eval(y = side_info, indices_closest = nearest_idx)
rownames(eval_result) <- cnms
# ---------------------------------------------------------------------------
# Build output
# ---------------------------------------------------------------------------
first_nn <- side_info[nearest_idx, , drop = FALSE]
colnames(first_nn) <- paste0("nn_", cnms)
if (ncol(side_info) > 1L) {
sds <- get_column_sds(side_info[complete.cases(side_info), , drop = FALSE])
global_eval <- data.frame(
mean_standardized_rmsd = mean(eval_result[, "rmsd"] / sds),
mean_r = mean(eval_result[, "r"])
)
result <- list(
eval = eval_result,
global_eval = global_eval,
first_nn = cbind(side_info, first_nn)
)
} else {
result <- list(
eval = eval_result,
first_nn = cbind(side_info, first_nn)
)
}
result
}
#' @rdname diss_evaluate
#' @param d Deprecated. Use \code{diss} in \code{diss_evaluate()} instead.
#' @export
sim_eval <- function(d, side_info) {
.Deprecated("diss_evaluate")
diss_evaluate(diss = d, side_info = side_info)
}
# =============================================================================
# Internal helpers
# =============================================================================
#' @keywords internal
.eval_categorical <- function(y, indices_closest) {
tab <- as.matrix(table(y[!is.na(y)], y[indices_closest][!is.na(y)]))
total <- sum(tab)
tab <- tab / total
p <- rowSums(tab) %*% t(colSums(tab))
pra <- sum(diag(tab))
pre <- sum(diag(p))
kappa <- t((pra - pre) / (1 - pre))
colnames(kappa) <- "kappa"
kappa
}
#' @keywords internal
#' @importFrom stats cor
.eval_continuous <- function(y, indices_closest) {
eval_one <- function(col_idx) {
obs <- y[, col_idx]
nn <- y[indices_closest, col_idx]
rmsd <- sqrt(mean((nn - obs)^2, na.rm = TRUE))
if (sd(nn, na.rm = TRUE) == 0 || sd(obs, na.rm = TRUE) == 0) {
r <- NA_real_
} else {
r <- cor(nn, obs, use = "complete.obs")
}
c(rmsd = rmsd, r = r)
}
result <- vapply(
seq_len(ncol(y)),
eval_one,
FUN.VALUE = numeric(2)
)
t(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.