Nothing
#' @title A function for searching in a given reference set the neighbors of
#' another given set of observations (search_neighbors)
#' @description
#' \loadmathjax
#' This function searches in a reference set the neighbors of the observations
#' provided in another set.
#' @usage
#' search_neighbors(Xr, Xu, diss_method = c("pca", "pca.nipals", "pls", "mpls",
#' "cor", "euclid", "cosine", "sid"),
#' Yr = NULL, k, k_diss, k_range, spike = NULL,
#' pc_selection = list("var", 0.01),
#' return_projection = FALSE, return_dissimilarity = FALSE,
#' ws = NULL,
#' center = TRUE, scale = FALSE,
#' documentation = character(), ...)
#'
#' @param Xr a matrix of reference (spectral) observations where the neighbor
#' search is to be conducted. See details.
#' @param Xu an optional matrix of (spectral) observations for which its
#' neighbors are to be searched in \code{Xr}. Default is \code{NULL}. See details.
#' @param diss_method a character string indicating the spectral dissimilarity metric
#' to be used in the selection of the nearest neighbors of each observation.
#' \itemize{
#' \item{\code{"pca"}: Mahalanobis distance
#' computed on the matrix of scores of a Principal Component (PC)
#' projection of \code{Xr} (and \code{Xu} if supplied).
#' PC projection is done using the singular value decomposition (SVD)
#' algorithm. See \code{\link{ortho_diss}} function.}
#' \item{\code{"pca.nipals"}: Mahalanobis distance
#' computed on the matrix of scores of a Principal Component (PC)
#' projection of \code{Xr} (and \code{Xu} if supplied).
#' PC projection is done using the
#' non-linear iterative partial least squares (niapls) algorithm.
#' See \code{\link{ortho_diss}} function.}
#' \item{\code{"pls"}: Mahalanobis distance
#' computed on the matrix of scores of a partial least squares projection
#' of \code{Xr} (and \code{Xu} if supplied). In this case, \code{Yr}
#' is always required. See \code{\link{ortho_diss}} function.}
#' \item{\code{"mpls"}: Mahalanobis distance
#' computed on the matrix of scores of a modified partial least squares
#' projection (Shenk and Westerhaus, 1991; Westerhaus, 2014)
#' of \code{Xr} (and \code{Xu} if provided). In this case, \code{Yr} is
#' always required. See \code{\link{ortho_diss}} function.}
#' \item{\code{"cor"}: correlation coefficient
#' between observations. See \code{\link{cor_diss}} function.}
#' \item{\code{"euclid"}: Euclidean distance
#' between observations. See \code{\link{f_diss}} function.}
#' \item{\code{"cosine"}: Cosine distance
#' between observations. See \code{\link{f_diss}} function.}
#' \item{\code{"sid"}: spectral information divergence between observations.
#' See \code{\link{sid}} function.}
#' }
#' @param Yr a numeric matrix of `n` observations used as side information of
#' \code{Xr} for the \code{\link{ortho_diss}} methods (i.e. \code{pca},
#' \code{pca.nipals} or \code{pls}). It is required when:
#' \itemize{
#' \item{\code{diss_method = "pls"}}
#' \item{\code{diss_method = "pca"} with \code{"opc"} used as the method
#' in the \code{pc_selection} argument. See [ortho_diss()].}
#' }
#' @param k an integer value indicating the k-nearest neighbors of each
#' observation in \code{Xu} that must be selected from \code{Xr}.
#' @param k_diss an integer value indicating a dissimilarity treshold.
#' For each observation in \code{Xu}, its nearest neighbors in \code{Xr}
#' are selected as those for which their dissimilarity to \code{Xu} is below
#' this \code{k_diss} threshold. This treshold depends on the corresponding
#' dissimilarity metric specified in \code{diss_method}. Either \code{k} or
#' \code{k_diss} must be specified.
#' @param k_range an integer vector of length 2 which specifies the minimum
#' (first value) and the maximum (second value) number of neighbors to be
#' retained when the \code{k_diss} is given.
#' @param spike a vector of integers (with positive and/or negative values)
#' indicating what observations in \code{Xr}
#' (and \code{Yr}) must be forced into or avoided in the neighborhoods.
#' @param pc_selection a list of length 2 to be passed onto the
#' \code{\link{ortho_diss}} methods. It is required if the method selected in
#' \code{diss_method} is any of \code{"pca"}, \code{"pca.nipals"} or
#' \code{"pls"}. This argument is used for
#' optimizing the number of components (principal components or pls factors)
#' to be retained. This list must contain two elements in the following order:
#' \code{method} (a character indicating the method for selecting the number of
#' components) and \code{value} (a numerical value that complements the selected
#' method). The methods available are:
#' \itemize{
#' \item{\code{"opc"}: optimized principal component selection based on
#' Ramirez-Lopez et al. (2013a, 2013b). The optimal number of components
#' (of set of observations) is the one for which its distance matrix
#' minimizes the differences between the \code{Yr} value of each
#' observation and the \code{Yr} value of its closest observation. In this
#' case \code{value} must be a value (larger than 0 and below the
#' minimum dimension of \code{Xr} or \code{Xr} and \code{Xu} combined)
#' indicating the maximum number of principal components to be tested.
#' See the \code{\link{ortho_projection}} function for more details.}
#' \item{\code{"cumvar"}: selection of the principal components based
#' on a given cumulative amount of explained variance. In this case,
#' \code{value} must be a value (larger than 0 and below or equal to 1)
#' indicating the minimum amount of cumulative variance that the
#' combination of retained components should explain.}
#' \item{\code{"var"}: selection of the principal components based
#' on a given amount of explained variance. In this case,
#' \code{value} must be a value (larger than 0 and below or equal to 1)
#' indicating the minimum amount of variance that a single component
#' should explain in order to be retained.}
#' \item{\code{"manual"}: for manually specifying a fix number of
#' principal components. In this case, \code{value} must be a value
#' (larger than 0 and below the
#' minimum dimension of \code{Xr} or \code{Xr} and \code{Xu} combined)
#' indicating the minimum amount of variance that a component should
#' explain in order to be retained.}
#' }
#' The default is \code{list(method = "var", value = 0.01)}.
#'
#' Optionally, the \code{pc_selection} argument admits \code{"opc"} or
#' \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character
#' string. In such a case the default \code{"value"} when either \code{"opc"} or
#' \code{"manual"} are used is 40. When \code{"cumvar"} is used the default
#' \code{"value"} is set to 0.99 and when \code{"var"} is used, the default
#' \code{"value"} is set to 0.01.
#' @param return_projection a logical indicating if the projection(s) must be
#' returned. Projections are used if the \code{\link{ortho_diss}} methods are
#' called (i.e. \code{method = "pca"}, \code{method = "pca.nipals"} or
#' \code{method = "pls"}).
#' @param return_dissimilarity a logical indicating if the dissimilarity matrix
#' used for neighbor search must be returned.
#' @param ws an odd integer value which specifies the window size, when
#' \code{diss_method = cor} (\code{\link{cor_diss}} method) for moving correlation
#' dissimilarity. If \code{ws = NULL} (default), then the window size will be
#' equal to the number of variables (columns), i.e. instead moving correlation,
#' the normal correlation will be used. See \code{\link{cor_diss}} function.
#' @param center a logical indicating if the \code{Xr} and \code{Xu} matrices
#' must be centered. If \code{Xu} is provided the data is centered around the
#' mean of the pooled \code{Xr} and \code{Xu} matrices (\mjeqn{Xr \cup Xu}{Xr U Xu}). For
#' dissimilarity computations based on \code{diss_method = pls}, the data is always
#' centered.
#' @param scale a logical indicating if the \code{Xr} and \code{Xu} matrices
#' must be scaled. If \code{Xu} is provided the data is scaled based
#' on the standard deviation of the the pooled \code{Xr} and \code{Xu} matrices
#' (\mjeqn{Xr \cup Xu}{Xr U Xu}). If \code{center = TRUE}, scaling is applied after
#' centering.
#' @param documentation an optional character string that can be used to
#' describe anything related to the \code{mbl} call (e.g. description of the
#' input data). Default: \code{character()}. NOTE: his is an experimental
#' argument.
#' @param ... further arguments to be passed to the \code{\link{dissimilarity}}
#' function. See details.
#' @details
#' This function may be specially useful when the reference set (\code{Xr}) is
#' very large. In some cases the number of observations in the reference set
#' can be reduced by removing irrelevant observations (i.e. observations that are not
#' neighbors of a particular target set). For example, this fucntion can be
#' used to reduce the size of the reference set before before running the
#' \code{\link{mbl}} function.
#'
#' This function uses the \code{\link{dissimilarity}} fucntion to compute the
#' dissimilarities between \code{Xr} and \code{Xu}. Arguments to
#' \code{\link{dissimilarity}} as well as further arguments to the functions
#' used inside \code{\link{dissimilarity}} (i.e. \code{\link{ortho_diss}}
#' \code{\link{cor_diss}} \code{\link{f_diss}} \code{\link{sid}}) can be passed to
#' those functions as additional arguments (i.e. \code{...}).
#' @return a \code{list} containing the following elements:
#' \itemize{
#' \item{\code{neighbors_diss}: a matrix of the \code{Xr} dissimilarity scores
#' corresponding to the neighbors of each \code{Xr} observation (or \code{Xu}
#' observation, in case \code{Xu} was supplied).
#' The neighbor dissimilarity scores are organized by columns and are sorted
#' in ascending order.}
#' \item{\code{neighbors}: a matrix of the \code{Xr} indices corresponding to
#' the neighbors of each observation in \code{Xu}. The neighbor indices are
#' organized by columns and are sorted in ascending order by their
#' dissimilarity score.}
#' \item{\code{unique_neighbors}: a vector of the indices in \code{Xr}
#' identified as neighbors of any observation in \code{Xr} (or in \code{Xu},
#' in case it was supplied). This is obtained by
#' converting the \code{neighbors} matrix into a vector and applying the
#' \code{\link[base]{unique}} function.}
#' \item{\code{k_diss_info}: a \code{data.table} that is returned only if the
#' \code{k_diss} argument was used. It comprises three columns, the first one
#' (\code{Xr_index} or \code{Xu_index}) indicates the index of the observations
#' in \code{Xr} (or in \code{Xu}, in case it was suppplied),
#' the second column (\code{n_k}) indicates the number of neighbors found in
#' \code{Xr} and the third column (\code{final_n_k}) indicates the final number
#' of neighbors selected bounded by \code{k_range}.
#' argument.}
#' \item{\code{dissimilarity}: If \code{return_dissimilarity = TRUE} the
#' dissimilarity object used (as computed by the \code{\link{dissimilarity}}
#' function.}
#' \item{\code{projection}: an \code{ortho_projection} object. Only output if
#' \code{return_projection = TRUE} and if \code{diss_method = "pca"},
#' \code{diss_method = "pca.nipals"} or \code{diss_method = "pls"}. \cr
#' This object contains the projection used to compute
#' the dissimilarity matrix. In case of local dissimilarity matrices,
#' the projection corresponds to the global projection used to select the
#' neighborhoods. (see \code{\link{ortho_diss}} function for further
#' details).}
#' }
#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez}.
#' @details
#' If no matrix is passed to \code{Xu}, the neighbor search is conducted for the
#' observations in \code{Xr} that are found whiting that matrix. If a matrix is
#' passed to \code{Xu}, the neighbors of \code{Xu} are searched in the \code{Xr}
#' matrix.
#' @references
#' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Stevens, A., Dematte, J.A.M.,
#' Scholten, T. 2013a. The spectrum-based learner: A new local approach for
#' modeling soil vis-NIR spectra of complex data sets. Geoderma 195-196, 268-279.
#'
#' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Viscarra Rossel, R.,
#' Dematte, J. A. M., Scholten, T. 2013b. Distance and similarity-search
#' metrics for use with soil vis-NIR spectra. Geoderma 199, 43-53.
#' @seealso \code{\link{dissimilarity}} \code{\link{ortho_diss}}
#' \code{\link{cor_diss}} \code{\link{f_diss}} \code{\link{sid}}
#' \code{\link{mbl}}
#' @examples
#' \donttest{
#' library(prospectr)
#'
#' data(NIRsoil)
#'
#' Xu <- NIRsoil$spc[!as.logical(NIRsoil$train), ]
#' Yu <- NIRsoil$CEC[!as.logical(NIRsoil$train)]
#' Yr <- NIRsoil$CEC[as.logical(NIRsoil$train)]
#' Xr <- NIRsoil$spc[as.logical(NIRsoil$train), ]
#'
#' Xu <- Xu[!is.na(Yu), ]
#' Yu <- Yu[!is.na(Yu)]
#'
#' Xr <- Xr[!is.na(Yr), ]
#' Yr <- Yr[!is.na(Yr)]
#'
#' # Identify the neighbor observations using the correlation dissimilarity and
#' # default parameters
#' # (In this example all the observations in Xr belong at least to the
#' # first 100 neighbors of one observation in Xu)
#' ex1 <- search_neighbors(
#' Xr = Xr, Xu = Xu,
#' diss_method = "cor",
#' k = 40
#' )
#'
#' # Identify the neighbor observations using principal component (PC)
#' # and partial least squares (PLS) dissimilarities, and using the "opc"
#' # approach for selecting the number of components
#' ex2 <- search_neighbors(
#' Xr = Xr, Xu = Xu,
#' diss_method = "pca",
#' Yr = Yr, k = 50,
#' pc_selection = list("opc", 40),
#' scale = TRUE
#' )
#'
#' # Observations that do not belong to any neighborhood
#' seq(1, nrow(Xr))[!seq(1, nrow(Xr)) %in% ex2$unique_neighbors]
#'
#' ex3 <- search_neighbors(
#' Xr = Xr, Xu = Xu,
#' diss_method = "pls",
#' Yr = Yr, k = 50,
#' pc_selection = list("opc", 40),
#' scale = TRUE
#' )
#' # Observations that do not belong to any neighborhood
#' seq(1, nrow(Xr))[!seq(1, nrow(Xr)) %in% ex3$unique_neighbors]
#'
#' # Identify the neighbor observations using local PC dissimialrities
#' # Here, 150 neighbors are used to compute a local dissimilarity matrix
#' # and then this matrix is used to select 50 neighbors
#' ex4 <- search_neighbors(
#' Xr = Xr, Xu = Xu,
#' diss_method = "pls",
#' Yr = Yr, k = 50,
#' pc_selection = list("opc", 40),
#' scale = TRUE,
#' .local = TRUE,
#' pre_k = 150
#' )
#' }
#' @export
######################################################################
# resemble
# Copyright (C) 2014 Leonardo Ramirez-Lopez and Antoine Stevens
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
######################################################################
## History:
## 09.03.2014 Leo In the doc was specified that multi-threading is
## not working for mac
## 25.05.2020 Leo - function renamed from neigCleaning to search_neighbors
## - scaled renamed to scale
## - pcMethod and cores are deprecated
search_neighbors <- function(Xr, Xu = NULL, diss_method = c(
"pca",
"pca.nipals",
"pls",
"mpls",
"cor",
"euclid",
"cosine",
"sid"
),
Yr = NULL,
k, k_diss, k_range,
spike = NULL,
pc_selection = list("var", 0.01),
return_projection = FALSE,
return_dissimilarity = FALSE,
ws = NULL,
center = TRUE, scale = FALSE,
documentation = character(), ...) {
# Sanity checks
match.arg(diss_method, c(
"pca",
"pca.nipals",
"pls",
"mpls",
"cor",
"euclid",
"cosine",
"sid"
))
if (missing(k)) {
k <- NULL
}
if (missing(k_diss)) {
k_diss <- NULL
}
if (missing(k_range)) {
k_range <- NULL
}
if (!is.logical(center)) {
stop("'center' argument must be logical")
}
if (!is.logical(scale)) {
stop("'scale' argument must be logical")
}
if (diss_method == "cor") {
if (!is.null(ws)) {
if (ws < 3 | ws > (ncol(Xr) - 1) | length(ws) != 1 | (ws %% 2) == 0) {
stop(paste(
"ws must be an odd value between 3 and", ncol(Xr) - 1,
"(ncol(Xr) - 1)"
))
}
}
}
if (!is.null(k) & !is.null(k_diss)) {
# if k and k_diss are not called here, errors are thrown during checks
k
k_diss
stop("Only one of k or k_diss can be specified")
}
if (is.null(k) & is.null(k_diss)) {
stop("Either k or k_diss must be specified")
}
if (!is.null(k)) {
k <- as.integer(k)
if (k < 1) {
stop("k must be an integer larger than 0")
}
if (k > nrow(Xr)) {
stop(paste0(
"The number of nearest neighbors cannot exceed the number ",
"of reference observations (nrow(Xr))"
))
}
kk <- k
}
if (!is.null(k_diss)) {
# if k_diss is not called here, errors are thrown during checks
k_diss
if (is.null(k_range)) {
# if k_range is not called here, errors are thrown during checks
k_range
stop("if the k_diss argument is used, k_range must be specified")
}
if (length(k_range) != 2 | !is.numeric(k_range) | diff(k_range) < 0) {
stop(paste0(
"k_range must be a vector (of length 2) which specifies the ",
"minimum (first value) and the maximum (second value) number ",
"of neighbors"
))
}
k_min <- as.integer(k_range[1])
k_max <- as.integer(k_range[2])
if (k_min < 1) {
stop("Minimum number of nearest neighbors must be larger than 0")
}
if (k_max > nrow(Xr)) {
stop(paste0(
"Maximum number of nearest neighbors cannot exceed the ",
"number of reference observations"
))
}
kk <- k_max
}
input_dots <- list(...)
if (".local" %in% names(input_dots)) {
if (isTRUE(input_dots$.local)) {
if (!"pre_k" %in% names(input_dots)) {
stop(paste0(
"When .local = TRUE (passed to the ortho_diss method), the ",
"'pre_k' argument must be specified"
))
}
if (input_dots$pre_k < kk) {
stop(paste0(
"pre_k must be larger than ",
ifelse(is.null(k), "max(k_range)", "k")
))
}
}
}
if (!is.null(spike)) {
if (!is.vector(spike)) {
stop("spike must be a vector of integers")
}
if (any(!spike %% 1 == 0)) {
stop("spike must be a vector of integers")
}
if (length(spike) >= nrow(Xr)) {
stop("The lebgth of spike cannot be larger or equal to the number of rows of Xr")
}
if (max(spike) > nrow(Xr)) {
stop("Argument spike contains indices subscript out of bounds of Xr")
}
if (!is.null(k)) {
if (min(k) <= length(spike)) {
stop("values for k must be larger than length(spike)")
}
}
if (!is.null(k_diss)) {
if (min(k_range) <= length(spike)) {
stop("values for k_range must be larger than length(spike)")
}
}
spike <- sort(unique(as.integer(spike)))
}
dsm <- dissimilarity(
Xr = Xr,
Xu = Xu,
diss_method = diss_method,
Yr = Yr,
pc_selection = pc_selection,
return_projection = return_projection,
ws = ws,
center = center,
scale = scale,
...
)
skip_first <- ifelse(is.null(Xu), TRUE, FALSE)
if (!is.null(spike)) {
spike_hold <- spike[spike > 0]
spike_rm <- -spike[spike < 0]
spike_problem <- length(intersect(spike_hold, spike_rm)) > 0
if (spike_problem) {
stop("spike contains contradictory information, some indices to keep and to avoid are the same")
}
}
results <- diss_to_neighbors(dsm$dissimilarity,
k = k, k_diss = k_diss, k_range = k_range,
spike = spike,
return_dissimilarity = return_dissimilarity,
skip_first = skip_first
)
mprefix <- ifelse(skip_first, "Xr", "Xu")
colnames(results$neighbors) <- paste0(mprefix, 1:ncol(results$neighbors))
if (return_projection & diss_method %in% c("pca", "pca.nipals", "pls")) {
results$projection <- dsm$projection
}
if ("gh" %in% names(input_dots)) {
results$gh <- dsm$gh
}
results
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.