Nothing
# This file is part of the deadwood package for R.
# ############################################################################ #
# #
# Copyleft (C) 2020-2026, Marek Gagolewski <https://www.gagolewski.com> #
# #
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU Affero General Public License #
# Version 3, 19 November 2007, published by the Free Software Foundation. #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU Affero General Public License Version 3 for more details. #
# You should have received a copy of the License along with this program. #
# If this is not the case, refer to <https://www.gnu.org/licenses/>. #
# #
# ############################################################################ #
#' @title Euclidean and Mutual Reachability Minimum Spanning Trees
#'
#' @description
#' A Euclidean minimum spanning tree (MST) provides a computationally
#' convenient representation of a dataset: the \eqn{n} points are connected
#' via \eqn{n-1} shortest segments. Provided that the dataset
#' has been appropriately preprocessed so that the distances between the
#' points are informative, an MST can be applied in outlier detection,
#' clustering, density estimation, dimensionality reduction, and many other
#' topological data analysis tasks.
#'
#'
#' @details
#' If \code{d} is a matrix and the Euclidean distance is requested
#' (the default), then the MST is computed via a call to
#' \code{\link[quitefastmst]{mst_euclid}} from \pkg{quitefastmst}.
#' It is efficient in low-dimensional spaces. Otherwise, a general-purpose
#' implementation of the Jarník (Prim/Dijkstra)-like \eqn{O(n^2)}-time algorithm
#' is called.
#'
#' If \eqn{M>0}, then the minimum spanning tree is computed with respect to
#' a mutual reachability distance (Campello et al., 2013):
#' \eqn{d_M(i,j)=\max(d(i,j), c_M(i), c_M(j))}, where \eqn{d(i,j)} is
#' an ordinary distance and \eqn{c_M(i)} is the core distance given by
#' \eqn{d(i,k)} with \eqn{k} being \eqn{i}'s \eqn{M}-th nearest neighbour
#' (not including self, unlike in Campello et al., 2013).
#' This pulls outliers away from their neighbours.
#'
#' If the distances are not unique, there might be multiple trees
#' spanning a given graph that meet the minimality property.
#'
#'
#' @seealso
#' \code{\link[quitefastmst]{mst_euclid}}
#'
#' @references
#' V. Jarník, O jistem problemu minimalnim (z dopisu panu O. Borůvkovi),
#' \emph{Prace Moravske Prirodovedecke Spolecnosti} 6, 1930, 57-63
#'
#' C.F. Olson, Parallel algorithms for hierarchical clustering,
#' \emph{Parallel Computing} 21, 1995, 1313-1325
#'
#' R. Prim, Shortest connection networks and some generalisations,
#' \emph{The Bell System Technical Journal} 36(6), 1957, 1389-1401
#'
#' O. Borůvka, O jistém problému minimálním, \emph{Práce Moravské
#' Přírodovědecké Společnosti} 3, 1926, 37–58
#'
#' J.L. Bentley, Multidimensional binary search trees used for associative
#' searching, \emph{Communications of the ACM} 18(9), 509–517, 1975,
#' \doi{10.1145/361002.361007}
#
#' W.B. March, R. Parikshit, A. Gray, Fast Euclidean minimum spanning
#' tree: Algorithm, analysis, and applications, \emph{Proc. 16th ACM SIGKDD
#' Intl. Conf. Knowledge Discovery and Data Mining (KDD '10)}, 2010, 603–612
#'
#' R.J.G.B. Campello, D. Moulavi, J. Sander,
#' Density-based clustering based on hierarchical density estimates,
#' \emph{Lecture Notes in Computer Science} 7819, 2013, 160-172,
#' \doi{10.1007/978-3-642-37456-2_14}
#'
#' M. Gagolewski, quitefastmst, in preparation, 2026, TODO
#'
#'
#' @param d either a numeric matrix (or an object coercible to one,
#' e.g., a data frame with numeric-like columns) or an
#' object of class \code{dist}; see \code{\link[stats]{dist}}
#'
#' @param distance metric used in the case where \code{d} is a matrix; one of:
#' \code{"euclidean"} (synonym: \code{"l2"}),
#' \code{"manhattan"} (a.k.a. \code{"l1"} and \code{"cityblock"}),
#' \code{"cosine"}
#'
#' @param M smoothing factor; \eqn{M=0} selects the requested
#' \code{distance}; otherwise, the corresponding degree-\code{M} mutual
#' reachability distance is used
#'
#' @param verbose logical; whether to print diagnostic messages
#' and progress information
#'
#' @param ... further arguments passed to \code{\link[quitefastmst]{mst_euclid}}
#' from \pkg{quitefastmst}
#'
#'
#' @return
#' Returns a numeric matrix of class \code{mst} with \eqn{n-1} rows and
#' three columns: \code{from}, \code{to}, and \code{dist}.
#' Its \eqn{i}-th row specifies the \eqn{i}-th edge of the MST
#' which is incident to the vertices \code{from[i]} and \code{to[i]} with
#' \code{from[i] < to[i]} (in \eqn{1,...,n}) and \code{dist[i]} gives
#' the corresponding weight, i.e., the distance between the point pair.
#' Edges are ordered increasingly with respect to their weights.
#'
#' The \code{Size} attribute specifies the number of points, \eqn{n}.
#' The \code{Labels} attribute gives the labels of the input points,
#' if available.
#' The \code{method} attribute provides the name of the distance function used.
#'
#' If \eqn{M>0}, the \code{nn.index} attribute gives the indexes
#' of the \code{M} nearest neighbours of each point
#' and \code{nn.dist} provides the corresponding distances,
#' both in the form of an \eqn{n} by \eqn{M} matrix.
#'
#'
#' @examples
#' library("datasets")
#' data("iris")
#' X <- jitter(as.matrix(iris[1:2])) # some data
#' T <- mst(X)
#' plot(X, asp=1, las=1)
#' segments(X[T[, 1], 1], X[T[, 1], 2],
#' X[T[, 2], 1], X[T[, 2], 2])
#'
#' @rdname mst
#' @export
mst <- function(d, ...)
{
UseMethod("mst")
}
#' @export
#' @rdname mst
#' @method mst default
mst.default <- function(
d,
M=0L,
distance=c("euclidean", "l2", "manhattan", "cityblock", "l1", "cosine"),
verbose=FALSE,
...
) {
d <- as.matrix(d)
M <- as.integer(M)[1]
distance <- match.arg(distance)
verbose <- !identical(verbose, FALSE)
if (distance %in% c("euclidean", "l2")) {
.res <- mst_euclid(
d, M, ...,
verbose=verbose
)
result <- cbind(.res[["mst.index"]], .res[["mst.dist"]])
attr(result, "nn.index") <- .res[["nn.index"]]
attr(result, "nn.dist") <- .res[["nn.dist"]]
}
else {
result <- .oldmst.matrix(d, distance, M, verbose=verbose)
}
stopifnot(result[, 1] < result[, 2])
stopifnot(!is.unsorted(result[, 3]))
structure(
result,
class="mst",
Size=nrow(d),
Labels=dimnames(d)[[1]], # dist() returns `Labels`, not `labels`
method=if (M == 0L) distance else
sprintf("mutual reachability distance (%s, M=%d)", distance, M)
)
}
#' @export
#' @rdname mst
#' @method mst dist
mst.dist <- function(
d,
M=0L,
verbose=FALSE,
...
) {
#cast_float32 <- !identical(cast_float32, FALSE)
verbose <- !identical(verbose, FALSE)
M <- as.integer(M)[1]
result <- .oldmst.dist(d, M, verbose)
structure(
result,
class="mst",
Size=nrow(result)+1,
Labels=attr(d, "Labels"), # dist() returns `Labels`, not `labels`
method=if (M == 0L) attr(d, "method") else
sprintf("mutual reachability distance (%s, M=%d)", attr(d, "method"), M)
)
}
registerS3method("mst", "default", "mst.default")
registerS3method("mst", "dist", "mst.dist")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.