Nothing
#' hdbscan
#'
#' Implemenation of the hdbscan algorithm.
#'
#' @param edges An edge matrix of the type returned by \code{\link{buildEdgeMatrix}} or, alternatively, a \code{largeVis} object.
#' @param neighbors An adjacency matrix of the type returned by \code{\link{randomProjectionTreeSearch}}. Must be specified unless
#' \code{edges} is a \code{largeVis} object.
#' @param minPts The minimum number of points in a cluster.
#' @param K The number of points in the core neighborhood. (See details.)
#' @param threads Maximum number of threads. Determined automatically if \code{NULL} (the default). It is unlikely that
#' this parameter should ever need to be adjusted. It is only available to make it possible to abide by the CRAN limitation that no package
#' use more than two cores.
#' @param verbose Verbosity.
#'
#' @details The hyperparameter \code{K} controls the size of core neighborhoods.
#' The algorithm measures the density around a point as 1 / the distance between
#' that point and its \eqn{k}th nearest neighbor. A low value of \code{K} is similar
#' to clustering nearest neighbors rather than based on density. A high value of
#' \code{K} may cause the algorithm to miss some (usually contrived) clustering
#' patterns, such as where clusters are made up of points arranged in lines to form
#' shapes.
#'
#' The function must be provided sufficient nearest-neighbor data for whatever
#' is specified for \eqn{k}. If \eqn{k} = 5, for example, the edge matrix (and
#' neighbor matrix, if specified) must contain data on at least 5 neighbors for
#' each point. This should not be problematic in typical use in connection with
#' \code{\link{largeVis}}, which is ordinarily run with a far higher \eqn{k}-value
#' than hdbscan.
#'
#' @return An object of type \code{hdbscan} with the following fields:
#' \describe{
#' \item{'clusters'}{A vector of the cluster membership for each vertex. Outliers
#' are given \code{NA}}
#' \item{'probabilities'}{A vector of the degree of each vertex' membership. This
#' is calculated by standardizing each vertex' \eqn{lambda_p} against the \eqn{lambda_birth}
#' and \eqn{lambda_death} of the cluster.}
#' \item{'glosh'}{A vector of GLOSH outlier scores for each node assigned to a cluster. NA for nodes not
#' in a cluster.}
#' \item{'tree'}{The minimum spanning tree used to generate the clustering.}
#' \item{'hierarchy'}{A representation of the condensed cluster hierarchy.}
#' \item{'call'}{The call.}
#' }
#'
#' The hierarchy describes the complete post-condensation structure of the tree:
#' \describe{
#' \item{'nodemembership'}{The cluster ID of the vertex's immediate parent, after condensation.}
#' \item{'lambda'}{\eqn{\lambda_p} for each vertex.}
#' \item{'parent'}{The cluster ID of each cluster's parent.}
#' \item{'stability'}{The cluster's stability, taking into account child-node stabilities.}
#' \item{'selected'}{Whether the cluster was selected.}
#' \item{'coredistances'}{The core distance determined for each vertex.}
#' \item{'lamba_birth'}{\eqn{\lambda_b} for each cluster.}
#' \item{'lambda_deaeth'}{\eqn{\lambda_d} for each cluster.}
#' }
#'
#' @references R. Campello, D. Moulavi, and J. Sander, Density-Based Clustering Based on Hierarchical Density Estimates In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172. 2013
#' @seealso \url{https://github.com/lmcinnes/hdbscan}
#' @note This is not precisely the \code{HDBSCAN} algorithm because it relies on the
#' nearest neighbor data generated by the \code{LargeVis} algorithm. In particular,
#' \code{HDBSCAN} assumes that all points can be connected in a single minimal-spanning
#' tree. This implementation uses a minimal-spanning forest, because the graph may not
#' be fully connected depending on the amount of nearest-neighbor data provided.
#' If, for example, the data has distinct clusters where no member of some cluster is a
#' nearest neighbor of a point in any other cluster, which can easily happen, the algorithm will
#' generate distinct trees for each disconnected set of points. In testing, this
#' improved the performance of the algorithm.
#'
#' @note Do not rely on the content of the \code{probabilities} field for outliers. A future version
#' will hopefully provide some measure of outlier factor.
#' @examples
#' \dontrun{
#' library(largeVis)
#' library(clusteringdatasets) # See https://github.com/elbamos/clusteringdatasets
#' data(spiral)
#' dat <- as.matrix(spiral[, 1:2])
#' neighbors <- randomProjectionTreeSearch(t(dat), K = 10, tree_threshold = 100,
#' max_iter = 5, threads = 1)
#' edges <- buildEdgeMatrix(t(dat), neighbors)
#' clusters <- hdbscan(edges, neighbors = neighbors, verbose = FALSE, threads = 1)
#'
#' # Calling largeVis while setting sgd_batches to 1 is
#' # the simplest way to generate the data structures neeeded for hdbscan
#' spiralVis <- largeVis(t(dat), K = 10, tree_threshold = 100, max_iter = 5,
#' sgd_batches = 1, threads = 1)
#' clusters <- hdbscan(spiralVis, verbose = FALSE, threads = 1)
#' # The gplot function helps to visualize the clustering
#' largeHighDimensionalDataset <- matrix(rnorm(50000), ncol = 50)
#' vis <- largeVis(largeHighDimensionalDataset)
#' clustering <- hdbscan(vis)
#' gplot(clustering, t(vis$coords))
#' }
#' @export
#' @importFrom stats aggregate
hdbscan <- function(edges, neighbors = NULL, minPts = 20, K = 5,
threads = NULL,
verbose = getOption("verbose", TRUE)) {
if (inherits(edges, "edgematrix")) {
edges <- t(toMatrix(edges))
} else if (inherits(edges, "largeVis")) {
if (missing(neighbors)) neighbors <- edges$knns
edges <- t(toMatrix(edges$edges))
} else {
stop("edges must be either an edgematrix or a largeVis object")
}
if (is.null(edges) || is.null(neighbors)) stop("Both edges and neighbors must be specified (or use a largeVis object)")
if (!is.null(neighbors)) {
neighbors[is.na(neighbors)] <- -1
if (ncol(neighbors) != ncol(edges)) neighbors <- t(neighbors)
}
clustersout <- hdbscanc(edges = edges,
neighbors = neighbors,
K = as.integer(K),
minPts = as.integer(minPts),
threads = threads,
verbose = as.logical(verbose))
clusters = factor(clustersout$clusters)
probs <- data.frame(
probs = clustersout$lambdas
)
mins = stats::aggregate(probs, by = list(clusters), FUN = "min")$probs
maxes = stats::aggregate(probs, by = list(clusters), FUN = "max")$probs - mins
probs$probs[!is.na(clusters)] <- (probs$probs[!is.na(clusters)] -
mins[as.integer(clusters)[!is.na(clusters)]]) /
maxes[as.integer(clusters)[!is.na(clusters)]]
# Adjust C->R style numbering
hierarchy <- clustersout$hierarchy
hierarchy$nodemembership <- hierarchy$nodemembership + 1
hierarchy$parent <- hierarchy$parent + 1
# hierarchy$coredistances <- hierarchy$coredistances
tree <- clustersout$tree + 1
tree[tree == 0] <- NA
# GLOSH
fmax <- stats::aggregate(hierarchy$lambda, by = list(hierarchy$nodemembership), FUN = max,
na.rm = TRUE, simplify = TRUE, drop = FALSE)
maxes <- fmax$x[match(hierarchy$nodemembership, fmax$Group.1)]
glosh <- (maxes - hierarchy$lambda) / maxes
structure(
.Data = list(
clusters = clusters,
probabilities = probs$probs,
GLOSH = glosh,
tree = tree,
hierarchy = hierarchy
),
call = sys.call(),
class = "hdbscan")
}
#' gplot
#'
#' Plot an \code{hdbscan} object, using \code{\link[ggplot2]{ggplot}}. The
#' plot is primarily intended for diagnostic purposes, but can be useful to undertand
#' how clusters were generated.
#'
#' Point color corresponds to clusters, with outliers as the \code{NA} color. Alpha
#' corresponds to the node's GLOSH score. The segments on the plot
#' correspond to the connections on the minimum spanning tree. Segment alpha
#' corresponds to \eqn{\lambda_p}.
#'
#' If the parameter \code{text} is set to \code{TRUE} or \code{"parent"}, the nodes will
#' be labelled with the node index number or cluster index number, respectively.
#'
#' @param x An \code{hdbscan} object.
#' @param coords Coordinates for the points clustered in \code{x}.
#' @param text If \code{TRUE}, include on the plot labels for each node's index.
#' If \code{"parent"}, the labels will instead be the index number of the node's
#' cluster.
#' @param distances If \code{TRUE}, draw circles representing the core distances around each point.
#'
#' @return A \code{\link[ggplot2]{ggplot}} object
#' @export
#'
#' @examples
#' \dontrun{
#' library(largeVis)
#' # The aggregation dataset can be downloaded from https://github.com/elbamos/clusteringdatasets
#' data(Aggregation)
#' dat <- as.matrix(Aggregation[, 1:2])
#' aggregateVis <- largeVis(dat, K = 10, tree_threshold = 100,
#' max_iter = 5, sgd_batches = 1)
#' clusters <- hdbscan(aggregateVis, verbose = FALSE)
#' gplot(clusters, dat)
#' }
#' @importFrom ggplot2 ggplot unit geom_label geom_point geom_segment aes_
gplot <- function(x, coords, text = FALSE, distances = FALSE) {
dframe <- data.frame(coords)
colnames(dframe) <- c("x", "y")
dframe$cluster = x$clusters
dframe$glosh = x$GLOSH
dframe$distance = x$hierarchy$coredistances
dframe$glosh[is.nan(dframe$glosh)] <-
x$hierarchy$lambda[is.nan(dframe$glosh)]
dframe$label <- 0:(nrow(dframe) - 1)
dframe$parent <- x$hierarchy$nodemembership
xy <- data.frame(coords[x$tree, ])
colnames(xy) <- c("x2", "y2")
dframe <- cbind(dframe, xy)
plt <- ggplot2::ggplot(dframe,
ggplot2::aes_(x = quote(x), y = quote(y),
xend = quote(x2), yend = quote(y2), color = quote(cluster)))
if (distances) {
if (requireNamespace("ggforce", quietly = TRUE)) plt <- plt + ggforce::geom_circle(ggplot2::aes_(x0 = quote(x),
y0 = quote(y),
r = quote(distance),
fill = quote(cluster),
color = quote(cluster)),
inherit.aes = FALSE, alpha = 0.1, linetype = "dotted")
else warning("To display distance circles, the ggforce package must be installed.")
}
plt <- plt +
ggplot2::geom_point(aes_(alpha = quote(glosh)), size = 0.7) +
ggplot2::geom_segment(size = 0.5, ggplot2::aes_(alpha = quote(glosh))) +
ggplot2::scale_alpha_continuous(trans = "reverse")
if (text == "parent") {
plt <- plt + ggplot2::geom_label(ggplot2::aes_(label = quote(parent)), size = 2.5,
label.padding = ggplot2::unit(0.1, "lines"),
label.size = 0.1, alpha = 0.7)
} else if (text) {
plt <- plt + ggplot2::geom_label(ggplot2::aes_(label = quote(label)), size = 2.5,
label.padding = ggplot2::unit(0.1, "lines"),
label.size = 0.1, alpha = 0.7)
}
plt
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.