R/hdbscan.R
In largeVis: High-Quality Visualizations of Large, High-Dimensional Datasets

Documented in gplot hdbscan

#' hdbscan
#'
#' Implemenation of the hdbscan algorithm.
#'
#' @param edges An edge matrix of the type returned by \code{\link{buildEdgeMatrix}} or, alternatively, a \code{largeVis} object.
#' @param neighbors An adjacency matrix of the type returned by \code{\link{randomProjectionTreeSearch}}. Must be specified unless
#' \code{edges} is a \code{largeVis} object.
#' @param minPts The minimum number of points in a cluster.
#' @param K The number of points in the core neighborhood. (See details.)
#' @param threads Maximum number of threads. Determined automatically if \code{NULL} (the default).  It is unlikely that
#' this parameter should ever need to be adjusted.  It is only available to make it possible to abide by the CRAN limitation that no package
#' use more than two cores.
#' @param verbose Verbosity.
#'
#' @details The hyperparameter \code{K} controls the size of core neighborhoods.
#' The algorithm measures the density around a point as 1 / the distance between
#' that point and its \eqn{k}th nearest neighbor. A low value of \code{K} is similar
#' to clustering nearest neighbors rather than based on density. A high value of
#' \code{K} may cause the algorithm to miss some (usually contrived) clustering
#' patterns, such as where clusters are made up of points arranged in lines to form
#' shapes.
#'
#' The function must be provided sufficient nearest-neighbor data for whatever
#' is specified for \eqn{k}. If \eqn{k} = 5, for example, the edge matrix (and
#' neighbor matrix, if specified) must contain data on at least 5 neighbors for
#' each point. This should not be problematic in typical use in connection with
#' \code{\link{largeVis}}, which is ordinarily run with a far higher \eqn{k}-value
#' than hdbscan.
#'
#' @return An object of type \code{hdbscan} with the following fields:
#' \describe{
#'    \item{'clusters'}{A vector of the cluster membership for each vertex. Outliers
#'    are given \code{NA}}
#'    \item{'probabilities'}{A vector of the degree of each vertex' membership. This
#'    is calculated by standardizing each vertex' \eqn{lambda_p} against the \eqn{lambda_birth}
#'    and \eqn{lambda_death} of the cluster.}
#'    \item{'glosh'}{A vector of GLOSH outlier scores for each node assigned to a cluster. NA for nodes not
#'    in a cluster.}
#'    \item{'tree'}{The minimum spanning tree used to generate the clustering.}
#'    \item{'hierarchy'}{A representation of the condensed cluster hierarchy.}
#'    \item{'call'}{The call.}
#'  }
#'
#'  The hierarchy describes the complete post-condensation structure of the tree:
#'  \describe{
#'  \item{'nodemembership'}{The cluster ID of the vertex's immediate parent, after condensation.}
#'  \item{'lambda'}{\eqn{\lambda_p} for each vertex.}
#'  \item{'parent'}{The cluster ID of each cluster's parent.}
#'  \item{'stability'}{The cluster's stability, taking into account child-node stabilities.}
#'  \item{'selected'}{Whether the cluster was selected.}
#'  \item{'coredistances'}{The core distance determined for each vertex.}
#'  \item{'lamba_birth'}{\eqn{\lambda_b} for each cluster.}
#'  \item{'lambda_deaeth'}{\eqn{\lambda_d} for each cluster.}
#'  }
#'
#' @references R. Campello, D. Moulavi, and J. Sander, Density-Based Clustering Based on Hierarchical Density Estimates In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172. 2013
#' @seealso \url{https://github.com/lmcinnes/hdbscan}
#' @note This is not precisely the \code{HDBSCAN} algorithm because it relies on the
#' nearest neighbor data generated by the \code{LargeVis} algorithm. In particular,
#' \code{HDBSCAN} assumes that all points can be connected in a single minimal-spanning
#' tree. This implementation uses a minimal-spanning forest, because the graph may not
#' be fully connected depending on the amount of nearest-neighbor data provided.
#' If, for example, the data has distinct clusters where no member of some cluster is a
#' nearest neighbor of a point in any other cluster, which can easily happen, the algorithm will
#' generate distinct trees for each disconnected set of points. In testing, this
#' improved the performance of the algorithm.
#'
#' @note Do not rely on the content of the \code{probabilities} field for outliers. A future version
#' will hopefully provide some measure of outlier factor.
#' @examples
#' \dontrun{
#' library(largeVis)
#' library(clusteringdatasets)  # See https://github.com/elbamos/clusteringdatasets
#' data(spiral)
#' dat <- as.matrix(spiral[, 1:2])
#' neighbors <- randomProjectionTreeSearch(t(dat), K = 10, tree_threshold = 100,
#'                                        max_iter = 5, threads = 1)
#' edges <- buildEdgeMatrix(t(dat), neighbors)
#' clusters <- hdbscan(edges, neighbors = neighbors, verbose = FALSE, threads = 1)
#'
#' # Calling largeVis while setting sgd_batches to 1 is
#' # the simplest way to generate the data structures neeeded for hdbscan
#' spiralVis <- largeVis(t(dat), K = 10, tree_threshold = 100, max_iter = 5,
#'                       sgd_batches = 1, threads = 1)
#' clusters <- hdbscan(spiralVis, verbose = FALSE, threads = 1)
#' # The gplot function helps to visualize the clustering
#' largeHighDimensionalDataset <- matrix(rnorm(50000), ncol = 50)
#' vis <- largeVis(largeHighDimensionalDataset)
#' clustering <- hdbscan(vis)
#' gplot(clustering, t(vis$coords))
#' }
#' @export
#' @importFrom stats aggregate
hdbscan <- function(edges, neighbors = NULL, minPts = 20, K = 5,
										threads = NULL,
										verbose = getOption("verbose", TRUE)) {

	if (inherits(edges, "edgematrix")) {
		edges <- t(toMatrix(edges))
	} else if (inherits(edges, "largeVis")) {
		if (missing(neighbors)) neighbors <- edges$knns
		edges <- t(toMatrix(edges$edges))
	} else {
		stop("edges must be either an edgematrix or a largeVis object")
	}
	if (is.null(edges) || is.null(neighbors)) stop("Both edges and neighbors must be specified (or use a largeVis object)")

	if (!is.null(neighbors)) {
		neighbors[is.na(neighbors)] <- -1
		if (ncol(neighbors) != ncol(edges)) neighbors <- t(neighbors)
	}

	clustersout <- hdbscanc(edges = edges,
													neighbors = neighbors,
													K	= as.integer(K),
													minPts = as.integer(minPts),
													threads = threads,
													verbose = as.logical(verbose))

	clusters = factor(clustersout$clusters)
	probs <- data.frame(
		probs = clustersout$lambdas
	)
	mins = stats::aggregate(probs, by = list(clusters), FUN = "min")$probs
	maxes = stats::aggregate(probs, by = list(clusters), FUN = "max")$probs - mins
	probs$probs[!is.na(clusters)] <- (probs$probs[!is.na(clusters)] -
																			mins[as.integer(clusters)[!is.na(clusters)]]) /
		maxes[as.integer(clusters)[!is.na(clusters)]]

	# Adjust C->R style numbering
	hierarchy <- clustersout$hierarchy
	hierarchy$nodemembership <- hierarchy$nodemembership + 1
	hierarchy$parent <- hierarchy$parent + 1
#	hierarchy$coredistances <- hierarchy$coredistances
	tree <- clustersout$tree + 1
	tree[tree == 0] <- NA

	# GLOSH
	fmax <- stats::aggregate(hierarchy$lambda, by = list(hierarchy$nodemembership), FUN = max,
													 na.rm = TRUE, simplify = TRUE, drop = FALSE)
	maxes <- fmax$x[match(hierarchy$nodemembership, fmax$Group.1)]
	glosh <- (maxes - hierarchy$lambda) / maxes

	structure(
		.Data = list(
			clusters = clusters,
			probabilities = probs$probs,
			GLOSH = glosh,
			tree = tree,
			hierarchy = hierarchy
		),
		call = sys.call(),
		class = "hdbscan")
}

#' gplot
#'
#' Plot an \code{hdbscan} object, using \code{\link[ggplot2]{ggplot}}. The
#' plot is primarily intended for diagnostic purposes, but can be useful to undertand
#' how clusters were generated.
#'
#' Point color corresponds to clusters, with outliers as the \code{NA} color. Alpha
#' corresponds to the node's GLOSH score. The segments on the plot
#' correspond to the connections on the minimum spanning tree. Segment alpha
#' corresponds to \eqn{\lambda_p}.
#'
#' If the parameter \code{text} is set to \code{TRUE} or \code{"parent"}, the nodes will
#' be labelled with the node index number or cluster index number, respectively.
#'
#' @param x An \code{hdbscan} object.
#' @param coords Coordinates for the points clustered in \code{x}.
#' @param text If \code{TRUE}, include on the plot labels for each node's index.
#' If \code{"parent"}, the labels will instead be the index number of the node's
#' cluster.
#' @param distances If \code{TRUE}, draw circles representing the core distances around each point.
#'
#' @return A \code{\link[ggplot2]{ggplot}} object
#' @export
#'
#' @examples
#' \dontrun{
#' library(largeVis)
#' # The aggregation dataset can be downloaded from https://github.com/elbamos/clusteringdatasets
#' data(Aggregation)
#' dat <- as.matrix(Aggregation[, 1:2])
#' aggregateVis <- largeVis(dat, K = 10, tree_threshold = 100,
#'                          max_iter = 5, sgd_batches = 1)
#' clusters <- hdbscan(aggregateVis, verbose = FALSE)
#' gplot(clusters, dat)
#' }
#' @importFrom ggplot2 ggplot unit geom_label geom_point geom_segment aes_
gplot <- function(x, coords, text = FALSE, distances = FALSE) {
	dframe <- data.frame(coords)
	colnames(dframe) <- c("x", "y")
	dframe$cluster = x$clusters
	dframe$glosh = x$GLOSH
	dframe$distance = x$hierarchy$coredistances
	dframe$glosh[is.nan(dframe$glosh)] <-
		x$hierarchy$lambda[is.nan(dframe$glosh)]
	dframe$label <- 0:(nrow(dframe) - 1)
	dframe$parent <- x$hierarchy$nodemembership
	xy <- data.frame(coords[x$tree, ])
	colnames(xy) <- c("x2", "y2")
	dframe <- cbind(dframe, xy)
	plt <- ggplot2::ggplot(dframe,
												 ggplot2::aes_(x = quote(x), y = quote(y),
												 							xend = quote(x2), yend = quote(y2), color = quote(cluster)))
	if (distances) {
		if (requireNamespace("ggforce", quietly = TRUE)) plt <- plt + ggforce::geom_circle(ggplot2::aes_(x0 = quote(x),
																																 y0 = quote(y),
																																 r = quote(distance),
																																 fill = quote(cluster),
																																 color = quote(cluster)),
																									 inherit.aes = FALSE, alpha = 0.1, linetype = "dotted")
		else warning("To display distance circles, the ggforce package must be installed.")
	}
	plt <- plt +
		ggplot2::geom_point(aes_(alpha = quote(glosh)), size = 0.7) +
		ggplot2::geom_segment(size = 0.5, ggplot2::aes_(alpha = quote(glosh))) +
		ggplot2::scale_alpha_continuous(trans = "reverse")

	if (text == "parent") {
		plt <- plt + ggplot2::geom_label(ggplot2::aes_(label = quote(parent)), size = 2.5,
																		 label.padding = ggplot2::unit(0.1, "lines"),
																		 label.size = 0.1, alpha = 0.7)
	} else if (text) {
		plt <- plt + ggplot2::geom_label(ggplot2::aes_(label = quote(label)), size = 2.5,
																		 label.padding = ggplot2::unit(0.1, "lines"),
																		 label.size = 0.1, alpha = 0.7)
	}
	plt
}