#' Calculate clusters using Hartigan's Leader Algorithm
#'
#' Takes a matrix of coordinates and outputs cluster ids
#' from running the leader algorithm. The coordinates can either be on
#' points in the space R^n, or latitude/longitude pairs. A radius delta
#' must be provided.
#'
#' @param points A matrix, or something which can be coerced into a
#' matrix, of coordinates with rows representing points
#' and columns representing dimensions. If using
#' \code{haversine} distance, this must be a two column
#' matrix with the first column containing latitudes in
#' decimal degrees and the second containing longitudes
#' in decimal degrees.
#'
#' @param radius A scalar value giving the radius of the resulting
#' clusters; this is the main tuning parameter for the
#' algorithm. When using the \code{haversine} distance
#' this value should be in kilometres.
#'
#' @param weights An vector of weights, one per row of points, to apply
#' to the clustering algorithm.
#'
#' @param max_iter Maximum number of times to iterate the algorithm; can
#' safely set to 1 in many instances. See Details.
#'
#' @param distance The method to be used for calculating distances between
#' points. If this is set to \code{haversine}, the
#' \code{points} must be a two column matrix. If
#' \code{Lp}, then the \code{p} input specifies which
#' norm is being used.
#'
#' @param p When using \code{Lp} as the value for \code{distance},
#' this is a positive number specifing which Lp-norm to
#' implement. For p equal to 1,2, or Inf, a special
#' implementation will be used which is slightly more
#' efficent than the more general application.
#'
#' @return A list containing a vector of cluster ids, a matrix of cluster
#' centroids, the number of clusters, and the number iterations.
#'
#' @details
#' The value for delta defines an approximate radius of each
#' cluster. As the algorithm runs, a point within a distance
#' delta from the centroid of a cluster will be labeled with
#' the coorisponding cluster. As centroid clusters move, it
#' is possible for the final radius of each cluster to be
#' slightly larger than delta.
#'
#' Unlike many other iterative clustering algorithms, the
#' leader algorithm typically provides reasonable clusters
#' after just a single pass. When speed is of concern, the
#' max_iter value may be safely set to 1. However, the
#' algorithm typically fully converges in only a few cycles;
#' also, a convergent solution will usually have a smaller
#' number of clusters than a solution with only one pass.
#'
#' The algorithm scales nicely, and can fit a model with
#' 100s of columns and 100k's of rows in (on a relatively
#' modest machine) under a minute. However, the processing
#' time decays significantly if the radius is too small,
#' since the number of clusters will be very high.
#'
#' @author Taylor B. Arnold, \email{taylor.arnold@@acm.org}
#'
#' @references J. A. Hartigan. Clustering Algorithms. John Wiley & Sons,
#' New York, 1975.
#'
#'@examples
#'points <- 1:10
#'out <- leaderCluster(points, radius=2, distance="Lp", max_iter=1L)
#'
# A two-dimensional example
#'par(mar = c(0,0,0,0), mfrow = c(1,3))
#'set.seed(1)
#'points <- matrix(runif(100*2), ncol=2)
#'for(r in c(0.1, 0.2, 0.4)) {
#' out <- leaderCluster(points = points, radius = r, distance="L2")$cluster_id
#' cols <- rainbow(length(unique(out)))[out]
#' plot(points, pch = 19, cex = 0.7, col = cols, axes = FALSE)
#' points(points[!duplicated(out),,drop=FALSE], cex = 2, col = unique(cols))
#' box()
#'}
#'
#' @export
leaderCluster <- function(points, radius, weights = rep(1, nrow(points)), max_iter = 10L,
distance = c("Lp", "L1", "L2", "Linf", "haversine"), p = 2)
{
type_code = match(distance[[1]], c("Lp", "L1", "L2", "Linf", "haversine")) - 1L
if (!is.matrix(points)) points = as.matrix(points)
if (is.na(type_code))
{
stop(paste0("The distance method '", distance[[1]], "' is not supported."))
}
if (p <= 0) stop(paste0("The input p='", p, "' is invalid. Must be > 0."))
if (type_code == 0L & p == 1) type_code = 1L
if (type_code == 0L & p == 2) type_code = 2L
if (type_code == 0L & p == Inf) type_code = 3L
if (!is.finite(p)) p = 2
if (type_code == 4L & ncol(points) != 2L)
{
stop("Haversine distance requires two dimensional input.")
}
cluster_id = rep(0, nrow(points))
iter = 1L
index = 1L:nrow(points)
ord = order(weights, decreasing=TRUE)
points = points[ord,,drop=FALSE]
index = index[ord]
while (iter <= max_iter )
{
out = .C(C_leader_cluster,
delta = as.double(radius),
points = as.double(points),
weights = as.double(weights),
cluster_id = as.integer(cluster_id),
nrow = nrow(points),
ncol = ncol(points),
type = type_code,
p = p,
PACKAGE = "leaderCluster")$cluster_id + 1
if(all(out == cluster_id)) break
cluster_id = out
ord = order(out)
points = points[ord,,drop=FALSE]
index = index[ord]
iter = iter + 1
}
num_clusters = max(cluster_id)
cluster_centroids = matrix(NA, ncol=ncol(points), nrow=num_clusters)
for (i in 1:ncol(points)) cluster_centroids[,i] = tapply(points[,i], cluster_id, mean)
return(list(cluster_id = out[match(1:length(index), index)],
cluster_centroids = cluster_centroids,
num_clusters = num_clusters,
iter = iter))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.