Nothing
#' Minimize the convex clustering loss function
#'
#' @description Minimizes the convex clustering loss function for a given set of
#' values for lambda.
#'
#' @param X An \eqn{n} x \eqn{p} numeric matrix. This function assumes that each
#' row represents an object with \eqn{p} attributes.
#' @param W A \code{sparseweights} object, see \link{sparse_weights}.
#' @param lambdas A vector containing the values for the penalty parameter.
#' @param tau Parameter to compute the threshold to fuse clusters. Default is
#' 0.001.
#' @param center If \code{TRUE}, center \code{X} so that each column has mean
#' zero. Default is \code{TRUE}.
#' @param scale If \code{TRUE}, scale the loss function to ensure that the
#' cluster solution is invariant to the scale of \code{X}. Default is
#' \code{TRUE}. Not recommended to set to \code{FALSE} unless comparing to
#' algorithms that minimize the unscaled convex clustering loss function.
#' @param eps_conv Parameter for determining convergence of the minimization.
#' Default is 1e-6.
#' @param burnin_iter Number of updates of the loss function that are done
#' without step doubling. Default is 25.
#' @param max_iter_conv Maximum number of iterations for minimizing the loss
#' function. Default is 5000.
#' @param save_clusterpath If \code{TRUE}, store the solution that minimized
#' the loss function for each lambda. Is required for drawing the clusterpath.
#' Default is \code{FALSE}. To store the clusterpath coordinates, \eqn{n} x
#' \eqn{p} x \eqn{no. lambdas} have to be stored, this may require too much
#' memory for large data sets.
#' @param target_losses The values of the loss function that are used to
#' determine convergence of the algorithm (tested as: loss - target <=
#' \code{eps_conv} * target). If the input is not \code{NULL}, it should be a
#' vector with the same length as \code{lambdas}. Great care should be exercised
#' to make sure that the target losses correspond to attainable values for the
#' minimization. The inputs (\code{X}, \code{W}, \code{lambdas}) should be the
#' same, but also the same version of the loss function (centered, scaled)
#' should be used. Default is \code{NULL}.
#' @param save_losses If \code{TRUE}, return the values of the loss function
#' attained during minimization for each value of lambda. Default is
#' \code{FALSE}.
#' @param save_convergence_norms If \code{TRUE}, return the norm of the
#' difference between consecutive iterates during minimization for each value
#' of lambda. Default is \code{FALSE}. If timing the algorithm is of importance,
#' do not set this to \code{TRUE}, as additional computations are done for
#' bookkeeping that are irrelevant to the optimization.
#'
#' @return A \code{cvxclust} object containing the following
#' \item{\code{info}}{A dataframe containing for each value for lambda: the
#' number of different clusters, and the value of the loss function at the
#' minimum.}
#' \item{\code{merge}}{The merge table containing the order at which the
#' observations in \code{X} are clustered.}
#' \item{\code{height}}{The value for lambda at which each reduction in the
#' number of clusters occurs.}
#' \item{\code{order}}{The order of the observations in \code{X} in order to
#' draw a dendrogram without conflicting branches.}
#' \item{\code{elapsed_time}}{The number of seconds that elapsed while
#' running the code. Note that this does not include the time required for
#' input checking and possibly scaling and centering \code{X}.}
#' \item{\code{coordinates}}{The clusterpath coordinates. Only part of the
#' output in case that \code{save_clusterpath=TRUE}.}
#' \item{\code{lambdas}}{The values for lambda for which a clustering was
#' found.}
#' \item{\code{eps_fusions}}{The threshold for cluster fusions that was used by
#' the algorithm.}
#' \item{\code{num_clusters}}{The different numbers of clusters that have been
#' found.}
#' \item{\code{n}}{The number of observations in \code{X}.}
#' \item{\code{losses}}{Optional: if \code{save_losses = TRUE}, the values of
#' the loss function during minimization.}
#' \item{\code{convergence_norms}}{Optional: if
#' \code{save_convergence_norms = TRUE}, the norms of the differences between
#' consecutive iterates during minimization.}
#'
#' @examples
#' # Load data
#' data(two_half_moons)
#' data = as.matrix(two_half_moons)
#' X = data[, -3]
#' y = data[, 3]
#'
#' # Get sparse weights in dictionary of keys format with k = 5 and phi = 8
#' W = sparse_weights(X, 5, 8.0)
#'
#' # Set a sequence for lambda
#' lambdas = seq(0, 2400, 1)
#'
#' # Compute clusterpath
#' res = convex_clusterpath(X, W, lambdas)
#'
#' # Get cluster labels for two clusters
#' labels = clusters(res, 2)
#'
#' # Plot the clusterpath with colors based on the cluster labels
#' plot(res, col = labels)
#'
#' @seealso \link{convex_clustering}, \link{sparse_weights}
#'
#' @export
convex_clusterpath <- function(X, W, lambdas, tau = 1e-3, center = TRUE,
scale = TRUE, eps_conv = 1e-6, burnin_iter = 25,
max_iter_conv = 5000, save_clusterpath = TRUE,
target_losses = NULL, save_losses = FALSE,
save_convergence_norms = FALSE)
{
# Input checks
.check_array(X, 2, "X")
.check_weights(W)
.check_lambdas(lambdas)
.check_scalar(tau, TRUE, "tau", upper_bound = 1)
.check_boolean(center, "center")
.check_boolean(scale, "scale")
.check_scalar(eps_conv, TRUE, "eps_conv", upper_bound = 1)
.check_int(burnin_iter, FALSE, "burnin_iter")
.check_int(max_iter_conv, FALSE, "max_iter_conv")
.check_boolean(save_clusterpath, "save_clusterpath")
.check_boolean(save_losses, "save_losses")
.check_boolean(save_convergence_norms, "save_convergence_norms")
# Check the vector of target losses
if (!is.null(target_losses)) {
.check_array(target_losses, 1, "target_losses")
if (length(lambdas) != length(target_losses)) {
message = "target_losses should have the same length as lambdas"
stop(message)
}
use_target = TRUE
} else {
target_losses = rep(-1, length(lambdas))
use_target = FALSE
}
# Preliminaries
n = nrow(X)
# Set the means of each column of X to zero
if (center) {
X_ = X - matrix(apply(X, 2, mean), byrow = TRUE, ncol = ncol(X),
nrow = nrow(X))
} else {
X_ = X
}
# Transpose X
X_ = t(X_)
# Separate the weights into keys and values
W_idx = t(W$keys) - 1
W_val = W$values
# Compute fusion threshold
eps_fusions = .fusion_threshold(X_, tau)
t_start = Sys.time()
clust = .convex_clusterpath(X_, W_idx, W_val, lambdas, target_losses,
eps_conv, eps_fusions, scale, save_clusterpath,
use_target, save_losses,
save_convergence_norms, burnin_iter,
max_iter_conv)
elapsed_time = difftime(Sys.time(), t_start, units = "secs")
# Construct result
result = list()
result$info = data.frame(
clust$info_d[1, ],
clust$info_i[2, ],
clust$info_d[2, ],
clust$info_i[1, ]
)
names(result$info) = c("lambda", "clusters", "loss", "iterations")
# Merge table and height vector
result$merge = t(clust$merge)
result$height = clust$height
# Determine order of the observations for a dendrogram
# Start with an entry in a hashmap for each observation
d = hashmap()
for (i in 1:n) {
d[[-i]] = i
}
# Work through the merge table to make sure that everything that is
# merged is next to each other
if (nrow(result$merge) >= 1) {
for (i in 1:nrow(result$merge)) {
d[[i]] = c(d[[result$merge[i, 1]]], d[[result$merge[i, 2]]])
delete(d, result$merge[i, 1])
delete(d, result$merge[i, 2])
}
}
# Finally, create a vector with the order of the observations
result$order = c()
keys = keys(d)
for (key in keys) {
result$order = c(result$order, d[[key]])
}
# Add elapsed time
result$elapsed_time = elapsed_time
# Add clusterpath coordinates
if (save_clusterpath) {
result$coordinates = t(clust$clusterpath)
}
# Add lambdas
result$lambdas = result$info$lambda
# Add fusion threshold
result$eps_fusions = eps_fusions
# Add vector of possible cluster counts
result$num_clusters = unique(result$info$clusters)
# Add the number of observations
result$n = nrow(X)
# Give the result a class
class(result) = "cvxclust"
# Add losses
if (save_losses) {
result$losses = clust$losses
}
# Add convergence norms
if (save_convergence_norms) {
result$convergence_norms = clust$convergence_norms
}
return(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.