#' @title K-Means Clustering
#' @description \code{kMeansLloyd} performs the k-means algorithm based on
#' Lloyd's paper (1982) on a data matrix.
#' @usage kMeansLloyd(x, centroids, maxIter = 10L, nStart = 1L)
#'
#' @param x matrix or an object that can be coerced to a matrix
#' (e.g. data.frame): contains the observations that are clustered. Note:
#' \code{x} has to be numeric and does not contain missing values
#' @param centroids integer (numeric) or matrix: If an integer, a random set of
#' (distinct) rows in \code{x} is chosen as the intitial centres. If an
#' appropriate matrix, the first row corresponds to the first initial cluster
#' center. The provided integer or the number of rows of the provided matrix
#' determines the number of clusters.
#' @param maxIter integer (numeric): is the maximum number of iterations that
#' is allowed.
#' @param nStart integer (numeric): if \code{centroids} is a number,
#' \code{nStart} determines the number of random sets that are chosen.
#'
#' @return \code{kMeansLloyd} returns an object of class \code{kMeans}. Methods
#' implemented for class \code{kMeans} include: \code{print}, \code{summary},
#' \code{plot} and \code{fitted}.
#'
#' An object of class \code{kMeans} is a list containing the following
#' components:
#' \item{\code{cluster}}{a vector of integers indicating the cluster allocation
#' of each point.}
#' \item{\code{centroids}}{a matrix of cluster centroids.}
#' \item{\code{iterations}}{the number of iterations.}
#' \item{\code{groupSizes}}{the number of points in each cluster.}
#' \item{\code{data}}{the data matrix.}
#' \item{\code{withinSS}}{a vector of within-cluster sum of squares (one
#' component per cluster).}
#' \item{\code{withinTot}}{total within-cluster sum of squares (i.e.
#' \code{sum(withinss)}).}
#' @export
#' @aliases cluster
#' @seealso \code{\link{print.kMeans}}, \code{\link{summary.kMeans}},
#' \code{\link{plot.kMeans}}, \code{\link{fitted.kMeans}}
#' @examples
#' X <- rbind(matrix(rnorm(50, sd = 0.5), ncol = 2),
#' matrix(rnorm(50, mean = 1, sd = 0.5), ncol = 2))
#' result <- kMeansLloyd(x = X, centroids = 2, nStart = 2)
#' @references Lloyd, S. P. (1957, 1982). Least squares quantization in PCM.
#' Technical Note, Bell Laboratories. Published in 1982 in \emph{IEEE
#' Transactions on Information Theory}, \strong{28}, 128-137.
kMeansLloyd <- function(x, centroids, maxIter = 10L, nStart = 1L) {
# check data, retrieve dimensions, error messages ----
# check x
x <- as.matrix(x)
if (!is.numeric(x)) stop("cannot process non-numeric data")
if (any(is.na(x))) stop("'x' contains missing values")
n <- as.integer(nrow(x))
if (is.na(n)) stop("invalid nrow(x)")
p <- as.integer(ncol(x))
if (is.na(p)) stop("invalid ncol(x)")
if (p < 2L) stop("ncol(x) > 1 required")
# check centroids
if (missing(centroids)) stop("'centroids' missing")
# check maxIter
maxIterOld <- maxIter
bool1 <- is.logical(maxIterOld)
suppressWarnings(maxIter <- as.integer(maxIter))
if (is.na(maxIter) || maxIter < 1L || length(maxIter) > 1L || bool1) {
stop("'maxIter' must be positive number")
}
if (maxIterOld - maxIter != 0) {
message("Note: 'maxIter' is not an integer, input is truncated")
}
# check nStart
nStartOld <- nStart
bool2 <- is.logical(nStartOld)
suppressWarnings(nStart <- as.integer(nStart))
if (is.na(nStart) || nStart < 1L || length(nStart) > 1L || bool2) {
stop("'nStart' must be positive number")
}
if (nStartOld - nStart != 0) {
message("Note: 'nStart' is not an integer, input is truncated")
}
# determination of starting centroids ----
if (length(centroids) == 1L) {
# integer case
# check centroids
if (!is.numeric(centroids)) stop("'centroids' neither number nor matrix")
if (centroids - as.integer(centroids) != 0) {
message("Note: 'centroids' is not an integer, input is truncated")
}
k <- as.integer(centroids)
if (k < 1L) stop("'centroids' must be positive integer")
# draw random centroids
indic <- unique(x)
numUniquePoints <- nrow(indic)
if (numUniquePoints < k) stop("more clusters than distinct data points")
centroids <- indic[sample.int(numUniquePoints, k), , drop = FALSE]
} else {
# matrix case
centroids <- as.matrix(centroids)
if (any(duplicated(centroids))) stop("initial centroids are not distinct")
k <- as.integer(nrow(centroids))
if (n < k) stop("more clusters than data points")
if (p != ncol(centroids))
stop("must have same number of columns in 'x' and 'centroids'")
if (nStart > 1) {
warning("Note: nStart is ignored since centroids are not randomly chosen")
}
indic <- NULL
}
# apply Lloyd algorithm ----
res <- lloyd(x, centroids, maxIter)
# random sets----
if (!is.null(indic) && nStart >= 2L) {
for (i in 2:nStart) {
centroids <- indic[sample.int(numUniquePoints, k), , drop = FALSE]
updRes <- lloyd(x, centroids, maxIter)
if (updRes$withinTot < res$withinTot) {
res <- updRes
}
}
}
# define output ----
size <- as.vector(table(res$cluster))
colnames(res$centroids) <- colnames(x)
rownames(res$centroids) <- 1:k
out <- res
out$groupSizes <- size
out$data <- x
class(out) <- "kMeans"
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.