kMeans_Rpackage: Lloyd's K-Means Algorithm from Scratch

Documented in kMeansLloyd

#' @title K-Means Clustering
#' @description \code{kMeansLloyd} performs the k-means algorithm based on
#' Lloyd's paper (1982) on a data matrix.
#' @usage kMeansLloyd(x, centroids, maxIter = 10L, nStart = 1L)
#'
#' @param x matrix or an object that can be coerced to a matrix
#' (e.g. data.frame): contains the observations that are clustered. Note:
#' \code{x} has to be numeric and does not contain missing values
#' @param centroids integer (numeric) or matrix: If an integer, a random set of
#'  (distinct) rows in \code{x} is chosen as the intitial centres. If an
#'  appropriate matrix, the first row corresponds to the first initial cluster
#'  center. The provided integer or the number of rows of the provided matrix
#'   determines the number of clusters.
#' @param maxIter integer (numeric): is the maximum number of iterations that
#' is allowed.
#' @param nStart integer (numeric): if \code{centroids} is a number,
#' \code{nStart} determines the number of random sets that are chosen.
#'
#' @return \code{kMeansLloyd} returns an object of class \code{kMeans}. Methods
#' implemented for class \code{kMeans} include: \code{print}, \code{summary},
#' \code{plot} and \code{fitted}.
#'
#' An object of class \code{kMeans} is a list containing the following
#'  components:
#' \item{\code{cluster}}{a vector of integers indicating the cluster allocation
#' of each point.}
#' \item{\code{centroids}}{a matrix of cluster centroids.}
#' \item{\code{iterations}}{the number of iterations.}
#' \item{\code{groupSizes}}{the number of points in each cluster.}
#' \item{\code{data}}{the data matrix.}
#' \item{\code{withinSS}}{a vector of within-cluster sum of squares (one
#' component per cluster).}
#' \item{\code{withinTot}}{total within-cluster sum of squares (i.e.
#' \code{sum(withinss)}).}
#' @export
#' @aliases cluster
#' @seealso \code{\link{print.kMeans}}, \code{\link{summary.kMeans}},
#' \code{\link{plot.kMeans}}, \code{\link{fitted.kMeans}}
#' @examples
#' X <- rbind(matrix(rnorm(50, sd = 0.5), ncol = 2),
#' matrix(rnorm(50, mean = 1, sd = 0.5), ncol = 2))
#' result <- kMeansLloyd(x = X, centroids = 2, nStart = 2)
#' @references Lloyd, S. P. (1957, 1982). Least squares quantization in PCM.
#' Technical Note, Bell Laboratories. Published in 1982 in \emph{IEEE
#' Transactions on Information Theory}, \strong{28}, 128-137.
kMeansLloyd <- function(x, centroids, maxIter = 10L, nStart = 1L) {
  # check data, retrieve dimensions, error messages ----

  # check x
  x <- as.matrix(x)
  if (!is.numeric(x)) stop("cannot process non-numeric data")
  if (any(is.na(x))) stop("'x' contains missing values")
  n <- as.integer(nrow(x))
  if (is.na(n)) stop("invalid nrow(x)")
  p <- as.integer(ncol(x))
  if (is.na(p)) stop("invalid ncol(x)")
  if (p < 2L) stop("ncol(x) > 1 required")

  # check centroids
  if (missing(centroids)) stop("'centroids' missing")

  # check maxIter
  maxIterOld <- maxIter
  bool1 <- is.logical(maxIterOld)
  suppressWarnings(maxIter <- as.integer(maxIter))
  if (is.na(maxIter) || maxIter < 1L || length(maxIter) > 1L || bool1) {
    stop("'maxIter' must be positive number")
  }
  if (maxIterOld - maxIter != 0) {
    message("Note: 'maxIter' is not an integer, input is truncated")
  }

  # check nStart
  nStartOld <- nStart
  bool2 <- is.logical(nStartOld)
  suppressWarnings(nStart <- as.integer(nStart))
  if (is.na(nStart) || nStart < 1L || length(nStart) > 1L || bool2) {
    stop("'nStart' must be positive number")
  }
  if (nStartOld - nStart != 0) {
    message("Note: 'nStart' is not an integer, input is truncated")
  }

  # determination of starting centroids ----
  if (length(centroids) == 1L) {
    # integer case
    # check centroids
    if (!is.numeric(centroids)) stop("'centroids' neither number nor matrix")
    if (centroids - as.integer(centroids) != 0) {
      message("Note: 'centroids' is not an integer, input is truncated")
    }
    k <- as.integer(centroids)
    if (k < 1L) stop("'centroids' must be positive integer")
    # draw random centroids
    indic <- unique(x)
    numUniquePoints <- nrow(indic)
    if (numUniquePoints < k) stop("more clusters than distinct data points")
    centroids <- indic[sample.int(numUniquePoints, k), , drop = FALSE]
  } else {
    # matrix case
    centroids <- as.matrix(centroids)
    if (any(duplicated(centroids))) stop("initial centroids are not distinct")
    k <- as.integer(nrow(centroids))
    if (n < k) stop("more clusters than data points")
    if (p != ncol(centroids))
      stop("must have same number of columns in 'x' and 'centroids'")
    if (nStart > 1) {
      warning("Note: nStart is ignored since centroids are not randomly chosen")
    }
    indic <- NULL
  }

  # apply Lloyd algorithm ----
  res <- lloyd(x, centroids, maxIter)

  # random sets----
  if (!is.null(indic) && nStart >= 2L) {
    for (i in 2:nStart) {
      centroids <- indic[sample.int(numUniquePoints, k), , drop = FALSE]
      updRes <- lloyd(x, centroids, maxIter)
      if (updRes$withinTot < res$withinTot) {
        res <- updRes
      }
    }
  }

  # define output ----
  size <- as.vector(table(res$cluster))
  colnames(res$centroids) <- colnames(x)
  rownames(res$centroids) <- 1:k

  out <- res
  out$groupSizes <- size
  out$data <- x
  class(out) <- "kMeans"

  out
}