Nothing
## roxygen2::roxygenise("C:/users/valen/onedrive/myrepo/rrdev/robClus", load_code=roxygen2:::load_installed)
#'
#' TKMEANS method for robust K-means clustering
#'
#' @name tkmeans
#' @aliases print.tkmeans
#' @description This function searches for \code{k} (or less) spherical clusters
#' in a data matrix \code{x}, whereas the \code{ceiling(alpha n)} most outlying
#' observations are trimmed.
#' @param x A matrix or data.frame of dimension n x p, containing the observations (row-wise).
#' @param k The number of clusters initially searched for.
#' @param alpha The proportion of observations to be trimmed.
#' @param nstart The number of random initializations to be performed.
#' @param niter1 The number of concentration steps to be performed for the nstart initializations.
#' @param niter2 The maximum number of concentration steps to be performed for the
#' \code{nkeep} solutions kept for further iteration. The concentration steps are
#' stopped, whenever two consecutive steps lead to the same data partition.
#' @param nkeep The number of iterated initializations (after niter1 concentration
#' steps) with the best values in the target function that are kept for further iterations
#' @param iter.max (deprecated, use the combination \code{nkeep, niter1 and niter2})
#' The maximum number of concentration steps to be performed.
#' The concentration steps are stopped, whenever two consecutive steps lead
#' to the same data partition.
#' @param zero_tol The zero tolerance used. By default set to 1e-16.
#' @param points Optional initial mean vectors, \code{NULL} or a matrix with \code{k}
#' vectors used as means to initialize the algorithm. If initial mean vectors are
#' specified, \code{nstart} should be 1 (otherwise the same initial means are
#' used for all runs).
#' @param center Optional centering of the data: a function or a vector of length p
#' which can optionally be specified for centering x before calculation
#' @param scale Optional scaling of the data: a function or a vector of length p
#' which can optionally be specified for scaling x before calculation
#' @param store_x A logical value, specifying whether the data matrix \code{x} shall be
#' included in the result object. By default this value is set to \code{TRUE}, because
#' some of the plotting functions depend on this information. However, when big data
#' matrices are handled, the result object's size can be decreased noticeably
#' when setting this parameter to \code{FALSE}.
#' @param parallel A logical value, specifying whether the nstart initializations should be done in parallel.
#' @param n.cores The number of cores to use when paralellizing, only taken into account if parallel=TRUE.
#' @param drop.empty.clust Logical value specifying, whether empty clusters shall be
#' omitted in the resulting object. (The result structure does not contain center
#' estimates of empty clusters anymore. Cluster names are reassigned
#' such that the first l clusters (l <= k) always have at least one observation.
#' @param trace Defines the tracing level, which is set to 0 by default. Tracing level 1
#' gives additional information on the stage of the iterative process.
#'
#' @return The function returns the following values:
#' \itemize{
#' \item cluster - A numerical vector of size \code{n} containing the cluster assignment
#' for each observation. Cluster names are integer numbers from 1 to k, 0 indicates
#' trimmed observations. Note that it could be empty clusters with no observations
#' when \code{equal.weights=FALSE}.
#' \item obj - The value of the objective function of the best (returned) solution.
#' \item size - An integer vector of size k, returning the number of observations contained by each cluster.
#' \item centers - A matrix of size p x k containing the centers (column-wise) of each cluster.
#' \item code - A numerical value indicating if the concentration steps have
#' converged for the returned solution (2).
#' \item cluster.ini - A matrix with nstart rows and number of columns equal to
#' the number of observations and where each row shows the final clustering
#' assignments (0 for trimmed observations) obtained after the \code{niter1}
#' iteration of the \code{nstart} random initializations.
#' \item obj.ini - A numerical vector of length \code{nstart} containing the values
#' of the target function obtained after the \code{niter1} iteration of the
#' \code{nstart} random initializations.
#' \item x - The input data set.
#' \item k - The input number of clusters.
#' \item alpha - The input trimming level.
#' }
#'
#' @author Valentin Todorov, Luis Angel Garcia Escudero, Agustin Mayo Iscar.
#'
#' @references
#'
#' Cuesta-Albertos, J. A.; Gordaliza, A. and MatrĂ¡n, C. (1997), "Trimmed k-means:
#' an attempt to robustify quantizers". Annals of Statistics, Vol. 25 (2), 553-576.
#'
#' @export
#'
#' @examples
#'
#' \dontshow{
#' set.seed(0)
#' }
#' ##--- EXAMPLE 1 ------------------------------------------
#' sig <- diag(2)
#' cen <- rep(1,2)
#' x <- rbind(MASS::mvrnorm(360, cen * 0, sig),
#' MASS::mvrnorm(540, cen * 5, sig),
#' MASS::mvrnorm(100, cen * 2.5, sig))
#'
#' ## Two groups and 10\% trimming level
#' (clus <- tkmeans(x, k = 2, alpha = 0.1))
#'
#' plot(clus)
#' plot(clus, labels = "observation")
#' plot(clus, labels = "cluster")
#'
#' #--- EXAMPLE 2 ------------------------------------------
#' data(geyser2)
#' (clus <- tkmeans(geyser2, k = 3, alpha = 0.03))
#' plot(clus)
#'
tkmeans <- function(x, k, alpha=0.05, nstart=500, niter1=3, niter2=20, nkeep=5, iter.max, points=NULL,
center=FALSE, scale=FALSE, store_x=TRUE,
parallel=FALSE, n.cores=-1,
zero_tol=1e-16, drop.empty.clust=TRUE, trace=0) {
if(!missing(iter.max)) {
warning("The parameter 'iter.max' is deprecated, please read the help and use the combination of 'niter1', 'niter2', 'nkeep'.")
niter1 <- iter.max
}
if(!is.null(points)) {
if(nstart > 1)
warning("If initial mean vectors are specified, 'nstart' should be 1 (otherwise the same initial means are used for all runs)")
nstart <- nkeep <- 1
parallel <- FALSE
}
parlist <- list(k=k, alpha=alpha, nstart=nstart, niter1=niter1, niter2=niter2, nkeep=nkeep,
center=center, scale=scale,
zero_tol=zero_tol, drop.empty.clust=drop.empty.clust, trace=trace, store_x=store_x,
equal.weights=TRUE)
# Initial checks
if(is.data.frame(x))
x <- (data.matrix(x))
if(!is.matrix (x))
x <- as.matrix(x)
if(any(is.na(x)))
stop ("x cannot contain NA")
if(!is.numeric (x))
stop ("Parameter x: numeric matrix/vector expected")
scaled <- myscale(x, center=center, scale=scale)
x <- scaled$x
if(store_x)
parlist$x <- x
if (k < 1)
stop ("Parameter k: must be >= 1")
if (alpha < 0 || alpha > 1)
stop ("Parameter alpha: must be in [0,1]")
if (niter1 < 0 || as.integer(niter1) != niter1)
stop ("Parameter niter1: must be an integer >= 0")
if (niter2 < 0 || as.integer(niter2) != niter2)
stop ("Parameter niter2: must be an integer >= 0")
if (nkeep < 0 || as.integer(nkeep) != nkeep)
stop ("Parameter nkeep: must be an integer >= 0")
if (nkeep > nstart)
stop ("Parameter nkeep: must be <= nstart")
if(!is.logical(parallel))
stop ("Parameter parallel: must be a logical TRUE or FALSE")
if (n.cores < -2 || as.integer(n.cores) != n.cores)
stop ("Parameter n.cores: must be an integer >= 0, -1 or -2")
if(zero_tol < 0)
stop ("Parameter zero_tol: must be >= 0")
if(trace != 0 & trace !=1)
stop ("Parameter trace: must be 0 or 1")
###
# FIRST STEP: get nstart solutions with niter1 concentration steps
###
if(trace){
cat(paste("\nPhase 1: obtaining ", nstart, " solutions.\n", sep = ""))
if(parallel) cat(paste("\n Parallelizing initializations using ", n.cores,
" cores. Progress bar will not display accurate information.\n", sep = ""))
pb <- txtProgressBar(min = 0, max = nstart, style = 3)
}
if(!parallel) {
cluster.ini <- vector("list", nstart) ## for containing the best values for the parameters after several random starts
obj.ini <- rep(0, nstart) ## for containing best objective values
for(j in 1:nstart) {
assig_obj <- tkmeans_c1(x, k, alpha, niter1, zero_tol, points) # niter1 steps!
cluster.ini[[j]] <- assig_obj$cluster
obj.ini[j] <- assig_obj$obj
if(trace){
setTxtProgressBar(pb, j)
}
}
} else {
# Setup parallel cluster
if(n.cores == -1){
n.cores <- detectCores()
} else if (n.cores == -2){
n.cores <- detectCores() - 1
}
parclus <- makeCluster(n.cores)
registerDoParallel(parclus)
count <- 0
comb <- function(...) { # Custom combination function for the foreach loop
count <<- count + length(list(...)) - 1
setTxtProgressBar(pb, count)
flush.console()
c(...) # this can feed into .combine option of foreach
}
init.results <- foreach(j = 1:nstart,
.packages = "tclust",
.combine = ifelse(trace, "comb", "c"),
.multicombine = TRUE,
.inorder = FALSE) %dopar% {
assig_obj <- tkmeans_c1(x, k, alpha, niter1, zero_tol)
list(assig_obj$cluster, assig_obj$obj)
}
stopCluster(parclus)
cluster.ini <- init.results[0:(nstart-1) * 2 + 1] # Impair positions of cluster.ini
obj.ini <- unlist(init.results[1:nstart * 2]) # Pair positions of cluster.ini
}
###
# SECOND STEP: get nkeep best solutions so far
###
if(trace){
cat(paste("\n\nPhase 2: obtaining ", nkeep, " best solutions out of the intial ", nstart," solutions.\n", sep = ""))
}
best_index <- order(obj.ini)[1:nkeep]
best_assig_list <- cluster.ini[best_index]
###
# THIRD STEP: apply niter2 concentration steps to nkeep best solutions, return the best solution
###
if(trace){
cat(paste("\nPhase 3: applying ", niter2, " concentration steps to each of the ", nkeep," best solutions.\n", sep = ""))
pb2 <- txtProgressBar(min = 0, max = nkeep, style = 3)
}
best_iter <- NULL
best_iter_obj <- Inf
for(j in 1:nkeep){
iter <- tkmeans_c2(x, k, best_assig_list[[j]], alpha, niter2, zero_tol=1e-16)
if(iter$obj <= best_iter_obj){
best_iter <- iter
best_iter_obj <- iter$obj
}
if(trace){
setTxtProgressBar(pb2, j)
}
}
if(trace){
cat("\n\n")
}
## Adjust the returned object to be similar to 'tkmeans':
## Handle empty clusters
if(drop.empty.clust)
idxuse <- which(best_iter$size > 0)
else
idxuse <- 1:k
idxuse <- idxuse[order(best_iter$size[idxuse], decreasing = TRUE)]
ClusterIDs <- rep(0, k + 1)
ClusterIDs[idxuse + 1] <- 1:length(idxuse)
k.real <- length(idxuse)
## - A list of values internally used by functions related to tclust objects.
int <- list(
iter.successful=0, iter.converged=0,
dim=dim(x))
## - Transpose the 'centers' matrix
best_iter$centers <- t(best_iter$centers)
best_iter$centers <- best_iter$centers[, idxuse, drop = FALSE]
best_iter$cluster <- as.vector(best_iter$cluster)
best_iter$cluster <- ClusterIDs[best_iter$cluster + 1]
best_iter$size <- as.vector(best_iter$size)
best_iter$size <- best_iter$size[idxuse]
best_iter$cov <- NULL
best_iter$posterior <- NULL
best_iter$NlogL <- NULL
best_iter$disttom <- NULL
ret <- c(best_iter, list(cluster.ini=matrix(unlist(cluster.ini), byrow=TRUE, nrow=length(cluster.ini)),
obj.ini=obj.ini, int=int, par=parlist, k=sum(best_iter$size > 0)))
## - Dimnames of center and cov
dn.x <- dimnames(x)
rownames(ret$centers) <- if(is.null(colnames(x))) paste ("X", 1:ncol(x)) else colnames (x)
colnames(ret$centers) <- paste("C", 1:k.real)
## Back transform centers, cov and x
cmm <- scaled$scale %*% t (scaled$scale)
for(i in 1:ret$k) {
ret$centers[,i] <- ret$centers[,i] * scaled$scale + scaled$center
}
x <- sweep(x, 2L, scaled$scale, `*`, check.margin = FALSE)
x <- sweep(x, 2L, scaled$center, `+`, check.margin = FALSE)
if(store_x)
ret$par$x <- x
class(ret) <- "tkmeans"
return(ret)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.