# R/dpmeans.R In maotai: Tools for Matrix Algebra, Optimization and Inference

#### Documented in dpmeans

#' DP-means Algorithm for Clustering Euclidean Data
#'
#' DP-means is a nonparametric clustering method motivated by DP mixture model in that
#' the number of clusters is determined by a parameter \eqn{\lambda}. The larger
#' the \eqn{\lambda} value is, the smaller the number of clusters is attained.
#' In addition to the original paper, we added an option to randomly permute
#' an order of updating for each observation's membership as a common
#' heuristic in the literature of cluster analysis.
#'
#' @param data an \eqn{(n\times p)} data matrix for each row being an observation.
#' @param lambda a threshold to define a new cluster.
#' @param maxiter maximum number of iterations.
#' @param abstol stopping criterion
#' @param permute.order a logical; \code{TRUE} if random order for permutation is used, \code{FALSE} otherwise.
#'
#' @return a named list containing
#' \describe{
#' \item{cluster}{an \eqn{(n\times ndim)} matrix whose rows are embedded observations.}
#' \item{centers}{a list containing information for out-of-sample prediction.}
#' }
#'
#' @examples
#' ## define data matrix of two clusters
#' x1  = matrix(rnorm(50*3,mean= 2), ncol=3)
#' x2  = matrix(rnorm(50*3,mean=-2), ncol=3)
#' X   = rbind(x1,x2)
#' lab = c(rep(1,50),rep(2,50))
#'
#' ## run dpmeans with several lambda values
#' solA <- dpmeans(X, lambda= 5)$cluster #' solB <- dpmeans(X, lambda=10)$cluster
#' solC <- dpmeans(X, lambda=20)$cluster #' #' ## visualize the results #' opar <- par(no.readonly=TRUE) #' par(mfrow=c(1,4), pty="s") #' plot(X,col=lab, pch=19, cex=.8, main="True", xlab="x", ylab="y") #' plot(X,col=solA, pch=19, cex=.8, main="dpmeans lbd=5", xlab="x", ylab="y") #' plot(X,col=solB, pch=19, cex=.8, main="dpmeans lbd=10", xlab="x", ylab="y") #' plot(X,col=solC, pch=19, cex=.8, main="dpmeans lbd=20", xlab="x", ylab="y") #' par(opar) #' #' \donttest{ #' ## let's find variations by permuting orders of update #' ## used setting : lambda=20, we will 8 runs #' sol8 <- list() #' for (i in 1:8){ #' sol8[[i]] = dpmeans(X, lambda=20, permute.order=TRUE)$cluster
#' }
#'
#' ## let's visualize
#' vpar <- par(no.readonly=TRUE)
#' par(mfrow=c(2,4), pty="s")
#' for (i in 1:8){
#'   pm = paste("permute no.",i,sep="")
#'   plot(X,col=sol8[[i]], pch=19, cex=.8, main=pm, xlab="x", ylab="y")
#' }
#' par(vpar)
#' }
#'
#' @references
#' \insertRef{kulis_revisiting_2012}{maotai}
#'
#' @export
dpmeans <- function(data, lambda=1, maxiter=1234, abstol=1e-6, permute.order=FALSE){
############################################################
# Preprocessing
if (!check_datamat(data)){
stop("* dpmeans : an input 'data' should be a matrix without any missing/infinite values.")
}
# Parameter and Initialization
n = nrow(data)
p = ncol(data)
k = 1                                   # set k=1
labels = rep(1,n)                       # labels={1,2,...,n}
mu     = matrix(colMeans(data), nrow=1) # global mean
lambda = as.double(lambda)

############################################################
# Main Iteration
ss.old = compute.ss(data, labels, mu)+ k*lambda
ss.new = 0
for (iter in 1:maxiter){
# 0. updating order of observations
if (permute.order){
idseq = sample(1:n)
} else {
idseq = 1:n
}
# 1. update the class membership per each class
for (i in idseq){
# 1-1. compute distances to the centers
# dic = rep(0, k); for (j in 1:k){dic[j] = sum((as.vector(data[i,])-as.vector(mu[j,]))^2)}
dic = as.vector(dat2centers(data[i,], mu)); # cpp conversion
# 1-2. assign new or stay
if (min(dic) > lambda){
k = k+1
labels[i] = k
mu = rbind(mu, data[i,])
} else {
idmins = which(dic==min(dic))
if (length(idmins)>1){
labels[i] = sample(idmins, 1)
} else {
labels[i] = idmins
}
}
}

# 2. rearrange the label (remove empty ones)
labels = as.factor(labels)
ulabel = sort(unique(labels))
labnew = rep(0,n)
for (i in 1:length(ulabel)){
labnew[(labels==ulabel[i])] = i
}
labels = labnew
k      = round(max(labels))

# 3. compute per-class means
uassign = sort(unique(labels))
mu = array(0,c(k,p))
for (i in 1:k){
idmean = which(labels==uassign[i])
if (length(idmean)==1){
mu[i,] = as.vector(data[idmean,])
} else {
mu[i,] = as.vector(colMeans(data[idmean,]))
}
}

# 4. compute DPMEANS objective function
ss.new   = compute.ss(data, labels, mu) + k*lambda
ss.delta = ss.old-ss.new
ss.old   = ss.new

# 5. stop if updating is not significant
if (ss.delta < abstol){
break
}
}

############################################################
# Return the results
output = list()
output$cluster = as.factor(labels) output$centers = mu
return(output)
}

# auxiliary functions -----------------------------------------------------
#' @keywords internal
#' @noRd
compute.ss <- function(data, label, centers){
p = ncol(data)
if (is.vector(centers)){
centers = matrix(centers, nrow=1)
}
ulabel = sort(unique(label))
output = 0
for (i in 1:length(ulabel)){
subdata = data[(label==ulabel[i]),]
if (!is.vector(subdata)){
nn = nrow(subdata)
for (j in 1:nn){
output = output + sum((as.vector(subdata[j,])-as.vector(centers[i,]))^2)
}
}
}
return(output)
}


## Try the maotai package in your browser

Any scripts or data that you put into this service are public.

maotai documentation built on Feb. 3, 2022, 5:09 p.m.