#' wrk
#'
#'
#' This function performs robust (weighted) k-means clustering which is very useful in case of contaminated data. The method
#' aims at detecting both clusters and outliers.
#'
#' @param data A data matrix with n observations and p variables.
#' @param D A distance matrix.
#' @param k The number of clusters.
#' @param max.iter The maximum number of iterations to reach local optimum, the default is 30.
#' @param cutoff A cutoff value to determine outliers. An observation is
#' declared as an outlier if its weight is smaller than or equal to this cutoff, the default is 0.5.
#'
#' @return
#' \item{clusters}{An integer vector with values from 1 to k, indicating a resulting cluster membership.}
#' \item{obsweights}{A numeric vector of observation weights ranging between 0 and 1.}
#' \item{outclusters}{An integer vector with values from 0 to k, containing both cluster membership and identified outliers.
#' 0 corresponds to outlier.}
#' \item{WCSS}{The within-cluster sum of squares of the local optimum.}
#' \item{centers}{The set of cluster centers.}
#'
#' @export
#' @examples
#' # generate data
#' d <- SimData(size_grp=c(40,40,40),p_inf=50,
#' p_noise=750,p_out_noise=75)
#' dat <- scale(d$x)
#'
#' res <- wrk(data=dat,D=dist(dat),k=3)
#' table(d$y,res$clusters)
#' table(d$lb,res$outclusters)
#' plot(res$obsweights,col=d$lb +1)
#'
#' @details
#' \code{wrk} initializes the clustering procedure by ROBIN approach and
#' incorporates a weighting function on each detected clusters during k-means clustering.
#' The weighting function uses LOF in order to assign a weight for each observation. The resulting observation weights reflect
#' a degree of outlyingness and range between 0 and 1. These weights are used to
#' identify outliers as observations with a weight \code{<=cutoff}.
#'
#' @author Sarka Brodinova <sarka.brodinova@tuwien.ac.at>
#'
#' @references S. Brodinova, P. Filzmoser, T. Ortner, C. Breiteneder, M. Zaharieva. Robust and sparse k-means clustering for
#' high-dimensional data. Submitted for publication, 2017. Available at http://arxiv.org/abs/1709.10012
#'
#' @seealso \code{\link[wrsk]{ROBIN}}
#'
wrk <- function(data,D,k,max.iter=30,cutoff=0.5){
n <- nrow(data)
p <- ncol(data)
# Robin initialization
init <- ROBIN(D,data,k)
# neobernad fix:
if (all(is.na(init$centers))) {
init$centers <- order(init$lof, decreasing = TRUE)[1:k]
warning("ROBIN() failed to find centers. Using top LOF values as initialization.")
}
means <- data[init$centers,] %>% as.matrix()
go <- TRUE
iter <- 1
opt.crit <- +Inf
oldclass <- rep(1,n)
while(go){
# assign points to the closest clusters and calculate their distances
distToCenter <- matrix(NA,nrow=n,ncol=k)
for (j in 1:k){
muj <- means[j,]
distToCenter[,j] <- apply(scale(data,center=muj,scale=FALSE)^2,1,sum)
}
icluster <- apply(distToCenter,1,which.min)
wcss <- apply(distToCenter,1,min)
crit <- sum(wcss) # stop criterion
# calculate observetion weights and cluster centers as weighted means
A <- GetWeightsAll(n,data,k,icluster,means)
means <- A$means %>% as.matrix()
# rearange clustering solution based on the weights
outcluster <- icluster
outcluster[A$obsweights<=cutoff] <- 0
# local optimal solution
if( (iter<max.iter) & !identical(oldclass,outcluster) ){
go <- TRUE
if(crit<opt.crit){
#print(table(lb,out.iclass))
WCSS <- crit
outCluster <- outcluster
cluster <- icluster
optmeans <- A$means
optweights <- A$obsweights
}
iter <- iter +1
oldclass <- outcluster
}else{
go <- FALSE
break
}
} # end while
return(list(clusters=cluster,
obsweights=optweights,
outclusters=outCluster,
WCSS=WCSS,
centers=optmeans))
}
GetWeightsCl <- function(x,lof.k=10){
# x .... detected cluster
# lof.k ... the number of nearest neighbors for LOF
lf <- lof(x,k=lof.k)
lf.sc <- scale(lf)
M1 <- median(lf.sc) + 2.5 * mad(lf.sc)
const1 <- 2
v1 <- (1 - ((lf.sc - M1)/(const1 - M1))^2)^2
v1[lf.sc < M1] <- 1
v1[lf.sc > const1] <- 0
return(v1)
}
GetWeightsAll <- function(n,data,k,icluster,means){
v <-numeric(n)
for(l in 1:k){
cl <- data[icluster==l,,drop=FALSE] %>% as.matrix()
if(nrow(cl)>2){
q <- ifelse(nrow(cl)<=10,nrow(cl)-1,10)
vo <- GetWeightsCl(x=cl,lof.k=q)
vo <- as.vector(vo,mode="numeric")
means[l,] <- apply(vo * cl,2,sum) /sum(vo)
}else{
means[l,] <-apply(cl,2,mean)
vo <- rep(1,nrow(cl))
}
v[icluster==l] <- vo
}
return(list(obsweights=v,means=means))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.