Nothing
#' improvedktaucenters
#'
#' Robust Clustering algorithm for non-spherical data. This function estimate clusters taking into account that clusters may have
#' different size, volume or orientation.
#' @param X numeric matrix of size n x p.
#' @param K the number of cluster.
#' @param cutoff optional argument for getOutliers - quantiles of chi-square to be used as a threshold for outliers detection, defaults to 0.999
#' @param nstart optional the number of trials that the base algorithm ktaucenters_aux is run
#' at the first stage. #' If it is greater than 1 and center is not set as NULL,
#' a random set of (distinct) rows in x is chosen as the initial centres for each try.
#' @param INITcenters numeric matrix of size K x p indicating the initial centers for that clusters and robust covarianze matrices will be
#' computed, if it is set as NULL the algorithm will compute @param INITcenters from ktaucenters routine. Set to NULL by default.
#' @return A list including the estimated K centers and clusters labels for the observations
## @details text describing parameter inputs in more detail.
#' \itemize{
#' \item{\code{centers}}{: matrix of size K x p, with the estimated K centers.}
#' \item{\code{cluster}}{: array of size n x 1 integers labels between 1 and K.}
#' \item{\code{tauPath}}{: sequence of tau scale values at each iterations.}
#' \item{\code{Wni}}{: numeric array of size n x 1 indicating the weights associated to each observation.}
#' \item{\code{emptyClusterFlag}}{: a boolean value. True means that in some iteration there were clusters totally empty.}
#' \item{\code{niter}}{: number of iterations untill convergence is achived or maximun number of iteration is reached.}
#' \item{\code{sigmas}}{: a list containing the k covariance matrices found by the procedure at its second step.}
#' \item{\code{outliers}}{: indices observation that can be considered as outliers.}
#' }
#' @importFrom GSE GSE getOutliers getLocation getScatter
#' @importFrom MASS ginv
#' @importFrom stats mahalanobis qchisq
## #
#' @examples
#'
#' # Generate Sintetic data (three normal cluster in two dimension)
#' # clusters have different shapes and orentation.
#' # The data is contaminated uniformly (level 20%).
#'
#' ################################################
#' #### Start data generating process ############
#' ##############################################
#'
#' # generates base clusters
#' set.seed(1)
#' Z1 <- c(rnorm(100,0),rnorm(100,0),rnorm(100,0))
#' Z2 <- rnorm(300);
#' X <- matrix(0, ncol=2,nrow=300);
#' X[,1]=Z1;X[,2]=Z2
#' true.cluster= c(rep(1,100),rep(2,100),rep(3,100))
#'
#' # rotate, expand and translate base clusters
#' theta=pi/3;
#' aux1=matrix(c(cos(theta),-sin(theta),sin(theta),cos(theta)),nrow=2)
#' aux2=sqrt(4)*diag(c(1,1/4))
#' B=aux1%*%aux2%*%t(aux1)
#' X[true.cluster==3,]=X[true.cluster==3,]%*%aux2%*%aux1 + matrix(c(5,2),byrow = TRUE,nrow=100,ncol=2)
#' X[true.cluster==2,2]=X[true.cluster==2,2]*5
#' X[true.cluster==1,2]=X[true.cluster==1,2]*0.1
#' X[true.cluster==1, ]=X[true.cluster==1,]+ matrix(c(-5,-1),byrow = TRUE,nrow=100,ncol=2)
#' ### Generate 60 sintetic outliers (contamination level 20%)
#'
#' outliers=sample(1:300,60)
#' X[outliers, ] <- matrix(runif( 40, 2 * min(X), 2 * max(X) ),
#' ncol = 2, nrow = 60)
#' ###############################################
#' #### END data generating process ############
#' #############################################
#'
#' #############################################
#' ### Applying the algortihm ##################
#' #############################################
#' ret=improvedktaucenters(X,K=3,cutoff=0.999)
#'
#' #############################################
#' ### plotting results ########################
#' #############################################
#' oldpar=par(mfrow=c(2,1))
#' #' plot(X,main="actual clusters")
#' for (j in 1:3){
#' points(X[true.cluster==j,],pch=19, col=j+1)
#' }
#' points(X[outliers,],pch=19,col=1)
#' plot(X,main="clusters estimation")
#' for (j in 1:3){
#' points(X[ret$cluster==j,],pch=19, col=j+1)
#' }
#' points(X[ret$outliers,],pch=19)
#'
#' par(oldpar)
#' @references Gonzalez, J. D., Yohai, V. J., & Zamar, R. H. (2019).
#' Robust Clustering Using Tau-Scales. arXiv preprint arXiv:1906.08198.
#' @export
improvedktaucenters=function(X,K,cutoff=0.999,nstart =5,INITcenters=NULL){
n=dim(X)[1]
p=dim(X)[2]
sigmas <- vector(mode="list", length=K);
centers=INITcenters
#### FIRST STEP: determine the best centers #####
# if centers are not given, we calculate with ktaucenters routine
if (is.null(centers)){
ret_ktau <- ktaucenters(X, K, nstart = nstart)
centers <- ret_ktau$centers
newClusters<-ret_ktau$cluster
}
#### if centers are given, we just
#### calculate the clusters from these centers.
sphericalDistanceMatrix<- matrix(0, ncol = K, nrow = n)
if (!is.null(centers)){
for (j in 1:K){
sphericalDistanceMatrix[,j] <- mahalanobis(x=X,
center=centers[j, ]
,cov=diag(p))
}
newClusters <- apply(sphericalDistanceMatrix, 1,
function(x) which( x == min(x))[1])
}
newcentersaux <- 0*centers
for (j in 1:K){
sigmas[[j]] <- diag(p)
}
#=========== and SECOND STEP ========
iter=0
newcenters <- centers;
while(iter<1){
for (j in 1:K){
Xcluster <- X[newClusters==j,];
nk <- dim(Xcluster)[1];
# dim works properly when there are two or more observations, for that reason:
if (sum(newClusters==j)<2){
nk=sum(newClusters==j)
}
# if there is enough observations, we proceed to compute
# robust covarianze matrix
if(nk>(3*p)){
# Xcentered <- sweep(Xcluster, 2, centers[j,], FUN="-")
sal1 <- GSE(Xcluster, tol=1e-4, maxiter=150)
newcentersaux[j,] <- getLocation(sal1)
sigmas[[j]] <- getScatter(sal1)
}
# if there is not enough observations, we don't compute
# robust covarianze matrix
if(nk<=(3*p)){
newcentersaux[j,]= newcenters[j, ]
}
}
mahalanobisMatrix <- matrix(0, ncol = K, nrow = n)
for (j in 1:K){
mahalanobisMatrix[,j] <- mahalanobis(x=X,
center=newcentersaux[j, ]
,cov=ginv(sigmas[[j]]),
inverted = TRUE)
}
newClusters_Mah <- apply(mahalanobisMatrix, 1,
function(x) which( x == min(x))[1])
value <- qchisq(cutoff, df = p)
#IMPROVED oultiers determination.
outliers=c()
for (j in 1:K){
indices <- which(newClusters_Mah == j)
booleansubindices <- mahalanobisMatrix[indices, j] > value
outliersk <- indices[booleansubindices]
outliers <- c(outliersk, outliers)
}
# In the future: implement this lines should be worked better than current for unbalanced-cluster cases.
# qsMatrix=matrix(0,ncol=K,nrow=n)
# logdet=rep(0,K);
# for (j in 1:K){
# logalpha=sum(newClusters_Mah==j)/(n-length(outliers))
# logdet[j]=log(det(sigmas[[j]]))
# qsMatrix[,j]= logalpha[j] - .5*logdet[j] -.5*mahalanobisMatrix[,j]
# }
# newClusters_qs=apply(qsMatrix,1,function(x) which(x==min(x))[1])
newClusters <- newClusters_Mah;
newcenters <- newcentersaux;
iter <- iter + 1
}
list(cluster=newClusters, centers=newcenters, outliers=outliers, sigmas=sigmas);
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.