ktaucenterscpp: Robust Clustering Procedures

Documented in improvedktaucenters

#' improvedktaucenters
#'
#' Robust Clustering algorithm for non-spherical data. This function
#' estimate clusters taking into account that clusters may have
#' different size, volume or orientation.
#' @param X numeric matrix of size n x p.
#' @param K the number of cluster.
#' @param cutoff optional argument for getOutliers - quantiles of
#'     chi-square to be used as a threshold for outliers detection,
#'     defaults to 0.999
#' @param nstart optional the number of trials that the base algorithm
#'     ktaucenters_aux is run at the first stage. #' If it is greater
#'     than 1 and center is not set as NULL, a random set of
#'     (distinct) rows in x is chosen as the initial centres for each
#'     try.
#' @param INITcenters numeric matrix of size K x p indicating the
#'     initial centers for that clusters and robust covarianze
#'     matrices will be computed, if it is set as NULL the algorithm
#'     will compute @param INITcenters from ktaucenters routine. Set
#'     to NULL by default.
#' @return A list including the estimated K centers and clusters
#'     labels for the observations

## @details text describing parameter inputs in more detail.
#' \itemize{
#'  \item{\code{centers}}{:   matrix of size K x p, with the estimated K centers.}
#'  \item{\code{cluster}}{: array of size n x 1  integers labels between 1 and K.}
#'  \item{\code{tauPath}}{: sequence of tau scale values at each iterations.}
#'  \item{\code{Wni}}{: numeric array of size n x 1 indicating the weights associated to each observation.}
#'  \item{\code{emptyClusterFlag}}{: a boolean value. True means that in some iteration there were clusters totally empty.}
#'  \item{\code{niter}}{: number of iterations untill convergence is achived or maximun number of iteration is reached.}
#'  \item{\code{sigmas}}{: a list containing the k covariance matrices found by the procedure at its second step.}
#'  \item{\code{outliers}}{: indices observation that can be considered as outliers.}
#' }
#' @importFrom GSE GSE   getOutliers  getLocation  getScatter
#' @importFrom MASS ginv
#' @importFrom stats   mahalanobis  qchisq
## #
#' @examples
#'
#' # Generate Sintetic data (three normal cluster in two dimension)
#' # clusters have different shapes and orentation.  The data is
#' # contaminated uniformly (level 20%).
#'
#' #######################################
#' #### Start data generating process ####
#' #######################################
#'
#' # generates base clusters
#' set.seed(1)
#' Z1 <- c(rnorm(100,0),rnorm(100,0),rnorm(100,0))
#' Z2 <- rnorm(300);
#' X <-  matrix(0, ncol=2,nrow=300);
#' X[,1]=Z1;X[,2]=Z2
#' true.cluster= c(rep(1,100),rep(2,100),rep(3,100))
#'
#' # rotate, expand and translate base clusters
#' theta=pi/3;
#' aux1=matrix(c(cos(theta),-sin(theta),sin(theta),cos(theta)),nrow=2)
#' aux2=sqrt(4)*diag(c(1,1/4))
#' B=aux1%*%aux2%*%t(aux1)
#' X[true.cluster==3,]=X[true.cluster==3,]%*%aux2%*%aux1 + matrix(c(5,2),byrow = TRUE,nrow=100,ncol=2)
#' X[true.cluster==2,2]=X[true.cluster==2,2]*5
#' X[true.cluster==1,2]=X[true.cluster==1,2]*0.1
#' X[true.cluster==1, ]=X[true.cluster==1,]+ matrix(c(-5,-1),byrow = TRUE,nrow=100,ncol=2)

#' ### Generate 60 sintetic outliers (contamination level 20%)
#'
#' outliers=sample(1:300,60)
#' X[outliers, ] <- matrix(runif( 40, 2 * min(X), 2 * max(X) ),
#'                                 ncol = 2, nrow = 60)
#' #############################################
#' #### END data generating process ############
#' #############################################
#'
#' #############################################
#' ### Applying the algortihm ##################
#' #############################################
#' ret=improvedktaucenters(X,K=3,cutoff=0.999)
#'
#' #############################################
#' ### plotting results ########################
#' #############################################
#' oldpar=par(mfrow=c(2,1))
#' #' plot(X,main='actual clusters')
#' for (j in 1:3){
#'  points(X[true.cluster==j,],pch=19, col=j+1)
#' }
#' points(X[outliers,],pch=19,col=1)

#' plot(X,main='clusters estimation')
#' for (j in 1:3){
#'  points(X[ret$cluster==j,],pch=19, col=j+1)
#' }
#' points(X[ret$outliers,],pch=19)
#'
#' par(oldpar)
#' @references Gonzalez, J. D., Yohai, V. J., & Zamar, R. H. (2019).
#' Robust Clustering Using Tau-Scales. arXiv preprint arXiv:1906.08198.
#' @export

improvedktaucenters <- function(X, K, cutoff = 0.999,
                                nstart = 5, INITcenters = NULL) {

    n <- dim(X)[1]
    p <- dim(X)[2]
    sigmas <- vector(mode = "list", length = K)
    centers <- INITcenters
    ## FIRST STEP: determine the best centers ##

    ## if centers are not given, we calculate with ktaucenters routine
    if (is.null(centers)) {
        ret_ktau <- ktaucenters(X, K, nstart = nstart)
        centers <- ret_ktau$centers
        newClusters <- ret_ktau$cluster
    }

    ## if centers are given, we just calculate the clusters from these
    ## centers.
    sphericalDistanceMatrix <- matrix(0, ncol = K, nrow = n)
    if (!is.null(centers)) {
        for (j in 1:K) {
            sphericalDistanceMatrix[, j] <- mahalanobis(x = X,
                                                        center = centers[j, ],
                                                        cov = diag(p))

        }
        newClusters <- max.col(-sphericalDistanceMatrix, ties = "first")
    }

    newcentersaux <- 0 * centers
    for (j in 1:K) {
        sigmas[[j]] <- diag(p)
    }

    # =========== and SECOND STEP ========
    iter = 0
    newcenters <- centers
    while (iter < 1) {
        for (j in 1:K) {
            Xcluster <- X[newClusters == j, ]
            nk <- dim(Xcluster)[1]

            # dim works properly when there are two or more
            # observations, for that reason:
            if (sum(newClusters == j) < 2) {
                nk = sum(newClusters == j)
            }

            # if there is enough observations, we proceed to compute
            # robust covarianze matrix
            if (nk > (3 * p)) {
                # Xcentered <- sweep(Xcluster, 2, centers[j,], FUN='-')
                sal1 <- GSE(Xcluster, tol = 1e-04, maxiter = 150)
                newcentersaux[j, ] <- getLocation(sal1)
                sigmas[[j]] <- getScatter(sal1)
            }
            # if there is not enough observations, we don't compute
            # robust covarianze matrix
            if (nk <= (3 * p)) {
                newcentersaux[j, ] = newcenters[j, ]
            }

        }

        mahalanobisMatrix <- matrix(0, ncol = K, nrow = n)
        for (j in 1:K) {
            mahalanobisMatrix[, j] <- mahalanobis(x = X,
                                                  center = newcentersaux[j, ],
                                                  cov = ginv(sigmas[[j]]),
                                                  inverted = TRUE)

        }
        newClusters_Mah <- max.col(-mahalanobisMatrix, ties="first")
        value <- qchisq(cutoff, df = p)

        ## IMPROVED oultiers determination.
        outliers <- c()
        for (j in 1:K) {
            indices <- which(newClusters_Mah == j)
            booleansubindices <- mahalanobisMatrix[indices, j] > value
            outliersk <- indices[booleansubindices]
            outliers <- c(outliersk, outliers)
        }

        ## TODO:
        ## In the future: implement this lines should be worked better than current for
        ## unbalanced-cluster cases.  qsMatrix=matrix(0,ncol=K,nrow=n) logdet=rep(0,K);
        ## for (j in 1:K){ logalpha=sum(newClusters_Mah==j)/(n-length(outliers))
        ## logdet[j]=log(det(sigmas[[j]])) qsMatrix[,j]= logalpha[j] - .5*logdet[j]
        ## -.5*mahalanobisMatrix[,j] } newClusters_qs=apply(qsMatrix,1,function(x)
        ## which(x==min(x))[1])

        newClusters <- newClusters_Mah
        newcenters <- newcentersaux
        iter <- iter + 1
    }

    list(cluster = newClusters,
         centers = newcenters,
         outliers = outliers,
         sigmas = sigmas)
}