variational-ogc: Variational Outcome-Guided Clustering

Documented in unsupervisedClustering

#' Variational Bayesian inference for unsupervised clustering
#'
#' @param X NxD data matrix.
#' @param K (Maximum) number of clusters.
#' @param prior Prior parameters (optional).
#' @param ms Boolean flag that indicates whether model selection is required or not. Default is FALSE.
#' @param vs Boolean flag that indicates whether variable selection is required or not. Default is FALSE.
#' @param labels_init Initial cluster labels (can be empty).
#' @param kmeans_init Boolean flag, which, if TRUE, initializes the cluster labels with the k-means algorithm. Default is FALSE.
#' @param tol Tolerance on lower bound. Default is 10e-20.
#' @param maxiter Maximum number of iterations of the VB algorithm. Default is 2000.
#' @param verbose Boolean flag which, if TRUE, prints the iteration numbers. Default is FALSE.
#' @param indep Boolean flag which, if TRUE, indicates that the covariates are independent. Default is TRUE.
#' @return A list containing L, the lower bound at each step of the algorithm, label, a vector containing the cluster
#' labels, model, a list containing the trained model structure, and a vector called n_ comp which, if model selection
#' is required, contains the number of mixture components at every step of the VB algorithm.
#' @references Pattern Recognition and Machine Learning by Christopher M. Bishop.
#' @references This function is based on the MatLab code by Mo Chen (sth4nth@gmail.com).
#' @export
#'

unsupervisedClustering = function(X, K, prior, ms = F, vs = F, labels_init, kmeans_init = F, tol = 10e-20,
                                   maxiter = 2000, verbose = F, indep = T){

    if(vs) stop('Variable selection has not been implemented yet')

    # ms = FALSE; labels_init = mylabels; tol = 0.000001; maxiter = 2000; verbose = TRUE

    if(verbose) message(sprintf("Unsupervised clustering - Multivariate Gaussians \n"))

    if(is.vector(X)){
        N = length(X)
        D = 1
    }else{
        N = dim(X)[1]
        D = dim(X)[2]
    }
    L = rep(-Inf,maxiter*2+1)
    if(ms) n_comp = rep(1000, maxiter)

    Resp = init(X, K, labels_init, kmeans_init, verbose)

    if(missing(prior)){ # set default prior
        if(is.vector(X)){ # if D = 1
            prior = list(alpha = 1, beta = 1, m = mean(X), v = 2, M = 1)
            prior$logW = log(1/prior$M)
        }else if(indep){ # if D > 1 and the covariates are indepedent
            prior = list(alpha = 1, beta = 1, m = colMeans(X), v = D+1, M = rep(1,D));
        }else{ # if D > 1 and the covariates are not independent
            prior = list(alpha = 1, beta = 1, m = colMeans(X), v = D+1, M = diag(1,D,D))
            prior$logW = log(1/det(prior$M))
        }
    }

    model = prior; model$Resp = Resp # model initialisation

    model = maximizeGauss(X, model, prior, ms, indep)
    for (iter in 1:maxiter){
        if(verbose) message(sprintf("Iteration number %d. ", iter))

        model = expectGauss(X, model, ms, indep) # Expectation step
        L[iter*2] = boundGauss(X, model, prior, indep)/N # Lower bound

        model = maximizeGauss(X, model, prior, ms, indep) # Maximisation step
        L[iter*2+1] = boundGauss(X, model, prior, indep)/N # Lower bound
        if(ms) n_comp[iter] = sum(model$piw>0.00001) # number of clusters still in use

        if(check_convergence(L, iter, tol, maxiter, verbose)) break # check for convergence
    }
    output = list(L = L[1:iter*2+1], label = apply(model$R, 1, which.max), model=model)
    if(ms) ouput$n_comp = n_comp

    return(output)
}