#' Variational Bayesian inference for unsupervised clustering
#'
#' @param X NxD data matrix.
#' @param K (Maximum) number of clusters.
#' @param prior Prior parameters (optional).
#' @param ms Boolean flag that indicates whether model selection is required or not. Default is FALSE.
#' @param vs Boolean flag that indicates whether variable selection is required or not. Default is FALSE.
#' @param labels_init Initial cluster labels (can be empty).
#' @param kmeans_init Boolean flag, which, if TRUE, initializes the cluster labels with the k-means algorithm. Default is FALSE.
#' @param tol Tolerance on lower bound. Default is 10e-20.
#' @param maxiter Maximum number of iterations of the VB algorithm. Default is 2000.
#' @param verbose Boolean flag which, if TRUE, prints the iteration numbers. Default is FALSE.
#' @param indep Boolean flag which, if TRUE, indicates that the covariates are independent. Default is TRUE.
#' @return A list containing L, the lower bound at each step of the algorithm, label, a vector containing the cluster
#' labels, model, a list containing the trained model structure, and a vector called n_ comp which, if model selection
#' is required, contains the number of mixture components at every step of the VB algorithm.
#' @references Pattern Recognition and Machine Learning by Christopher M. Bishop.
#' @references This function is based on the MatLab code by Mo Chen (sth4nth@gmail.com).
#' @export
#'
unsupervisedClustering = function(X, K, prior, ms = F, vs = F, labels_init, kmeans_init = F, tol = 10e-20,
maxiter = 2000, verbose = F, indep = T){
if(vs) stop('Variable selection has not been implemented yet')
# ms = FALSE; labels_init = mylabels; tol = 0.000001; maxiter = 2000; verbose = TRUE
if(verbose) message(sprintf("Unsupervised clustering - Multivariate Gaussians \n"))
if(is.vector(X)){
N = length(X)
D = 1
}else{
N = dim(X)[1]
D = dim(X)[2]
}
L = rep(-Inf,maxiter*2+1)
if(ms) n_comp = rep(1000, maxiter)
Resp = init(X, K, labels_init, kmeans_init, verbose)
if(missing(prior)){ # set default prior
if(is.vector(X)){ # if D = 1
prior = list(alpha = 1, beta = 1, m = mean(X), v = 2, M = 1)
prior$logW = log(1/prior$M)
}else if(indep){ # if D > 1 and the covariates are indepedent
prior = list(alpha = 1, beta = 1, m = colMeans(X), v = D+1, M = rep(1,D));
}else{ # if D > 1 and the covariates are not independent
prior = list(alpha = 1, beta = 1, m = colMeans(X), v = D+1, M = diag(1,D,D))
prior$logW = log(1/det(prior$M))
}
}
model = prior; model$Resp = Resp # model initialisation
model = maximizeGauss(X, model, prior, ms, indep)
for (iter in 1:maxiter){
if(verbose) message(sprintf("Iteration number %d. ", iter))
model = expectGauss(X, model, ms, indep) # Expectation step
L[iter*2] = boundGauss(X, model, prior, indep)/N # Lower bound
model = maximizeGauss(X, model, prior, ms, indep) # Maximisation step
L[iter*2+1] = boundGauss(X, model, prior, indep)/N # Lower bound
if(ms) n_comp[iter] = sum(model$piw>0.00001) # number of clusters still in use
if(check_convergence(L, iter, tol, maxiter, verbose)) break # check for convergence
}
output = list(L = L[1:iter*2+1], label = apply(model$R, 1, which.max), model=model)
if(ms) ouput$n_comp = n_comp
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.