R/ogc_categorical_response.R

Defines functions ogc_categoricalResponse

Documented in ogc_categoricalResponse

#' Variational Bayesian inference for outcome-guided clustering with categorical response
#'
#' This function allows you do semi-supervised clustering using variational Bayesian inference, when the response
#' variable y is discrete.
#' @param X NxD data matrix
#' @param y Vector of response variables of length N.
#' @param K (Maximum) number of clusters
#' @param R Number of categories in y.
#' @param prior Prior hyperparameters (optional).
#' @param ms Boolean flag that indicates whether model selection is required or not. Default is FALSE.
#' @param vs Boolean flag that indicates whether variable selection is required or not. Default is FALSE.
#' @param labels_init User-defined initial cluster labels (can be empty).
#' @param kmeans_init Boolean flag, which, if TRUE, initializes the cluster labels with the k-means algorithm. Default is FALSE.
#' @param tol Tolerance for stopping criterion. Default is 10e-20.
#' @param maxiter Maximum number of iterations. Default is 2000.
#' @param verbose Boolean flag which, if TRUE, prints the iteration number. Default is FALSE.
#' @param indep Boolean flag which determines whether the covariates are considered as independent or not. Default is TRUE.
#' @return A list containing L, the lower bound at each step of the algorithm, label, a vector containing the cluster
#' labels, model, a list containing the trained model structure, and a vector called n_ comp which, if model selection
#' is required, contains the number of mixture components at every step of the VB algorithm.
#' @references Pattern Recognition and Machine Learning by Christopher M. Bishop
#' @export
#'
ogc_categoricalResponse = function(X, K, y, prior, R = 0, ms = F, vs = F, labels_init, kmeans_init = F,
                                                          tol = 10e-20, maxiter = 2000, verbose = F, indep = T){

    if(verbose) message(sprintf("Semi-supervised clustering \n y ~ Categorical, X ~ MtvGaussian"))

    N = dim(X)[1]
    D = dim(X)[2]
    L = rep(-Inf,maxiter*2+1)
    if(ms) n_comp = rep(1000, maxiter)

    Resp = init(X, K, labels_init, kmeans_init, verbose)

    if(missing(prior)){ # default prior
        prior = list(alpha = 1, beta = 1, m = colMeans(X), v = D+1, M = diag(1,D,D), eps = matrix(1, R, K));
        prior$logW = log(1/det(prior$M))
    }
    model = prior; model$Resp = Resp # model initialisation

    model = maximizeGauss(X, model, prior, ms, indep)
    model = maximizeCatResp(y, model, prior)

    for (iter in 1:maxiter){
        if(verbose) message(sprintf("Iteration number %d.\n", iter))

        model = expectGauss_catResp(X, y, model, ms, indep) # Expectation step
        L[iter*2] = (boundGauss(X, model, prior, indep) + boundCatResp(y, model, prior))/N # Lower bound

        model = maximizeGauss(X, model, prior, ms, indep) # Maximisation step
        model = maximizeCatResp(y, model, prior)
        L[iter*2+1] = (boundGauss(X, model, prior, indep) + boundCatResp(y, model, prior))/N # Lower bound
        if(ms) n_comp[iter] = sum(model$piw > 10e-3)

        if(check_convergence(L, iter, tol, maxiter, verbose)) break
    }
    output = list(L = L[1:iter*2+1], label = apply(model$R, 1, which.max), model=model)
    if(ms) ouput$n_comp = n_comp
    output
}
acabassi/variational-ogc documentation built on May 23, 2019, 2:45 p.m.