R/deep_mou_gibbs.R
In deepMOU: Clustering of Short Texts by Mixture of Unigrams and Its Deep Extensions

#' @title Deep Mixture of Unigrams
#'
#' @description Performs parameter estimation by means of Gibbs sampling and cluster allocation
#' for the Deep Mixture of Unigrams.
#'
#' @param x            Document-term matrix describing the frequency of terms that occur in a collection of documents. Rows correspond to documents in the collection and columns correspond to terms.
#' @param k            Number of clusters/groups at the top layer.
#' @param g            Number of clusters at the bottom layer.
#' @param n_it         Number of Gibbs steps.
#' @param seed_choice  Set seed for reproducible results.
#' @param burn_in      Number of initial Gibbs samples to be discarded and not included in the computation of final estimates.
#' @return A list containing the following elements:
#' \item{x}{The data matrix.}
#' \item{clusters}{the clustering labels.}
#' \item{k}{the number of clusters at the top layer.}
#' \item{g}{the number of clusters at the bottom layer.}
#' \item{numobs}{the sample size.}
#' \item{p}{the vocabulary size.}
#' \item{z1}{the allocation variables at the top layer.}
#' \item{z2}{the allocation variables at the bottom layer.}
#' \item{Alpha}{the estimates of Alpha parameters.}
#' \item{Beta}{the estimates of the Beta parameters.}
#' \item{pi_hat}{estimated probabilities of belonging to the \code{k} clusters at the top layer conditional to the \code{g} clusters at the bottom layer.}
#' \item{pi_hat_2}{estimated probabilities of belonging to the \code{g} clusters at the bottom layer.}
#' 
#'
#' @importFrom Rfast Lgamma
#' @importFrom stats runif rmultinom
#' @importFrom extraDistr rdirichlet rdirmnom
#'
#' @details Starting from the data matrix \code{x}, the Deep Mixture of Unigrams is fitted
#' and \code{k} clusters are obtained.
#' The algorithm for the estimation of the parameters is the Gibbs sampling.
#' In particular, the function assigns initial values to all the parameters to be estimated. Then \code{n_it} samples for the parameters are obtained using
#' conditional distributions on all the other parameters. The final estimates are obtained by averaging the samples given that initial \code{burn_in} samples are
#' discarded. Clustering is eventually performed by maximizing the posterior distribution of the latent variables.
#' For further details see the references.
#'
#' @references
#' Viroli C, Anderlucci L (2020). "Deep mixtures of Unigrams for uncovering topics in textual data." \emph{Statistics and Computing}, pp. 1-18. \doi{10.1007/s11222-020-09989-9}.
#'
#' @examples
#' # Load the CNAE2 dataset
#' data("CNAE2")
#'
#' # Perform parameter estimation and clustering, very few iterations used for this example
#' deep_CNAE2 = deep_mou_gibbs(x = CNAE2, k = 2, g = 2, n_it = 5, burn_in = 2)
#'
#' # Shows cluster labels to documents
#' deep_CNAE2$clusters
#' @export
deep_mou_gibbs = function (x, k, g, n_it = 500, seed_choice = 1, burn_in = 200)
{
  set.seed(seed_choice)
  progress_bar = txtProgressBar(min = 0, max = n_it, style = 3)
  setTxtProgressBar(progress_bar, 0)
  if (is.null(colnames(x)))
    colnames(x) = paste("W", sep = "", 1:dim(x)[2])
  max_x = max(colSums(x))
  numobs = nrow(x)
  p = ncol(x)
  lik = NULL
  Alpha_dep = Beta_dep = pi_hat_1_dep = pi_hat_2_dep = 0
  delta = 1
  z1 = matrix(0, numobs, k)
  z2 = matrix(0, numobs, g)
  gamma_prev = array(0, c(numobs, k, g))
  Beta = matrix(runif(k * p, 0, 20), p, k)
  Alpha = matrix(runif(g * p, -0.1, 0.1), p, g)
  if (g == 1)
    Alpha = matrix(0, p, g)
  pi_hat_2 = rep(1/g, g)
  pi_hat_1 = matrix(rep(1/k, k), k, g)
  edu_guess = mou_EM(x, k)
  z1[cbind(seq_along(edu_guess$cl), edu_guess$cl)] = 1
  for (i in 1:k) Beta[, i] = colSums(x[edu_guess$cl == i, ])
  Beta = ifelse(Beta == 0, 0.001, Beta)
  Beta = ifelse(Beta > max_x, max_x, Beta)
  for (iter in 1:n_it) {
    setTxtProgressBar(progress_bar, iter)
    for (i in 1:k) for (j in 1:g) {
      beta_alf_prod = Beta[, i] * (Alpha[, j] + 1)
      mat_beta_alf = matrix(beta_alf_prod, numobs, p, byrow = TRUE)
      den = rowSums(Lgamma(mat_beta_alf) - Lgamma(x + mat_beta_alf))
      num = Lgamma(rep(sum(beta_alf_prod), numobs)) - Lgamma(rowSums(x +
                                                                       mat_beta_alf))
      gamma_prev[, i, j] = num - den
    }
    gammaa = gamma_prev - max(gamma_prev) + 700
    gammaa = exp(gammaa)
    if (g > 1) {
      prob = matrix(0, numobs, g)
      for (j in 1:g) {
        prob[, j] = rowSums((matrix(pi_hat_1[, j, drop = FALSE] *
                                      pi_hat_2[j], numobs, k, byrow = T) * matrix(gammaa[,
                                                                                           , j], numobs, k)) * z1)
      }
      prob = prob/rowSums(prob)
      prob = ifelse(is.na(prob), 1/g, prob)
      z2 = t(apply(prob, 1, function(x) rmultinom(n = 1,
                                                  size = 1, prob = x)))
      if (colSums(z2)[j] == 0) {
        index = sample(1:numobs, 1)
        z2[index, j] = 1
      }
      n_j = colSums(z2)
      pi_hat_2 = rdirichlet(1, n_j + delta)
    }
    for (j in 1:g) {
      n_i = colSums(z1 * matrix(z2[, j], numobs, k))
      pi_hat_1[, j] = rdirichlet(1, n_i + delta)
    }
    if (g > 1) {
      Beta_n = t(Beta[, apply(z1, 1, which.max)])
      for (j in 1:g) {
        Beta_ni = Beta_n[z2[, j] == 1, , drop = FALSE]
        xx = x[z2[, j] == 1, , drop = FALSE]
        xx = ifelse(xx == 0, 0.001, xx)
        n_j = nrow(xx)
        size = (Beta_ni %*% (1 + Alpha[, j]))
        prop = rdirmnom(length(size), round(size), xx)
        prop = ifelse(is.na(prop), 0.001, prop)
        prop = ifelse(prop == 0, 0.001, prop)
        prop = prop/Beta_ni - 1
        prop = colMeans(prop)
        prop = ifelse(prop > 1, 1, prop)
        Alpha[, j] = prop
      }
    }
    if (g > 1)
      Alpha_n = t(Alpha[, apply(z2, 1, which.max)])
    else Alpha_n = matrix(0, numobs, p)
    for (i in 1:k) {
      Alpha_ni = Alpha_n[z1[, i] == 1, , drop = FALSE]
      xx = x[z1[, i] == 1, , drop = FALSE]
      xx = ifelse(xx == 0, 0.001, xx)
      size = Beta[, i] %*% t(1 + Alpha_ni)
      prop = rdirmnom(length(size), round(size), xx)
      prop = ifelse(is.na(prop), 0.001, prop)
      prop = ifelse(prop == 0, 0.001, prop)
      prop = prop/(1 + Alpha_ni)
      prop = colMeans(prop)
      prop = ifelse(is.na(prop), 0.001, prop)
      prop = ifelse(prop > max_x, max_x, prop)
      Beta[, i] = prop
    }
    if (k > 1) {
      prob = matrix(0, numobs, k)
      for (i in 1:k) {
        prob[, i] = rowSums((matrix(pi_hat_1[i, ],
                                    numobs, g, byrow = TRUE) * gammaa[, i, ]) *
                              z2)
      }
      prob = prob/rowSums(prob)
      prob = ifelse(is.na(prob), 1/k, prob)
      z1 = t(apply(prob, 1, function(x) rmultinom(1, 1,
                                                  x)))
    }
    if (iter > burn_in) {
      Beta_dep = (Beta_dep * (iter - burn_in - 1) + Beta)/(iter -
                                                             burn_in)
      Alpha_dep = (Alpha_dep * (iter - burn_in - 1) + Alpha)/(iter -
                                                                burn_in)
      pi_hat_1_dep = (pi_hat_1_dep * (iter - burn_in -
                                            1) + pi_hat_1)/(iter - burn_in)
      pi_hat_2_dep = (pi_hat_2_dep * (iter - burn_in -
                                            1) + pi_hat_2)/(iter - burn_in)
    }
  }
  Beta = Beta_dep
  Alpha = Alpha_dep
  pi_hat_1 = pi_hat_1_dep
  pi_hat_2 = pi_hat_2_dep
  for (i in 1:k) for (j in 1:g) {
    den = rowSums(lgamma(matrix(Beta[, i] * (Alpha[, j] +
                                               1), numobs, p, byrow = TRUE)) - lgamma(x + matrix(Beta[,
                                                                                                      i] * (Alpha[, j] + 1), numobs, p, byrow = TRUE)))
    num = lgamma(rep(sum(Beta[, i] * (Alpha[, j] + 1)), numobs)) -
      lgamma(rowSums(x + matrix(Beta[, i] * (Alpha[, j] +
                                               1), numobs, p, byrow = TRUE)))
    gamma_prev[, i, j] = num - den
  }
  gammaa = gamma_prev - max(gamma_prev) + 700
  gammaa = exp(gammaa)
  if (g > 1) {
    prob = matrix(0, numobs, g)
    for (j in 1:g) prob[, j] = rowSums((matrix(pi_hat_1[,
                                                          j] * pi_hat_2[j], numobs, k, byrow = TRUE) * gammaa[,
                                                                                                                , j]) * z1)
    prob = prob/rowSums(prob)
    prob = ifelse(is.na(prob), 1/g, prob)
    z2 = t(apply(prob, 1, function(x) rmultinom(1, 1, x)))
    if (colSums(z2)[j] == 0) {
      index = sample(1:numobs, 1)
      z2[index, j] = 1
    }
    n_j = colSums(z2)
  }
  prob = matrix(0, numobs, k)
  for (i in 1:k) prob[, i] = rowSums((matrix(pi_hat_1[i,
                                                        ], numobs, g, byrow = TRUE) * gammaa[, i, ]) * z2)
  prob = prob/rowSums(prob)
  prob = ifelse(is.na(prob), 1/k, prob)
  z1 = t(apply(prob, 1, function(x) rmultinom(1, 1, x)))
  clusters = apply(z1, 1, which.max)
  output = list(x = x, k = k, g = g, numobs = numobs, p = p,
                clusters = clusters, z1 = z1, z2 = z2, Alpha = Alpha_dep,
                Beta = Beta_dep, pi_hat = pi_hat_1_dep, pi_hat_2 = pi_hat_2_dep)
  close(progress_bar)
  class(output) ="deepMOU"
  return(output)
}

Any scripts or data that you put into this service are public.

deepMOU documentation built on March 4, 2021, 9:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

deepMOU
Clustering of Short Texts by Mixture of Unigrams and Its Deep Extensions

R/deep_mou_gibbs.R
In deepMOU: Clustering of Short Texts by Mixture of Unigrams and Its Deep Extensions

Try the deepMOU package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

deepMOU Clustering of Short Texts by Mixture of Unigrams and Its Deep Extensions

R/deep_mou_gibbs.R In deepMOU: Clustering of Short Texts by Mixture of Unigrams and Its Deep Extensions

Try the deepMOU package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

deepMOU
Clustering of Short Texts by Mixture of Unigrams and Its Deep Extensions

R/deep_mou_gibbs.R
In deepMOU: Clustering of Short Texts by Mixture of Unigrams and Its Deep Extensions