R/RcppExports.R

Defines functions lda_acgs_st lda_cgs_perplexity lda_cgs_em_perplexity lda_cgs_em lda_fgs_perplexity lda_fgs_BF_perplexity lda_fgs_ppc lda_fgs_st_perplexity sample_antoniak sample_multinomial sample_dirichlet

Documented in lda_acgs_st lda_cgs_em lda_cgs_em_perplexity lda_cgs_perplexity lda_fgs_BF_perplexity lda_fgs_perplexity lda_fgs_ppc lda_fgs_st_perplexity sample_antoniak sample_dirichlet sample_multinomial

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' LDA: Serial Tempering with Perplexity Computation
#'
#' Implements the LDA serial tempering algorithm. Sampling \code{z_{di}}'s 
#' is adapted from the idea of collapsed Gibbs sampling chain (Griffiths and 
#' Steyvers, 2004). To compute perplexity, it first partitions each document in 
#' the corpus into two sets of words: 
#'   (a) a test set (held-out set) and 
#'   (b) a training set, given a user defined \eqn{test_set_share}. 
#' Then, it runs the Markov chain based on the training set and computes 
#' perplexity for the held-out set.
#'
#' @param num_topics Number of topics in the corpus
#' @param vocab_size  Vocabulary size
#' @param docs_tf A list of corpus documents read from the Blei corpus using 
#'   \code{\link{read_docs}} (term indices starts with 0)
#' @param h_grid A 2-dimensional grid of hyperparameters \eqn{h = (\eta, 
#'   \alpha)}. It is a 2 x G matrix, where G is the number of grid points and 
#'   the first row is for \eqn{\alpha} values and the second row is for 
#'   \eqn{\eta} values
#' @param st_grid A 2-dimensional grid of hyperparameters \eqn{h = (\eta, 
#'   \alpha)}. It is a 2 x G matrix, where G is the number of grid points and 
#'   the first row is for \eqn{\alpha} values and the second row is for 
#'   \eqn{\eta} values. This a subgrid on h_grid_ that is used for Serial 
#'   Tempering
#' @param st_grid_nbrs The neighbor indices, from [0, G-1], of each helper grid
#'   point
#' @param init_st_grid_index Index of the helper h grid, from [1, G], of the 
#'   initial hyperparameter \eqn{h = (\eta, \alpha)}
#' @param zetas  Initial guess for normalization constants
#' @param tuning_iter Number of tuning iterations
#' @param max_iter_tuning Maximum number of Gibbs iterations to be performed
#'   for the tuning iterations
#' @param max_iter_final Maximum number of Gibbs iterations to be performed for
#'   the final run
#' @param burn_in Burn-in-period for the Gibbs sampler
#' @param spacing Spacing between the stored samples (to reduce correlation)
#' @param test_set_share Proportion of the test words in each document. Must be
#'   between 0. and 1.
#' @param save_beta If 0 the function does not save \eqn{\beta} samples
#' @param save_theta If 0 the function does not save \eqn{\theta} samples
#' @param save_lp if 0 The function does not save computed log posterior for 
#'   iterations
#' @param save_hat_ratios If 0 the function does not save hat ratios for 
#'   iterations
#' @param save_tilde_ratios If 0 the function does not save tilde ratios for 
#'   iterations
#' @param verbose Values from {0, 1, 2}
#'
#' @return A list of
#'   \item{corpus_topic_counts}{corpus-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{theta_counts}{document-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{beta_counts}{topic word counts from last iteration of the Markov chain}
#'   \item{theta_samples}{\eqn{\theta} samples after the burn in period, if
#'   \code{save_theta} is set}
#'   \item{beta_samples}{\eqn{\beta} samples after the burn in period, if
#'   \code{save_beta} is set}
#'   \item{log_posterior}{the log posterior (upto a constant multiplier) of
#'   the hidden variable \eqn{\psi = (\beta, \theta, z)} in the LDA model,
#'   if \code{save_lp} is set}
#'   \item{perplexity}{perplexity of the held-out words' set}
#'
#' @export
#'
#' @family MCMC
#' 
#' @note 
#'  Modifed on:
#'  
#'  October 01, 2016 - Created date, adapated from lda_fgs_st.cpp 
#'
lda_acgs_st <- function(num_topics, vocab_size, docs_tf, h_grid, st_grid, st_grid_nbrs, init_st_grid_index, zetas, tuning_iter, max_iter_tuning, max_iter_final, burn_in, spacing, test_set_share, save_beta, save_theta, save_lp, save_hat_ratios, save_tilde_ratios, verbose) {
    .Call('_ldamcmc_lda_acgs_st', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, h_grid, st_grid, st_grid_nbrs, init_st_grid_index, zetas, tuning_iter, max_iter_tuning, max_iter_final, burn_in, spacing, test_set_share, save_beta, save_theta, save_lp, save_hat_ratios, save_tilde_ratios, verbose)
}

#' LDA: Collapsed Gibbs Sampler with Perplexity Computation
#'
#' This implements of the collapsed Gibbs sampler for the LDA model---a Markov
#' chain on \eqn{z}. To compute perplexity, it first
#' partitions each document in the corpus into two sets of words: (a) a test
#' set (held-out set) and (b) a training set, given a user defined
#' \code{test_set_share}. Then, it runs the Markov chain based on the training
#' set and computes perplexity for the held-out set.
#'
#' @param num_topics Number of topics in the corpus
#' @param vocab_size  Vocabulary size
#' @param docs_tf A list of corpus documents read from the Blei corpus using
#'             \code{\link{read_docs}} (term indices starts with 0)
#' @param alpha_h Hyperparameter for \eqn{\theta} sampling
#' @param eta_h Smoothing parameter for the \eqn{\beta} matrix
#' @param max_iter Maximum number of Gibbs iterations to be performed
#' @param burn_in Burn-in-period for the Gibbs sampler
#' @param spacing Spacing between the stored samples (to reduce correlation)
#' @param save_beta if 0 the function does not save \eqn{\beta} samples
#' @param save_theta if 0 the function does not save \eqn{\theta} samples
#' @param save_lp if 0 the function does not save computed log posterior for
#'                iterations
#' @param verbose from {0, 1, 2}
#' @param test_set_share proportion of the test words in each document. Must be
#'                       between 0. and 1.
#'
#' @return The Markov chain output as a list of
#'   \item{corpus_topic_counts}{corpus-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{theta_counts}{document-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{beta_counts}{topic word counts from last iteration of the Markov chain}
#'   \item{theta_samples}{\eqn{\theta} samples after the burn in period, if
#'   \code{save_theta} is set}
#'   \item{beta_samples}{\eqn{\beta} samples after the burn in period, if
#'   \code{save_beta} is set}
#'   \item{log_posterior}{the log posterior (upto a constant multiplier) of
#'   the hidden variable \eqn{\psi = (\beta, \theta, z)} in the LDA model,
#'   if \code{save_lp} is set}
#'   \item{perplexity}{perplexity of the held-out words' set}
#'
#' @export
#'
#' @family MCMC
#'
lda_cgs_perplexity <- function(num_topics, vocab_size, docs_tf, alpha_h, eta_h, max_iter, burn_in, spacing, save_theta, save_beta, save_lp, verbose, test_set_share) {
    .Call('_ldamcmc_lda_cgs_perplexity', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, alpha_h, eta_h, max_iter, burn_in, spacing, save_theta, save_beta, save_lp, verbose, test_set_share)
}

#' LDA: Gibbs-EM with Perplexity Computation
#'
#' This implements the Gibbs-EM algorithm for LDA that is mentioned in the  
#' paper Topic Modeling: Beyond Bag-of-Words. Wallach (2006). 
#' 
#' It uses the LDA collapsed Gibbs sampler---a Markov chain on \eqn{z} for the
#' E-step, and Minka (2003) fixed point iterations to optimize \eqn{h = (\eta,
#' \alpha)} in the M-step. To compute perplexity, it first partitions each
#' document in the corpus into two sets of words: (a) a test set (held-out set)
#' and (b) a training set, given a user defined \code{test_set_share}. Then, it
#' runs the Markov chain based on the training set and computes perplexity for
#' the held-out set.
#'
#' @param num_topics Number of topics in the corpus
#' @param vocab_size  Vocabulary size
#' @param docs_tf A list of corpus documents read from the Blei corpus using
#'             \code{\link{read_docs}} (term indices starts with 0)
#' @param alpha_h Hyperparameter for \eqn{\theta} sampling
#' @param eta_h Smoothing parameter for the \eqn{\beta} matrix
#' @param em_max_iter Maximum number of EM iterations to be performed
#' @param gibbs_max_iter Maximum number of Gibbs iterations to be performed
#' @param burn_in Burn-in-period for the Gibbs sampler
#' @param spacing Spacing between the stored samples (to reduce correlation)
#' @param save_beta if 0 the function does not save \eqn{\beta} samples
#' @param save_theta if 0 the function does not save \eqn{\theta} samples
#' @param save_lp if 0 the function does not save computed log posterior for
#'                iterations
#' @param verbose from {0, 1, 2}
#' @param test_set_share proportion of the test words in each document. Must be
#'                       between 0. and 1.
#'
#' @return The Markov chain output as a list of
#'   \item{corpus_topic_counts}{corpus-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{theta_counts}{document-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{beta_counts}{topic word counts from last iteration of the Markov chain}
#'   \item{theta_samples}{\eqn{\theta} samples after the burn in period, if
#'   \code{save_theta} is set}
#'   \item{beta_samples}{\eqn{\beta} samples after the burn in period, if
#'   \code{save_beta} is set}
#'   \item{log_posterior}{the log posterior (upto a constant multiplier) of
#'   the hidden variable \eqn{\psi = (\beta, \theta, z)} in the LDA model,
#'   if \code{save_lp} is set}
#'   \item{perplexity}{perplexity of the held-out words' set}
#'
#' @export
#'
#' @family MCMC
#'
lda_cgs_em_perplexity <- function(num_topics, vocab_size, docs_tf, alpha_h, eta_h, em_max_iter, gibbs_max_iter, burn_in, spacing, save_theta, save_beta, save_lp, verbose, test_set_share) {
    .Call('_ldamcmc_lda_cgs_em_perplexity', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, alpha_h, eta_h, em_max_iter, gibbs_max_iter, burn_in, spacing, save_theta, save_beta, save_lp, verbose, test_set_share)
}

#' LDA: Gibbs-EM
#'
#' This implements the Gibbs-EM algorithm for LDA that is mentioned in the  
#' paper Topic Modeling: Beyond Bag-of-Words. Wallach (2006). 
#'
#' @export
#'
#' @family MCMC
#'
lda_cgs_em <- function(num_topics, vocab_size, docs_tf, alpha_h, eta_h, em_max_iter, gibbs_max_iter, burn_in, spacing, verbose) {
    .Call('_ldamcmc_lda_cgs_em', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, alpha_h, eta_h, em_max_iter, gibbs_max_iter, burn_in, spacing, verbose)
}

#' LDA: Full Gibbs Sampler with Perplexity Computation
#'
#' Implements the Full Gibbs sampler for the LDA model---a Markov chain on 
#' \eqn{(\beta, \theta, z)}. To compute perplexity, it first
#' partitions each document in the corpus into two sets of words: (a) a test
#' set (held-out set) and (b) a training set, given a user defined
#' \code{test_set_share}. Then, it runs the Markov chain based on the training
#' set and computes perplexity for the held-out set.
#'
#' @param num_topics Number of topics in the corpus
#' @param vocab_size  Vocabulary size
#' @param docs_tf A list of corpus documents read from the Blei corpus using
#'             \code{\link{read_docs}} (term indices starts with 0)
#' @param alpha_h Hyperparameter for \eqn{\theta} sampling
#' @param eta_h Smoothing parameter for the \eqn{\beta} matrix
#' @param max_iter Maximum number of Gibbs iterations to be performed
#' @param burn_in Burn-in-period for the Gibbs sampler
#' @param spacing Spacing between the stored samples (to reduce correlation)
#' @param save_beta if 0 the function does not save \eqn{\beta} samples
#' @param save_theta if 0 the function does not save \eqn{\theta} samples
#' @param save_lp if 0 the function does not save computed log posterior for
#'                iterations
#' @param verbose from {0, 1, 2}
#' @param test_set_share proportion of the test words in each document. Must be
#'                       between 0. and 1.
#'
#' @return The Markov chain output as a list of
#'   \item{corpus_topic_counts}{corpus-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{theta_counts}{document-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{beta_counts}{topic word counts from last iteration of the Markov chain}
#'   \item{theta_samples}{\eqn{\theta} samples after the burn in period, if
#'   \code{save_theta} is set}
#'   \item{beta_samples}{\eqn{\beta} samples after the burn in period, if
#'   \code{save_beta} is set}
#'   \item{log_posterior}{the log posterior (upto a constant multiplier) of
#'   the hidden variable \eqn{\psi = (\beta, \theta, z)} in the LDA model,
#'   if \code{save_lp} is set}
#'   \item{perplexity}{perplexity of the held-out words' set}
#'
#' @export
#'
#' @family MCMC
#'
lda_fgs_perplexity <- function(num_topics, vocab_size, docs_tf, alpha_h, eta_h, max_iter, burn_in, spacing, save_theta, save_beta, save_lp, verbose, test_set_share) {
    .Call('_ldamcmc_lda_fgs_perplexity', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, alpha_h, eta_h, max_iter, burn_in, spacing, save_theta, save_beta, save_lp, verbose, test_set_share)
}

#' LDA: Estimate Bayes Factors using Full Gibbs Sampler
#'
#' Implements the Full Gibbs sampler for the LDA model---a Markov chain on
#' \eqn{(\beta, \theta, z)}. To compute perplexity, it first
#' partitions each document in the corpus into two sets of words: (a) a test
#' set (held-out set) and (b) a training set, given a user defined
#' \code{test_set_share}. Then, it runs the Markov chain based on the training
#' set and computes perplexity for the held-out set.
#'
#' @param num_topics Number of topics in the corpus
#' @param vocab_size  Vocabulary size
#' @param docs_tf A list of corpus documents read from the Blei corpus using
#'             \code{\link{read_docs}} (term indices starts with 0)
#' @param alpha_h Hyperparameter for \eqn{\theta} sampling
#' @param eta_h Smoothing parameter for the \eqn{\beta} matrix
#' @param h_grid Grid of \eqn{(\alpha, \eta)} values
#' @param max_iter Maximum number of Gibbs iterations to be performed
#' @param burn_in Burn-in-period for the Gibbs sampler
#' @param spacing Spacing between the stored samples (to reduce correlation)
#' @param save_beta if 0 the function does not save \eqn{\beta} samples
#' @param save_theta if 0 the function does not save \eqn{\theta} samples
#' @param save_lp if 0 the function does not save computed log posterior for
#'                iterations
#' @param verbose from {0, 1, 2}
#' @param test_set_share proportion of the test words in each document. Must be
#'                       between 0. and 1.
#'
#' @return The Markov chain output as a list of
#'   \item{corpus_topic_counts}{corpus-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{theta_counts}{document-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{beta_counts}{topic word counts from last iteration of the Markov chain}
#'   \item{theta_samples}{\eqn{\theta} samples after the burn in period, if
#'   \code{save_theta} is set}
#'   \item{beta_samples}{\eqn{\beta} samples after the burn in period, if
#'   \code{save_beta} is set}
#'   \item{log_posterior}{the log posterior (upto a constant multiplier) of
#'   the hidden variable \eqn{\psi = (\beta, \theta, z)} in the LDA model,
#'   if \code{save_lp} is set}
#'   \item{perplexity}{perplexity of the held-out words' set}
#'
#' @export
#'
#' @family MCMC
#'
lda_fgs_BF_perplexity <- function(num_topics, vocab_size, docs_tf, alpha_h, eta_h, h_grid, max_iter, burn_in, spacing, save_theta, save_beta, save_lp, save_BF, verbose, test_set_share) {
    .Call('_ldamcmc_lda_fgs_BF_perplexity', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, alpha_h, eta_h, h_grid, max_iter, burn_in, spacing, save_theta, save_beta, save_lp, save_BF, verbose, test_set_share)
}

#' LDA: Full Gibbs Sampler with Posterior Predictive Value
#'
#' Implements the Full Gibbs sampler for the LDA model---a Markov chain on
#' \eqn{(\beta, \theta, z)}. The log posterior predictive value is based on 
#' Zhe Chen (2015)
#'
#' @param num_topics Number of topics in the corpus
#' @param vocab_size  Vocabulary size
#' @param docs_tf A list of corpus documents read from the Blei corpus using
#'             \code{\link{read_docs}} (term indices starts with 0)
#' @param alpha_h Hyperparameter for \eqn{\theta} sampling
#' @param eta_h Smoothing parameter for the \eqn{\beta} matrix
#' @param max_iter Maximum number of Gibbs iterations to be performed
#' @param burn_in Burn-in-period for the Gibbs sampler
#' @param spacing Spacing between the stored samples (to reduce correlation)
#' @param verbose from {0, 1, 2}
#'
#' @return The Markov chain output as a list of
#'   \item{lppv}{log posterior predictive values of each document}
#'   \item{lppc}{averge of log posterior predictive values}
#'
#' @export
#'
#' @family MCMC
#'
lda_fgs_ppc <- function(num_topics, vocab_size, docs_tf, alpha_h, eta_h, max_iter, burn_in, spacing, verbose) {
    .Call('_ldamcmc_lda_fgs_ppc', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, alpha_h, eta_h, max_iter, burn_in, spacing, verbose)
}

#' LDA: Serial Tempering with Perplexity Computation
#'
#' Implements the LDA serial tempering algorithm. To compute perplexity, it 
#' first partitions each document in the corpus into two sets of words: 
#'   (a) a test set (held-out set) and 
#'   (b) a training set, given a user defined \code{test_set_share}. 
#' Then, it runs the Markov chain based on the training set and computes 
#' perplexity for the held-out set.
#'
#' @param num_topics Number of topics in the corpus
#' @param vocab_size  Vocabulary size
#' @param docs_tf A list of corpus documents read from the Blei corpus using 
#'   \code{\link{read_docs}} (term indices starts with 0)
#' @param h_grid a 2-dimensional grid of hyperparameters \eqn{h = (\eta, 
#'   \alpha)}. It is a 2 x G matrix, where G is the number of grid points and 
#'   the first row is for \eqn{\alpha} values and the second row is for 
#'   \eqn{\eta} values
#' @param st_grid a 2-dimensional grid of hyperparameters \eqn{h = (\eta, 
#'   \alpha)}. It is a 2 x G matrix, where G is the number of grid points and 
#'   the first row is for \eqn{\alpha} values and the second row is for 
#'   \eqn{\eta} values. This a subgrid on h_grid_ that is used for Serial 
#'   Tempering
#' @param st_grid_nbrs the neighbor indices, from [0, G-1], of each helper grid
#'   point
#' @param init_st_grid_index index of the helper h grid, from [1, G], of the 
#'   initial hyperparameter \eqn{h = (\eta, \alpha)}
#' @param zetas  Initial guess for normalization constants
#' @param tuning_iter number of tuning iterations
#' @param max_iter_tuning Maximum number of Gibbs iterations to be performed
#'   for the tuning iterations
#' @param max_iter_final Maximum number of Gibbs iterations to be performed for
#'   the final run
#' @param burn_in Burn-in-period for the Gibbs sampler
#' @param spacing Spacing between the stored samples (to reduce correlation)
#' @param test_set_share proportion of the test words in each document. Must be
#'   between 0. and 1.
#' @param save_beta if 0 the function does not save \eqn{\beta} samples
#' @param save_theta if 0 the function does not save \eqn{\theta} samples
#' @param save_lp if 0 the function does not save computed log posterior for 
#'   iterations
#' @param save_hat_ratios if 0 the function does not save hat ratios for 
#'   iterations
#' @param save_tilde_ratios if 0 the function does not save tilde ratios for 
#'   iterations
#' @param verbose from {0, 1, 2}
#'
#' @return The Markov chain output as a list of
#'   \item{corpus_topic_counts}{corpus-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{theta_counts}{document-level topic counts from last iteration
#'   of the Markov chain}
#'   \item{beta_counts}{topic word counts from last iteration of the Markov chain}
#'   \item{theta_samples}{\eqn{\theta} samples after the burn in period, if
#'   \code{save_theta} is set}
#'   \item{beta_samples}{\eqn{\beta} samples after the burn in period, if
#'   \code{save_beta} is set}
#'   \item{log_posterior}{the log posterior (upto a constant multiplier) of
#'   the hidden variable \eqn{\psi = (\beta, \theta, z)} in the LDA model,
#'   if \code{save_lp} is set}
#'   \item{perplexity}{perplexity of the held-out words' set}
#'
#' @export
#'
#' @family MCMC
#' 
#' @note 
#'  Modifed on:
#'  
#'  May 30, 2015  
#'  
#'  January 28, 2016
#'  
#'  April 05, 2016 
#'
lda_fgs_st_perplexity <- function(num_topics, vocab_size, docs_tf, h_grid, st_grid, st_grid_nbrs, init_st_grid_index, zetas, tuning_iter, max_iter_tuning, max_iter_final, burn_in, spacing, test_set_share, save_beta, save_theta, save_lp, save_hat_ratios, save_tilde_ratios, verbose) {
    .Call('_ldamcmc_lda_fgs_st_perplexity', PACKAGE = 'ldamcmc', num_topics, vocab_size, docs_tf, h_grid, st_grid, st_grid_nbrs, init_st_grid_index, zetas, tuning_iter, max_iter_tuning, max_iter_final, burn_in, spacing, test_set_share, save_beta, save_theta, save_lp, save_hat_ratios, save_tilde_ratios, verbose)
}

#' Samples from the Antoniak distribution
#'
#' It's done by sampling \eqn{N} Bernoulli variables
#'
#' References:
#'
#'   http://www.jmlr.org/papers/volume10/newman09a/newman09a.pdf
#'
#' @param N Number of samples
#' @param alpha strength parameter
#'
#' @export
#'
#' @family utils
#'
#' @note
#'
#' Created on: May 19, 2016
#'
#' Created by: Clint P. George
#'
sample_antoniak <- function(N, alpha) {
    .Call('_ldamcmc_sample_antoniak', PACKAGE = 'ldamcmc', N, alpha)
}

#' A speedy sampling from a multimomial distribution
#'
#' @param theta a multinomial probability vector (K x 1 vector)
#'
#' @return returns a class index from [0, K)
#'
#' @note
#' Author: Clint P. George
#'
#' Created on: February 11, 2016
#'
#' @family utils
#'
#' @export
sample_multinomial <- function(theta) {
    .Call('_ldamcmc_sample_multinomial', PACKAGE = 'ldamcmc', theta)
}

#' Samples from a Dirichlet distribution given a hyperparameter
#'
#' @param num_elements the dimention of the Dirichlet distribution
#' @param alpha the hyperparameter vector (a column vector)
#'
#' @return returns a Dirichlet sample (a column vector)
#'
#' @note
#' Author: Clint P. George
#'
#' Created on: 2014
#'
#' @family utils
#'
#' @export
sample_dirichlet <- function(num_elements, alpha) {
    .Call('_ldamcmc_sample_dirichlet', PACKAGE = 'ldamcmc', num_elements, alpha)
}
clintpgeorge/ldamcmc documentation built on Feb. 22, 2020, 12:39 p.m.