R/RcppExports.R

Defines functions fit_lda_c create_lexicon

Documented in create_lexicon fit_lda_c

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Make a lexicon for looping over in the gibbs sampler
#' @keywords internal
#' @description
#'   One run of the Gibbs sampler and other magic to initialize some objects.
#'   Works in concert with \code{\link[tidylda]{initialize_topic_counts}}.
#' @param Cd_in IntegerMatrix denoting counts of topics in documents
#' @param Beta_in NumericMatrix denoting probability of words in topics
#' @param dtm_in arma::sp_mat document term matrix
#' @param alpha NumericVector prior for topics over documents
#' @param freeze_topics bool if making predictions, set to \code{TRUE}
#' @return Returns a list with five entries.
#' 
#'   \code{Docs} is a list of vectors. Each element is a document, and the contents
#'   are indices for tokens. Used as an iterator for the Gibbs sampler.
#'   
#'   \code{Zd} is a list of vectors, similar to Docs. However, its contents are topic
#'   assignments of each document/token pair. Used as an iterator for Gibbs
#'   sampling.
#'   
#'   \code{Cd} is a matrix counting the number of times each topic is sampled per
#'   document.
#'   
#'   \code{Cv} is a matrix counting the number of times each topic is sampled per token.
#'   
#'   \code{Ck} is a vector counting the total number of times each topic is sampled overall.
#'   
#'   \code{Cd}, \code{Cv}, and \code{Ck} are derivatives of \code{Zd}.
#' @details
#'   Arguments ending in \code{_in} are copied and their copies modified in
#'   some way by this function. In the case of \code{Cd_in} and \code{Beta_in},
#'   the only modification is that they are converted from matrices to nested
#'   \code{std::vector} for speed, reliability, and thread safety. \code{dtm_in}
#'   is transposed for speed when looping over columns. 
create_lexicon <- function(Cd_in, Beta_in, dtm_in, alpha, freeze_topics) {
    .Call(`_tidylda_create_lexicon`, Cd_in, Beta_in, dtm_in, alpha, freeze_topics)
}

#' Main C++ Gibbs sampler for Latent Dirichlet Allocation
#' @keywords internal
#' @description
#'   This is the C++ Gibbs sampler for LDA. "Abandon all hope, ye who enter here."
#' @param Docs List with one element for each document and one entry for each token
#'   as formatted by \code{\link[tidylda]{initialize_topic_counts}}
#' @param Zd_in List with one element for each document and one entry for each token
#'   as formatted by \code{\link[tidylda]{initialize_topic_counts}}
#' @param Cd_in IntegerMatrix denoting counts of topics in documents
#' @param Cv_in IntegerMatrix denoting counts of tokens in topics
#' @param Ck_in IntegerVector denoting counts of topics across all tokens
#' @param eta_in NumericMatrix for prior of tokens over topics
#' @param alpha_in NumericVector prior for topics over documents
#' @param iterations int number of gibbs iterations to run in total
#' @param burnin int number of burn in iterations
#' @param calc_likelihood bool do you want to calculate the log likelihood each
#'   iteration?
#' @param Beta_in NumericMatrix denoting probability of tokens in topics
#' @param freeze_topics bool if making predictions, set to \code{TRUE}
#' @param optimize_alpha bool do you want to optimize alpha each iteration?
#' @param threads unsigned integer, how many parallel threads?
#'        For now, nothing is actually parallel
#' @param verbose bool do you want to print out a progress bar?
#' @return Returns a list with the following entries.
#' 
#'   \code{Cd} is a matrix counting the number of times each topic is sampled per
#'   document.
#'   
#'   \code{Cv} is a matrix counting the number of times each topic is sampled per token.
#'   
#'   \code{Cd_mean} the same as \code{Cd} but values averaged across iterations
#'   greater than \code{burnin} iterations.
#'   
#'   \code{Cv_mean} the same as \code{Cv} but values averaged across iterations
#'   greater than \code{burnin} iterations.
#'   
#'   \code{Cd_sum} the same as \code{Cd} but values summed across iterations
#'   greater than \code{burnin} iterations.
#'   
#'   \code{Cv_sum} the same as \code{Cv} but values summed across iterations
#'   greater than \code{burnin} iterations.
#'   
#'   \code{log_likelihood} a matrix with one row indexing iterations and one
#'   row of the log likelihood for each iteration.
#'   
#'   \code{alpha} a vector of the document-topic prior
#'   
#'   \code{_eta} a matrix of the topic-token prior
#' @details
#'   Arguments ending in \code{_in} are copied and their copies modified in
#'   some way by this function. In the case of \code{eta_in} and \code{Beta_in},
#'   the only modification is that they are converted from matrices to nested
#'   \code{std::vector} for speed, reliability, and thread safety. In the case
#'   of all others, they may be explicitly modified during training. 
fit_lda_c <- function(Docs, Zd_in, Cd_in, Cv_in, Ck_in, alpha_in, eta_in, iterations, burnin, optimize_alpha, calc_likelihood, Beta_in, freeze_topics, threads = 1L, verbose = TRUE) {
    .Call(`_tidylda_fit_lda_c`, Docs, Zd_in, Cd_in, Cv_in, Ck_in, alpha_in, eta_in, iterations, burnin, optimize_alpha, calc_likelihood, Beta_in, freeze_topics, threads, verbose)
}

# Register entry points for exported C++ functions
methods::setLoadAction(function(ns) {
    .Call(`_tidylda_RcppExport_registerCCallable`)
})

Try the tidylda package in your browser

Any scripts or data that you put into this service are public.

tidylda documentation built on July 26, 2023, 5:34 p.m.