#' @title LDA Replications
#'
#' @description
#' Performs multiple runs of Latent Dirichlet Allocation.
#'
#' @details The function generates multiple LDA runs with the possibility of
#' using parallelization. The integration is done by the
#' \code{\link[parallelMap:parallelMap]{parallelMap-package}}.
#'
#' The function returns a \code{LDARep} object. You can receive results and
#' all other elements of this object with getter functions (see \code{\link{getJob}}).
#'
#' @family replication functions
#' @family LDA functions
#' @family workflow functions
#'
#' @param docs [\code{list}]\cr
#' Documents as received from \code{\link[tosca]{LDAprep}}.
#' @param vocab [\code{character}]\cr
#' Vocabularies passed to \code{\link[lda]{lda.collapsed.gibbs.sampler}}.
#' For additional (and necessary) arguments passed, see ellipsis (three-dot argument).
#' @param n [\code{integer(1)}]\cr
#' Number of Replications.
#' @param seeds [\code{integer(n)}]\cr
#' Random Seeds for each Replication.
#' @param id [\code{character(1)}]\cr
#' Name for the computation.
#' @param pm.backend [\code{character(1)}]\cr
#' One of "multicore", "socket" or "mpi".
#' If \code{pm.backend} is set, \code{\link[parallelMap]{parallelStart}} is
#' called before computation is started and \code{\link[parallelMap]{parallelStop}}
#' is called after.
#' @param ncpus [\code{integer(1)}]\cr
#' Number of (physical) CPUs to use. If \code{pm.backend} is passed,
#' default is determined by \code{\link[future]{availableCores}}.
#' @param ... additional arguments passed to \code{\link[lda]{lda.collapsed.gibbs.sampler}}.
#' Arguments will be coerced to a vector of length \code{n}.
#' Default parameters are \code{alpha = eta = 1/K} and \code{num.iterations = 200}.
#' There is no default for \code{K}.
#' @return [\code{named list}] with entries \code{id} for computation's name,
#' \code{jobs} for the parameter settings and \code{lda} for the results itself.
#'
#' @examples
#' res = LDARep(docs = reuters_docs, vocab = reuters_vocab, n = 4, seeds = 1:4,
#' id = "myComputation", K = 7:10, alpha = 1, eta = 0.01, num.iterations = 20)
#' res
#' getJob(res)
#' getID(res)
#' getLDA(res, 4)
#'
#' \donttest{
#' LDARep(docs = reuters_docs, vocab = reuters_vocab,
#' K = 10, num.iterations = 100, pm.backend = "socket")
#' }
#'
#' @export LDARep
LDARep = function(docs, vocab, n = 100, seeds, id = "LDARep", pm.backend, ncpus, ...){
assert_string(id, min.chars = 1)
assert_list(docs, min.len = 1, names = "unique", types = "matrix", any.missing = FALSE)
stopifnot(all(sapply(docs, nrow) == 2),
all(sapply(docs, function(x) all(x[2,] == 1))))
assert_character(vocab, any.missing = FALSE, unique = TRUE, min.len = 2)
assert_int(n, lower = 1)
args = .paramList(n = n, ...)
if (missing(seeds) || length(seeds) != n){
message("No seeds given or length of given seeds differs from number of replications: sample seeds. Sampled seeds can be obtained via getJob().")
if (!exists(".Random.seed", envir = globalenv())){
runif(1)
}
oldseed = .Random.seed
seeds = sample(9999999, n)
.Random.seed <<- oldseed
}
if (anyDuplicated(seeds)){
message(sum(duplicated(seeds)), " duplicated seeds.")
}
args$seed = seeds
args$fun = function(seed, ...){
set.seed(seed)
LDA(lda.collapsed.gibbs.sampler(documents = docs, vocab = vocab, ...),
param = list(...))
}
if (!missing(pm.backend) && !is.null(pm.backend)){
if (missing(ncpus) || is.null(ncpus)) ncpus = future::availableCores()
assert_choice(pm.backend, choices = c("multicore", "socket", "mpi"))
assert_int(ncpus, lower = 1)
parallelMap::parallelStart(mode = pm.backend, cpus = ncpus)
parallelMap::parallelExport("docs", "vocab")
}else{
suppressWarnings(parallelMap::parallelExport("docs", "vocab"))
}
ldas = do.call(parallelMap::parallelMap, args = args)
if (!missing(pm.backend) && !is.null(pm.backend)) parallelMap::parallelStop()
job.id = seq_len(n)
args = data.table(job.id = job.id,
do.call(cbind, args[names(args) != "fun"]))
names(ldas) = job.id
res = list(id = id, lda = ldas, jobs = args)
class(res) = "LDARep"
res
}
#' @export
print.LDARep = function(x, ...){
jobs = getJob(x)
parameters = unique(jobs[, !colnames(jobs) %in% c("job.id", "seed"), with = FALSE])
if (nrow(parameters) == 1){
parameters = paste0("parameters ",
paste0(paste0(colnames(parameters), ": ", as.character(round(parameters, 4))), collapse = ", "))
}else{
parameters = paste0(nrow(parameters), " different parameter sets.")
}
cat(
"LDARep Object \"", getID(x), "\"\n ",
nrow(jobs), " LDA Runs", "\n ",
"with ", parameters, "\n\n",
sep = ""
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.