tests/benchmark/distributed.R

require(seededlda)
require(quanteda)

data_corpus_guardian <- readRDS('../Dropbox/Public/data_corpus_guardian2016.rds')
data_corpus_guardian <- readRDS('/home/kohei/Dropbox/Public/data_corpus_guardian2016-10k.rds')
#data_corpus_guardian <- readRDS('C:/Users/watan/Dropbox/Public/data_corpus_guardian2016-10k.rds')

toks <- tokens(data_corpus_guardian,
               remove_punct = TRUE,
               remove_symbols = TRUE,
               remove_number = TRUE)
dfmt <- dfm(toks) %>%
    dfm_remove(stopwords(), min_nchar = 2) %>%
    dfm_trim(max_docfreq = 0.1, docfreq_type = "prop")

lda0 <- textmodel_lda(dfmt, k = 20, verbose = TRUE, max_iter = 200, batch_size = 1)
lda1 <- textmodel_lda(dfmt, k = 20, verbose = TRUE, max_iter = 200, batch_size = 0.5)
lda2 <- textmodel_lda(dfmt, k = 20, verbose = TRUE, max_iter = 200, batch_size = 0.2)
lda3 <- textmodel_lda(dfmt, k = 20, verbose = TRUE, max_iter = 200, batch_size = 0.1)
lda4 <- textmodel_lda(dfmt, k = 20, verbose = TRUE, max_iter = 200, batch_size = 0.05)

microbenchmark::microbenchmark(
    para = textmodel_lda(dfmt, k = 20, verbose = TRUE, batch_size = 0.01),
    auto = textmodel_lda(dfmt, k = 20, verbose = TRUE, batch_size = 0.01, auto_iter = TRUE),
    seri = textmodel_lda(dfmt, k = 20, verbose = TRUE),
    times = 1
)

system.time(textmodel_lda(dfmt, k = 20, verbose = TRUE, batch_size = 0.01))
system.time(textmodel_lda(dfmt, k = 20, verbose = TRUE, batch_size = 0.01, auto_iter = TRUE))
system.time(textmodel_lda(dfmt, k = 20, verbose = TRUE))

Try the seededlda package in your browser

Any scripts or data that you put into this service are public.

seededlda documentation built on April 4, 2025, 2:33 a.m.