test_that("models works", {
# set seed
seed <- 42L
set.seed(seed)
reticulate::py_set_seed(seed)
data(corpus, package = "gensimr")
docs <- prepare_documents(corpus)
dictionary <- corpora_dictionary(docs)
corpus_bow <- doc2bow(dictionary, docs)
corpus_mm <- serialize_mmcorpus(corpus_bow, auto_delete = FALSE)
mm <- read_serialized_mmcorpus(corpus_mm)
expect_type(mm, "environment")
# tfidf
tfidf_mm <- model_tfidf(mm)
tfidf_file <- model_tfidf(corpus_mm)
tfidf_object <- model_tfidf(corpus_mm)
expect_equal(tfidf_file, tfidf_object)
# apply
corpus_transformed_file <- wrap(tfidf_file, corpus_bow)
corpus_transformed_object <- wrap(tfidf_object, corpus_bow)
expect_equal(corpus_transformed_object, corpus_transformed_file)
# lsi
k <- 2L
lsi <- model_lsi(corpus_transformed_file, id2word = dictionary, num_topics = k)
expect_type(lsi, "environment")
wrapped_corpus <- wrap(lsi, corpus_transformed_file)
wrapped_corpus_docs <- get_docs_topics(wrapped_corpus)
expect_length(wrapped_corpus_docs, k * 2)
# fasttext
ft <- model_fasttext(size = 4L, window = 3L, min_count = 1L)
expect_type(ft, "environment")
ft$build_vocab(sentences = docs)
ft$train(sentences = docs, total_examples = length(docs), epochs = 10L)
sim <- ft$wv$most_similar(positive = c('computer', 'human'), negative = c('interface')) %>%
reticulate::py_to_r()
expect_equal(sim[[1]][[1]], "system")
match <- ft$wv$doesnt_match(c("human", "computer", "interface", "tree")) %>%
reticulate::py_to_r()
expect_equal(match, "tree")
# random projections
rp <- model_rp(corpus_transformed_file, id2word = dictionary, num_topics = k)
expect_type(rp, "environment")
wrapped_corpus <- wrap(rp, corpus_transformed_file)
wrapped_corpus_docs <- get_docs_topics(wrapped_corpus)
expect_length(wrapped_corpus_docs, k * 2)
# lda
lda <- model_lda(corpus_mm, id2word = dictionary, num_topics = k)
expect_type(lda, "environment")
lda <- model_lda(corpus_transformed_file, id2word = dictionary, num_topics = k)
expect_type(lda, "environment")
wrapped_corpus <- wrap(lda, corpus_transformed_file)
wrapped_corpus_docs <- get_docs_topics(wrapped_corpus)
expect_length(wrapped_corpus_docs, k * 2)
# ldamc
ldamc <- model_ldamc(mm, id2word = dictionary, num_topics = k)
expect_type(ldamc, "environment")
ldamc <- model_ldamc(corpus_mm, id2word = dictionary, num_topics = k)
expect_type(ldamc, "environment")
wrapped_corpus <- wrap(ldamc, corpus_transformed_file)
wrapped_corpus_docs <- get_docs_topics(wrapped_corpus)
expect_length(wrapped_corpus_docs, k * 2)
# log entropy
log_entropy <- model_logentropy(corpus_bow)
expect_type(log_entropy, "environment")
wrapped_corpus <- wrap(log_entropy, corpus_bow)
# hdp
hdp <- model_hdp(corpus_mm, id2word = dictionary)
expect_type(hdp, "environment")
hdp <- model_hdp(mm, id2word = dictionary)
expect_type(hdp, "environment")
topics <- reticulate::py_to_r(hdp$show_topic(topic_id = 1L, topn = 5L))
expect_length(topics, 5L)
# word2vec
word2vec <- model_word2vec(size = 100L, window = 5L, min_count = 1L)
expect_type(word2vec, "environment")
word2vec$build_vocab(docs)
word2vec$train(docs, total_examples = word2vec$corpus_count, epochs = 20L)
word2vec$init_sims(replace = TRUE)
sim <- word2vec$wv$most_similar(positive = c("interface")) %>%
reticulate::py_to_r()
match <- word2vec$wv$doesnt_match(c("human", "interface", "trees")) %>%
reticulate::py_to_r()
sim1 <- word2vec$wv$similarity("eps", "survey") %>%
reticulate::py_to_r()
sim2 <- word2vec$wv$similarity("human", "trees") %>%
reticulate::py_to_r()
# author topic
data("authors")
auth2doc <- auth2doc(authors, name, document)
expect_type(auth2doc, "environment")
# create temp to hold serialized data
temp <- tempfile("serialized")
# build model
atmodel <- model_at(
corpus_mm,
id2word = dictionary,
author2doc = auth2doc,
num_topics = 2L,
serialized = TRUE,
serialization_path = temp
)
expect_type(atmodel, "environment")
# delete temp
unlink(temp, recursive = TRUE)
atopics <- get_author_topics(atmodel)
expect_length(atopics, 5)
# map models
models <- map_model(
corpus = corpus_transformed_file,
id2word = dictionary,
num_topics = c(2, 4, 6)
)
# compute topic coherence of all models
models <- map_coherence(
models, corpus = corpus_transformed_file,
dictionary = dictionary,
coherence = 'u_mass'
)
plot(models)
print(models)
# coherence
coh <- get_coherence_data(models)
expect_length(coh, 3)
# perplexity
p <- get_perplexity_data(models)
expect_length(p, 3)
collect <- as_model_collection(list(lda, ldamc))
expect_length(collect, 2)
# cleanup
delete_mmcorpus(corpus_mm)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.