knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) reticulate::use_virtualenv("../env")
First we preprocess the corpus using example data, a tiny corpus of 9 documents. Reproducing the tutorial on corpora and vector spaces.
library(gensimr) set.seed(42) # rerproducability # sample data data(corpus, package = "gensimr") print(corpus) # preprocess corpus docs <- prepare_documents(corpus) docs[[1]] # print first preprocessed document
Once preprocessed we can build a dictionary.
dictionary <- corpora_dictionary(docs)
A dictionary essentially assigns an integer to each term.
doc2bow
simply applies the method of the same name to every documents (see example below); it counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector.
# native method to a single document dictionary$doc2bow(docs[[1]]) # apply to all documents corpus_bow <- doc2bow(dictionary, docs)
Then serialise to matrix market format, the function returns the path to the file (this is saved on disk for efficiency), if no path is passed then a temp file is created. Here we set auto_delete
to FALSE
otherwise the corpus is deleted after first use. Note this means you should manually delete it with delete_mmcorpus
.
(corpus_mm <- serialize_mmcorpus(corpus_bow, auto_delete = FALSE))
Then initialise a model, we're going to use a Latent Similarity Indexing method later on (model_lsi
) which requires td-idf.
tfidf <- model_tfidf(corpus_mm)
We can then use the model to transform our original corpus.
corpus_transformed <- wrap(tfidf, corpus_bow)
Finally, we can build models, the number of topics of model_*
functions defautls to 2, which is too low for what we generally would do with gensimr but works for the low number of documents we have. Below we reproduce bits and bobs of the topics and transformation.
Note that we use the transformed corpus.
lsi <- model_lsi(corpus_transformed, id2word = dictionary, num_topics = 2L) lsi$print_topics()
We can then wrap the model around the corpus to extract further information, below we extract how each document contribute to each dimension (topic).
wrapped_corpus <- wrap(lsi, corpus_transformed) (wrapped_corpus_docs <- get_docs_topics(wrapped_corpus)) plot(wrapped_corpus_docs$dimension_1_y, wrapped_corpus_docs$dimension_2_y)
wrapped_corpus <- wrap(lsi, corpus_transformed) (wrapped_corpus_docs <- get_docs_topics(wrapped_corpus)) par(bg = '#f4f1e6') plot(wrapped_corpus_docs$dimension_1_y, wrapped_corpus_docs$dimension_2_y)
Note that we use the transformed corpus.
rp <- model_rp(corpus_transformed, id2word = dictionary, num_topics = 2L) wrapped_corpus <- wrap(rp, corpus_transformed) wrapped_corpus_docs <- get_docs_topics(wrapped_corpus) plot(wrapped_corpus_docs$dimension_1_y, wrapped_corpus_docs$dimension_2_y)
rp <- model_rp(corpus_transformed, id2word = dictionary, num_topics = 2L) wrapped_corpus <- wrap(rp, corpus_transformed) wrapped_corpus_docs <- get_docs_topics(wrapped_corpus) par(bg = '#f4f1e6') plot(wrapped_corpus_docs$dimension_1_y, wrapped_corpus_docs$dimension_2_y)
Note that we use the original, non-transformed corpus.
lda <- model_lda(corpus_transformed, id2word = dictionary, num_topics = 2L) lda_topics <- lda$get_document_topics(corpus_bow) wrapped_corpus_docs <- get_docs_topics(lda_topics) plot(wrapped_corpus_docs$dimension_1_y, wrapped_corpus_docs$dimension_2_y)
lda <- model_lda(corpus_mm, id2word = dictionary, num_topics = 2L) lda_topics <- lda$get_document_topics(corpus_bow) wrapped_corpus_docs <- get_docs_topics(lda_topics) par(bg = '#f4f1e6') plot(wrapped_corpus_docs$dimension_1_y, wrapped_corpus_docs$dimension_2_y)
hdp <- model_hdp(corpus_mm, id2word = dictionary) reticulate::py_to_r(hdp$show_topic(topic_id = 1L, topn = 5L))
ft <- model_fasttext(size = 4L, window = 3L, min_count = 1L) ft$build_vocab(sentences = unname(docs)) ft$train(sentences = unname(docs), total_examples = length(docs), epochs = 10L) # most similar ft$wv$most_similar(positive = c('computer', 'human'), negative = c('interface')) # odd one out ft$wv$doesnt_match(c("human", "computer", "interface", "tree")) # similarity score ft$wv$similarity('computer', 'human')
First we build the model.
# authors of corpus data("authors", package = "gensimr") auth2doc <- auth2doc(authors, name, document) # create temp to hold serialized data temp <- tempfile("serialized") # build model atmodel <- model_at( corpus_mm, id2word = dictionary, author2doc = auth2doc, num_topics = 2L, serialized = TRUE, serialization_path = temp ) # delete temp unlink(temp, recursive = TRUE)
Then extract the topics for each author.
atmodel$get_author_topics("jack") # native for single author # apply to all authors get_author_topics(atmodel)
log_entropy <- model_logentropy(corpus_bow) wrap(log_entropy, corpus_bow)
Clean up, delete the corpus.
delete_mmcorpus(corpus_mm)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.