knitr::opts_chunk$set( warning = FALSE, collapse = TRUE, comment = "#>" ) par(bg = '#f9f7f1') library(htmltools) reticulate::use_virtualenv("./env")
br() br() div( class = "row", div( class = "col-md-4", img( src = "logo.png", class = "img-responsive responsive-img" ) ), div( class = "col-md-8", p( "Topic Modeling for Humans with gensim.", br(), "Large scale efficient topic modeling in R and Python." ), br(), p( tags$a( tags$i(class = "fa fa-level-down-alt"), class = "btn btn-primary", href = "articles/installation.html", style = "margin-bottom: 5px;", "Installation" ), tags$a( tags$i(class = "fa fa-terminal"), class = "btn btn-default", href = "reference", style = "margin-bottom: 5px;", "Reference" ) ) ) )
Below we build a very basic Latent Dirichlet Allocation model aiming for 2 latent dimensions using example data.
library(gensimr) # example corpus data("corpus", package = "gensimr") # preprocess documents texts <- prepare_documents(corpus) dictionary <- corpora_dictionary(texts) corpus <- doc2bow(dictionary, texts) # create tf-idf model tfidf <- model_tfidf(corpus) tfidf_corpus <- wrap(tfidf, corpus) # latent similarity index lda <- model_lda(tfidf_corpus, id2word = dictionary, num_topics = 2L) topics <- lda$print_topics() # get topics
Objects returned by the package are not automatically converted to R data structures, use reticulate::py_to_r
as shown below to convert them.
reticulate::py_to_r(topics) # convert to R format
We can then use our model to transform our corpus and then the document topic matrix.
corpus_wrapped <- wrap(lda, corpus) doc_topics <- get_docs_topics(corpus_wrapped) plot(doc_topics$dimension_1_y, doc_topics$dimension_2_y)
corpus_wrapped <- wrap(lda, corpus) doc_topics <- get_docs_topics(corpus_wrapped) par(bg = '#f4f1e6') plot(doc_topics$dimension_1_y, doc_topics$dimension_2_y)
The plot correctly identifies two topics/clusters. As stated in table 2 from this paper, the example corpus (data(corpus)
) essentially has two classes of documents. First five are about human-computer interaction and the other four are about graphs.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.