knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) par(bg = '#f9f7f1') reticulate::use_virtualenv("../env")
First we preprocess the corpus using example data, a tiny corpus of 9 documents. Reproducing the tutorial on corpora and vector spaces.
library(gensimr) set.seed(42) # rerproducability # sample data data(corpus, package = "gensimr") print(corpus) # preprocess corpus docs <- prepare_documents(corpus) docs[[1]] # print first preprocessed document
Once preprocessed we can build a dictionary.
dictionary <- corpora_dictionary(docs)
A dictionary essentially assigns an integer to each term.
doc2bow
simply applies the method of the same name to every documents (see example below); it counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector.
# native method to a single document dictionary$doc2bow(docs[[1]]) # apply to all documents corpus_bow <- doc2bow(dictionary, docs)
Then serialise to matrix market format, the function returns the path to the file (this is saved on disk for efficiency), if no path is passed then a temp file is created. Here we set auto_delete
to FALSE
otherwise the corpus is deleted after first use. Note this means you should manually delete it with delete_mmcorpus
.
(corpus_mm <- serialize_mmcorpus(corpus_bow, auto_delete = FALSE))
Then initialise a model, we're going to use a Latent Similarity Indexing method later on (model_lsi
) which requires td-idf.
tfidf <- model_tfidf(corpus_mm)
We can then use the model to transform our original corpus.
corpus_transformed <- wrap(tfidf, corpus_bow)
Author-topic model.
# authors of corpus data("authors", package = "gensimr") # turn author - document to format expected by scikit-learn auth2doc <- auth2doc(authors, name, document) temp <- tempfile("serialized") atmodel <- sklearn_at( id2word = dictionary, author2doc = auth2doc, num_topics = 2L, passes = 100L, serialized = TRUE, serialization_path = temp ) unlink(temp, recursive = TRUE) atmodel$fit(corpus_bow)$transform("jack")
d2v <- sklearn_doc2vec(min_count = 1, size = 5) vectors <- d2v$fit_transform(docs)
hdp <- sklearn_hdp(id2word = dictionary) vectors <- hdp$fit_transform(corpus_bow)
lda <- sklearn_lda( num_topics = 2L, id2word = dictionary, iterations = 20L, random_state = 1L ) lda$fit_transform(corpus_bow) %>% reticulate::py_to_r() %>% plot()
Create stages for our pipeline (including gensim and sklearn models alike).
lsi <- sklearn_lsi(id2word = dictionary, num_topics = 15L) # L2 reg classifier clf <- sklearn_logistic(penalty = "l2", C = 0.1, solver = "lbfgs") # sklearn pipepline pipe <- sklearn_pipeline(lsi, clf) # Create some random binary labels for our documents. labels <- sample(c(0L, 1L), 9, replace = TRUE) # How well does our pipeline perform on the training set? pipe$fit(corpus_bow, labels)$score(corpus_bow, labels)
# initialise rp_model <- sklearn_rp(id2word = dictionary) # fit rp_fit <- rp_model$fit(corpus_bow) # Use the trained model to transform a document. result <- rp_fit$transform(corpus_bow)
# split phrases into vectors of words # this should be further cleaned corpus_split <- corpus %>% purrr::map(strsplit, " ") %>% purrr::map(function(x){ sentence <- x[[1]] tolower(sentence) }) # Create the model. Make sure no term is ignored and combinations seen 2+ times are captured. pt_model <- sklearn_pt(min_count = 1, threshold = 2) # Use sklearn fit_transform to see the transformation. pt_trans <- pt_model$fit_transform(corpus_split) # Since graph and minors were seen together 2+ times they are considered a phrase. c("This", "is", "graph_minors") %in% reticulate::py_to_r(pt_trans)[[9]]
doc2bow
with scikit-learn. Note that in the example below we do not clean the text (no preprocess
).
# initialise skbow_model <- sklearn_doc2bow() # fit corpus_skbow <- skbow_model$fit_transform(corpus)
tfidf_model <- sklearn_tfidf(dictionary = dictionary) tfidf_w_sklearn <- tfidf_model$fit_transform(corpus_bow) # same as with gensim corpus_transformed[[1]] == tfidf_w_sklearn[[1]]
# Create a model to represent each word by a 10 dimensional vector. w2v_model <- sklearn_word2vec(size = 10L, min_count = 1L, seed = 1L) # train w2v_model <- w2v_model$fit(docs) # What is the vector representation of the word 'graph'? w2v_model$transform(list("graph"))
Clean up, delete the corpus.
delete_mmcorpus(corpus_mm)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.