knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

par(bg = '#f9f7f1')
reticulate::use_virtualenv("../env")

Preprocessing

First we preprocess the corpus using example data, a tiny corpus of 9 documents. Reproducing the tutorial on corpora and vector spaces.

library(gensimr)

set.seed(42) # rerproducability

# sample data
data(corpus, package = "gensimr")
print(corpus)

# preprocess corpus
docs <- prepare_documents(corpus)

docs[[1]] # print first preprocessed document 

Once preprocessed we can build a dictionary.

dictionary <- corpora_dictionary(docs)

A dictionary essentially assigns an integer to each term.

doc2bow simply applies the method of the same name to every documents (see example below); it counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector.

# native method to a single document
dictionary$doc2bow(docs[[1]])

# apply to all documents
corpus_bow <- doc2bow(dictionary, docs)

Then serialise to matrix market format, the function returns the path to the file (this is saved on disk for efficiency), if no path is passed then a temp file is created. Here we set auto_delete to FALSE otherwise the corpus is deleted after first use. Note this means you should manually delete it with delete_mmcorpus.

(corpus_mm <- serialize_mmcorpus(corpus_bow, auto_delete = FALSE))

Then initialise a model, we're going to use a Latent Similarity Indexing method later on (model_lsi) which requires td-idf.

tfidf <- model_tfidf(corpus_mm)

We can then use the model to transform our original corpus.

corpus_transformed <- wrap(tfidf, corpus_bow)

Author-topic Model

Author-topic model.

# authors of corpus
data("authors", package = "gensimr")

#  turn author - document to format expected by scikit-learn
auth2doc <- auth2doc(authors, name, document)

temp <- tempfile("serialized")
atmodel <- sklearn_at(
  id2word = dictionary, 
  author2doc = auth2doc, 
  num_topics = 2L, 
  passes = 100L,
  serialized = TRUE,
  serialization_path = temp
)
unlink(temp, recursive = TRUE)

atmodel$fit(corpus_bow)$transform("jack")

Doc2vec

d2v <- sklearn_doc2vec(min_count = 1, size = 5)
vectors <- d2v$fit_transform(docs)

Hierarchical Dirichlet Process

hdp <- sklearn_hdp(id2word = dictionary)
vectors <- hdp$fit_transform(corpus_bow)

Latent Dirichlet Allocation

lda <- sklearn_lda(
  num_topics = 2L, 
  id2word = dictionary, 
  iterations = 20L, 
  random_state = 1L
)
lda$fit_transform(corpus_bow) %>% 
  reticulate::py_to_r() %>% 
  plot()

Latent Semantic Indexing

Create stages for our pipeline (including gensim and sklearn models alike).

lsi <- sklearn_lsi(id2word = dictionary, num_topics = 15L)

# L2 reg classifier
clf <- sklearn_logistic(penalty = "l2", C = 0.1, solver = "lbfgs")

# sklearn pipepline
pipe <- sklearn_pipeline(lsi, clf)

# Create some random binary labels for our documents.
labels <- sample(c(0L, 1L), 9, replace = TRUE)

# How well does our pipeline perform on the training set?
pipe$fit(corpus_bow, labels)$score(corpus_bow, labels)

Random Projections

# initialise
rp_model <- sklearn_rp(id2word = dictionary)

# fit
rp_fit <- rp_model$fit(corpus_bow)

# Use the trained model to transform a document.
result <- rp_fit$transform(corpus_bow)

Phrase Detection

# split phrases into vectors of words
# this should be further cleaned
corpus_split <- corpus %>% 
  purrr::map(strsplit, " ") %>% 
  purrr::map(function(x){
    sentence <- x[[1]]
    tolower(sentence)
  })

# Create the model. Make sure no term is ignored and combinations seen 2+ times are captured.
pt_model <- sklearn_pt(min_count = 1, threshold = 2)

# Use sklearn fit_transform to see the transformation.
pt_trans <- pt_model$fit_transform(corpus_split)

# Since graph and minors were seen together 2+ times they are considered a phrase.
c("This", "is", "graph_minors") %in% reticulate::py_to_r(pt_trans)[[9]]

Word ID Mapping

doc2bow with scikit-learn. Note that in the example below we do not clean the text (no preprocess).

# initialise
skbow_model <- sklearn_doc2bow()

# fit
corpus_skbow <- skbow_model$fit_transform(corpus)

Tf-idf

tfidf_model <- sklearn_tfidf(dictionary = dictionary)
tfidf_w_sklearn <- tfidf_model$fit_transform(corpus_bow)

# same as with gensim
corpus_transformed[[1]] == tfidf_w_sklearn[[1]]

Word2vec

# Create a model to represent each word by a 10 dimensional vector.
w2v_model <- sklearn_word2vec(size = 10L, min_count = 1L, seed = 1L)

# train
w2v_model <- w2v_model$fit(docs)

# What is the vector representation of the word 'graph'?
w2v_model$transform(list("graph"))

Clean up, delete the corpus.

delete_mmcorpus(corpus_mm)


news-r/gensimr documentation built on Jan. 9, 2021, 5:55 a.m.