Background

I used this website as an introduction to gensim and to understand the data structure.

outdir <- tempdir()
modelname <- "germaparlmini"

Preparing Python

library(reticulate)
gensim <- import("gensim")
threads <- parallel::detectCores() - 1L

Preparing R

library(polmineR)
use("GermaParl")
coi <- "GERMAPARLMINI" # corpus of interest
language <- "german"
library(slam)

Data preparation and preprocessing with R

dtm <- corpus(coi) %>%
  as.speeches(s_attribute_name = "speaker") %>%
  as.DocumentTermMatrix(p_attribute = "word")
dim(dtm)

Computing the topic models will be much faster if we remove common and noisy words. The following code block is taken from the template for LDA topic models using the R topicmodels package.

# minimum document length 100 words
docs_to_drop_length <- which(slam::row_sums(dtm) < 100L) # less than 100
if (length(docs_to_drop_length) > 0L) dtm <- dtm[-docs_to_drop_length,]

# remove noisy words
noise_to_drop <- noise(colnames(dtm), specialChars = NULL, stopwordsLanguage = language)
noise_to_drop[["stopwords"]] <- c(
  noise_to_drop[["stopwords"]],
  paste(
    toupper(substr(noise_to_drop[["stopwords"]], 1, 1)),
    substr(noise_to_drop[["stopwords"]], 2, nchar(noise_to_drop[["stopwords"]])),
    sep = ""
  )
)

dtm <- dtm[,-which(colnames(dtm) %in% unique(unlist(noise_to_drop)))]

# remove rare words
terms_to_drop_rare <- which(slam::col_sums(dtm) <= 10L)
if (length(terms_to_drop_rare) > 0L) dtm <- dtm[,-terms_to_drop_rare]

# remove documents that are empty now
empty_docs <- which(slam::row_sums(dtm) == 0L)
if (length(empty_docs) > 0L) dtm <- dtm[-empty_docs,]
dim(dtm)

Data transition from R to Python

Prepare the 'corpus' object

We need a list of list with tuples inside ...

dtm$j <- dtm$j - 1L
py$j <- r_to_py(unname(split(x = dtm$j, f = dtm$i)), convert = TRUE)
py$v <- r_to_py(unname(split(x = dtm$v, f = dtm$i)), convert = TRUE)
py_run_string("corpus = [zip(j[x],v[x]) for x in range(len(j))]")

Prepare the 'Dictionary' class object

Encoding(dtm$dimnames$Terms) <- "UTF-8"
py$terms <- dtm$dimnames$Terms
py_run_string("token2id = dict(zip(terms, range(len(terms))))")
py$dictionary <- gensim$corpora$dictionary$Dictionary()
py_run_string("dictionary.__dict__['token2id'] = token2id")

Computing the topic model

And now we can run the gensim topicmodelling engine.

started <- Sys.time()
if (as.integer(threads) == 1L){
  lda_model <- gensim$models$ldamodel$LdaModel(
    corpus = py$corpus,
    id2word = py$dictionary,
    num_topics = 250L, 
    random_state = 100L,
    update_every = 1L,
    chunksize = 100L,
    passes = 10L,
    alpha = "auto",
    per_word_topics = TRUE
  )
} else {
  lda_model <- gensim$models$ldamulticore$LdaMulticore(
    corpus = py$corpus,
    id2word = py$dictionary,
    num_topics = 20L,
    random_state = 100L,
    chunksize = 100L,
    passes = 10L,
    per_word_topics = TRUE,
    workers = as.integer(threads) # required to be integer
  )

}

Time passed for computing the topic model: r Sys.time() - started

lda_model$save(file.path(outdir, modelname))


PolMine/polmineR.topics documentation built on March 6, 2020, 6:03 p.m.