I used this website as an introduction to gensim and to understand the data structure.
outdir <- tempdir() modelname <- "germaparlmini"
library(reticulate) gensim <- import("gensim") threads <- parallel::detectCores() - 1L
library(polmineR) use("GermaParl") coi <- "GERMAPARLMINI" # corpus of interest language <- "german"
library(slam)
dtm <- corpus(coi) %>% as.speeches(s_attribute_name = "speaker") %>% as.DocumentTermMatrix(p_attribute = "word")
dim(dtm)
Computing the topic models will be much faster if we remove common and noisy words. The following code block is taken from the template for LDA topic models using the R topicmodels package.
# minimum document length 100 words docs_to_drop_length <- which(slam::row_sums(dtm) < 100L) # less than 100 if (length(docs_to_drop_length) > 0L) dtm <- dtm[-docs_to_drop_length,] # remove noisy words noise_to_drop <- noise(colnames(dtm), specialChars = NULL, stopwordsLanguage = language) noise_to_drop[["stopwords"]] <- c( noise_to_drop[["stopwords"]], paste( toupper(substr(noise_to_drop[["stopwords"]], 1, 1)), substr(noise_to_drop[["stopwords"]], 2, nchar(noise_to_drop[["stopwords"]])), sep = "" ) ) dtm <- dtm[,-which(colnames(dtm) %in% unique(unlist(noise_to_drop)))] # remove rare words terms_to_drop_rare <- which(slam::col_sums(dtm) <= 10L) if (length(terms_to_drop_rare) > 0L) dtm <- dtm[,-terms_to_drop_rare] # remove documents that are empty now empty_docs <- which(slam::row_sums(dtm) == 0L) if (length(empty_docs) > 0L) dtm <- dtm[-empty_docs,] dim(dtm)
We need a list of list with tuples inside ...
dtm$j <- dtm$j - 1L py$j <- r_to_py(unname(split(x = dtm$j, f = dtm$i)), convert = TRUE) py$v <- r_to_py(unname(split(x = dtm$v, f = dtm$i)), convert = TRUE) py_run_string("corpus = [zip(j[x],v[x]) for x in range(len(j))]")
Encoding(dtm$dimnames$Terms) <- "UTF-8" py$terms <- dtm$dimnames$Terms py_run_string("token2id = dict(zip(terms, range(len(terms))))") py$dictionary <- gensim$corpora$dictionary$Dictionary() py_run_string("dictionary.__dict__['token2id'] = token2id")
And now we can run the gensim topicmodelling engine.
started <- Sys.time()
if (as.integer(threads) == 1L){ lda_model <- gensim$models$ldamodel$LdaModel( corpus = py$corpus, id2word = py$dictionary, num_topics = 250L, random_state = 100L, update_every = 1L, chunksize = 100L, passes = 10L, alpha = "auto", per_word_topics = TRUE ) } else { lda_model <- gensim$models$ldamulticore$LdaMulticore( corpus = py$corpus, id2word = py$dictionary, num_topics = 20L, random_state = 100L, chunksize = 100L, passes = 10L, per_word_topics = TRUE, workers = as.integer(threads) # required to be integer ) }
Time passed for computing the topic model: r Sys.time() - started
lda_model$save(file.path(outdir, modelname))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.