In MansMeg/tidytopics: Tidy Topic Modeling Toolbox for R

This is a short introduction how to play with topic models using the topicmodelsamplers R package. The purpose of this packag is that is much more nitty-gritty than more stand alone topicmodel packages such as the LDA package and the.

It is also a testbed where one can start out your own topic model sampler in a similar fashion to that of mallet. It is also possible to transfer the topic state to and from mallet.

library(tidytopics)
library(tidytext)
library(janeaustenr)
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()

tidy_books <- original_books %>%
  unnest_tokens(word, text)

As a first step we need to initialize the number of topic randomly. This is alos one way to initialize the number of topics.

tidy_books <- filter(tidy_books, book == "Pride & Prejudice")
tidy_books$topic <- sample(x = 1:20, replace = TRUE, size = nrow(tidy_books))
tidy_books$word <- as.factor(tidy_books$word)

Now let us sample one iteration with a gibbs sampler

doc <- as.integer(tidy_books$chapter)
D <- length(unique(doc))
checkmate::assert_integer(doc, 0, D)

type <- as.integer(tidy_books$word) - 1L
V <- length(unique(type))
checkmate::assert_integer(type, 0, V)

z <- tidy_books$topic - 1L
K <- length(unique(z))
checkmate::assert_integer(z, 0, K)
system.time(
z2 <- tidytopics:::sample_vanilla_lda(doc = doc, type = type, z = z, K = K, D = D, V = V,
                   iter = 100,
                   beta = 0.1, alpha = 0.1)
)
tidy_books$topic <- z2 + 1L


# Test
X <- t(table(tidy_books$topic, tidy_books$word))
i <- 5
X[,i][order(X[,i], decreasing = TRUE)][1:30]

stat_txt_path <- system.file("extdata", "stats.txt", package="tidytopics")
stat_txt <- 
  data_frame(text = readLines(stat_txt_path)) %>% 
  mutate(doc = row_number()) %>% 
  unnest_tokens(word, text)
stat_txt$topic <- sample(x = 1:10, replace = TRUE, size = nrow(stat_txt))
stat_txt$word <- as.factor(stat_txt$word)
stat_txt$doc <- as.integer(as.factor(stat_txt$doc))

doc <- as.integer(stat_txt$doc)
D <- length(unique(doc))
checkmate::assert_integer(doc, 0, D)

type <- as.integer(stat_txt$word) - 1L
V <- length(unique(type))
checkmate::assert_integer(type, 0, V)

z <- stat_txt$topic - 1L
K <- length(unique(z))
checkmate::assert_integer(z, 0, K)

system.time(
z2 <- tidytopics:::sample_vanilla_lda(doc = doc, type = type, z = z, K = K, D = D, V = V, iter = 100,
                   beta = 0.1, alpha = 0.1)
)
stat_txt$topic <- z2 + 1L