library(rvw)
# In this demo, we will take a look at the topic modeling problem.
# For this, we will use Latent Dirichlet Allocation (LDA) method implemented in Vowpal Wabbit (VW).
# First, switch to a temporary directory.
curr_dir <- getwd()
setwd(tempdir())
# Here we prepare our dataset. We consider the WebKB dataset.
# It consists of web pages collected from various Universities and manually classified into seven different classes (topics).
# Original reference: The 4 Universities Data Set
# http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/
# We use a preprocessed version of this dataset from Ana Cardoso-Cachopo PhD thesis:
# http://ana.cachopo.org/datasets-for-single-label-text-categorization
data_url <- "http://ana.cachopo.org/datasets-for-single-label-text-categorization/webkb-test-stemmed.txt"
lda_data <- read.delim(file = data_url, header = F, stringsAsFactors = F)
names(lda_data) <- c("topic", "text")
# Clear out empty lines.
lda_data <- lda_data[!(lda_data$text == ""), ]
# Prepare a vocabulary from all documents.
lda_vocab <- sort(unique(unlist(strsplit(lda_data$text, " "))))
# In order to use VW LDA algorithm, we have to convert plain text to "word:word_count word:word_count ..." format.
# Also, we replace the words with their indexes in the vocabulary.
# This is needed if we want to easily decode feature hashes later and show topics in a human-readable format.
lda_data$features <- sapply(lda_data$text, function(x) {
splitted_words <- unlist(strsplit(x, " "))
counted_words <- aggregate(data.frame(count=splitted_words), list(word=splitted_words), length)
res_str <- paste0(apply(counted_words, 1, function(x){
paste0( (which(lda_vocab == x[["word"]]) - 1) , ":", as.numeric(x[["count"]]))
# Or use this if no replacement with index is needed:
# paste0(x[["word"]], ":", as.numeric(x[["count"]]))
}),
collapse = " ")
res_str
})
# Calculate required number of bits (b) for feature hashes range: [0, 2^(b) - 1].
bits <- ceiling(log2(length(lda_vocab)))
# Total number of unique documents in data
num_docs <- as.numeric(nrow(lda_data))
# Now we can set up a LDA model.
lda_model <- vwsetup(feature_params = list(bit_precision=bits),
optimization_params = list(initial_t=1, power_t=0.5), # Parameters for learning rate schedule
option = "lda", # Enable LDA algorithm
num_topics = 7, # Specify the number of topics to learn (the same as were manually classified)
lda_D = num_docs,
minibatch = 16) # Analyze 16 documents at a time
# And start learning a set of topics.
vwtrain(vwmodel = lda_model,
data = lda_data,
namespaces = list(" " = "features"),
fixed = "features")
# Here we get our topic predictions for each word from regressor values.
vwout <- vwaudit(vwmodel = lda_model)
# Each line of vwout corresponds to a single feature (a single word in our case)
# Output contains following columns:
# Names - feature names
# Hashes - feature hashes
# V1-V7 - Regressor values for each topic
# Now we need to post-process this output to get final word - topic correspondences.
# First, filter out zero valued features.
selected_rows <- apply(vwout[, 3:9], 1, function(x) {
!all(x == 0)
})
vwout<- vwout[selected_rows,]
# And finaly:
# 1) Connect words from prepared vocabulary with feature hashes from our model.
# 2) Connect words with a maximum valued topic prediction.
lda_results <- data.frame(
word = lda_vocab,
topic = apply(vwout[order(vwout$Hashes), 3:9], 1, function(x) {
which.max(x)
}),
value = apply(vwout[order(vwout$Hashes), 3:9], 1, function(x) {
max(x)
})
)
head(lda_results)
# Switch back.
setwd(curr_dir)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.