Nothing
## ---- echo=FALSE--------------------------------------------------------------
options(rmarkdown.html_vignette.check_title = FALSE)
## ---- eval=FALSE--------------------------------------------------------------
# install.packages("mallet")
## ---- eval=FALSE--------------------------------------------------------------
# options(java.parameters = "-Xmx4g")
## -----------------------------------------------------------------------------
library(mallet)
## -----------------------------------------------------------------------------
# Note this is the path to the folder where the stoplists are stored in the R package.
# Change this path to another directory to read other txt files into R.
directory <- system.file("stoplists", package = "mallet")
files_in_directory <- list.files(directory, full.names = TRUE)
txt_file_content <- character(length(files_in_directory))
for(i in seq_along(files_in_directory)){
txt_file_content[i] <- paste(readLines(files_in_directory[i]), collapse = "\n")
}
# We can check the content with str()
str(txt_file_content)
## -----------------------------------------------------------------------------
library(dplyr)
data(sotu)
sotu[["text"]][1:2]
## -----------------------------------------------------------------------------
mallet_supported_stoplists()
stopwords_en_file_path <- mallet_stoplist_file_path("en")
## -----------------------------------------------------------------------------
sotu.instances <-
mallet.import(id.array = row.names(sotu),
text.array = sotu[["text"]],
stoplist = stopwords_en_file_path,
token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
## -----------------------------------------------------------------------------
sotu.instances.short <-
mallet.import(text.array = sotu[["text"]])
## -----------------------------------------------------------------------------
stop_vector <- readLines(stopwords_en_file_path)
sotu.instances.short <-
mallet.import(text.array = sotu[["text"]],
stoplist = stop_vector)
## -----------------------------------------------------------------------------
topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
## -----------------------------------------------------------------------------
topic.model$loadDocuments(sotu.instances)
## -----------------------------------------------------------------------------
vocabulary <- topic.model$getVocabulary()
head(vocabulary)
## -----------------------------------------------------------------------------
word_freqs <- mallet.word.freqs(topic.model)
head(word_freqs)
## -----------------------------------------------------------------------------
topic.model$setAlphaOptimization(20, 50)
## -----------------------------------------------------------------------------
topic.model$train(200)
## -----------------------------------------------------------------------------
topic.model$maximize(10)
## -----------------------------------------------------------------------------
doc.topics <- mallet.doc.topics(topic.model, smoothed=TRUE, normalized=TRUE)
topic.words <- mallet.topic.words(topic.model, smoothed=TRUE, normalized=TRUE)
## -----------------------------------------------------------------------------
mallet.top.words(topic.model, word.weights = topic.words[2,], num.top.words = 5)
## -----------------------------------------------------------------------------
docs <- which(doc.topics[,2] > 0.50)
doc_size <- nchar(sotu[["text"]])[docs]
idx <- docs[order(doc_size, decreasing = TRUE)[1]]
sotu[["text"]][idx]
## -----------------------------------------------------------------------------
post1975_topic_words <- mallet.subset.topic.words(topic.model, sotu[["year"]] > 1975)
mallet.top.words(topic.model, word.weights = post1975_topic_words[2,], num.top.words = 5)
## ---- fig.height=5, fig.width=5-----------------------------------------------
topic_labels <- mallet.topic.labels(topic.model, num.top.words = 2)
topic_clusters <- mallet.topic.hclust(doc.topics, topic.words, balance = 0.5)
plot(topic_clusters, labels=topic_labels, xlab = "", )
## -----------------------------------------------------------------------------
state_file <- file.path(tempdir(), "temp_mallet_state.gz")
save.mallet.state(topic.model = topic.model, state.file = state_file)
## -----------------------------------------------------------------------------
doc.topics.counts <- mallet.doc.topics(topic.model, smoothed=FALSE, normalized=FALSE)
rm(topic.model)
## -----------------------------------------------------------------------------
new.topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
new.topic.model$loadDocuments(sotu.instances)
load.mallet.state(topic.model = new.topic.model, state.file = state_file)
doc.topics.counts[1:3, 1:6]
mallet.doc.topics(new.topic.model, smoothed=FALSE, normalized=FALSE)[1:3, 1:6]
## -----------------------------------------------------------------------------
model_file <- file.path(tempdir(), "temp_mallet.model")
mallet.topic.model.save(new.topic.model, model_file)
read.topic.model <- mallet.topic.model.read(model_file)
doc.topics.counts[1:3, 1:6]
mallet.doc.topics(read.topic.model, smoothed=FALSE, normalized=FALSE)[1:3, 1:6]
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.