modelfile <- tempfile() # Use findable path min_doc_length <- 50L
The memory available for the Java Virtual Machine (JVM) to be used needs to be defined before creating the JVM
options(java.parameters = "-Xmx4g") # Increase further for larger data
library(biglda) library(polmineR) # min v0.8.7.9013 library(purrr) library(stringi) use("RcppCWB", corpus = "REUTERS")
discard <- c(tm::stopwords("en"), capitalize(tm::stopwords("en"))) articles <- corpus("REUTERS") %>% split(s_attribute = "id") %>% get_token_stream(p_attribute = "word", subset = {!word %in% discard}) %>% keep(function(x) length(x) >= min_doc_length) %>% # drop short documents sapply(stri_c, collapse = "\n") %>% discard(function(x) nchar(x) == 0L) # drop empty documents docnames <- names(articles) names(articles) <- NULL # equivalent to unname() - but without copy
instance_list <- as.instance_list(articles)
BTM <- BigTopicModel(n_topics = 25L) BTM$addInstances(instance_list) BTM$setNumThreads(1L) BTM$setTopicDisplay(50L, 10L) BTM$setNumIterations(1000L)
Starting to estimate topic model at: r (format(started <- Sys.time(), format = "%T"))
BTM$estimate()
Finished computation: r (format(finished <- Sys.time(), "%T"))
Total time: r format(Sys.time() - started, format = "%T")
BTM$write(rJava::.jnew("java/io/File", modelfile))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.