````r args <- commandArgs(trailingOnly = TRUE) package_to_use <- args[2] corpus <- toupper(args[3]) k <- as.integer(args[4]) language <- args[5]
```r message(sprintf("Package to use: %s", package_to_use)) message(sprintf("Corpus to use: %s", corpus)) message(sprintf("Number of topics (k): %s", k)) lda_filename <- sprintf("./lda_%s_%s.Rdata", corpus, k) message("LDA model will be written to: ", lda_filename)
library(polmineR) library(data.table) library(pbapply) library(topicmodels)
use(package_to_use)
speeches <- as.speeches(.Object = corpus, sAttributeDates = "date", sAttributeNames = "name") count_bundle <- count(speeches, pAttribute = "word", verbose = TRUE) rm(speeches)
message("... create DocumentTermMatrix") # Create DocumentTermMatrix dtm <- as.DocumentTermMatrix(count_bundle, pAttribute = "word", col = "count")
# minimum document length 100 words docs_to_drop_length <- which(slam::row_sums(dtm) < 100) # less than 100 if (length(docs_to_drop_length) > 0) dtm <- dtm[-docs_to_drop_length,] # remove noisy words noise_to_drop <- noise(colnames(dtm), specialChars = NULL, stopwordsLanguage = language) noise_to_drop[["stopwords"]] <- c( noise_to_drop[["stopwords"]], paste( toupper(substr(noise_to_drop[["stopwords"]], 1, 1)), substr(noise_to_drop[["stopwords"]], 2, nchar(noise_to_drop[["stopwords"]])), sep = "" ) ) dtm <- dtm[,-which(unique(unlist(noise_to_drop)) %in% colnames(dtm))] # remove rare words terms_to_drop_rare <- which(slam::col_sums(dtm) <= 10) if (length(terms_to_drop_rare) > 0) dtm <- dtm[,-terms_to_drop_rare] # remove documents that are empty now empty_docs <- which(slam::row_sums(dtm) == 0) if (length(empty_docs) > 0) dtm <- dtm[-empty_docs,]
lda <- LDA( dtm, k = k, method = "Gibbs", control = list(burnin = 1000, iter = 3L, keep = 50, verbose = TRUE) )
saveRDS(object = lda, file = lda_filename)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.