Das Skript soll von dem Verzeichnis aus aufgerufen werden,

in welches die Ergebnisse geschrieben werden.

Aufruf zum Beispiel mit:

Rscript topicmodelling.R --args ParlSpeech BUNDESTAG 250 en

````r args <- commandArgs(trailingOnly = TRUE) package_to_use <- args[2] corpus <- toupper(args[3]) k <- as.integer(args[4]) language <- args[5]

message(sprintf("Package to use: %s", package_to_use))
message(sprintf("Corpus to use: %s", corpus))
message(sprintf("Number of topics (k): %s", k))

lda_filename <- sprintf("./lda_%s_%s.Rdata", corpus, k)
message("LDA model will be written to: ", lda_filename)

Load libraries


Activate Corpus Package


Get going

speeches <- as.speeches(.Object = corpus, sAttributeDates = "date", sAttributeNames = "name")
count_bundle <- count(speeches, pAttribute = "word", verbose = TRUE)
message("... create DocumentTermMatrix")

# Create DocumentTermMatrix
dtm <- as.DocumentTermMatrix(count_bundle, pAttribute = "word", col = "count")
# minimum document length 100 words
docs_to_drop_length <- which(slam::row_sums(dtm) < 100) # less than 100
if (length(docs_to_drop_length) > 0) dtm <- dtm[-docs_to_drop_length,]

# remove noisy words
noise_to_drop <- noise(colnames(dtm), specialChars = NULL, stopwordsLanguage = language)
noise_to_drop[["stopwords"]] <- c(
    toupper(substr(noise_to_drop[["stopwords"]], 1, 1)),
    substr(noise_to_drop[["stopwords"]], 2, nchar(noise_to_drop[["stopwords"]])),
    sep = ""

dtm <- dtm[,-which(unique(unlist(noise_to_drop)) %in% colnames(dtm))]

# remove rare words
terms_to_drop_rare <- which(slam::col_sums(dtm) <= 10)
if (length(terms_to_drop_rare) > 0) dtm <- dtm[,-terms_to_drop_rare]

# remove documents that are empty now
empty_docs <- which(slam::row_sums(dtm) == 0)
if (length(empty_docs) > 0) dtm <- dtm[-empty_docs,]
lda <- LDA(
  dtm, k = k, method = "Gibbs",
  control = list(burnin = 1000, iter = 3L, keep = 50, verbose = TRUE)
saveRDS(object = lda, file = lda_filename)

PolMine/polmineR.topics documentation built on March 6, 2020, 6:03 p.m.