From 'bag-of-words' to algorithmic text analysis {.smaller}

set.seed(42)

Initialization {.smaller}

if (packageVersion("polmineR") < package_version("0.7.10.9006"))
  devtools::install_github("PolMine/polmineR", ref = "dev")
library(polmineR)
use("UNGA")
for (pkg in c("magrittr", "slam", "tm", "quanteda", "Matrix")){
  if (!pkg %in% rownames(installed.packages())) install.packages(pkg)
  library(package = pkg, character.only = TRUE)
}

Sparse, transformable matrices {.smaller}

Direttissima {.smaller}

dtm <- polmineR::as.DocumentTermMatrix("UNGA", p_attribute = "word", s_attribute = "year")

Flexibility via partition_bundle {.smaller}

unga2016 <- partition("UNGA", year = 2016)
unga2016_speakers <- partition_bundle(unga2016, s_attribute = "speaker", progress = TRUE)
unga2016_speakers <- enrich(unga2016_speakers, p_attribute = "word", progress = TRUE)
dtm <- polmineR::as.DocumentTermMatrix(unga2016_speakers, col = "count")

Flexibility via partition_bundle (cont.) {.smaller}

as.matrix(dtm)[sort(sample(1:913, 10)), c(1:5, 300:304)]

Using as.speeches() to create a partition_bundle {.smaller}

doit <- !file.exists("~/lab/tmp/unga2000s_speeches_lds.RData")
unga_2000s <- partition("UNGA", year = 2000:2010)
unga_2000s_speeches <- as.speeches(unga_2000s, s_attribute_date = "date", s_attribute_name = "speaker")
unga_2000s_speeches <- enrich(unga_2000s_speeches, p_attribute = "word")
dtm <- polmineR::as.DocumentTermMatrix(unga_2000s_speeches, col = "count")

Shrinking the Matrix {.smaller}

short_docs <- which(slam::row_sums(dtm) < 100)
if (length(short_docs) > 0) dtm <- dtm[-short_docs,]
rare_words <- which(slam::col_sums(dtm) < 5)
if (length(rare_words) > 0) dtm <- dtm[,-rare_words]

Additional Filtering {.smaller}

noisy_tokens <- noise(colnames(dtm), specialChars = NULL, stopwordsLanguage = "en")
noisy_tokens_where <- which(unique(unlist(noisy_tokens)) %in% colnames(dtm))
dtm <- dtm[,-noisy_tokens_where]
stopit <- tm::stopwords("en")
stopit_upper <- paste(toupper(substr(stopit, 1, 1)), substr(stopit, 2, nchar(stopit)), sep = "")
stopit_upper_where <- which(stopit_upper %in% colnames(dtm))
if (length(stopit_upper_where) > 0) dtm <- dtm[, -stopit_upper_where]

Fitting a Topic Model {.smaller}

empty_docs <- which(slam::row_sums(dtm) == 0)
if (length(empty_docs) > 0) dtm <- dtm[-empty_docs,]
lda <- topicmodels::LDA(
  dtm, k = 150, method = "Gibbs",
  control = list(burnin = 1000, iter = 3L, keep = 50, verbose = TRUE)
)
if (doit == TRUE){
  saveRDS(lda, file = "~/lab/tmp/unga2000s_speeches_lds.RData")
} else {
  lda <- readRDS(file = "~/lab/tmp/unga2000s_speeches_lds.RData")
}
lda_terms <- terms(lda, 10)

Topic-Term-Matrix {.smaller}

n_terms <- 5L
lda_terms <- terms(lda, n_terms)
y <- t(lda_terms)
colnames(y) <- paste("Term", 1:n_terms, sep = " ")
DT::datatable(y)

Filtering by Part-of-Speech Annotation {.smaller}

pb <- partition("UNGA", year = 2000:2010) %>%
  as.speeches(s_attribute_date = "date", s_attribute_name = "speaker") %>% 
  enrich(p_attribute = c("word", "pos"), progress = TRUE) %>%
  subset(pos == "NN")
pb@objects <- lapply(pb@objects, function(x){x@stat[, "pos" := NULL]; x@p_attribute <- "word"; x})

The next Topic Model {.smaller}

dtm <- polmineR::as.DocumentTermMatrix(pb, col = "count")

short_docs <- which(slam::row_sums(dtm) < 100)
if (length(short_docs) > 0) dtm <- dtm[-short_docs,]

rare_words <- which(slam::col_sums(dtm) < 5)
if (length(rare_words) > 0) dtm <- dtm[,-rare_words]

empty_docs <- which(slam::row_sums(dtm) == 0)
if (length(empty_docs) > 0) dtm <- dtm[-empty_docs,]

lda <- topicmodels::LDA(
  dtm, k = 150, method = "Gibbs",
  control = list(burnin = 1000, iter = 3L, keep = 50, verbose = TRUE)
)
if (doit == TRUE){
  saveRDS(lda, file = "~/lab/tmp/lda_unga2000s_speeches_pos.RData")
} else {
  lda <- readRDS(file = "~/lab/tmp/lda_unga2000s_speeches_pos.RData")
}

Topic-Term-Matrix {.smaller}

n_terms <- 5L
lda_terms <- terms(lda, n_terms)
y <- t(lda_terms)
colnames(y) <- paste("Term", 1:n_terms, sep = " ")
DT::datatable(y)

Data transformation: Creating a document-feature-matrix {.smaller}

pb <- partition("UNGA", speaker = "Clinton|Bush|Obama|Trump", regex = TRUE) %>%
  partition_bundle(s_attribute = "speaker")
pb <- enrich(pb, p_attribute = "lemma")
dtm <- polmineR::as.sparseMatrix(pb, col = "count")
dtm <- Matrix::t(dtm)

Data transformation (cont.) {.smaller}

pg_dfm <- new(
  "dfm",
  i = dtm@i,
  p = dtm@p,
  x = dtm@x,
  Dim = dtm@Dim,
  Dimnames = list(
    docs = dtm@Dimnames$Docs,
    features = dtm@Dimnames$Terms
  )
)

Use Case Wordfish I {.smaller}

pg_dfm_red <- dfm_trim(pg_dfm, min_termfreq = 10)

Use Case Wordfish II {.smaller}

wfm_1 <- textmodel_wordfish(pg_dfm_red, c(3,4))
wordfish_summary <- summary(wfm_1)

Use Case Wordfish II (cont.) {.smaller}

wordfish_summary$estimated.document.positions
head(wordfish_summary$estimated.feature.scores, 4) # show first 4 terms

Use Case Wordfish III {.smaller}

textplot_scale1d(wfm_1, doclabels = pg_dfm_red@Dimnames$docs)
betaterm <- data.frame(terms = wfm_1$features, beta = wfm_1$beta)
head(betaterm[order(betaterm$beta),], 10)
head(betaterm[order(betaterm$beta, decreasing = TRUE),], 10)

Use Case Wordfish IV {.smaller}

textplot_scale1d(wfm_1, margin = "features",
                 highlighted = c("America", "liberty", "development", "terror", "border", "inequality"))

Use Case Wordfish V {.smaller}

textplot_scale1d(wfm_1, margin = "features",
                 highlighted = c("America", "liberty", "development", "terror", "border", "inequality"))

Conclusion {.smaller}

References



PolMine/UCSSR documentation built on June 13, 2022, 10:23 p.m.