Detecting distinct vocabulary {.smaller}

Statistically significant vocabulary {.smaller}

Initialization {.smaller}

library(polmineR)
use("UNGA")
for (pkg in c("magrittr", "data.table", "RColorBrewer", "tm", "wordcloud")){
  if (!pkg %in% rownames(installed.packages())) install.packages(pkg)
  library(package = pkg, character.only = TRUE)
}

The Logic of the statistical test {.smaller}

A first Example {.smaller}

coi <- partition("UNGA", date = "2017-09-(19|20|21|22|23|25)", regex = TRUE)
ref <- partition("UNGA", date = "2009-09-2.", regex = TRUE)
coi <- enrich(coi, p_attribute = "word")
ref <- enrich(ref, p_attribute = "word")

Many Ways of Counting {.smaller}

coi <- partition("UNGA", date = "2017-09-(19|20|21|22|23|25)", regex = TRUE) %>% enrich(p_attribute = "word")
coi <- partition("UNGA", date = "2017-09-(19|20|21|22|23|25)", regex = TRUE, p_attribute = "word")
ref <- partition("UNGA", date = "2009-09-2.", regex = TRUE, p_attribute = "word")
is(coi)

Finally, Features {.smaller}

f <- polmineR::features(coi, ref)
nrow(f)
f <- subset(f, count_coi >= 5) %>% subset(chisquare >= 10.83)

The significant vocabulary 2017 {.smaller}

f

Filtering and Reduction {.smaller}

options("polmineR.pagelength" = 5L)
terms_to_drop <- c(tm::stopwords("en"), "17", "70", "2014", "2015", "\\'", "\\'s")
subset(f, !word %in% terms_to_drop)

Filtering using Part-of-Speech annotation {.smaller}

coi <- partition("UNGA", date = "2017-09-(19|20|21|22|23|25)", regex = TRUE) %>% 
  count(p_attribute = c("word", "pos"))

ref <- partition("UNGA", date = "2009-09-2.", regex = TRUE) %>% count(p_attribute = c("word", "pos"))
f <- polmineR::features(coi, ref) %>% subset(count_coi >= 5) %>% subset(chisquare >= 10.83)
f <- subset(f, pos == "NN")

Table of Results 2017 - using POS Filtering {.smaller}

options("polmineR.pagelength" = 10L)
f

Advanced Scenarios {.smaller}

merkel <- partition("UNGA", speaker = "Merkel") %>%
  count(p_attribute = c("word", "pos"))
unga <- partition("UNGA", year = 2005:2017) %>%
  count(p_attribute = c("word", "pos"))
am_features <- polmineR::features(merkel, unga, included = TRUE)

Merkel in the UNGA {.smaller}

am_features_min <- am_features %>%
  subset(count_coi >= 5) %>%
  subset(chisquare >= 10.83) %>%
  subset(pos %in% c("NN", "JJ", "VB", "VBP"))
wordcloud::wordcloud(
  words = am_features_min[["word"]][1:30],
  freq = am_features_min[["count_coi"]][1:30],
  colors = rep(RColorBrewer::brewer.pal(8, "Dark2"), times = 7),
  random.color = TRUE
  )

Merkel in the UNGA - a word cloud {.flexbox .vcenter}

wordcloud::wordcloud(
  words = am_features_min[["word"]][1:30],
  freq = am_features_min[["count_coi"]][1:30],
  colors = rep(RColorBrewer::brewer.pal(8, "Dark2"), times = 7),
  random.color = TRUE
  )

Merkel in the UNGA - Table of results {.smaller}

am_features_min

Multiword Expressions {.smaller}

options("polmineR.pagelength" = 5L)
merkel_ngrams <- partition("UNGA", speaker = "Merkel") %>%
  polmineR::ngrams(n = 2, p_attribute = "word")
unga_ngrams <- partition("UNGA", year = 2005:2017) %>%
  polmineR::ngrams(n = 2, p_attribute = "word")
polmineR::features(merkel_ngrams, unga_ngrams, included = TRUE) %>%
  subset(count_coi >= 5) %>% subset(chisquare >= 10.83)

Statistically significant multiword expressions {.smaller}

options("polmineR.pagelength" = 10L)
polmineR::features(merkel_ngrams, unga_ngrams, included = TRUE) %>%
  subset(count_coi >= 5) %>% subset(chisquare >= 10.83)

Formula and Patterns {.smaller}

merkel_ngrams <- partition("UNGA", speaker = "Merkel") %>%
  polmineR::ngrams(n = 2, p_attribute = c("word", "pos"))

unga_ngrams <- partition("UNGA", year = 2005:2017) %>%
  polmineR::ngrams(n = 2, p_attribute = c("word", "pos"))
dt <- polmineR::features(merkel_ngrams, unga_ngrams, included = TRUE) %>% data.table::as.data.table()
dt <- subset(dt, dt[["pos_1"]] == "JJ") %>% subset(.[["pos_2"]] == "NN") 
dt[,"pos_1" := NULL][,"pos_2" := NULL][,"exp_coi" := round(exp_coi, 2)][, "chisquare" := round(chisquare, 2)]

Adjektiv-Nomen-Konstruktionen {.smaller}

DT::datatable(dt)

Noun-Adjective-Noun Sequences {.smaller}

post2015_ngrams <- partition("UNGA", year = 2015:2017) %>%
  polmineR::ngrams(n = 3, p_attribute = c("word", "pos"))

pre2015_ngrams <- partition("UNGA", year = 1993:2014) %>%
  polmineR::ngrams(n = 3, p_attribute = c("word", "pos"))


f <- polmineR::features(post2015_ngrams, pre2015_ngrams, included = FALSE) %>%
  subset(count_coi >= 5) %>% subset(chisquare >= 10.83)
dt <-  data.table::as.data.table(f)
dt <- subset(dt, dt[["pos_1"]] == "NN") %>% subset(.[["pos_2"]] == "JJ") %>% subset(.[["pos_3"]] == "NN") 

Noun-Adjective-Noun Sequences {.smaller}

dt[, "pos_1" := NULL][, "pos_2" := NULL][, "pos_3" := NULL]
dt[, "exp_coi" := round(exp_coi, 2)][, "chisquare" := round(chisquare, 2)]
DT::datatable(dt)

Conclusion and Perspectives {.smaller}

References



PolMine/UCSSR documentation built on June 13, 2022, 10:23 p.m.