Zur Ermittlung der Ungleichheit des Vokabulars {.smaller}

Statistisch signifikantes Vokabular {.smaller}

Initialisierung

library(polmineR)
for (pkg in c("magrittr", "data.table", "RColorBrewer", "tm", "wordcloud")){
  if (!pkg %in% rownames(installed.packages())) install.packages(pkg)
  library(package = pkg, character.only = TRUE)
}

Die Logik des statistischen Tests {.smaller}

Ein erstes Beispiel {.smaller}

coi <- partition("GERMAPARL", year = 2016, interjection = FALSE)
ref <- partition("GERMAPARL", year = 2002:2015, interjection = FALSE)
coi <- enrich(coi, p_attribute = "word")
ref <- enrich(ref, p_attribute = "word")

Varianten des Weges zur Zählung {.smaller}

coi <- partition("GERMAPARL", year = 2016, interjection = FALSE) %>% enrich(p_attribute = "word")
coi <- partition("GERMAPARL", year = 2016, interjection = FALSE, p_attribute = "word")
ref <- partition("GERMAPARL", year = 2002:2015, interjection = FALSE, p_attribute = "word")
is(coi)

Features, endlich {.smaller}

f <- polmineR::features(coi, ref)
nrow(f)
f <- subset(f, count_coi >= 5) %>% subset(chisquare >= 10.83)

Das signifikante Vokabular von 2015/2016 {.smaller}

f

Filter- und Reduktionsschritte {.smaller}

options("polmineR.pagelength" = 5L)
terms_to_drop <- c(tm::stopwords("de"), "--", "``", "[", "]", "2016", "2017", "2015", "Vielen", "Dank")
subset(f, !word %in% terms_to_drop)

Filtern mit der Part-of-Speech-Annotation {.smaller}

coi <- partition("GERMAPARL", year = 2016, interjection = FALSE) %>% count(p_attribute = c("word", "pos"))
ref <- partition("GERMAPARL", year = 2002:2015, interjection = FALSE) %>% count(p_attribute = c("word", "pos"))
f <- polmineR::features(coi, ref) %>% subset(count_coi >= 5) %>% subset(chisquare >= 10.83)
f <- subset(f, pos == "NN")

Ergebnistabelle 2015 - mit POS-Filter {.smaller}

options("polmineR.pagelength" = 10L)
f

Vertiefung {.smaller}

merkel <- partition("GERMAPARL", speaker = "Angela Merkel", year = 2008:2009, interjection = FALSE) %>%
  count(p_attribute = c("word", "pos"))
bt <- partition("GERMAPARL", year = 2008:2009, interjection = FALSE) %>%
  count(p_attribute = c("word", "pos"))
am_features <- polmineR::features(merkel, bt, included = TRUE)

Merkel 2008 {.smaller}

am_features_min <- am_features %>%
  subset(count_coi >= 5) %>%
  subset(chisquare >= 10.83) %>%
  subset(pos %in% c("NN", "ADJA", "VVFIN"))
wordcloud::wordcloud(
  words = am_features_min[["word"]][1:50],
  freq = am_features_min[["count_coi"]][1:59],
  colors = rep(RColorBrewer::brewer.pal(8, "Dark2"), times = 7),
  random.color = TRUE
  )

Merkel 2008/2009 - eine Wortwolke {.flexbox .vcenter}

wordcloud::wordcloud(
  words = am_features_min[["word"]][1:50],
  freq = am_features_min[["count_coi"]][1:59],
  colors = rep(RColorBrewer::brewer.pal(8, "Dark2"), times = 7),
  random.color = TRUE
  )

Merkel 2008/2009 - Tabelle {.smaller}

am_features_min

Mehrworteinheiten {.smaller}

options("polmineR.pagelength" = 5L)
merkel_ngrams <- partition("GERMAPARL", speaker = "Angela Merkel", year = 2008:2009, interjection = FALSE) %>%
  polmineR::ngrams(n = 2, p_attribute = "word")
bt_ngrams <- partition("GERMAPARL", year = 2008:2009, interjection = FALSE) %>%
  polmineR::ngrams(n = 2, p_attribute = "word")
significant_mwes <- polmineR::features(merkel_ngrams, bt_ngrams, included = TRUE) %>%
  subset(count_coi >= 5) %>%
  subset(chisquare >= 10.83)
DT::datatable(significant_mwes@stat, options = list(dom = "ftip"))

Statistisch signifikante Mehrworteinheiten {.smaller}

options("polmineR.pagelength" = 10L)
significant_mwes <- polmineR::features(merkel_ngrams, bt_ngrams, included = TRUE) %>%
  subset(count_coi >= 5) %>%
  subset(chisquare >= 10.83)
DT::datatable(significant_mwes@stat, options = list(dom = "ftip"))

Formeln und Formelhaftigkeit {.smaller}

merkel_ngrams <- partition("GERMAPARL", speaker = "Angela Merkel", lp = 17, interjection = FALSE) %>%
  polmineR::ngrams(n = 2, p_attribute = c("word", "pos"))

bt_ngrams <- partition("GERMAPARL", lp = 17, interjection = FALSE) %>%
  polmineR::ngrams(n = 2, p_attribute = c("word", "pos"))
dt <- polmineR::features(merkel_ngrams, bt_ngrams, included = TRUE) %>% data.table::as.data.table()
dt <- subset(dt, dt[["pos_1"]] == "ADJA") %>% subset(.[["pos_2"]] == "NN") 
dt[,"pos_1" := NULL][,"pos_2" := NULL][,"exp_coi" := round(exp_coi, 2)][, "chisquare" := round(chisquare, 2)]

Adjektiv-Nomen-Konstruktionen {.smaller}

DT::datatable(dt)

Nomen-Artikel-Nomen-Konstruktionen {.smaller}

cdu_ngrams <- partition("GERMAPARL", party = "CDU", lp = 17, interjection = FALSE) %>%
  polmineR::ngrams(n = 3, p_attribute = c("word", "pos"))

non_cdu_ngrams <- partition("GERMAPARL", party = c("SPD", "FDP", "LINKE", "GRUENE"), lp = 17, interjection = FALSE) %>%
  polmineR::ngrams(n = 3, p_attribute = c("word", "pos"))

cdu_ngrams <- subset(cdu_ngrams, count >= 5) # vorgezogen

f <- polmineR::features(cdu_ngrams, non_cdu_ngrams, included = FALSE)
f <- subset(f, chisquare >= 10.83)
dt <-  data.table::as.data.table(f)
dt <- subset(dt, dt[["pos_1"]] == "NN") %>% subset(.[["pos_2"]] == "ART") %>% subset(.[["pos_3"]] == "NN") 

Nomen-Artikel-Nomen-Konstruktionen {.smaller}

dt[, "pos_1" := NULL][, "pos_2" := NULL][, "pos_3" := NULL]
dt[, "exp_coi" := round(exp_coi, 2)][, "chisquare" := round(chisquare, 2)]
DT::datatable(dt)

Ausblick {.smaller}

Literatur



PolMine/UCSSR documentation built on June 13, 2022, 10:23 p.m.