# This code is not visible in the slides and just necessary to install packages if needed.
# kable and kableExtra are necessary, to generate table output.
if (!"knitr" %in% rownames(installed.packages())) install.packages("knitr")
if (!"kableExtra" %in% rownames(installed.packages())) install.packages("kableExtra")

Initialization {.smaller}

library(polmineR)
use("UNGA")
library(magrittr)
library(data.table)

CWB, CQP and 'polmineR' {.smaller}

Looking for Words {.smaller}

count("UNGA", query = '"Discrimination"', cqp = TRUE)

Looking for Words (cont.) {.smaller}

count("UNGA", query = c('"Hunger"', '"Hunger" %c'), cqp = TRUE)

Regular Expressions: Character Classes {.smaller}

| Expression | Description | |:-------:| --------------| | . |A period character matches any character | | \d | 'digit', i.e. 0 to 9 |

count("UNGA", '".iece"', cqp = TRUE, breakdown = TRUE) %>% head()

Regular Expressions: Quantifiers {.smaller}

| Expression | Description | |:-------:| --------------| |?|The previous expression occurs between zero and one times.| |+|The previous expression occurs between one and multiple times.| |*|The previous expression occurs in any frequency between zero and multiple times.| |{n}|The previous expression occurs exactly n times.| |{min,}|The previous expression occurs at least min times.| |{min,max}|The previous expression occurs between min and max times.| |{0,max}|The previous expression occurs at most max times.|

Regular Expressions: Examples I {.smaller}

count("UNGA", query = '"multicult.*"', cqp = TRUE, breakdown = TRUE) %>% head(n = 3)
count("UNGA", query = '"[Mm]ulticult.*"', cqp = TRUE, breakdown = TRUE) %>% head(n = 3)

Regular Expressions: Examples II {.smaller}

count("UNGA", query = '"(im|e)migration.*"', cqp = TRUE, breakdown = TRUE) %>% head()

CQP I: Basics {.smaller}

Token Stream with Positional Attributes {.smaller}

P <- partition("UNGA", date = "2017-09-19", speaker = "The Secretary-General")
cpos_left <- P@cpos[1,1]
p_attributes <- c("word", "pos", "lemma", "ner")
tokenstream_list <- lapply(
  p_attributes,
  function(x) get_token_stream("UNGA", p_attribute = x, left = cpos_left, right = cpos_left + 1000)
)
tokenstream_df <- as.data.frame(tokenstream_list)
colnames(tokenstream_df) <- p_attributes
tokenstream_df[["pos"]] <- gsub("^\\$", "\\\\$", tokenstream_df[["pos"]])
tokenstream_df[["cpos"]] <- 0L:1000L
tokenstream_df <- tokenstream_df[, c("cpos", p_attributes)]
DT::datatable(tokenstream_df)

CQP II: Searching in the Token Stream {.smaller}

Q <- '[pos = "NN"] "of" "migration"'
C <- count("UNGA", query = Q, breakdown = TRUE)
head(C[,c("match", "count", "share")])

CQP III: Quantifiers {.smaller}

count("UNGA", query = '"World" [] [pos = "NN"]', cqp = T, breakdown = T) %>% 
  head(n = 3) %>% subset(select = c("match", "count", "share"))
count("UNGA", query = '"(un|)just" []{0,3} "sanction.*"', cqp = TRUE, breakdown = TRUE) %>%
  head(n = 3) %>% subset(select = c("match", "count", "share"))

CQP IV: Neighbours {.smaller}

Q <- '("[tT]error.*" []{0,9} "[iI]slam.*" | "[iI]slam.*" []{0,9} "[tT]error.*")'
Y <- count("UNGA", query = Q, cqp = TRUE)
Y[, "count"]

CQP: Useful for all polmineR methods {.smaller}

options("polmineR.pagelength" = 6L)
kwic("UNGA", query = '"conflict" []{0,5} ".*development.*"', cqp = TRUE)

CQP and the kwic() method {.smaller}

options("polmineR.pagelength" = 5L)
kwic("UNGA", query = '"[iI]slam.*"', positivelist = "[tT]error.*", regex = TRUE, cqp = TRUE) %>%
  highlight (yellow = "[tT]error.*", regex = TRUE)

CQP and the kwic() method (cont.) {.smaller}

kwic("UNGA", query = '[ner = "ORGANIZATION"] []{0,5} [(lemma = "sanction") & (pos = "V.+")]', regex = TRUE, cqp = TRUE)

CQP: Useful for dispersion() as well {.smaller}

dispersion("UNGA", query = '"[Dd]iscrimination.*"', s_attribute = "year")

CQP: Useful for cooccurrences() as well {.smaller}

cooccurrences("UNGA", query = '[(lemma = "love") & (pos = "N.+")]', cqp = TRUE) %>%
  data.table::as.data.table() %>% subset(rank_ll < 5) %>% 
  subset(select = c("word", "count_partition", "count_coi", "count_ref", "ll", "rank_ll")) %>%
  DT::datatable() # Embedding in slides as htmlwidget

Conclusion {.smaller}

partition("UNGA", year = 2002:2009) %>% 
  cooccurrences(query = '"([mM]uslim.|[iI]slam*)"', cqp = TRUE)

Good Luck!

Appendix: Part-of-speech Tagsets I - Stuttgart-Tübingen-Tagset (Excerpt) {.smaller}

|Notation|Description|Example| |:------:|------------|--------| |ADJA| attributive adjektive | [das] große [Haus]| |ART| determiner | der, die, das, ein, eine, ... | |NN| common noun | Tisch, Herr, [das] Reisen| |NE| proper noun | Hans, Hamburg, HSV| |VV.*| full verb, all forms | [du] gehst, [wir] kommen [an] | |VAFIN| finite verb, aux | [du] bist, [wir] werden |

Appendix: Part-of-speech Tagsets II - PENN Tagset (Excerpt) {.smaller}

|Notation|Description|Example| |:------:|------------|--------| |DT| determiner | a, the |JJ.| adjective, all forms | global [partnership], least [fortunate], sustainable [development] |NN.| common noun, all forms | document, vote |PR.| pronoun, personal or posessive | I [would like], its [48th meeting], their [financial contributions] |RB| adverb | [I] therefore [take it], properly [discussed] |VB.| verb, any form | be, consider, remind, [has been] postponed

References



PolMine/UCSSR documentation built on June 13, 2022, 10:23 p.m.