The Art of Counting {.smaller}

Initialization {.smaller}

library(polmineR)
use("UNGA")
for (pkg in c("magrittr", "data.table", "xts", "lubridate"))
  if (!pkg %in% rownames(installed.packages())) install.packages(pkg)

library(magrittr)
library(data.table)
library(xts)

Basics of Counting: The count() method {.smaller}

count("UNGA", query = "refugee")
count("UNGA", query = "refugee")[["count"]] / size("UNGA")
count("UNGA", query = c("refugee", "migrant"))

Using the result of the counting method {.smaller}

queries <- c(
  "alien", "emigrant", "evacuee", "expatriate", "foreigner", "immigrant", "migrant", "refugee"
  )
dt <- count("UNGA", query = queries)
df <- as.data.frame(dt)
df <- df[order(df$count, decreasing = TRUE),] # sorting
par(mar = c(8,4,2,2)) # enlarge plane for more room for labels
barplot(height = df$count, names.arg = df$query, las = 2)

Frequencies of terms related to Asylum {.flexbox .vcenter}

par(mar = c(8,4,2,2)) # enlarge plane for more room for labels
barplot(height = df$count, names.arg = df$query, las = 2)

count() method and partition objects {.smaller}

unga_2015 <- partition("UNGA", year = 2015)
count(unga_2015, query = "refugee")
partition("UNGA", year = 2015) %>%
  count(query = "refugee")

Example: Variation of language use {.smaller}

queries <- c("America", "borders", "crisis", "development", "economy", "freedom", "liberty", "markets", "wealth")

par(mar = c(6,5,2,2), mfrow = c(2,2),  cex = 0.6)

for (us_president in c("Clinton", "Bush", "Obama", "Trump")){
  dt <- partition("UNGA", speaker = us_president) %>%
    count(query = queries)
  barplot(
    height = dt$freq * 100000, names.arg = dt$query, # labels with query terms
    las = 2, # rotate labels to improve visuals
    main = us_president,
    ylab = "Count of Terms  (per 100.000 Tokens)",
    ylim = c(0, 350) # shared scale for comparison
    )
}

Variation of language use {.flexbox .vcenter}

queries <- c("America", "borders", "crisis", "development", "economy", "freedom", "liberty", "markets", "wealth")

par(mar = c(6,5,2,2), mfrow = c(2,2),  cex = 0.6)

for (us_president in c("Clinton", "Bush", "Obama", "Trump")){
  dt <- partition("UNGA", speaker = us_president) %>%
    count(query = queries)
  barplot(
    height = dt$freq * 100000, names.arg = dt$query, # labels with query terms
    las = 2, # rotate labels to improve visuals
    main = us_president,
    ylab = "Count of Terms  (per 100.000 Tokens)",
    ylim = c(0, 350) # shared scale for comparison
    )
}

Using regular expressions and CQP {.smaller}

count("UNGA", query = "'refugee.*'", cqp = TRUE) # using CQP syntax
dt <- count("UNGA", query = "'refugee.*'", cqp = TRUE, breakdown = TRUE)

Hits for regular expression {.smaller}

DT::datatable(dt)

Counting with positional attributes {.smaller}

count("UNGA", query = "refugee", p_attribute = "lemma")

Linguistic Variations: Matching of Inflections {.smaller}

queries <- c(
  asylum = "'.*asylum.*'",
  border = '"border.*"',
  migrant = '"(|e|im)migrant(|s)"', 
  migration = "'.*migration.*'",
  refugee = '"refugee.*"', 
  visa = "'visa'"
  )
par(mar = c(6,5,2,2), mfrow = c(2,2),  cex = 0.6)
for (us_president in c("Clinton", "Bush", "Obama", "Trump")) {
  partition("UNGA", speaker = us_president) %>%
    count(query = unname(queries), cqp = TRUE, p_attribute = "word") -> dt
  barplot(
    height = dt$freq * 100000,
    names.arg = names(queries),
    las = 2, main = us_president,
    ylim = c(0, 100)
  )
}

Linguistic Variations: Matching of Inflections {.flexbox .vcenter}

queries <- c(
  asylum = "'.*asylum.*'",
  border = '"border.*"',
  migrant = '"(|e|im)migrant(|s)"', 
  migration = "'.*migration.*'",
  refugee = '"refugee.*"', 
  visa = "'visa'"
  )
par(mar = c(6,5,2,2), mfrow = c(2,2),  cex = 0.6)
for (us_president in c("Clinton", "Bush", "Obama", "Trump")) {
  partition("UNGA", speaker = us_president) %>%
    count(query = unname(queries), cqp = TRUE, p_attribute = "word") -> dt
  barplot(
    height = dt$freq * 100000,
    names.arg = names(queries),
    las = 2, main = us_president,
    ylim = c(0, 100)
  )
}

Preliminary conclusions and 'Learnings' {.smaller}

Frequency Distribution {.smaller}

dt <- dispersion("UNGA", query = "refugee", s_attribute = "year")
head(dt) # just looking at the top of the table

Simple Visualization of Frequencies {.smaller}

par(mfrow = c(1,1))
dt <- dispersion("UNGA", query = "refugee", s_attribute = "year", freq = TRUE)
barplot(
  height = dt[["freq"]] * 100000,
  names.arg = dt[["year"]],
  las = 2, ylab = "Hits per 100.000 Terms"
  )

The term 'refugee' in the United Nations General Assembly, per year {.flexbox .vcenter}

barplot(
  height = dt[["freq"]] * 100000,
  names.arg = dt[["year"]],
  las = 2, ylab = "Hits per 100.000 Terms"
  )

Frequency Distribution - Two Dimensions {.smaller}

dt <- dispersion("UNGA", query = '"[Rr]efugee(|s)"', cqp = TRUE, s_attribute = c("year", "state_organization"))

# creating the index for columns with a sum greater than 200
idx <- which(colSums(dt[,2:ncol(dt)], na.rm = TRUE) > 200) + 1

# subsetting the dt before by this index (as well as the year column)
dt_min <- dt[,c(1, idx), with = FALSE]

# removing the column NA and the rows for 1993 and 2018 which are only partly in the corpus
dt_min <- dt_min[2:(nrow(dt_min)-1),-"NA"]

Frequency Distribution - Two Dimensions (cont.) {.smaller}

ts <- xts(x = dt_min[,c(2:ncol(dt_min)), with = FALSE],
          order.by = as.Date(sprintf("%s-01-01", dt_min[["year"]]))
          )
head(ts)

Visualization using xts {.smaller}

plot.xts(
  ts,
  multi.panel = TRUE,
  col = RColorBrewer::brewer.pal(12, "Set3"),
  lwd = 2,
  yaxs = "r"
  )

Visualization using xts (cont.) {.smaller}

plot.xts(
  ts,
  multi.panel = TRUE,
  col = RColorBrewer::brewer.pal(12, "Set3"),
  lwd = 2,
  yaxs = "r"
  )

A date specific time series {.smaller}

par(mar = c(4,2,2,2))
dt <- dispersion("UNGA", query = '"[Rr]efugee(s|)"', cqp = TRUE, s_attribute = "date")
dt <- dt[!is.na(as.Date(dt[["date"]]))]
ts <- xts(x = dt[["count"]], order.by = as.Date(dt[["date"]]))
plot(ts)

Aggregation by week - month - quarter - year {.smaller}

ts_week <- aggregate(ts, {a <- lubridate::ymd(paste(lubridate::year(index(ts)), 1, 1, sep = "-")); lubridate::week(a) <- lubridate::week(index(ts)); a})
ts_month <- aggregate(ts, as.Date(as.yearmon(index(ts))))
ts_qtr <- aggregate(ts, as.Date(as.yearqtr(index(ts))))
ts_year <- aggregate(ts, as.Date(sprintf("%s-01-01", gsub("^(\\d{4})-.*?$", "\\1", index(ts)))))
par(mfrow = c(2,2), mar = c(2,2,3,1))
plot(as.xts(ts_week), main = "Aggregation: Week")
plot(as.xts(ts_month), main = "Aggregation: Month");
plot(as.xts(ts_qtr), main = "Aggregation: Quarter")
plot(as.xts(ts_year), main = "Aggregation: Year")

Aggregation by week - month - quarter - year {.flexbox .vcenter}

par(mfrow = c(2,2), mar = c(2,2,3,1))
plot(as.xts(ts_week), main = "Aggregation: Week")
plot(as.xts(ts_month), main = "Aggregation: Month");
plot(as.xts(ts_qtr), main = "Aggregation: Quarter")
plot(as.xts(ts_year), main = "Aggregation: Year")

Working with time series: 'learnings' {.smaller}

Dictionary based classification I | Advanced Applications {.smaller}

unga_2016 <- partition("UNGA", year = 2016)
pb <- partition_bundle(unga_2016, s_attribute = "date")
nested <- lapply(
  pb@objects,
  function(x) partition_bundle(x, s_attribute = "state_organization", verbose = FALSE)
)
debates <- flatten(nested)
names(debates) <- paste(
  blapply(debates, function(x) s_attributes(x, "date")),
  blapply(debates, function(x) name(x)), 
  sep = "_"
)

Dictionary based classification II {.smaller}

dict <- c("asylum", "escaping", "fleeing", "migration", "refugee")
dt <- count(debates, query = dict) %>% data.table::setorderv(cols = "TOTAL", order = -1L)
debates_mig <- debates[[ subset(dt, TOTAL >= 10)[["partition"]] ]]
debates_mig[[1]] %>% read() %>% highlight(yellow = dict)

Counting all words in the corpus / partition {.smaller}

p <- partition("UNGA", year = 2008)
cnt <- count(p, p_attribute = "word")
sum(cnt[["count"]]) == size(p)
unga_2008 <- partition("UNGA", year = 2008)
dt <- count(unga_2008, p_attribute = c("word", "pos")) %>% subset(pos %in% c("NN", "JJ")) %>%
  data.table::as.data.table(.) %>% data.table::setorderv(., cols = "count", order = -1L) %>% head()

Counting for (more than) algorithms {.smaller}

Appendix {.smaller}

word <- get_token_stream("UNGA", p_attribute = "word")
Encoding(word) <- registry_get_encoding("UNGA")
lemma <- get_token_stream("UNGA", p_attribute = "lemma")
Encoding(lemma) <- registry_get_encoding("UNGA")

dt <- data.table::data.table(word = word, lemma = lemma)

token <- "refugee"
q <- iconv(token, from = "UTF-8", to = "latin1")
dt2 <- dt[lemma == q]
dt2[, .N, by = .(word)]

References



PolMine/UCSSR documentation built on June 13, 2022, 10:23 p.m.