In HumanitiesDataAnalysis/TEIdy: Work with TEI and other XML Data as Tidy Text.

This vignette illustrates using the tags to explore a single volume in the Foreign Relations of the United States.

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(tidyverse)
library(tidytext)
library(TEIdytext)
fname = system.file("extdata", "frus1946v06.xml", package = "TEIdytext")

all = TEIdy(fname, ignore = c("c", "w"))

Get word counts by document author and document number.

counts = all %>% 
  filter(div.document.subtype=="historical-document") %>%
  group_by(div.document.n) %>% 
  # Lift the plaintext inside 'persName.from' tags to a new 'name_from' field.
  lift_data(persName.from, plaintext, name_from) %>%
  select(name_from, div.document.n, plaintext) %>%
  unnest_tokens(word, plaintext) %>%
  group_by(name_from, word, div.document.n) %>%
  filter(name_from != "") %>%
  summarize(count = n()) %>% arrange(-count) %>% 
  ungroup

top_senders = counts %>% 
  group_by(name_from) %>%
  summarize(count=sum(count)) %>%
  arrange(-count) %>% head(10)

top_senders %>%
  ggplot() + geom_bar(aes(x=name_from, y = count), stat="identity") + coord_flip()

counts %>% 
  group_by(name_from, div.document.n) %>% summarize(words = sum(count)) %>%
  inner_join(top_senders) %>%
  arrange(-words) %>%
  head(40) %>%
  ggplot() + geom_bar(aes(x=reorder(interaction(name_from, div.document.n), words), fill=name_from, y = words), stat='identity') + coord_flip() + 
  labs(title="Whoa, that's one long telegram!")

j = counts %>%
  group_by(name_from) %>% 
  filter(sum(count) > 10000) %>% 
  summarize_llr(word,count)

j %>% group_by(name_from) %>%
  group_by(name_from) %>%
  filter(n() > 100) %>% 
  top_n(10, dunning_llr) %>%

ggplot() + geom_bar(aes(x=word, y=dunning_llr), stat="identity") + facet_wrap(~name_from, scales="free_y") + coord_flip()