README.md

sotuAnn: State of the Union addresses as an annotated corpus

Data made available via The Programming Historian, and collated here for demo purposes. Addresses have been annotated using the spacyr package, and include named entity tags.

library(sotuAnn)#devtools::install_github("jaytimm/sotuAnn")
library(tidyverse)
library(corpuslingr) #devtools::install_github("jaytimm/corpuslingr")
sotu <- sotuAnn::cdr_sotu_ann %>%
  corpuslingr::clr_set_corpus(meta = sotuAnn::cdr_sotu_meta)

Some corpus descriptives

summary <- sotu %>%
  corpuslingr::clr_desc_corpus(doc ='doc_id',sent='sentence_id', tok='token',upos='pos',genre='party')
## $text
##      doc_id textLength textType textSent
##   1:      1       1169      462       26
##   2:      2       1504      593       42
##   3:      3       2473      817       62
##   4:      4       2287      772       63
##   5:      5       2120      803       57
##  ---                                    
## 232:    232       8027     1850      422
## 233:    233       7687     1829      368
## 234:    234       8036     1927      386
## 235:    235       7762     1787      383
## 236:    236       6950     1691      367
## 
## $corpus
##    n_docs textLength textType textSent
## 1:    236    2143369    37633    69758
## 
## $genre
##                    party n_docs textLength textType textSent
## 1:           Nonpartisan      8      18019     3013      468
## 2:            Federalist      4       7717     1691      201
## 3: Democratic-Republican     28     125525     7407     2937
## 4:            Democratic     93     918462    25076    31269
## 5:                  Whig      8      78775     6464     1960
## 6:            Republican     95     994871    26874    33947

SOTU length (in words) by party and medium over time:

summary$text %>% left_join(cdr_sotu_meta) %>%
ggplot(aes(x=year)) +
  geom_point(aes(y=textLength, color = party, shape= sotu_type))

SOTU length for spoken addresses over the last century by political party. Democrats have grown chatty.

summary$text %>% left_join(cdr_sotu_meta) %>%
  filter(sotu_type=='speech', year > 1915) %>%
ggplot(aes(x=year,y=textLength, color = party)) +
  geom_point() +
  geom_smooth(method="loess", se=T)+
  labs(title = "speech length over time")

Geo-political entities in SOTU

by_decade <- summary$text %>%
  left_join(cdr_sotu_meta) %>%
  mutate(decade = as.numeric(gsub(".$","0",year))) %>%
  group_by(decade) %>%
  summarise(total = sum(textLength))

SOTU word count by decade:

## # A tibble: 23 x 2
##    decade  total
##     <dbl>  <int>
##  1  1790.  24247
##  2  1800.  25862
##  3  1810.  35973
##  4  1820.  76612
##  5  1830. 119576
##  6  1840. 132877
##  7  1850. 125724
##  8  1860.  95900
##  9  1870. 103129
## 10  1880. 130611
## # ... with 13 more rows
library(spacyr)
agged <- cdr_sotu_ann %>% spacyr::entity_extract() %>%
  filter(entity_type=='GPE')%>% 
  left_join(cdr_sotu_meta) %>%
  mutate(decade = as.numeric(gsub(".$","0",year))) %>%
  group_by (decade, entity) %>%
  summarise(freq = n()) %>%
  left_join(by_decade) %>%
  mutate(freq = (freq/total)*1000000)

Frequency (per 1 million words) of some political friends/foes by decade in SOTU:

agged%>%
  filter(entity %in% c('France', 'Spain', 'Great Britain', 'Germany', 
                       'China', 'Japan', 'Egypt', 'Vietnam', 'Russia')) %>%
  ggplot(aes(x=decade)) +
  geom_line(aes(y=freq, color=entity), size = 1)+
  facet_wrap(~entity) +
  theme(legend.position="none")+
  labs(title = "friends/foes & time")

Frequency of "America" and some geo-synonyms by decade:

agged%>%
  filter(entity %in% c( 'America', 'the United States', 'the United States of America')) %>%
  ggplot(aes(x=decade)) +
  geom_line(aes(y=freq, color=entity), size = 1)+
  labs(title = "synonymns & time")



jaytimm/sotuAnn documentation built on May 30, 2019, 11:40 p.m.