README.md
In jaytimm/sotuAnn:

Data made available via The Programming Historian, and collated here for demo purposes. Addresses have been annotated using the spacyr package, and include named entity tags.

library(sotuAnn)#devtools::install_github("jaytimm/sotuAnn")

library(tidyverse)
library(corpuslingr) #devtools::install_github("jaytimm/corpuslingr")

sotu <- sotuAnn::cdr_sotu_ann %>%
  corpuslingr::clr_set_corpus(meta = sotuAnn::cdr_sotu_meta)

summary <- sotu %>%
  corpuslingr::clr_desc_corpus(doc ='doc_id',sent='sentence_id', tok='token',upos='pos',genre='party')

## $text
##      doc_id textLength textType textSent
##   1:      1       1169      462       26
##   2:      2       1504      593       42
##   3:      3       2473      817       62
##   4:      4       2287      772       63
##   5:      5       2120      803       57
##  ---                                    
## 232:    232       8027     1850      422
## 233:    233       7687     1829      368
## 234:    234       8036     1927      386
## 235:    235       7762     1787      383
## 236:    236       6950     1691      367
## 
## $corpus
##    n_docs textLength textType textSent
## 1:    236    2143369    37633    69758
## 
## $genre
##                    party n_docs textLength textType textSent
## 1:           Nonpartisan      8      18019     3013      468
## 2:            Federalist      4       7717     1691      201
## 3: Democratic-Republican     28     125525     7407     2937
## 4:            Democratic     93     918462    25076    31269
## 5:                  Whig      8      78775     6464     1960
## 6:            Republican     95     994871    26874    33947

SOTU length (in words) by party and medium over time:

summary$text %>% left_join(cdr_sotu_meta) %>%
ggplot(aes(x=year)) +
  geom_point(aes(y=textLength, color = party, shape= sotu_type))

SOTU length for spoken addresses over the last century by political party. Democrats have grown chatty.

summary$text %>% left_join(cdr_sotu_meta) %>%
  filter(sotu_type=='speech', year > 1915) %>%
ggplot(aes(x=year,y=textLength, color = party)) +
  geom_point() +
  geom_smooth(method="loess", se=T)+
  labs(title = "speech length over time")

by_decade <- summary$text %>%
  left_join(cdr_sotu_meta) %>%
  mutate(decade = as.numeric(gsub(".$","0",year))) %>%
  group_by(decade) %>%
  summarise(total = sum(textLength))

SOTU word count by decade:

## # A tibble: 23 x 2
##    decade  total
##     <dbl>  <int>
##  1  1790.  24247
##  2  1800.  25862
##  3  1810.  35973
##  4  1820.  76612
##  5  1830. 119576
##  6  1840. 132877
##  7  1850. 125724
##  8  1860.  95900
##  9  1870. 103129
## 10  1880. 130611
## # ... with 13 more rows

library(spacyr)
agged <- cdr_sotu_ann %>% spacyr::entity_extract() %>%
  filter(entity_type=='GPE')%>% 
  left_join(cdr_sotu_meta) %>%
  mutate(decade = as.numeric(gsub(".$","0",year))) %>%
  group_by (decade, entity) %>%
  summarise(freq = n()) %>%
  left_join(by_decade) %>%
  mutate(freq = (freq/total)*1000000)

Frequency (per 1 million words) of some political friends/foes by decade in SOTU:

agged%>%
  filter(entity %in% c('France', 'Spain', 'Great Britain', 'Germany', 
                       'China', 'Japan', 'Egypt', 'Vietnam', 'Russia')) %>%
  ggplot(aes(x=decade)) +
  geom_line(aes(y=freq, color=entity), size = 1)+
  facet_wrap(~entity) +
  theme(legend.position="none")+
  labs(title = "friends/foes & time")

Frequency of "America" and some geo-synonyms by decade:

agged%>%
  filter(entity %in% c( 'America', 'the United States', 'the United States of America')) %>%
  ggplot(aes(x=decade)) +
  geom_line(aes(y=freq, color=entity), size = 1)+
  labs(title = "synonymns & time")