Data made available via The Programming Historian, and collated here for demo purposes. Addresses have been annotated using the spacyr
package, and include named entity tags.
library(sotuAnn)#devtools::install_github("jaytimm/sotuAnn")
library(tidyverse)
library(corpuslingr) #devtools::install_github("jaytimm/corpuslingr")
sotu <- sotuAnn::cdr_sotu_ann %>%
corpuslingr::clr_set_corpus(meta = sotuAnn::cdr_sotu_meta)
summary <- sotu %>%
corpuslingr::clr_desc_corpus(doc ='doc_id',sent='sentence_id', tok='token',upos='pos',genre='party')
## $text
## doc_id textLength textType textSent
## 1: 1 1169 462 26
## 2: 2 1504 593 42
## 3: 3 2473 817 62
## 4: 4 2287 772 63
## 5: 5 2120 803 57
## ---
## 232: 232 8027 1850 422
## 233: 233 7687 1829 368
## 234: 234 8036 1927 386
## 235: 235 7762 1787 383
## 236: 236 6950 1691 367
##
## $corpus
## n_docs textLength textType textSent
## 1: 236 2143369 37633 69758
##
## $genre
## party n_docs textLength textType textSent
## 1: Nonpartisan 8 18019 3013 468
## 2: Federalist 4 7717 1691 201
## 3: Democratic-Republican 28 125525 7407 2937
## 4: Democratic 93 918462 25076 31269
## 5: Whig 8 78775 6464 1960
## 6: Republican 95 994871 26874 33947
SOTU length (in words) by party and medium over time:
summary$text %>% left_join(cdr_sotu_meta) %>%
ggplot(aes(x=year)) +
geom_point(aes(y=textLength, color = party, shape= sotu_type))
SOTU length for spoken addresses over the last century by political party. Democrats have grown chatty.
summary$text %>% left_join(cdr_sotu_meta) %>%
filter(sotu_type=='speech', year > 1915) %>%
ggplot(aes(x=year,y=textLength, color = party)) +
geom_point() +
geom_smooth(method="loess", se=T)+
labs(title = "speech length over time")
by_decade <- summary$text %>%
left_join(cdr_sotu_meta) %>%
mutate(decade = as.numeric(gsub(".$","0",year))) %>%
group_by(decade) %>%
summarise(total = sum(textLength))
SOTU word count by decade:
## # A tibble: 23 x 2
## decade total
## <dbl> <int>
## 1 1790. 24247
## 2 1800. 25862
## 3 1810. 35973
## 4 1820. 76612
## 5 1830. 119576
## 6 1840. 132877
## 7 1850. 125724
## 8 1860. 95900
## 9 1870. 103129
## 10 1880. 130611
## # ... with 13 more rows
library(spacyr)
agged <- cdr_sotu_ann %>% spacyr::entity_extract() %>%
filter(entity_type=='GPE')%>%
left_join(cdr_sotu_meta) %>%
mutate(decade = as.numeric(gsub(".$","0",year))) %>%
group_by (decade, entity) %>%
summarise(freq = n()) %>%
left_join(by_decade) %>%
mutate(freq = (freq/total)*1000000)
Frequency (per 1 million words) of some political friends/foes by decade in SOTU:
agged%>%
filter(entity %in% c('France', 'Spain', 'Great Britain', 'Germany',
'China', 'Japan', 'Egypt', 'Vietnam', 'Russia')) %>%
ggplot(aes(x=decade)) +
geom_line(aes(y=freq, color=entity), size = 1)+
facet_wrap(~entity) +
theme(legend.position="none")+
labs(title = "friends/foes & time")
Frequency of "America" and some geo-synonyms by decade:
agged%>%
filter(entity %in% c( 'America', 'the United States', 'the United States of America')) %>%
ggplot(aes(x=decade)) +
geom_line(aes(y=freq, color=entity), size = 1)+
labs(title = "synonymns & time")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.