In favstats/demdebates2020: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Start

First, I load in some packages.

# Install these packages if you don't have them yet
# if (!require("pacman")) install.packages("pacman")
# devtools::install_github("favstats/demdebates2020")

pacman::p_load(tidyverse,       # powerful data wrangling
               knitr,           # for tables
               extrafont,       # extra fonts
               ggtext,          # markdown in ggplot!
               rvest,           # for emoji scraping 
               tidytext,        # text processing
               demdebates2020,  # democratic debates datasets
               ggthemes,        # custom themes
               scales)          # for prettying up plot labels

Eleventh Debate

Most commong distinct words

speaker_words <- demdebates2020::debates %>% 
  filter(debate == 11) %>% 
  filter(type == "Candidate") %>% 
  mutate(speech = tm::removeWords(str_to_lower(speech), stop_words$word)) %>% 
  unnest_tokens(word, speech, token = "ngrams", n = 1) %>%
  count(speaker, word, sort = TRUE)

total_words <- speaker_words %>% 
  group_by(speaker) %>% 
  summarize(total = sum(n))


speaker_words <- left_join(speaker_words, total_words)


speaker_words <- speaker_words %>%
  bind_tf_idf(word, speaker, n) %>% 
  filter(n > 2) %>% 
  drop_na(word) 



gg9 <- speaker_words %>%
  arrange(desc(tf_idf)) %>%
  filter(speaker %in% c("Bernie Sanders", "Elizabeth Warren",
                        "Joe Biden", "Pete Buttigieg", 
                        "Mike Bloomberg", "Amy Klobuchar")) %>%
  group_by(speaker) %>% 
  arrange(desc(tf_idf)) %>% 
  slice(1:15) %>% 
  ungroup() %>%
  # mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  mutate(word = fct_reorder(word, tf_idf)) %>% 
  ggplot(aes(word, tf_idf, fill = speaker)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~speaker, ncol = 3, scales = "free") +
  coord_flip()   +
  ggthemes::theme_hc() +
  # facet_grid(~type, scales = "free_x", space = "free") +
  ggthemes::scale_fill_economist() +
  theme(
    text = element_text(family = "Fira Code Retina"),
    legend.position = "top",
    plot.title = element_markdown(size = 15, margin=margin(0,0,20,0), face = "bold", hjust = 0.5)
    ) +
  labs(x = "", y = "tf-idf", title = "Most Common Distinct Words for each Candidate during 11th Debate",
       caption = "\n11th Democratic Debate\nData Visualization: @favstats\nSource: Transcript by rev.com") 

# rath::ggpreview(gg9, width = 14, height = 9)
  # C:\Users\fabio\Desktop\demdebates2020

tidytemplate::ggsave_it(gg11, 
                        width = 10, 
                        height = 6)

Who got the most Boos?

demdebates2020::debates %>% 
  filter(debate == 9)  %>% 
  filter(type == "Candidate") %>% 
  filter(speaker %in% c("Bernie Sanders", "Elizabeth Warren",
                        "Joe Biden", "Pete Buttigieg", 
                        "Mike Bloomberg", "Amy Klobuchar")) %>%
  mutate(speaker = as.factor(speaker)) %>% 
  mutate(background = as.factor(background)) %>% 
  dplyr::count(background, speaker, sort = T, .drop = F) %>% 
  drop_na() %>% 
  filter(background == "(AUDIENCE BOOS)") %>% 
  mutate(speaker = fct_reorder(speaker, n)) %>% 
  ggplot(aes(speaker, n)) +
  geom_col(aes(fill = speaker), width = 0.25) +
  coord_flip() +
  ggthemes::theme_hc() +
  geom_label(aes(label = n), size = 4) +
  ggthemes::scale_fill_gdocs() +
  theme(
    text = element_text(family = "Fira Code Retina"),
    legend.position = "none", 
    plot.title = element_markdown(size = 14,
                                  hjust = 0.5,
                                  # margin=margin(0,0,15,0),
                                  face = "bold"),
    plot.caption = element_text(size = 5)
    ) +
  labs(x = "", y = "Audience Boos", title = "Who got the <span style='color: #3E7ACF'>most audience boos</span><br>in Democratic Debates?",
       caption = "\nDemocratic Debates: 1 - 7 & 9 - 10\nData Visualization: @favstats\nSource: Transcripts by Washington Post, Des Moines Register & NBC News") +
  scale_y_continuous(breaks = 0:2)

tidytemplate::ggsave_it(boos3, width = 6, height = 4)

Who the most applause?

demdebates2020::debates %>% 
  filter(debate == 11)  %>% 
  filter(type == "Candidate") %>% 
  filter(speaker %in% c("Bernie Sanders", "Elizabeth Warren",
                        "Joe Biden", "Pete Buttigieg", 
                        "Mike Bloomberg", "Amy Klobuchar")) %>% 
  dplyr::count(background, speaker, type, sort = T) %>% 
  drop_na() %>% 
  filter(background == "(APPLAUSE)") %>% 
  mutate(speaker = fct_reorder(speaker, n)) %>% 
  mutate(type = paste0(type, "s")) %>% 
  ggplot(aes(speaker, n)) +
  geom_col(aes(fill = type), width = 0.5) +
  coord_flip() +
  ggthemes::theme_hc() +
  geom_label(aes(label = n), size = 4) +
  # facet_wrap(~type, scales = "free") +
  ggthemes::scale_fill_gdocs() +
  theme(
    text = element_text(family = "Fira Code Retina"),
    legend.position = "none", 
    plot.title = element_markdown(hjust = 0.5, size = 30, margin=margin(0,0,15,0), face = "bold")
    ) +
  labs(x = "", y = "Applause", title = "Who got the <span style='color: #3E7ACF'>most Applause</span><img src='https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/120/apple/237/clapping-hands-sign_1f44f.png' width='20'/><br>in 10th Democratic Debate?",
       caption = "\nDemocratic Debate: 10\nData Visualization: @favstats\nSource: Transcript by NBC News") 


tidytemplate::ggsave_it(applause4, width = 10, height = 8)

demdebates2020::debates %>% 
  filter(debate == 11)  %>% 
  filter(type == "Candidate") %>% 
  filter(speaker %in% c("Bernie Sanders", "Elizabeth Warren",
                        "Joe Biden", "Pete Buttigieg", 
                        "Mike Bloomberg", "Amy Klobuchar")) %>% 
  dplyr::count(background, speaker, type, sort = T) %>% 
  drop_na() %>% 
  filter(background == "(LAUGHTER)")  %>% 
  mutate(speaker = fct_reorder(speaker, n)) %>% 
  ggplot(aes(speaker, n)) +
  geom_col(aes(fill = type), width = 0.5) +
  # geom_point(aes(fill = type), size = 9) +
  coord_flip() +
  ggthemes::theme_hc() +
  geom_label(aes(label = n), size = 4) +
  # facet_grid(~type, scales = "free_x", space = "free") +
  ggthemes::scale_fill_gdocs() +
  theme(
    text = element_text(family = "Fira Code Retina"),
    legend.position = "none", 
    plot.title = element_markdown(size = 30, margin=margin(0,0,15,0), face = "bold")
    ) +
  labs(x = "", y = "Laughs", title = "Who got the <span style='color: #3E7ACF'>most Laughs</span><img src='https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/120/apple/237/face-with-tears-of-joy_1f602.png' width='20'/><br>in 10th Democratic Debate?",
       caption = "\nDemocratic Debates: 1 - 7 & 9 - 10\nData Visualization: @favstats\nSource: Transcript by NBC News") 

tidytemplate::ggsave_it(laughs4, width = 8, height = 8)

library(spacyr)
spacy_initialize()

Tidytext can take a custom tokenization function

library(tidyverse)
library(demdebates2020)
library(tidytext)

tokenize_scispacy_entities <- function(text) {
  spacy_extract_entity(text) %>%
    group_by(doc_id) %>%
    nest() %>%
    pull(data) %>%
    map("text") %>%
    map(str_to_lower)
}

tokenize_scispacy_entities(c("Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity.", "They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC)."))

abstract_entities <- debates %>%
  filter(type == "Candidate") %>% 
  select(speaker, speech) %>%
  group_by(speaker) %>%
  summarise(speech = paste0(speech, collapse = "\n")) %>% 
  ungroup() %>% 
  unnest_tokens(entity, speech, token = tokenize_scispacy_entities)


speaker_words <- abstract_entities%>% 
  filter(entity != "na") %>% 
  dplyr::count(speaker, entity, sort = T)


total_words <- speaker_words %>% 
  group_by(speaker) %>% 
  summarize(total = sum(n))


speaker_words <- left_join(speaker_words, total_words)


speaker_words <- speaker_words %>%
  bind_tf_idf(entity, speaker, n) %>% 
  filter(n > 2) %>% 
  drop_na(entity) 

library(ggtext)

gg11 <- speaker_words %>%
  arrange(desc(tf_idf)) %>%
  filter(speaker %in% c("Bernie Sanders", "Elizabeth Warren",
                        "Joe Biden", "Pete Buttigieg", 
                        "Mike Bloomberg", "Amy Klobuchar")) %>%
  group_by(speaker) %>% 
  arrange(desc(tf_idf)) %>% 
  slice(1:15) %>% 
  ungroup() %>%
  # mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  mutate(entity = fct_reorder(entity, tf_idf)) %>% 
  ggplot(aes(entity, tf_idf, fill = speaker)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~speaker, ncol = 3, scales = "free") +
  coord_flip()   +
  ggthemes::theme_hc() +
  # facet_grid(~type, scales = "free_x", space = "free") +
  ggthemes::scale_fill_economist() +
  theme(
    text = element_text(family = "Fira Code Retina"),
    legend.position = "top",
    plot.title = element_markdown(size = 15, margin=margin(0,0,20,0), face = "bold", hjust = 0.5)
    ) +
  labs(x = "", y = "tf-idf", title = "Most Common Distinct entitys for each Candidate during 11th Debate",
       caption = "\n11th Democratic Debate\nData Visualization: @favstats\nSource: Transcript by rev.com") 

# rath::ggpreview(gg9, width = 14, height = 9)
  # C:\Users\fabio\Desktop\demdebates2020

tidytemplate::ggsave_it(gg12, 
                        width = 10, 
                        height = 6)