inst/doc/tidying_casting.R

## -----------------------------------------------------------------------------
knitr::opts_chunk$set(
  message = FALSE, warning = FALSE,
  eval = requireNamespace("tm", quietly = TRUE) && requireNamespace("quanteda", quietly = TRUE) && requireNamespace("topicmodels", quietly = TRUE) && requireNamespace("ggplot2", quietly = TRUE)
)

## -----------------------------------------------------------------------------
library(ggplot2)
theme_set(theme_bw())

## -----------------------------------------------------------------------------
library(tm)
data("AssociatedPress", package = "topicmodels")
AssociatedPress

## -----------------------------------------------------------------------------
library(dplyr)
library(tidytext)

ap_td <- tidy(AssociatedPress)

## -----------------------------------------------------------------------------
ap_sentiments <- ap_td %>%
  inner_join(get_sentiments("bing"), join_by(term == word))

ap_sentiments

## -----------------------------------------------------------------------------
library(tidyr)

ap_sentiments %>%
  count(document, sentiment, wt = count) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  arrange(sentiment)

## -----------------------------------------------------------------------------
library(ggplot2)

ap_sentiments %>%
  count(sentiment, term, wt = count) %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(n, term, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free_y") +
  labs(x = "Contribution to sentiment", y = NULL)

## -----------------------------------------------------------------------------
library(methods)

data("data_corpus_inaugural", package = "quanteda")
d <- quanteda::tokens(data_corpus_inaugural) %>%
  quanteda::dfm()

d

tidy(d)

## -----------------------------------------------------------------------------
ap_td

# cast into a Document-Term Matrix
ap_td %>%
  cast_dtm(document, term, count)

# cast into a Term-Document Matrix
ap_td %>%
  cast_tdm(term, document, count)

# cast into quanteda's dfm
ap_td %>%
  cast_dfm(term, document, count)


# cast into a Matrix object
m <- ap_td %>%
  cast_sparse(document, term, count)
class(m)
dim(m)

## -----------------------------------------------------------------------------
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- VCorpus(DirSource(reut21578),
                   readerControl = list(reader = readReut21578XMLasPlain))

reuters

## -----------------------------------------------------------------------------
reuters_td <- tidy(reuters)
reuters_td

## -----------------------------------------------------------------------------
library(quanteda)

data("data_corpus_inaugural")

data_corpus_inaugural

inaug_td <- tidy(data_corpus_inaugural)
inaug_td

## -----------------------------------------------------------------------------
inaug_words <- inaug_td %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

inaug_words

## -----------------------------------------------------------------------------
inaug_freq <- inaug_words %>%
  count(Year, word) %>%
  complete(Year, word, fill = list(n = 0)) %>%
  group_by(Year) %>%
  mutate(year_total = sum(n), percent = n / year_total) %>%
  ungroup()

inaug_freq

## -----------------------------------------------------------------------------
library(broom)
models <- inaug_freq %>%
  group_by(word) %>%
  filter(sum(n) > 50) %>%
  group_modify(
    ~ tidy(glm(cbind(n, year_total - n) ~ Year, ., family = "binomial"))
  ) %>%
  ungroup() %>%
  filter(term == "Year")

models

models %>%
  filter(term == "Year") %>%
  arrange(desc(abs(estimate)))

## -----------------------------------------------------------------------------
library(ggplot2)

models %>%
  mutate(adjusted.p.value = p.adjust(p.value)) %>%
  ggplot(aes(estimate, adjusted.p.value)) +
  geom_point() +
  scale_y_log10() +
  geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
  labs(x = "Estimated change over time", y = "Adjusted p-value")

## -----------------------------------------------------------------------------
library(scales)

models %>%
  slice_max(abs(estimate), n = 6) %>%
  inner_join(inaug_freq) %>%
  ggplot(aes(Year, percent)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(vars(word)) +
  scale_y_continuous(labels = percent_format()) +
  labs(y = "Frequency of word in speech")

Try the tidytext package in your browser

Any scripts or data that you put into this service are public.

tidytext documentation built on May 29, 2024, 5:42 a.m.