#| echo = FALSE knitr::opts_chunk$set( message = FALSE, warning = FALSE, eval = requireNamespace("tm", quietly = TRUE) && requireNamespace("quanteda", quietly = TRUE) && requireNamespace("topicmodels", quietly = TRUE) && requireNamespace("ggplot2", quietly = TRUE) )
#| echo = FALSE library(ggplot2) theme_set(theme_bw())
Many existing text mining datasets are in the form of a DocumentTermMatrix
class (from the tm package). For example, consider the corpus of 2246 Associated Press articles from the topicmodels package:
library(tm) data("AssociatedPress", package = "topicmodels") AssociatedPress
If we want to analyze this with tidy tools, we need to turn it into a one-term-per-document-per-row data frame first. The tidy
function does this. (For more on the tidy verb, see the broom package).
library(dplyr) library(tidytext) ap_td <- tidy(AssociatedPress)
Just as shown in this vignette, having the text in this format is convenient for analysis with the tidytext package. For example, you can perform sentiment analysis on these newspaper articles.
ap_sentiments <- ap_td %>% inner_join(get_sentiments("bing"), join_by(term == word)) ap_sentiments
We can find the most negative documents:
library(tidyr) ap_sentiments %>% count(document, sentiment, wt = count) %>% pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% mutate(sentiment = positive - negative) %>% arrange(sentiment)
Or visualize which words contributed to positive and negative sentiment:
#| fig.width = 7, #| fig.height = 4, #| fig.alt = 'Bar charts for the contribution of words to sentiment scores. The words "like" and "work" contribute the most to positive sentiment, and the words "killed" and "death" contribute the most to negative sentiment' library(ggplot2) ap_sentiments %>% count(sentiment, term, wt = count) %>% group_by(sentiment) %>% slice_max(n, n = 10) %>% mutate(term = reorder(term, n)) %>% ggplot(aes(n, term, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(vars(sentiment), scales = "free_y") + labs(x = "Contribution to sentiment", y = NULL)
Note that a tidier is also available for the dfm
class from the quanteda package:
library(methods) data("data_corpus_inaugural", package = "quanteda") d <- quanteda::tokens(data_corpus_inaugural) %>% quanteda::dfm() d tidy(d)
Some existing text mining tools or algorithms work only on sparse document-term matrices. Therefore, tidytext provides cast_
verbs for converting from a tidy form to these matrices.
ap_td # cast into a Document-Term Matrix ap_td %>% cast_dtm(document, term, count) # cast into a Term-Document Matrix ap_td %>% cast_tdm(term, document, count) # cast into quanteda's dfm ap_td %>% cast_dfm(term, document, count) # cast into a Matrix object m <- ap_td %>% cast_sparse(document, term, count) class(m) dim(m)
This allows for easy reading, filtering, and processing to be done using dplyr and other tidy tools, after which the data can be converted into a document-term matrix for machine learning applications.
You can also tidy Corpus objects from the tm package. For example, consider a Corpus containing 20 documents, one for each
reut21578 <- system.file("texts", "crude", package = "tm") reuters <- VCorpus(DirSource(reut21578), readerControl = list(reader = readReut21578XMLasPlain)) reuters
The tidy
verb creates a table with one row per document:
reuters_td <- tidy(reuters) reuters_td
Similarly, you can tidy
a corpus
object from the quanteda package:
library(quanteda) data("data_corpus_inaugural") data_corpus_inaugural inaug_td <- tidy(data_corpus_inaugural) inaug_td
This lets us work with tidy tools like unnest_tokens
to analyze the text alongside the metadata.
inaug_words <- inaug_td %>% unnest_tokens(word, text) %>% anti_join(stop_words) inaug_words
We could then, for example, see how the appearance of a word changes over time:
inaug_freq <- inaug_words %>% count(Year, word) %>% complete(Year, word, fill = list(n = 0)) %>% group_by(Year) %>% mutate(year_total = sum(n), percent = n / year_total) %>% ungroup() inaug_freq
For example, we can use the broom package to perform logistic regression on each word.
library(broom) models <- inaug_freq %>% group_by(word) %>% filter(sum(n) > 50) %>% group_modify( ~ tidy(glm(cbind(n, year_total - n) ~ Year, ., family = "binomial")) ) %>% ungroup() %>% filter(term == "Year") models models %>% filter(term == "Year") %>% arrange(desc(abs(estimate)))
You can show these models as a volcano plot, which compares the effect size with the significance:
#| fig.width = 7, #| fig.height = 5, #| fig.alt = 'Volcano plot showing that words like "america" and "world" have increased over time with small p-values, while words like "public" and "institution" have decreased' library(ggplot2) models %>% mutate(adjusted.p.value = p.adjust(p.value)) %>% ggplot(aes(estimate, adjusted.p.value)) + geom_point() + scale_y_log10() + geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) + labs(x = "Estimated change over time", y = "Adjusted p-value")
We can also use the ggplot2 package to display the top 6 terms that have changed in frequency over time.
#| fig.width = 7, #| fig.height = 6, #| fig.alt = 'Scatterplot with LOESS smoothing lines showing that the words "america", "americans", "century", "children", "democracy", and "god" have increased over time' library(scales) models %>% slice_max(abs(estimate), n = 6) %>% inner_join(inaug_freq) %>% ggplot(aes(Year, percent)) + geom_point() + geom_smooth() + facet_wrap(vars(word)) + scale_y_continuous(labels = percent_format()) + labs(y = "Frequency of word in speech")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.