Nothing
## -----------------------------------------------------------------------------
knitr::opts_chunk$set(
collapse = FALSE,
comment = "#>",
fig.width = 7,
fig.height = 6,
fig.path = "../man/figures/",
warning = FALSE,
message = FALSE
)
## -----------------------------------------------------------------------------
tryCatch(
library(gutenbergr),
error = function(e) {
# Fallback for Windows check environments
devtools::load_all("..")
}
)
## -----------------------------------------------------------------------------
library(dplyr)
library(tidytext)
library(ggplot2)
library(tidyr)
library(stringr)
## -----------------------------------------------------------------------------
gutenberg_works(str_detect(title, "Persuasion"))
## -----------------------------------------------------------------------------
# persuasion <- gutenberg_download(105, meta_fields = "title")
## -----------------------------------------------------------------------------
# For vignette building, use sample data
persuasion <- gutenbergr::sample_books |>
filter(gutenberg_id == 105) |>
select(gutenberg_id, text, title)
## -----------------------------------------------------------------------------
persuasion
## -----------------------------------------------------------------------------
persuasion <- persuasion |>
gutenberg_add_sections(
pattern = "^Chapter [IVXLCDM]+",
section_col = "chapter",
format_fn = function(x) {
x |>
str_remove("^CHAPTER\\s+") |>
str_remove("\\.$") |>
as.roman() |>
as.numeric()
}
)
# Preview the new structure
persuasion |>
filter(!is.na(chapter)) |>
head()
## -----------------------------------------------------------------------------
words <- persuasion |>
unnest_tokens(word, text) |>
anti_join(stop_words, by = "word")
## -----------------------------------------------------------------------------
word_counts <- words |>
count(word, sort = TRUE)
word_counts
## -----------------------------------------------------------------------------
word_counts |>
slice_max(n, n = 20) |>
mutate(word = reorder(word, n)) |>
ggplot(aes(x = n, y = word, fill = word)) +
geom_col(show.legend = FALSE) +
labs(
title = expression(paste("Most Common Words in ", italic("Persuasion"))),
x = "Frequency",
y = NULL
) +
theme_minimal()
## -----------------------------------------------------------------------------
# nrc_sentiments <- get_sentiments("nrc")
#
# word_sentiments <- words |>
# inner_join(nrc_sentiments, by = "word", relationship = "many-to-many") |>
# count(sentiment, sort = TRUE)
## -----------------------------------------------------------------------------
# word_sentiments |>
# mutate(sentiment = reorder(sentiment, n)) |>
# ggplot(aes(x = n, y = sentiment, fill = sentiment)) +
# geom_col(show.legend = FALSE) +
# labs(
# title = expression(paste(
# "Sentiment Distribution in ",
# italic("Persuasion")
# )),
# x = "Word Count",
# y = NULL
# ) +
# theme_minimal()
## -----------------------------------------------------------------------------
# nrc_by_chapter <- words |>
# inner_join(nrc_sentiments, by = "word", relationship = "many-to-many") |>
# count(chapter, sentiment) |>
# filter(!is.na(chapter))
#
# nrc_by_chapter |>
# filter(sentiment %in% c("joy", "sadness", "anger", "fear")) |>
# ggplot(aes(x = chapter, y = n, fill = factor(sentiment))) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
# labs(
# title = expression(paste("Sentiment by Chapter in ", italic("Persuasion"))),
# x = "Chapter",
# y = "Word Count"
# ) +
# theme_minimal() +
# theme(
# axis.text.x = element_text(angle = 45, hjust = 1),
# strip.text = element_text(face = "bold")
# )
## -----------------------------------------------------------------------------
# # Add a running index to preserve order and calculate bins
# words_with_index <- words |>
# mutate(word_index = row_number()) |>
# mutate(bin = (word_index - 1) %/% 500 + 1)
#
# nrc_binned <- words_with_index |>
# inner_join(nrc_sentiments, by = "word", relationship = "many-to-many") |>
# count(bin, sentiment)
#
# # Add labels for chapters
# chapter_breaks <- words |>
# filter(!is.na(chapter)) |>
# mutate(word_index = row_number()) |>
# group_by(chapter) |>
# slice_min(word_index, n = 1) |>
# ungroup() |>
# mutate(
# bin = (word_index - 1) %/% 500 + 1
# ) |>
# filter(chapter %% 2 == 0)
#
# nrc_binned |>
# filter(sentiment %in% c("joy", "sadness", "anger", "fear")) |>
# ggplot(aes(x = bin, y = n, color = sentiment)) +
# geom_line(linewidth = 1, show.legend = FALSE) +
# facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
# scale_x_continuous(
# name = "Word Bin (500 words)",
# sec.axis = sec_axis(
# ~.,
# breaks = chapter_breaks$bin,
# labels = chapter_breaks$chapter,
# name = "Chapter"
# )
# ) +
# labs(
# title = expression(paste(
# "Sentiment Progression in ",
# italic("Persuasion")
# )),
# subtitle = "NRC sentiments by word bin with chapter reference",
# y = "Word Count"
# ) +
# theme_minimal()
## -----------------------------------------------------------------------------
chapter_words <- persuasion |>
unnest_tokens(word, text) |>
count(chapter, word, sort = TRUE) |>
bind_tf_idf(word, chapter, n)
# Look at the most "important" words for chapters 10 through 13
chapter_words |>
filter(chapter %in% 10:13) |>
group_by(chapter) |>
slice_max(tf_idf, n = 5) |>
ungroup() |>
mutate(word = reorder(word, tf_idf)) |>
ggplot(aes(tf_idf, word, fill = factor(chapter))) +
geom_col(show.legend = FALSE) +
facet_wrap(~chapter, scales = "free") +
labs(
title = "Highest TF-IDF words in Chapters 10-13",
x = "TF-IDF",
y = NULL
) +
theme_minimal()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.