text-mining.R
In gutenbergr: Download and Process Public Domain Works from Project Gutenberg

## -----------------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = FALSE,
  comment = "#>",
  fig.width = 7,
  fig.height = 6,
  fig.path = "../man/figures/",
  warning = FALSE,
  message = FALSE
)

## -----------------------------------------------------------------------------
tryCatch(
  library(gutenbergr),
  error = function(e) {
    # Fallback for Windows check environments
    devtools::load_all("..")
  }
)

## -----------------------------------------------------------------------------
library(dplyr)
library(tidytext)
library(ggplot2)
library(tidyr)
library(stringr)

## -----------------------------------------------------------------------------
gutenberg_works(str_detect(title, "Persuasion"))

## -----------------------------------------------------------------------------
# persuasion <- gutenberg_download(105, meta_fields = "title")

## -----------------------------------------------------------------------------
# For vignette building, use sample data
persuasion <- gutenbergr::sample_books |>
  filter(gutenberg_id == 105) |>
  select(gutenberg_id, text, title)

## -----------------------------------------------------------------------------
persuasion

## -----------------------------------------------------------------------------
persuasion <- persuasion |>
  gutenberg_add_sections(
    pattern = "^Chapter [IVXLCDM]+",
    section_col = "chapter",
    format_fn = function(x) {
      x |>
        str_remove("^CHAPTER\\s+") |>
        str_remove("\\.$") |>
        as.roman() |>
        as.numeric()
    }
  )

# Preview the new structure
persuasion |>
  filter(!is.na(chapter)) |>
  head()

## -----------------------------------------------------------------------------
words <- persuasion |>
  unnest_tokens(word, text) |>
  anti_join(stop_words, by = "word")

## -----------------------------------------------------------------------------
word_counts <- words |>
  count(word, sort = TRUE)

word_counts

## -----------------------------------------------------------------------------
word_counts |>
  slice_max(n, n = 20) |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(x = n, y = word, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(
    title = expression(paste("Most Common Words in ", italic("Persuasion"))),
    x = "Frequency",
    y = NULL
  ) +
  theme_minimal()

## -----------------------------------------------------------------------------
# nrc_sentiments <- get_sentiments("nrc")
# 
# word_sentiments <- words |>
#   inner_join(nrc_sentiments, by = "word", relationship = "many-to-many") |>
#   count(sentiment, sort = TRUE)

## -----------------------------------------------------------------------------
# word_sentiments |>
#   mutate(sentiment = reorder(sentiment, n)) |>
#   ggplot(aes(x = n, y = sentiment, fill = sentiment)) +
#   geom_col(show.legend = FALSE) +
#   labs(
#     title = expression(paste(
#       "Sentiment Distribution in ",
#       italic("Persuasion")
#     )),
#     x = "Word Count",
#     y = NULL
#   ) +
#   theme_minimal()

## -----------------------------------------------------------------------------
# nrc_by_chapter <- words |>
#   inner_join(nrc_sentiments, by = "word", relationship = "many-to-many") |>
#   count(chapter, sentiment) |>
#   filter(!is.na(chapter))
# 
# nrc_by_chapter |>
#   filter(sentiment %in% c("joy", "sadness", "anger", "fear")) |>
#   ggplot(aes(x = chapter, y = n, fill = factor(sentiment))) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
#   labs(
#     title = expression(paste("Sentiment by Chapter in ", italic("Persuasion"))),
#     x = "Chapter",
#     y = "Word Count"
#   ) +
#   theme_minimal() +
#   theme(
#     axis.text.x = element_text(angle = 45, hjust = 1),
#     strip.text = element_text(face = "bold")
#   )

## -----------------------------------------------------------------------------
# # Add a running index to preserve order and calculate bins
# words_with_index <- words |>
#   mutate(word_index = row_number()) |>
#   mutate(bin = (word_index - 1) %/% 500 + 1)
# 
# nrc_binned <- words_with_index |>
#   inner_join(nrc_sentiments, by = "word", relationship = "many-to-many") |>
#   count(bin, sentiment)
# 
# # Add labels for chapters
# chapter_breaks <- words |>
#   filter(!is.na(chapter)) |>
#   mutate(word_index = row_number()) |>
#   group_by(chapter) |>
#   slice_min(word_index, n = 1) |>
#   ungroup() |>
#   mutate(
#     bin = (word_index - 1) %/% 500 + 1
#   ) |>
#   filter(chapter %% 2 == 0)
# 
# nrc_binned |>
#   filter(sentiment %in% c("joy", "sadness", "anger", "fear")) |>
#   ggplot(aes(x = bin, y = n, color = sentiment)) +
#   geom_line(linewidth = 1, show.legend = FALSE) +
#   facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
#   scale_x_continuous(
#     name = "Word Bin (500 words)",
#     sec.axis = sec_axis(
#       ~.,
#       breaks = chapter_breaks$bin,
#       labels = chapter_breaks$chapter,
#       name = "Chapter"
#     )
#   ) +
#   labs(
#     title = expression(paste(
#       "Sentiment Progression in ",
#       italic("Persuasion")
#     )),
#     subtitle = "NRC sentiments by word bin with chapter reference",
#     y = "Word Count"
#   ) +
#   theme_minimal()

## -----------------------------------------------------------------------------
chapter_words <- persuasion |>
  unnest_tokens(word, text) |>
  count(chapter, word, sort = TRUE) |>
  bind_tf_idf(word, chapter, n)

# Look at the most "important" words for chapters 10 through 13
chapter_words |>
  filter(chapter %in% 10:13) |>
  group_by(chapter) |>
  slice_max(tf_idf, n = 5) |>
  ungroup() |>
  mutate(word = reorder(word, tf_idf)) |>
  ggplot(aes(tf_idf, word, fill = factor(chapter))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~chapter, scales = "free") +
  labs(
    title = "Highest TF-IDF words in Chapters 10-13",
    x = "TF-IDF",
    y = NULL
  ) +
  theme_minimal()

Any scripts or data that you put into this service are public.

gutenbergr documentation built on March 15, 2026, 9:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

gutenbergr
Download and Process Public Domain Works from Project Gutenberg

inst/doc/text-mining.R
In gutenbergr: Download and Process Public Domain Works from Project Gutenberg

Try the gutenbergr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gutenbergr Download and Process Public Domain Works from Project Gutenberg

inst/doc/text-mining.R In gutenbergr: Download and Process Public Domain Works from Project Gutenberg

Try the gutenbergr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gutenbergr
Download and Process Public Domain Works from Project Gutenberg

inst/doc/text-mining.R
In gutenbergr: Download and Process Public Domain Works from Project Gutenberg