In lassehjorthmadsen/salmer: A Data Set with Hymns in Danish

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(salmer)
#ggplot2::theme_set(ggplot2::theme_minimal())

Introduction

`salmer`

df <- hymns %>% 
  dplyr::mutate(author = author %>% 
           forcats::fct_infreq() %>% 
           forcats::fct_lump_n(n = 1) %>% 
           forcats::fct_explicit_na(na_level = "Other"))

stopda <- dplyr::tibble(word = stopwords::stopwords(language = "da"))

toks <- df %>% 
  tidytext::unnest_tokens(word, text) %>% 
  dplyr::count(author, word, sort = TRUE) %>% 
  #anti_join(stopda) %>% 
  tidylo::bind_log_odds(author, word, n, unweighted = T) %>% 
  dplyr::mutate(sum_n = sum(n)) %>% 
  dplyr::group_by(author) %>% 
  dplyr::mutate(sum_author = sum(n)) %>% 
  dplyr::group_by(word) %>% 
  dplyr::mutate(sum_word = sum(n)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(p = (n+1)/(sum_author+1),
         odds = p/(1-p),
         log_odds3 = log(odds),
         log_odds2 = log(((n + 1) / (sum_author + 1)) / 
    ((sum_word - n + 1) / (sum_n - sum_author +1))
    )
  )

toks %>%
    dplyr::group_by(author) %>%
    dplyr::top_n(10, log_odds_weighted) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(word = forcats::fct_reorder(word, log_odds_weighted)) %>%
    ggplot2::ggplot(ggplot2::aes(word, log_odds_weighted, fill = author)) +
    ggplot2::geom_col(show.legend = FALSE) +
    ggplot2::facet_wrap(~ author, scales = "free_y") +
    ggplot2::coord_flip() +
    ggplot2::scale_y_continuous(expand = c(0,0)) +
    ggplot2::labs(y = "Log odds ratio, weighted by uninformative Dirichlet prior",
         x = NULL,
         title = "What are the most characteristic words for each writer?")