knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(salmer) #ggplot2::theme_set(ggplot2::theme_minimal())
salmer
df <- hymns %>% dplyr::mutate(author = author %>% forcats::fct_infreq() %>% forcats::fct_lump_n(n = 1) %>% forcats::fct_explicit_na(na_level = "Other"))
stopda <- dplyr::tibble(word = stopwords::stopwords(language = "da")) toks <- df %>% tidytext::unnest_tokens(word, text) %>% dplyr::count(author, word, sort = TRUE) %>% #anti_join(stopda) %>% tidylo::bind_log_odds(author, word, n, unweighted = T) %>% dplyr::mutate(sum_n = sum(n)) %>% dplyr::group_by(author) %>% dplyr::mutate(sum_author = sum(n)) %>% dplyr::group_by(word) %>% dplyr::mutate(sum_word = sum(n)) %>% dplyr::ungroup() %>% dplyr::mutate(p = (n+1)/(sum_author+1), odds = p/(1-p), log_odds3 = log(odds), log_odds2 = log(((n + 1) / (sum_author + 1)) / ((sum_word - n + 1) / (sum_n - sum_author +1)) ) )
toks %>% dplyr::group_by(author) %>% dplyr::top_n(10, log_odds_weighted) %>% dplyr::ungroup() %>% dplyr::mutate(word = forcats::fct_reorder(word, log_odds_weighted)) %>% ggplot2::ggplot(ggplot2::aes(word, log_odds_weighted, fill = author)) + ggplot2::geom_col(show.legend = FALSE) + ggplot2::facet_wrap(~ author, scales = "free_y") + ggplot2::coord_flip() + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::labs(y = "Log odds ratio, weighted by uninformative Dirichlet prior", x = NULL, title = "What are the most characteristic words for each writer?")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.