knitr::opts_chunk$set(echo = FALSE, cache = TRUE, message = FALSE, warning = FALSE, collapse = TRUE) library(pacman) p_load(myScrapers, tidyverse, quanteda)
The first step is to create of list of terms to lookup in documents. We can use the create_lookups
function to achieve this. You can use * as a wild card and create categories of search terms.
lookups <- create_lookup(phe = c("phe", "public_health*"), cancer = "cancer*", `mental health` = "mental*", profile = "profile*", fingertips = "fingertips*", phof = c("phof*", "outcome*_frame*"), local = "local_auth*", heart = c("cardi*", "heart*"), stp = "stp*", harlow = "harlow", gp = "gp*", surveillance = "surveil*", sugar = "sugar*", tobacco = c("e-cig*", "smok*")) lookups
We'll use Duncan Selbie's friday messages which we can extract and clean as follows using the get_page_links
and get_page_text
functions.
library(magrittr) url_ds <- "https://publichealthmatters.blog.gov.uk/category/duncan-selbie-friday-message/" url_ds1 <- paste0(url_ds, "page/", 2:8) urls_ds <- c(url_ds, url_ds1) links <- purrr::map(urls_ds, ~(get_page_links(.x))) friday_message <- links %>% purrr::flatten() %>%.[grepl("duncan-selbies-friday-message", .)] %>% .[!grepl("comments", .)] %>% unique()
results <- get_blog_term_frequency(l = friday_message, d = lookups, n = 2) head(results, 10) results %>% data.frame() %>% gather(term, count, 2:ncol(.)) %>% ggplot(aes(term, rev(forcats::fct_inorder(document)), fill = log(count+ .0005 ))) + geom_tile() + viridis::scale_fill_viridis(option = "B") + coord_equal() + coord_flip() + scale_y_discrete(position = "bottom") + theme(axis.text.x = element_text(angle = 90, size = 12, hjust = 0), axis.text.y = element_text(size = 12))
We can calculate a mention rate - the proportion of texts containing our search terms.
results %>% data.frame() %>% gather(term, count, 2:ncol(.)) %>% group_by(term) %>% mutate(mention = ifelse(count >0, 1, 0)) %>% summarise(`mention rate` = round(100 *mean(mention), 1)) %>% arrange(desc(`mention rate`))
here::set_here() here::here() source("../R/get_text_term_frequency.R") files <- list.files(path = here::here(), pattern = "pdf") head(files) test3 <- get_text_term_frequency(l = files, d = lookups, 2) test3
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.