knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>", 
  cache = TRUE,
  warning = FALSE, 
  fig.width = 8,
  fig.height = 6
)

PHE publishes a range of information to https://www.gov.uk/phe inlcuding blogs, statistics and other publications.

This vignette introduces an internal PHE R package myScrapers which includes tools to extract published public health information from:

The functions offered include:

These tools are intended to help text mining and analysis of published content, to assist developing outputs such as products and services catalogues, and to make it easier to identify public health content on relevant websites.

The functions are assembled as an R package which is currently available on github at https://github.com/julianflowers/myScrapers or on gitlab at https://gitlab.phe.gov.uk/packges/myScrapers

Getting started

The first step is to install the package. This can be achieved with the code below:

library(devtools)
library(pacman)
if(!require("myScrapers"))install_github("julianflowers/myScrapers", build_vignettes =TRUE)

p_load(myScrapers, tidyverse, tidytext, quanteda, Rcrawler)

Get a list of Directors of Public Health

This is easily achieved with the get_dsph_england() function.

dsph <- get_dsph_england()

head(dsph)

PHE Blogs

The URL for PHE blogs is "https://publichealthmatters.blog.gov.uk".

We can obtain a table of blog topic categories or authors...

url <- "https://publichealthmatters.blog.gov.uk"
no_pages <- 89
url1 <- paste0(url,  "/page/", 2:no_pages)
urls <- c(url, url1)

links <- purrr::map(urls, get_page_links ) ## loops over all urls to extract links

## extract categories
category <- links %>%
  unlist() %>%
  data.frame() %>%
  mutate(cat = str_detect(., "category"), 
         category = ifelse(cat == TRUE, str_extract(., "category/.*"), "NA"), 
         category = str_remove_all(category, "/$") ,
         category = str_remove(category, "category/")) %>%
  pull(category) %>%
  unique()


## extract authors
author <- links %>%
  unlist() %>%
  data.frame() %>%
  mutate(cat = str_detect(., "author"), 
         author = ifelse(cat == TRUE, str_extract(., "author/.*"), "NA"), 
         author = str_remove_all(author, "author/") ,
         author = str_remove(author, "/")) %>%
  pull(author) %>%
  unique()

r author %>% knitr::kable()

Getting blogs for a given author

Lets try Isabel Oliver:

url_author <- paste0("https://publichealthmatters.blog.gov.uk/author/", author)

## iterate over links to extract blog text

url_author %>%
  .[grepl("isabel", .)] %>%
  get_page_links(.) %>%
  .[c(13, 15 )] %>%
  purrr::map(., get_page_text)

Blog text for a blog category

We can extract blog posts for any given category e.g. data-blog

url_cat <- paste0(url, "/category/", category)

data_blog <- url_cat %>%
  .[grepl("data-blog", .)] %>%  ## filter data blogs
   get_page_links(.) %>%
  .[grepl(paste0(url, "/"),.)] %>% ## clean up
  .[!grepl("author|category",.)] %>%
  .[-c(22:29)] %>%
  .[!grepl("#comments",.)] %>% ## remove comments
  purrr::map(., get_page_text)

titles <- purrr::map(data_blog, 2) ## extract blog titles
names(data_blog) <- titles

## remove excess lines and add date of blog
data_blog_df <- data_blog %>%
  purrr::map(., data.frame) %>%
  purrr::map_df(., bind_rows, .id = "titles" ) %>%
  distinct() %>%
  group_by(titles) %>%
  mutate(row = row_number()) %>%
  dplyr::filter(row > 3, row < max(row)-3) %>%
  mutate(date = as.Date(str_extract(titles, "\\d{4}/\\d{2}/\\d{2}")))

Now we have a dataframe of the text of the data-blog blogs. We can analyse this further. For example, as wordlcouds...

palette <- viridis::viridis(10)

data_blog_df %>%
  select(titles, text = .x..i..) %>%
  create_bigrams(text) %>%
  group_by(bigram) %>%
  count(sort=TRUE) %>%
  ungroup() %>%
  mutate(bigram = tm::removeNumbers(bigram)) %>%
  dplyr::filter(!bigram %in% c("public health", "we’re pleased", "past week", "phe’s online", "week here’s", "online activity", "friday messages", "wishes friday", "NA NA")) %>%
  with(., wordcloud::wordcloud(bigram, n, colors = palette, random.order = FALSE, random.color = FALSE, max.words = "Inf", rot.per = 0.5, scale = c(5, 0.01)))

Duncan Selbie's friday message (n = 90) wordcloud

n <- 7
url <- "https://publichealthmatters.blog.gov.uk/category/duncan-selbie-friday-message/"
url1 <- paste0(url, "/page/", 2:n)
ifelse( n>1, urls <- c(url, url1), urls <- url)
#cat <- "local-authority-public-health/"
links <- purrr::map(urls, get_page_links)
links1 <-  links %>% purrr::flatten() %>% .[grepl("duncan-selbies-friday-message", .)] %>% unique()

blogs_ds <- purrr::map(links1, get_page_text)

titles <- purrr::map(blogs_ds, 2) ## extract blog titles
names(blogs_ds) <- titles

## remove excess lines
ds_blog_df <- blogs_ds %>%
  purrr::map(., data.frame) %>%
  purrr::map_df(., bind_rows, .id = "titles" ) %>%
  distinct() %>%
  group_by(titles) %>%
  mutate(row = row_number()) %>%
  dplyr::filter(row>3, row < max(row)-3) %>%
  mutate(date = str_extract(titles, "\\d{4}/\\d{2}/\\d{2}"))


palette <- viridis::viridis(5, direction = -1)

ds_blog_df %>%
  create_bigrams(.x..i..) %>%
  group_by(bigram) %>%
  count(sort=TRUE) %>%
  ungroup() %>%
  mutate(bigram = tm::removeNumbers(bigram)) %>%
  dplyr::filter(!bigram %in% c("public health", "we’re pleased", "past week", "phe’s online", "week here’s", "online activity", "friday messages", "wishes friday", "NA NA")) %>%
  with(., wordcloud::wordcloud(bigram, n, colors = palette, random.order = FALSE, random.color = FALSE, max.words = 500, rot.per = 0.5, scale = c(3, 0.01)))

PHE publications

Its hard to get everything PHE has published on .GOV.UK. To assist this process we have written a function which produces an interactive table of all the PHE publications by category (NB at the moment it is over inclusive). This makes use of the DT package and allows us to add download options so the data can be downloaded in various forms.

phe_cat <- myScrapers::get_phe_catalogue(url = "https://www.gov.uk/government/publications?departments%5B%5D=public-health-england", n = 103)

phe_cat

Identifying spreadshhets on NHS Digital

The principles outlined in this blog can be applied to other sites. We can try and identify data on the NHS Digital website.

## create list of links
urls_nhs <- paste0("https://digital.nhs.uk/search/document-type/publication/publicationStatus/true?area=data&sort=date&r25_r2:page=", 1:10, "&r25_r2:pageSize=10")

nhsd_links <- purrr::map(urls_nhs, get_page_links)

The relevant links are "data-and-information" so we will filter these out.

nhsd_links <- nhsd_links %>%
  purrr::flatten() %>%                ## creates a single list
  .[grepl("data-and-information", .)]

We now have a list of > 100 pages. It needs further filtering and we need to add the root url back to the links to create proper urls for further searching. This gives > 200 spreadsheet links which can be downloaded.

nhsd_links %>%
  .[!grepl("^/search", .)] %>%
  .[!grepl("information$", .)] %>%
  purrr::map(., ~(paste0("https://digital.nhs.uk", .x))) %>%
  purrr::map(., get_page_csvs) %>%
  compact() %>%
  purrr::flatten()


julianflowers/myScrapers documentation built on May 10, 2023, 7:17 a.m.