In julianflowers/myScrapers: Functions to extract data from PHE websites and identify PH information on the web

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>", 
  cache = TRUE,
  warning = FALSE, 
  fig.width = 8,
  fig.height = 6
)

PHE publishes a range of information to https://www.gov.uk/phe inlcuding blogs, statistics and other publications.

This vignette introduces an internal PHE R package myScrapers which includes tools to extract published public health information from:

NHS Digital
.GOV.UK

The functions offered include:

get_dsph_england which downloads the latest list of Directors of Public Health from .GOV.UK
get_page_links which extracts page links
get_page_text which allows the user to retrieve the text of blogs
get_page_csvs identifies links which are csv or xls/xlsx files
get_page_docs identifies links which are doc/docx or pdfs
get_phe_catalogue which downloads an interactive table of PHE publications on .GOV.UK

These tools are intended to help text mining and analysis of published content, to assist developing outputs such as products and services catalogues, and to make it easier to identify public health content on relevant websites.

The functions are assembled as an R package which is currently available on github at https://github.com/julianflowers/myScrapers or on gitlab at https://gitlab.phe.gov.uk/packges/myScrapers

Getting started

The first step is to install the package. This can be achieved with the code below:

library(devtools)
library(pacman)
if(!require("myScrapers"))install_github("julianflowers/myScrapers", build_vignettes =TRUE)

p_load(myScrapers, tidyverse, tidytext, quanteda, Rcrawler)

Get a list of Directors of Public Health

This is easily achieved with the get_dsph_england() function.

dsph <- get_dsph_england()

head(dsph)

PHE Blogs

The URL for PHE blogs is "https://publichealthmatters.blog.gov.uk".

We can obtain a table of blog topic categories or authors...

url <- "https://publichealthmatters.blog.gov.uk"
no_pages <- 89
url1 <- paste0(url,  "/page/", 2:no_pages)
urls <- c(url, url1)

links <- purrr::map(urls, get_page_links ) ## loops over all urls to extract links

## extract categories
category <- links %>%
  unlist() %>%
  data.frame() %>%
  mutate(cat = str_detect(., "category"), 
         category = ifelse(cat == TRUE, str_extract(., "category/.*"), "NA"), 
         category = str_remove_all(category, "/$") ,
         category = str_remove(category, "category/")) %>%
  pull(category) %>%
  unique()


## extract authors
author <- links %>%
  unlist() %>%
  data.frame() %>%
  mutate(cat = str_detect(., "author"), 
         author = ifelse(cat == TRUE, str_extract(., "author/.*"), "NA"), 
         author = str_remove_all(author, "author/") ,
         author = str_remove(author, "/")) %>%
  pull(author) %>%
  unique()

r author %>% knitr::kable()

Getting blogs for a given author

Lets try Isabel Oliver:

url_author <- paste0("https://publichealthmatters.blog.gov.uk/author/", author)

## iterate over links to extract blog text

url_author %>%
  .[grepl("isabel", .)] %>%
  get_page_links(.) %>%
  .[c(13, 15 )] %>%
  purrr::map(., get_page_text)

Blog text for a blog category

We can extract blog posts for any given category e.g. data-blog

url_cat <- paste0(url, "/category/", category)

data_blog <- url_cat %>%
  .[grepl("data-blog", .)] %>%  ## filter data blogs
   get_page_links(.) %>%
  .[grepl(paste0(url, "/"),.)] %>% ## clean up
  .[!grepl("author|category",.)] %>%
  .[-c(22:29)] %>%
  .[!grepl("#comments",.)] %>% ## remove comments
  purrr::map(., get_page_text)

titles <- purrr::map(data_blog, 2) ## extract blog titles
names(data_blog) <- titles

## remove excess lines and add date of blog
data_blog_df <- data_blog %>%
  purrr::map(., data.frame) %>%
  purrr::map_df(., bind_rows, .id = "titles" ) %>%
  distinct() %>%
  group_by(titles) %>%
  mutate(row = row_number()) %>%
  dplyr::filter(row > 3, row < max(row)-3) %>%
  mutate(date = as.Date(str_extract(titles, "\\d{4}/\\d{2}/\\d{2}")))

Now we have a dataframe of the text of the data-blog blogs. We can analyse this further. For example, as wordlcouds...

palette <- viridis::viridis(10)

data_blog_df %>%
  select(titles, text = .x..i..) %>%
  create_bigrams(text) %>%
  group_by(bigram) %>%
  count(sort=TRUE) %>%
  ungroup() %>%
  mutate(bigram = tm::removeNumbers(bigram)) %>%
  dplyr::filter(!bigram %in% c("public health", "we’re pleased", "past week", "phe’s online", "week here’s", "online activity", "friday messages", "wishes friday", "NA NA")) %>%
  with(., wordcloud::wordcloud(bigram, n, colors = palette, random.order = FALSE, random.color = FALSE, max.words = "Inf", rot.per = 0.5, scale = c(5, 0.01)))

Duncan Selbie's friday message (n = 90) wordcloud

n <- 7
url <- "https://publichealthmatters.blog.gov.uk/category/duncan-selbie-friday-message/"
url1 <- paste0(url, "/page/", 2:n)
ifelse( n>1, urls <- c(url, url1), urls <- url)
#cat <- "local-authority-public-health/"
links <- purrr::map(urls, get_page_links)
links1 <-  links %>% purrr::flatten() %>% .[grepl("duncan-selbies-friday-message", .)] %>% unique()

blogs_ds <- purrr::map(links1, get_page_text)

titles <- purrr::map(blogs_ds, 2) ## extract blog titles
names(blogs_ds) <- titles

## remove excess lines
ds_blog_df <- blogs_ds %>%
  purrr::map(., data.frame) %>%
  purrr::map_df(., bind_rows, .id = "titles" ) %>%
  distinct() %>%
  group_by(titles) %>%
  mutate(row = row_number()) %>%
  dplyr::filter(row>3, row < max(row)-3) %>%
  mutate(date = str_extract(titles, "\\d{4}/\\d{2}/\\d{2}"))


palette <- viridis::viridis(5, direction = -1)

ds_blog_df %>%
  create_bigrams(.x..i..) %>%
  group_by(bigram) %>%
  count(sort=TRUE) %>%
  ungroup() %>%
  mutate(bigram = tm::removeNumbers(bigram)) %>%
  dplyr::filter(!bigram %in% c("public health", "we’re pleased", "past week", "phe’s online", "week here’s", "online activity", "friday messages", "wishes friday", "NA NA")) %>%
  with(., wordcloud::wordcloud(bigram, n, colors = palette, random.order = FALSE, random.color = FALSE, max.words = 500, rot.per = 0.5, scale = c(3, 0.01)))

PHE publications

Its hard to get everything PHE has published on .GOV.UK. To assist this process we have written a function which produces an interactive table of all the PHE publications by category (NB at the moment it is over inclusive). This makes use of the DT package and allows us to add download options so the data can be downloaded in various forms.

phe_cat <- myScrapers::get_phe_catalogue(url = "https://www.gov.uk/government/publications?departments%5B%5D=public-health-england", n = 103)

phe_cat

Identifying spreadshhets on NHS Digital

The principles outlined in this blog can be applied to other sites. We can try and identify data on the NHS Digital website.

## create list of links
urls_nhs <- paste0("https://digital.nhs.uk/search/document-type/publication/publicationStatus/true?area=data&sort=date&r25_r2:page=", 1:10, "&r25_r2:pageSize=10")

nhsd_links <- purrr::map(urls_nhs, get_page_links)

The relevant links are "data-and-information" so we will filter these out.

nhsd_links <- nhsd_links %>%
  purrr::flatten() %>%                ## creates a single list
  .[grepl("data-and-information", .)]

We now have a list of > 100 pages. It needs further filtering and we need to add the root url back to the links to create proper urls for further searching. This gives > 200 spreadsheet links which can be downloaded.

nhsd_links %>%
  .[!grepl("^/search", .)] %>%
  .[!grepl("information$", .)] %>%
  purrr::map(., ~(paste0("https://digital.nhs.uk", .x))) %>%
  purrr::map(., get_page_csvs) %>%
  compact() %>%
  purrr::flatten()

julianflowers/myScrapers documentation built on Aug. 7, 2024, 3:17 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

julianflowers/myScrapers
Functions to extract data from PHE websites and identify PH information on the web

In julianflowers/myScrapers: Functions to extract data from PHE websites and identify PH information on the web

Getting started

Get a list of Directors of Public Health

PHE Blogs

Getting blogs for a given author

Blog text for a blog category

Duncan Selbie's friday message (n = 90) wordcloud

PHE publications

Identifying spreadshhets on NHS Digital

R Package Documentation

Browse R Packages

We want your feedback!

julianflowers/myScrapers Functions to extract data from PHE websites and identify PH information on the web

In julianflowers/myScrapers: Functions to extract data from PHE websites and identify PH information on the web

Getting started

Get a list of Directors of Public Health

PHE Blogs

Getting blogs for a given author

Blog text for a blog category

Duncan Selbie's friday message (n = 90) wordcloud

PHE publications

Identifying spreadshhets on NHS Digital

R Package Documentation

Browse R Packages

We want your feedback!

julianflowers/myScrapers
Functions to extract data from PHE websites and identify PH information on the web