library("here")
library("readxl")
library("rvest")
library("tidyr")
library("dplyr")
library("lubridate")
library("ggplot2")
library("purrr")
library("janitor")
library("socialmixr")
library("readr")
source(here::here("data-raw", "extract_publication_dates.r"))
## create directory for antibody data if it doesn't exist
ab_dir <- here::here("data-processed", "ab")
dir.create(ab_dir, showWarnings = FALSE, recursive = TRUE)
## creata URLs that list spreadsheets
years <- c(2020 + seq(1, 3))
urls <- paste0("https://www.ons.gov.uk/peoplepopulationandcommunity/",
"healthandsocialcare/conditionsanddiseases/datasets/",
"/coronaviruscovid19antibodydatafortheuk/", years)
## get URLs of the spreadsheets, scraped from the web pages
file_urls <- lapply(urls, function(url) {
session <- session(url)
file_url <- session %>%
html_nodes(xpath = paste0(
"//a[contains(concat(' ', ",
"normalize-space(@class),' '),' btn--primary ')]"
)) %>%
html_attr("href")
return(file_url)
}) %>%
unlist() %>%
grep("\\.xlsx?$", value = TRUE, .)
## construct tibble with files to download
df_dl <- tibble(file_url = file_urls) %>%
mutate(file_name = sub("^.*/([^/]+)$", "\\1", file_url),
file_path = file.path(ab_dir, file_name),
full_url = paste0("https://www.ons.gov.uk", file_url)) %>%
filter(!file.exists(file_path))
## define levels to extract and table structure
columns <- c(national = 5, regional = 6, age_school = 6)
super_headers <- c(regional = "region", age_school = "lower_age_limit")
threshold_levels <- c(standard = "", higher = "higher threshold")
threshold_ng_ml <- c(standard = 179, higher = 800)
## if no new URLs there is nothing to do
if (nrow(df_dl) > 0) {
df_dl <- df_dl %>%
rowwise() %>%
mutate(ret = download.file(full_url, file_path)) %>%
ungroup()
if (any(df_dl$ret != 0)) warning("Some downloads failed")
}
## list all files
files <- list.files(here::here("data-processed", "ab"), full.names = TRUE)
list_file <- here::here("data-processed", "ab_files.rds")
## define geography codes not in data
geography_codes <- c(England = "E92000001",
`North East` = "E12000001",
`North West` = "E12000002",
`Yorkshire and The Humber` = "E12000003",
`East Midlands` = "E12000004",
`West Midlands` = "E12000005",
`East of England` = "E12000006",
`London` = "E12000007",
`South East` = "E12000008",
`South West` = "E12000009")
if (file.exists(list_file) && setequal(files, readRDS(list_file))) {
warning("Nothing new to extract")
}
## construct list of data frames with positivity
ab <- list()
for (threshold_level in names(threshold_levels)) {
message(threshold_level)
for (level in names(columns)) {
message(" ", level)
full_level <- paste(threshold_level, level, sep = "_")
ab[[full_level]] <- lapply(files, function(x) {
message(" ", x)
## first, get table of contents sheet to work out which sheet we want
contents_sheet <- suppressMessages(read_excel(x, sheet = "Contents")) %>%
clean_names()
contents_sheet <- contents_sheet %>%
filter(grepl(threshold_levels[threshold_level], contents))
if (level == "national") {
contents_sheet <- contents_sheet %>%
filter(grepl("Antibodies", contents)) %>%
head(n = 1)
} else if (level == "regional") {
contents_sheet <- contents_sheet %>%
filter(grepl("by region", contents)) %>%
head(n = 1)
} else if (level == "age_school") {
contents_sheet <- contents_sheet %>%
filter(grepl("by age group$", contents)) %>%
head(n = 1)
} else {
stop("Unknown level: ", level)
}
## extract table number
sheet <- sub("^Table ([^ ]+) ?- .*$", "\\1", contents_sheet$contents)
if (length(sheet) == 1) {
## we found the sheet, now we get a preview so we can work out where in
## the sheet the actual table is
preview <- suppressMessages(read_excel(x, sheet = sheet)) %>%
remove_empty("cols") %>%
clean_names()
## get row that contains the headers
headers_row <- min(which(!is.na(preview[, 2])))
## add 1 if not a true header
if (sum(!is.na(unlist(preview[headers_row, ]))) == 1) {
headers_row <- headers_row + 1
}
skip <- which(preview$contents == "Weekly period")
if (level %in% c("regional", "age_school")) {
headers <- preview[headers_row, 2:ncol(preview)] %>%
t() %>%
as_tibble(.name_repair = "minimal") %>%
rename(header = 1) %>%
fill(header)
if (level == "age_school") {
headers <- headers %>%
mutate(header =
sub("^Age ([0-9]+) *- *([0-9]+).*$", "\\1|\\2", header),
header =
sub("^Age ([0-9]+)(\\+| years and over)", "\\1|", header)) %>%
separate(header, c("from", "to"), sep = "\\|")
}
}
## having figured out where the table is and extracted the date,
## read the table
if (is.infinite(skip)) return(NULL) ## couldn't find data
## find max row to read
n_max <-
min(which(!grepl("^[0-9]",
unlist(preview[(skip + 1):nrow(preview), 1])))) - 1
data <- suppressMessages(read_excel(x, sheet = sheet, skip = skip,
n_max = n_max, .name_repair = "minimal")) %>%
remove_empty("cols") %>%
clean_names()
if (ncol(data) == 0) return(NULL)
## remove trailing numbers
colnames(data) <-
c("weekly_period",
sub("(_[0-9]+)+$", "", colnames(data)[2:ncol(data)]))
if (level == c("regional")) {
colnames(data)[2:ncol(data)] <-
paste(colnames(data)[2:ncol(data)], headers$header, sep = "|")
} else if (level == "age_school") {
colnames(data)[2:ncol(data)] <-
paste(colnames(data)[2:ncol(data)], headers$from, sep = "|")
} else if (level == "national" && any(grepl("ng_ml", colnames(data)))) {
remove_cols <- unlist(lapply(
which(grepl("ng_ml", colnames(data)) &
!grepl(threshold_ng_ml[threshold_level], colnames(data))),
function(x) x + 0:2
))
if (length(remove_cols) > 0) data <- data[, -remove_cols]
}
data <- data[, !duplicated(colnames(data))]
data <- data %>%
select(!matches("^number"))
data <- data %>%
pivot_longer(2:ncol(.))
if (level %in% names(super_headers)) {
data <- data %>%
separate(name, c("name", super_headers[level]), sep = "\\|")
}
data <- data %>%
pivot_wider() %>%
separate(weekly_period, c("start_date", "end_date"), sep = " to ") %>%
mutate(start_date = dmy(start_date),
end_date = dmy(end_date))
data <- data %>%
select(1:columns[level]) %>%
rename(proportion_pos = columns[[level]] - 2,
proportion_pos_low_95 = columns[[level]] - 1,
proportion_pos_high_95 = columns[[level]]) %>%
mutate_at(vars(starts_with("proportion")), as.numeric) %>%
pivot_longer(starts_with("proportion")) %>%
replace_na(list(value = 0)) %>%
pivot_wider()
data <- data %>%
mutate(threshold_level = {{ threshold_level }})
if (level %in% c("national", "regional", "age_school")) {
if (level %in% c("national", "age_school")) {
data <- data %>%
mutate(region = NA_character_,
geography = "England")
} else if (level == "regional") {
data <- data %>%
mutate(region = sub("the Humber", "The Humber", region)) %>%
mutate(geography = region)
}
data <- data %>%
mutate(geography_code = geography_codes[geography])
}
if (level == "age_school") {
data <- data %>%
mutate(lower_age_limit = as.integer(lower_age_limit)) %>%
select(start_date, end_date, geography, geography_code,
lower_age_limit, threshold_level, starts_with("proportion"))
} else {
data <- data %>%
select(start_date, end_date, geography, geography_code,
threshold_level, starts_with("proportion"))
}
return(data %>%
mutate(file_name = x))
} else {
return(NULL)
}
})
ab[[full_level]] <- ab[[full_level]] %>%
bind_rows() %>%
distinct() %>% ## avoid duplicate rows
mutate(level = level)
}
}
## combine it all into one data frame
combined <- ab %>%
bind_rows() %>%
mutate(publication_date = extract_publication_dates(file_name)) %>%
arrange(publication_date, start_date) %>%
pivot_longer(starts_with("proportion_")) %>%
mutate(value =
if_else(publication_date %in% as.Date(c("2021-06-09", "2021-06-22")),
value, value / 100)) %>%
pivot_wider()
pop_file <- here::here("data-raw", "uk_pop.xls")
if (!file.exists(pop_file)) {
download.file("https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fpopulationandmigration%2fpopulationestimates%2fdatasets%2fpopulationestimatesforukenglandandwalesscotlandandnorthernireland%2fmid2020/ukpopestimatesmid2020on2021geography.xls", destfile = pop_file) # nolint
}
pop <- suppressMessages(read_excel(pop_file, sheet = "MYE2 - Persons", skip = 7)) %>%
clean_names()
pop_geo <- pop %>%
mutate(all_caps_geography = sub("[^a-zA-Z]*$", "", toupper(name))) %>%
select(all_caps_geography, population = all_ages) %>%
mutate(
all_caps_geography =
recode(all_caps_geography, EAST = "EAST OF ENGLAND")
)
pop_age <- pop %>%
filter(name %in% c("ENGLAND", "SCOTLAND",
"WALES", "NORTHERN IRELAND")) %>%
select(name, starts_with("x")) %>%
pivot_longer(starts_with("x"), names_to = "lower_age_limit") %>%
mutate(
lower_age_limit = as.integer(sub("^x", "", lower_age_limit)),
lower_age_limit =
reduce_agegroups(
lower_age_limit,
unique(na.omit(combined$lower_age_limit))
)
) %>%
filter(!is.na(lower_age_limit)) %>%
group_by(all_caps_geography = name, lower_age_limit) %>%
summarise(age_population = sum(value), .groups = "drop")
populations <- combined %>%
mutate(all_caps_geography = toupper(geography)) %>%
left_join(pop_geo, by = "all_caps_geography") %>%
left_join(pop_age, by = c("lower_age_limit", "all_caps_geography")) %>%
mutate(
population = if_else(!is.na(age_population),
age_population, population)
) %>%
select(level, lower_age_limit, geography, geography_code, population) %>%
distinct()
## save
write_csv(combined %>%
filter(level %in% c("national", "regional")) %>%
remove_empty(which = "cols"),
here::here("data-processed", "ab.csv"))
write_csv(combined %>%
filter(level %in% c("age_school")) %>%
remove_empty(which = "cols"),
here::here("data-processed", "ab_age.csv"))
write_csv(populations, here::here("data-processed", "populations_ab.csv"))
saveRDS(files, list_file)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.