# TODO: include table names and descriptions
# Values "..C" for "confidential" and "..." for "not applicable" are both
# imported as `NA`
library(tidyverse)
library(stringr)
library(tidyxl)
library(unpivotr)
library(here)
book_path <- here("inst", "extdata", "2013-census-electorate-tables.xlsx")
out_path <- here("inst", "extdata", "nzcensus2013.tsv")
book <- tidy_xlsx(book_path)
# Combine all sheets into one data frame (the new tidyxl api will do this)
sheets <- bind_rows(book$data, .id = "sheet")
formats <- book$formats
# Omit the first sheet
datasheets <- filter(sheets, sheet != "Contents")
# Formatting vectors for lookup
bottom_border_ids <- which(!is.na(formats$local$border$bottom$style))
top_border_ids <- which(!is.na(formats$local$border$top$style))
indents <- formats$local$alignment$indent
# Functions to help locate the top-left data cell
get_top_left_header_cell <- function(cells, top_border_ids) {
# First cell in col 1 that has a top border
cells %>%
filter(col == 1, local_format_id %in% top_border_ids) %>%
top_n(1, desc(row))
}
get_top_left_data_cell <- function(cells, top_left_header_cell) {
# From the top-left header cell, go down until the first non-blank cell (which
# is the first row header), then right one column. That cell might be blank,
# but it definitely is not a header.
top_left_header_cell %>%
offset_S(cells, boundary = ~ !is.na(character), include = TRUE) %>%
offset_E(cells, 1)
}
get_bottom_cell <- function(cells, bottom_border_ids) {
# The final cell in column 1 that has a bottom border
cells %>%
filter(col == 1, local_format_id %in% bottom_border_ids) %>%
arrange(desc(row)) %>%
slice(1)
}
# Functions to split headers into levels
get_row_headers <- function(cells, top_left_header_cell, bottom_cell) {
# Row headers are all in the same column, so use the amount of indentation as
# the hierarchy
out <-
cells %>%
filter(col == 1,
between(row,
top_left_header_cell$row + 1,
bottom_cell$row),
!is.na(character)) %>%
mutate(indent = indents[local_format_id]) %>%
select(row, col, indent, header = character) %>%
mutate(indent = if_else(is.na(indent), 0L, indent)) %>%
split(.$indent) %>%
map(~ select(.x, -indent))
c(list(select(top_left_header_cell, row, col, header = character)), out)
}
get_col_headers <- function(cells, top_left_header_cell, top_left_data_cell) {
# Column header hierarchy is given by the row number
headers <-
cells %>%
filter(col >= 2,
between(row,
top_left_header_cell$row,
top_left_data_cell$row - 1),
!is.na(character)) %>%
select(row, col, header = character)
# Fill down, fill right, and split by row into separate data frames
headers %>%
expand(row, col) %>%
left_join(headers, by = c("row", "col")) %>%
group_by(col) %>%
arrange(col, row) %>%
fill(header) %>%
group_by(row) %>%
arrange(row, col) %>%
fill(header) %>%
ungroup() %>%
split(.$row)
}
# Function to find the data cells
get_data_cells <- function(cells,
top_left_data_cell,
bottom_cell,
row_headers,
col_headers) {
# Data cells are at the intersection of row and column headers
cells %>%
filter(col >= 2,
between(row,
top_left_data_cell$row,
bottom_cell$row)) %>%
semi_join(bind_rows(row_headers), by = "row") %>%
semi_join(bind_rows(col_headers), by = "col") %>%
filter(!is.na(numeric) | character %in% c("..C", "...")) %>%
select(row, col, value = numeric)
}
# Function to match data cells to their nearest headers in each tier
match_headers <- function(datacells, row_headers, col_headers) {
out <- datacells
for (i in seq_along(row_headers)) {
out <- WNW(out, row_headers[[i]])
}
for (i in seq_along(col_headers)) {
out <- NNW(out, col_headers[[i]])
}
new_colnames <-
c("row", "col", "value",
paste0("col_header", rev(seq_along(col_headers))),
paste0("row_header", seq_along(row_headers)))
datacells <- arrange(datacells, row, col)
colnames(out) <- new_colnames
out
}
# Function to import the table on a worksheet
tidy_data <- function(cells) {
top_left_header_cell <- get_top_left_header_cell(cells, top_border_ids)
top_left_data_cell <- get_top_left_data_cell(cells, top_left_header_cell)
bottom_cell <- get_bottom_cell(cells, bottom_border_ids)
row_headers <- get_row_headers(cells, top_left_header_cell, bottom_cell)
col_headers <- get_col_headers(cells, top_left_header_cell, top_left_data_cell)
data_cells <- get_data_cells(cells, top_left_data_cell, bottom_cell, row_headers, col_headers)
out <- match_headers(data_cells, row_headers, col_headers)
out$title <- filter(cells, row == 3, col == 1)$character
out$dimensions <- filter(cells, row == 4, col == 1)$character
out$population <- filter(cells, row == 5, col == 1)$character
out
}
# Import all sheets
nzcensus2013 <-
datasheets %>%
nest(-sheet) %>%
mutate(tidy_data = map(data, tidy_data)) %>%
select(sheet, tidy_data) %>%
unnest() %>%
# Sort the columns by name, apart from sheet, row, col and value
select(sort(colnames(.))) %>%
select(sheet, title, dimensions, population, row, col, value, everything())
devtools::use_data(nzcensus2013, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.