HTML Tables
In unpivotr: Unpivot Complex and Irregular Data Layouts

This vignette for the unpivotr package demonstrates unpivoting html tables of various kinds.

The HTML files are in the package directory at system.file("extdata", c("rowspan.html", "colspan.html", "nested.html"), package = "unpivotr").

library(dplyr)
library(rvest)
library(htmltools)
library(unpivotr)

Rowspan and colspan examples

If a table has cells merged across rows or columns (or both), then as_cells() does not attempt to fill the cell contents across the rows or columns. This is different from other packages, e.g. rvest. However, if merged cells cause a table not to be square, then as_cells() pads the missing cells with blanks.

Rowspan

rowspan <- system.file("extdata", "rowspan.html", package = "unpivotr")
includeHTML(rowspan)

# rvest
rowspan %>%
  read_html() %>%
  html_table()

# unpivotr
rowspan %>%
  read_html() %>%
  as_cells()

Colspan

colspan <- system.file("extdata", "colspan.html", package = "unpivotr")
includeHTML(colspan)

# rvest
colspan %>%
  read_html() %>%
  html_table()

# unpivotr
colspan %>%
  read_html() %>%
  as_cells()

Both rowspan and colspan: non-square

rowandcolspan <- system.file("extdata",
                             "row-and-colspan.html",
                             package = "unpivotr")
includeHTML(rowandcolspan)

# rvest
rowandcolspan %>%
  read_html() %>%
  html_table()

# unpivotr
rowandcolspan %>%
  read_html() %>%
  as_cells()

Nested example

as_cells() never descends into cells. If there is a table inside a cell, then to parse that table use html_table again on that cell.

nested <- system.file("extdata", "nested.html", package = "unpivotr")
includeHTML(nested)

# rvest parses both tables
nested %>%
  read_html() %>%
  html_table(fill = TRUE)

# unpivotr
x <-
  nested %>%
  read_html() %>%
  as_cells() %>%
  .[[1]]
x

# The html of the table inside a cell
cell <-
  x %>%
  dplyr::filter(row == 2, col == 2) %>%
  .$html
cell

# Parsing the table inside the cell
cell %>%
  read_html() %>%
  as_cells()

URL example

A motivation for using unpivotr::as_cells() is that it extracts more than just text -- it can extract whatever part of the HTML you need.

Here, we extract URLs.

urls <- system.file("extdata", "url.html", package = "unpivotr")
includeHTML(urls)

cell_url <- function(x) {
  if (is.na(x)) return(NA)
  x %>%
    read_html %>%
    html_nodes("a") %>%
    html_attr("href")
}

cell_text <- function(x) {
  if (is.na(x)) return(NA)
  x %>%
    read_html %>%
    html_nodes("a") %>%
    html_text()
}

urls %>%
  read_html() %>%
  as_cells() %>%
  .[[1]] %>%
  mutate(text = purrr::map(html, cell_text),
         url = purrr::map(html, cell_url)) %>%
  tidyr::unnest(text, url)