#' Flexibly parse HTML table contents into a data frame
#'
#' Extract data from HTML tables using custom functions. This enables the user
#' to extract text, much as one would with rvest::html_table, but also allows
#' more complex extraction of HTML element attributes (href, src, etc.), raw
#' HTML, and more.
#'
#' @param table_node an HTML table in an object of class "xml_node".
#' @param cell_fn a function, e.g. function_x or anonymous function(x) {…}.
#' @param ... additional arguments to be passed to cell_fn.
#'
#' @return A data frame of list columns.
#' @export
#' @examples
#' library(xml2)
#' url <- "https://en.wikipedia.org/wiki/Political_party_strength_in_Michigan"
#' html_doc <- read_html(url)
#' wikitable_elements <- xml_find_all(html_doc,
#' "//table[contains(@class, 'wikitable')]")
#' wikitable_list <- lapply(
#' wikitable_elements,
#' parse_table,
#' cell_fn = function(.) {
#' xml_attr(xml_find_first(., ".//a"), "href")
#' }
#' )
parse_table <-
function(table_node, cell_fn = xml2::xml_text, ...) {
if (class(table_node) == "xml_nodeset" && length(table_node) > 1) {
stop("The function processes one table at a time.", call. = F)
}
ns <- xml2::xml_ns(table_node)
trs_list <- unclass(xml2::xml_find_all(table_node, ".//tr", ns))
for (i in seq_along(trs_list)) {
tds <- xml2::xml_children(trs_list[[i]])
cspns <- as.integer(xml2::xml_attr(tds, "colspan", ns, "1"))
rspns <- rep(as.integer(xml2::xml_attr(tds, "rowspan", ns, "1")), cspns)
rvals <- rep(lapply(tds, cell_fn, ...), cspns)
if (i == 1) {
rspns_keep <- rspns
trs_previous <- rvals
} else {
rspns_keep <- rspns_keep - 1
rspns_lgl <- rspns_keep == 0
rspns_keep[rspns_lgl] <- rspns[1:sum(rspns_lgl)]
rspns_keep[is.na(rspns_keep)] <- 1
trs_previous <- trs_list[[i-1]]
trs_previous[rspns_lgl] <- rvals[1:sum(rspns_lgl)]
}
trs_list[[i]] <-
lapply(trs_previous,
function(.) if (length(.) == 0) {NA_character_} else {.}
)
}
as.data.frame(do.call("rbind", trs_list))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.