R/parse_table.R

Defines functions parse_table

Documented in parse_table

#' Flexibly parse HTML table contents into a data frame
#'
#' Extract data from HTML tables using custom functions. This enables the user
#' to extract text, much as one would with rvest::html_table, but also allows
#' more complex extraction of HTML element attributes (href, src, etc.), raw
#' HTML, and more.
#'
#' @param table_node an HTML table in an object of class "xml_node".
#' @param cell_fn a function, e.g. function_x or anonymous function(x) {…}.
#' @param ... additional arguments to be passed to cell_fn.
#'
#' @return A data frame of list columns.
#' @export
#' @examples
#' library(xml2)
#' url <- "https://en.wikipedia.org/wiki/Political_party_strength_in_Michigan"
#' html_doc <- read_html(url)
#' wikitable_elements <- xml_find_all(html_doc,
#'                                    "//table[contains(@class, 'wikitable')]")
#' wikitable_list <- lapply(
#'     wikitable_elements,
#'     parse_table,
#'     cell_fn = function(.) {
#'         xml_attr(xml_find_first(., ".//a"), "href")
#'     }
#' )
parse_table <-
    function(table_node, cell_fn = xml2::xml_text, ...) {
        if (class(table_node) == "xml_nodeset" && length(table_node) > 1) {
            stop("The function processes one table at a time.", call. = F)
        }

        ns <- xml2::xml_ns(table_node)
        trs_list <- unclass(xml2::xml_find_all(table_node, ".//tr", ns))

        for (i in seq_along(trs_list)) {
            tds <- xml2::xml_children(trs_list[[i]])
            cspns <- as.integer(xml2::xml_attr(tds, "colspan", ns, "1"))
            rspns <- rep(as.integer(xml2::xml_attr(tds, "rowspan", ns, "1")), cspns)
            rvals <- rep(lapply(tds, cell_fn, ...), cspns)

            if (i == 1) {
                rspns_keep <- rspns
                trs_previous <- rvals
            } else {
                rspns_keep <- rspns_keep - 1
                rspns_lgl <- rspns_keep == 0
                rspns_keep[rspns_lgl] <- rspns[1:sum(rspns_lgl)]
                rspns_keep[is.na(rspns_keep)] <- 1
                trs_previous <- trs_list[[i-1]]
                trs_previous[rspns_lgl] <- rvals[1:sum(rspns_lgl)]
            }

            trs_list[[i]] <-
                lapply(trs_previous,
                       function(.) if (length(.) == 0) {NA_character_} else {.}
                )
        }

        as.data.frame(do.call("rbind", trs_list))
    }
gershomtripp/gttoolkit documentation built on Dec. 20, 2021, 10:41 a.m.