library(magrittr)
get_nodes_by_block <- function(xml_nodeset){
purrr::map(xml_nodeset, rvest::html_nodes, "block")
}
get_attrs_enframe_pivot <- function(nodes_by_flow_and_block, node_name) {
purrr::map(nodes_by_flow_and_block, ~ {
xml2::xml_attrs(.x, node_name) %>%
tibble::enframe() %>%
tidyr::pivot_wider(names_from = "name", values_from = "value")
})
}
get_word <- function(nodes_by_flow_and_block) {
purrr::map(nodes_by_flow_and_block, ~ {
rvest::html_nodes(.x, "word") %>%
xml2::xml_text()
})
}
get_page_attrs <- function(xml_nodeset){
xml2::xml_attrs(xml_nodeset) %>%
purrr::map(~{
tibble::enframe(.x) %>%
tidyr::pivot_wider(names_from = "name", values_from = "value")
}
)
}
# return a list of blocks subvided by flows by page
get_nested_content <- function(xml_nodeset){
# subdivide list into flows
nodes_by_flow <- purrr::map(xml_nodeset, rvest::html_nodes, "flow")
# subdivide `nodes_by_flow` to reflect blocks
nodes_by_flow_and_block <- get_nodes_by_block(nodes_by_flow)
# make nested data frame with attributes
tibble::tibble(
block_attr = purrr::map(nodes_by_flow_and_block, get_attrs_enframe_pivot, "block"),
line_attr = purrr::map(nodes_by_flow_and_block, get_attrs_enframe_pivot, "line"),
word_attr = purrr::map(nodes_by_flow_and_block, get_attrs_enframe_pivot, "word"),
word_value = purrr::map(nodes_by_flow_and_block, get_word)
# page_attr = get_page_attrs(xml_nodeset)
)
}
# `unnest_content()` makes a data frame with location attributes for each word.
# the function takes inputs generated by `get_nested_content(pages)`.
unnest_content <- function(nested_df){
nested_df %>%
tidyr::unnest_wider(page_attr, names_sep = "_") %>%
tidyr::unnest(cols = c(word_value, word_attr, line_attr, block_attr)) %>%
tidyr::unnest(cols = "word_value") %>%
tidyr::unnest_wider(word_attr, names_sep = "_") %>%
tidyr::unnest_wider(line_attr, names_sep = "_") %>%
tidyr::unnest_wider(block_attr, names_sep = "_")
}
#' Read PDF element positions from {pdftotext}.
#'
#' \code{read_bbox_layout_xhtml} Parses PDF layout files produced
#' by \code{pdftotext} command \code{pdftotext file -bbox-layout}.
#' The function returns a tibble with bounding box information for
#' each word, line, block and page.
#'
#' @param path_to_html Path to HTML file generated by {pdftotext}.
#' @examples
#' doc <- system.file("extdata", "edi_2009_frcho43c6mmlx5lyohqy_doc#immrrkosg.html", package = "pdfparser")
#' read_bbox_layout_xhtml(doc)
#' @export
read_bbox_layout_xhtml <- function(path_to_html) {
# parse html
doc <- xml2::read_html(path_to_html)
# nodesets of document by page
pages <- rvest::html_nodes(doc, "page")
# created nested structure
nested <- tibble::tibble(
get_nested_content(pages),
page_attr = get_page_attrs(pages),
doc_name = basename(tools::file_path_sans_ext(path_to_html)),
page_nr = 1:length(pages)
)
# unnest and return
unnest_content(nested)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.