R/parse_pdftotext_layout_output.R

Defines functions read_bbox_layout_xhtml unnest_content get_nested_content get_page_attrs get_word get_attrs_enframe_pivot get_nodes_by_block

Documented in read_bbox_layout_xhtml

library(magrittr)

get_nodes_by_block <- function(xml_nodeset){
  purrr::map(xml_nodeset, rvest::html_nodes, "block")
}

get_attrs_enframe_pivot <- function(nodes_by_flow_and_block, node_name) {
  purrr::map(nodes_by_flow_and_block, ~ {
    xml2::xml_attrs(.x, node_name) %>%
      tibble::enframe() %>%
      tidyr::pivot_wider(names_from = "name", values_from = "value")
  })
}

get_word <- function(nodes_by_flow_and_block) {
  purrr::map(nodes_by_flow_and_block, ~ {
    rvest::html_nodes(.x, "word") %>%
      xml2::xml_text()
  })
}

get_page_attrs <- function(xml_nodeset){
  xml2::xml_attrs(xml_nodeset) %>%
    purrr::map(~{
      tibble::enframe(.x) %>%
        tidyr::pivot_wider(names_from = "name", values_from = "value")
    }
    )
}



# return a list of blocks subvided by flows by page
get_nested_content <- function(xml_nodeset){

  # subdivide list into flows
  nodes_by_flow <- purrr::map(xml_nodeset, rvest::html_nodes, "flow")

  # subdivide `nodes_by_flow` to reflect blocks
  nodes_by_flow_and_block <- get_nodes_by_block(nodes_by_flow)

  # make nested data frame with attributes
  tibble::tibble(
    block_attr = purrr::map(nodes_by_flow_and_block, get_attrs_enframe_pivot, "block"),
    line_attr = purrr::map(nodes_by_flow_and_block, get_attrs_enframe_pivot, "line"),
    word_attr = purrr::map(nodes_by_flow_and_block, get_attrs_enframe_pivot, "word"),
    word_value = purrr::map(nodes_by_flow_and_block, get_word)
    # page_attr = get_page_attrs(xml_nodeset)
  )

}

# `unnest_content()` makes a data frame with location attributes for each word.
# the function takes inputs generated by `get_nested_content(pages)`.
unnest_content <- function(nested_df){
  nested_df %>%
    tidyr::unnest_wider(page_attr, names_sep = "_") %>%
    tidyr::unnest(cols = c(word_value, word_attr, line_attr, block_attr)) %>%
    tidyr::unnest(cols = "word_value") %>%
    tidyr::unnest_wider(word_attr, names_sep = "_") %>%
    tidyr::unnest_wider(line_attr, names_sep = "_") %>%
    tidyr::unnest_wider(block_attr, names_sep = "_")
}



#' Read PDF element positions from {pdftotext}.
#'
#' \code{read_bbox_layout_xhtml} Parses PDF layout files produced
#' by \code{pdftotext} command \code{pdftotext file -bbox-layout}.
#' The function returns a tibble with bounding box information for
#' each word, line, block and page.
#'
#' @param path_to_html Path to HTML file generated by {pdftotext}.
#' @examples
#' doc <- system.file("extdata", "edi_2009_frcho43c6mmlx5lyohqy_doc#immrrkosg.html", package = "pdfparser")
#' read_bbox_layout_xhtml(doc)
#' @export

read_bbox_layout_xhtml <- function(path_to_html) {
  # parse html
  doc <- xml2::read_html(path_to_html)
  # nodesets of document by page
  pages <- rvest::html_nodes(doc, "page")

  # created nested structure
  nested <- tibble::tibble(
    get_nested_content(pages),
    page_attr = get_page_attrs(pages),
    doc_name = basename(tools::file_path_sans_ext(path_to_html)),
    page_nr = 1:length(pages)
  )

  # unnest and return
  unnest_content(nested)
}
balthasars/pdfparser documentation built on May 10, 2020, 12:33 a.m.