R/pdf_xml.R

#
# pdf_xml.R
#
# Functions for working with XML files generated by pdftohtml.


#' Get Text Tags
#'
#'
pdf_text = function(page) {
  texts = xml_find_all(page, "./text")
  df = data_frame(
    left     = as.integer(xml_attr(texts, "left"))
    , top    = as.integer(xml_attr(texts, "top"))
    , right  = as.integer(xml_attr(texts, "width"))
    , bottom = as.integer(xml_attr(texts, "height"))
    , font   = as.integer(xml_attr(texts, "font"))
    , text   = xml_text(texts)
  )
  df$right  = df$left + df$right
  df$bottom = df$top + df$bottom

  return (df)
}


#' Get Bounding Boxes
#'
#' This function gets the bounding boxes for all nodes in a nodeset.
pdf_bbox = function(nodeset) {
  bbox_mat =
    vapply(nodeset, function(node) {
      tag = xml_name(node)

      if (tag %in% c("line", "rect")) {
        bbox = xml_attr(node, "bbox")
        bbox = as.numeric(strsplit(bbox, ",")[[1]])

      } else if (tag == "text") {
        bbox = xml_attrs(node)[c("left", "top", "width", "height")]
        bbox = as.numeric(bbox)
        bbox[3:4] = bbox[3:4] + bbox[1:2]

      } else {
        stop(sprintf("Cannot get bbox for node '%s'.\n", tag))
      }

      return (bbox)
    }, numeric(4))

  bbox_mat = t(bbox_mat)
  
  # Make sure left <= right and bottom <= top (despite plotting top-down).
  to_swap = bbox_mat[, 1] > bbox_mat[, 3]
  tmp = bbox_mat[to_swap, 1]
  bbox_mat[to_swap, 1] = bbox_mat[to_swap, 3]
  bbox_mat[to_swap, 3] = tmp

  to_swap = bbox_mat[, 2] > bbox_mat[, 4]
  tmp = bbox_mat[to_swap, 2]
  bbox_mat[to_swap, 2] = bbox_mat[to_swap, 4]
  bbox_mat[to_swap, 4] = tmp

  return (bbox_mat)
}
dsidavis/LCAP documentation built on May 15, 2019, 4:19 p.m.