#
# pdf_xml.R
#
# Functions for working with XML files generated by pdftohtml.
#' Get Text Tags
#'
#'
pdf_text = function(page) {
texts = xml_find_all(page, "./text")
df = data_frame(
left = as.integer(xml_attr(texts, "left"))
, top = as.integer(xml_attr(texts, "top"))
, right = as.integer(xml_attr(texts, "width"))
, bottom = as.integer(xml_attr(texts, "height"))
, font = as.integer(xml_attr(texts, "font"))
, text = xml_text(texts)
)
df$right = df$left + df$right
df$bottom = df$top + df$bottom
return (df)
}
#' Get Bounding Boxes
#'
#' This function gets the bounding boxes for all nodes in a nodeset.
pdf_bbox = function(nodeset) {
bbox_mat =
vapply(nodeset, function(node) {
tag = xml_name(node)
if (tag %in% c("line", "rect")) {
bbox = xml_attr(node, "bbox")
bbox = as.numeric(strsplit(bbox, ",")[[1]])
} else if (tag == "text") {
bbox = xml_attrs(node)[c("left", "top", "width", "height")]
bbox = as.numeric(bbox)
bbox[3:4] = bbox[3:4] + bbox[1:2]
} else {
stop(sprintf("Cannot get bbox for node '%s'.\n", tag))
}
return (bbox)
}, numeric(4))
bbox_mat = t(bbox_mat)
# Make sure left <= right and bottom <= top (despite plotting top-down).
to_swap = bbox_mat[, 1] > bbox_mat[, 3]
tmp = bbox_mat[to_swap, 1]
bbox_mat[to_swap, 1] = bbox_mat[to_swap, 3]
bbox_mat[to_swap, 3] = tmp
to_swap = bbox_mat[, 2] > bbox_mat[, 4]
tmp = bbox_mat[to_swap, 2]
bbox_mat[to_swap, 2] = bbox_mat[to_swap, 4]
bbox_mat[to_swap, 4] = tmp
return (bbox_mat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.