#library(XML)
# See MarkLundy/readPDF2HTML.R
# Create the class hierarchy for a document generated by our modified pdftohtml.
pdftohtmlDoc =
#
# read an XML or a PDF document. If it is XML, it is assumed to be
# already converted by pdftohtml. If it is a PDF document, we convert it.
#
function(file)
{
if(grepl("\\.pdf$", file))
convertPDF2XML(file)
else {
doc = xmlParse(file)
class(doc) = c("PDFToXMLDoc", "ConvertedPDFDoc", class(doc))
doc
}
}
showPage =
# toplevel function. Provide a file and a page number, and we render that page.
function(x, pageNum = 1, doc = xmlParse(x), page = getNodeSet(doc, "//page")[[pageNum]], ...)
{
if(is(x, "XMLInternalDocument") && missing(doc))
doc = x
else if(missing(pageNum) && is(x, "XMLInternalElementNode") && xmlName(x) == "page")
page = x
renderPage(page, ...)
}
plot.PDFToXMLPage = showPage
plot.PDFToXMLDoc =
function(x, y, ...)
{
np = getNumPages(x)
r = ceiling(sqrt(np))
c = np/r
opar = par(no.readonly = TRUE)
on.exit(par(opar))
par(mfrow = c(r, c))
invisible(sapply(getPages(x), renderPage))
}
pageTitle =
function(page, docname = docName(page), fullName = TRUE)
{
if(!fullName)
docname = basename(docname)
sprintf("%s, page %s", URLdecode(docname), xmlGetAttr(page, "number"))
}
renderPage =
#
# Give us the page and we draw its elements - line, rect, img, text
# The text is not proportional so is often much shorter in the horizontal direction
# than it would be when displayed as PDF.
#
# No color information on the text nodes at this point.
#
function(page, cex.text = .5, adj = c(0, 1), showText = TRUE, showBoxes = FALSE,
title = pageTitle(page, fullName = fullName), fullName = TRUE) # , showColors = TRUE)
{
p = page
psize = as.integer(xmlAttrs(p)[c("height", "width")])
h = psize[1]
plot(0, type = "n", xlab = "", ylab = "", xlim = c(0, psize[2]), ylim = c(0, psize[1]))
title(title)
renderLinesRects(page, h = h)
renderCoords(page, h = h)
imgs = getNodeSet(page, ".//img")
if(length(imgs)) {
bb = getBBox2(imgs, attrs = c("x", "y"))
#XXX??? Should these y values be subtracted from h???
# And are these widths and heights that need to be added!
rect(bb[,1], h - bb[,2], bb[,1] + bb[,3], h - bb[,4] - bb[,4], border = "blue", lty = 3)
}
if(showText) {
txt = getNodeSet(page, ".//text")
if(length(txt)) {
bb = t(sapply(txt, xmlAttrs))
if(!("rotation" %in% colnames(bb)))
bb = cbind(bb, rotation = rep(0, nrow(bb)))
storage.mode(bb) = "double"
prot = xmlGetAttr(page, "rotation", NA, as.numeric)
if(!is.na(prot))
bb[, "rotation"] = bb[, "rotation"] + prot
colors = getNodeColors(txt)
text = sapply(txt, xmlValue)
by(1:nrow(bb), bb[, "rotation"],
function(i)
text(bb[i,2], h - bb[i,1], text[i], cex = cex.text, adj = adj, col = colors[i], srt = - bb[i[1], "rotation"]))
}
}
if(showBoxes) {
txt = getNodeSet(page, ".//text")
bb = getBBox2(txt)
rect(bb[,1], h - bb[,2], bb[,1] + bb[,3], h - bb[,2] - bb[,4], border = "lightgreen")
}
TRUE
}
renderCoords =
function(page, h, coords = getCoords(page), col = "black")
{
# lapply(coords, renderCoord, h)
X = unlist(lapply(coords, function(x) c(x$x, NA)))
Y = unlist(lapply(coords, function(x) c(x$y, NA)))
#XXX add color and lwd.
lines(X, h - Y, col = col)
}
renderCoord =
function(x, h)
{
lines(x$x, h - x$y, col = x$stroke, lwd = x$lineWidth)
}
renderLinesRects =
function(page, h, border = "green")
{
rr = getNodeSet(page, ".//rect ")
if(length(rr)) {
bb = getBBox(rr)
col = sapply(rr, function(x) mkColor(xmlGetAttr(x, "fill.color", "0,0,0"), isFill = TRUE))
lwd = max(1, as.numeric(sapply(rr, function(x) xmlGetAttr(x, "lineWidth", "1.0"))))
rect(bb[,1], h - bb[,2], bb[,3], h - bb[,4], border = border, col = col, lwd = lwd)
#XXXX temp rect(bb[,1], bb[,2], bb[,3], bb[,4], border = border, col = col, lwd = lwd)
}
if(length ( lines <- getNodeSet(page, ".//line "))) {
bb = getBBox(lines)
renderLines(bb, h, lines)
}
}
renderLines =
# See below for vectorized version.
function(bb, h, lines)
{
sapply(1:nrow(bb),
function(i) {
at = xmlAttrs(lines[[i]])
lines(bb[i, c(1,3)], h - bb[i, c(2, 4)], col = mkColor(at["stroke.color"]),
lwd = max(1, as.numeric(at["lineWidth"], na.rm = TRUE)),
lty = 2)
})
# lines(bb[,1], h - bb[,2], bb[,3], h-bb[,4], col = "red")
}
renderLines =
# need to split by lwd and color.
# Add lwd and color from nodes.
function(bb, h, nodes)
{
n = nrow(bb)*3
x = y = rep(as.numeric(NA), n)
i = seq(1, by = 3, length = nrow(bb))
x[i] = bb[,"x0"]
x[i+1] = bb[, "x1"]
y[i] = bb[, "y0"]
y[i+1] = bb[, "y1"]
col = "black"
lines(x, h - y, col = col, lwd = 1, lty = 2)
#XXXX temp lines(x, y, col = col, lwd = 1, lty = 2)
}
getNodeColors =
function(textNodes, fonts = getFontInfo(xmlParent(textNodes[[1]])))
{
f = sapply(textNodes, xmlGetAttr, "font")
fonts$color[ match(f, fonts$id) ]
}
mkColor =
# Convert a triple of RGB values as a string into a color
# assuming a maxColorValue of 2^16
# This could be, but is not vectorized.
function(x, alpha = maxColorValue, maxColorValue = 2^16, isFill = FALSE)
{
if(is.null(x) || x == "" || is.na(x))
return(rgb(0, 0, 0))
els = as.integer(strsplit(x, ",")[[1]])
if(isFill && all(els == 0))
NA
else
rgb(els[1], els[2], els[3], alpha, maxColorValue = maxColorValue)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.