if(FALSE) {
doc = readPDFXML("Electric_A2A _DPL_08-19-13.xml")
page = doc[[2]]
#rr = getNodeSet(page, ".//rect")
hor = getHLines(page)
plot(page)
abline(h = dim(page)[2] - hor$y0, col = "red")
vert = getHLines(page, horiz = FALSE)
abline(v = unique(vert$x0), col = "red")
}
getLines = getHLines =
function(page, nodes = getNodeSet(page, ".//rect"), mar = margins(page),# page is global here!
bb = getBBox(nodes, asDataFrame = TRUE),
threshold = 5, lhThreshold = 6, minLineLength = 20, marThreshold = 10, horiz = TRUE)
{
# horizontal lines
w = if(horiz)
abs(bb$y1 - bb$y0) < lhThreshold
else
abs(bb$x1 - bb$x0) < lhThreshold
hbb = bb[w,]
var = if(horiz) "y1" else "x1"
uvals = unique(hbb[[var]])
g = split(hbb, cut(hbb[[var]], c(0, uvals)))
if(horiz) {
wd = sapply(g, function(x) range(x$x0, x$x1))
w2 = wd[1,] < (mar[1] + marThreshold) & wd[2,] > ( mar[2] - marThreshold)
} else {
ht = sapply(g, function(x) diff(range(x$y0, x$y1)))
w2 = ht > dim(page)[2]*.1
}
g = g[w2]
do.call(rbind, lapply(g, joinLines, horiz = horiz))
}
joinLines =
function(xx, horiz = TRUE, maxGap = 5, vars = if(horiz) c("x0", "x1") else c("y0", "y1"))
{
xx = xx[ order(xx[[ vars[2] ]]), ]
d = xx[[vars[1]]][-1] - xx[[vars[2]]] [-nrow(xx)]
w = c(0, cumsum(d >= maxGap))
yy = split(xx, w)
do.call(rbind, lapply(yy, function(x) {
cbind(x[which.min(x[[vars[1]]]), c("x0", "y0")], x[which.max(x[[vars[2]]]), c("x1", "y1")])
}))
}
#any(diff(uvals) < threshold)
byLine = docByLine =
#
# get all text nodes in the entire document and group by line.
# Handles splitting across pages.
#
function(doc)
{
txt = getNodeSet(doc, "//text[normalize-space(.) != '']")
nodesByLine(txt)
}
nodes2Line =
# takes a collection of nodes that are assumed to be on the same line
# and converts this to text.
# This doesn't (currently) order them based on position within the line.
function(x)
unname(paste(sapply(x, xmlValue, trim = TRUE), collapse = " "))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.