cleanText =
#
# This is for removing non-printing characters and other such text in
# XML nodes. Kim's document is an example of this.
#
# extend this.
function(x)
gsub("\\t\\r", " ", gsub("
", "", gsub(" ", " ", x)) )
pdfMinerDoc =
#
# read a PDF or already converted XML document and "tidy" it up
# by collapsing the character nodes into text nodes,
# removing the <page> nodes
# and any running header content
#
function(doc, removePageNodes = FALSE, removeHeader = TRUE, sub = cleanText, removeZeroWidthLines = TRUE)
{
if(is.character(doc)) {
if(!file.exists(doc))
stop("file ", doc, " does not exist")
fname = doc
if(grepl("\\.pdf$", doc))
doc = convertPDF(doc)
doc = xmlParse(doc)
docName(doc) = fname
}
xpathApply(doc, "//textline", collapseTextLine, sub)
if(removeZeroWidthLines)
xpathApply(doc, "//layout | //line[@linewidth = 0]", removeNodes)
pages = getPages(doc)
root = xmlRoot(doc)
xmlAttrs(root) = c(numPages = length(pages))
#XXX before we do this,
# a) do we even need to do it?, and
# b) can we get the header and footer at this time.
# c) renumber the top-level <textbox> within a page to contain
# the page number
# The text may contain page numbers, or these may be in a separate node.
if(removeHeader)
removeHeaderFooter(pages, doc)
# Was thinking we may have to recognize and use different margin values on alternating pages
# margins = sapply(pages, function(x) getMargins(xmlChildren(x)))
# if(!all(apply(margins, 1, function(x) length(unique(x)) ) == 1)) {
# browser()
# }
if(removePageNodes)
removePages(, pages)
# doc
structure(doc, class = c("PDFMinerDoc", class(doc)))
}
`[[.ConvertedPDFDoc` = #`[[.PDFMinerDoc` =
function(x, i, j, ...)
{
getPages(x)[[i]]
}
`[.ConvertedPDFDoc` = # `[.PDFMinerDoc` =
function(x, i, j, ...)
{
getPages(x)[i, ...]
}
lapply.ConvertedPDFDoc = # PDFMinerDoc =
function(X, FUN, ...)
lapply(getPages(X), FUN, ...)
sapply.ConvertedPDFDoc = # PDFMinerDoc =
function(X, FUN, ...)
sapply(getPages(X), FUN, ...)
getPages =
function(doc)
{
p = getNodeSet(doc, "//page")
# Change the class of the page nodes so that we know they are PDF page nodes.
lapply(p, function(x, newClass) {
class(x) = c(newClass, class(x))
x
}, gsub("Doc$", "Page", grep("PDF", class(doc), value = TRUE)))
}
removePages =
function(doc = as(pages[[1]], "XMLInternalDocument"), pages = getNodeSet(doc, "//page"))
{
lapply(pages, replaceNodeWithChildren)
doc
}
removeHeaderFooter =
function(pages, doc, above = 792 - 60, below = 72*.75, ...)
{
# Or use removeNodes( getHeaderByText(pages, doc) )
lapply(pages, function(x) removeNodes(getHeaderByPosition(x, above, ...)))
# remove the footer
lapply(pages, function(x) removeNodes(getFooterByPosition(x, below, ...)))
}
getFooterByPosition =
#
# Find text lower than below. 3/4 of an inch.
#
# We could check for a pattern to make certain it is not just
# additional text, e.g. overflow, footnote, etc.
#
function(page, below = 72*.75)
{
nodes = getNodeSet(page, sprintf(".//textline[@bbox and get-bottom(@bbox) < %f]", below),
xpathFuns = list('get-bottom' = getBottom))
}
getHeaderByPosition =
function(page, above = 792 - 60) # half inch
{
nodes = getNodeSet(page, sprintf(".//textline[@bbox and getBBoxEl(@bbox, 2) > %f]", above),
xpathFuns = list('getBBoxEl' = getBBoxEl))
}
getHeaderByText =
function(pages, doc)
{
# Assumes the header is the first thing!!
header = lapply(pages, `[[`, 1)
htext = sapply(header, xmlValue)
bbox = getBBox(header)
bottom = max(bbox[,2])
# if none duplicated, then we have not identified the
if(any(duplicated(htext))) {
others = getNodeSet(doc, sprintf("//textbox[@bbox and get-bottom(@bbox) >= %f]", bottom),
xpathFuns = list('get-bottom' = getBottom))
# removeNodes(others)
} else
others = list()
c( header[ duplicated(htext) ], others)
}
convertPDF =
#
# If given a pdf, call pdfminer's pdf2txt to create the XML file.
#
#XXX Need to locate the pdfminer/tools/pdf2txt.py script
#
function(filename, pdfminer = getOption("PDF2TXT", "pdf2txt.py"))
{
cmd = sprintf("%s -t xml '%s'", pdfminer, filename)
system(cmd, intern = TRUE)
}
getMargins =
function(doc, m = getIndentations(doc))
{
structure(c(apply(m[, c(1, 2)], 2, min), apply(m[, c(3, 4)], 2, max)) , names = c("left", "bottom", "right", "top"))
}
getSections =
#
#XXX Needs to get much smarter
# Not just bold, but font size different from next lines, indentation
# underlines of text (not getting this in the XML)
#
#
#
#
function(doc, setClass = TRUE, asNodes = TRUE, useIndent = TRUE)
{
if(is.character(doc))
doc = pdfMinerDoc(doc)
ans = if(useIndent)
getSectionsByIndent(doc, getMargins(doc)["left"], asNodes)
if(!length(ans))
ans = getNodeSet(doc, "//text[contains(@font, 'Bold')]")
if(setClass)
markSection(ans)
txt = sapply(ans, xmlValue)
if(asNodes)
structure(ans, names = txt)
else
txt
}
getSectionsByIndent =
#
function(doc, margin = NA, asNodes = TRUE)
{
xpquery = "//textbox[ not(ancestor::layout) and %s get-indent(following-sibling::textbox[1]/@bbox) > get-indent(@bbox) ]"
if(!is.na(margin))
margin = sprintf("get-indent(@bbox) = %f and", margin)
else
margin = ""
xpquery = sprintf(xpquery, margin)
# if(is(doc, "XMLInternalNode"))
# xpquery = paste0(".", xpquery)
getNodeSet(doc, xpquery, xpathFuns = list('get-indent' = getIndent))
}
groupByIndent =
#
# We run into a problem here with cv_amir
# The publications numbered from 9. to 1. have a different indentation than left.
# So we add the condition that if a node has multiple <textline>, then it is self-contained
# regardless of its indentation. But then we end up with 2 publications that
# are split across two groups because the second part of each has multiple <textline>
# If we had the pages, we could do this by page and we wouldn't have the problem.
# However, if a publication spanned 2 pages, we had have an entirely different problem.
#
# If the publications are ordered as a list (with numbers or bullets), we can use that
# to detect errors and regroup.
#
function(nodes, bbox = getIndentations(nodes))
{
left = min(bbox[,1])
hasMultipleLines = sapply(nodes, function(x) sum(names(x) == "textline") > 1)
atLeft = (bbox[,1] == left)
# prevStartsAtLeft = c(TRUE, atLeft[- length(atLeft)])
w = cumsum( atLeft | hasMultipleLines) # & prevStartsAtLeft) )
ans = split(nodes, w)
txt = sapply(ans, getAllText)
firstWord = sapply(strsplit(txt, "[[:space:]]"), `[[`, 1)
isNum = grepl("^[0-9]+[[:punct:]]?$", firstWord)
if(sum(isNum)/length(txt) > .9) {
# we have numbers
if(any(!isNum)) {
i = which(!isNum)
browser()
ans[i - 1L] = mapply(c, ans[i-1L], ans[i], SIMPLIFY = FALSE)
ans = ans[-i]
}
} else {
tt = table(first)
if(max(tt)/length(txt) > .9 && length(tt) > 1) {
# we have a common identifier
browser()
}
}
ans
}
getAllText =
function(nodes, collapse = "\n")
{
if(inherits(nodes, "XMLAbstractNode"))
nodes = xmlChildren(nodes)
paste(sapply(nodes, xmlValue), collapse = collapse)
}
getSectionElements =
function(nodes, convert = getAllText, ...)
{
e = groupByIndent(nodes)
sapply(e, convert, ...)
}
markSection =
# Set the class sectionTitle on the nodes along with the section number
function(nodes)
{
invisible(mapply(function(x, idx)
xmlAttrs(x) = c(class = "sectionTitle", sectionNum = idx),
nodes, seq(along = nodes)))
}
getWithinSection =
#
# Get the nodes within each section identified by getSections() and marked with a class = 'sectionTitle' attribute and a @sectionNum attribute.
# This @class attribute means we can work with the document object here.
#
function(doc, sectionNodes = getNodeSet(doc, "//textbox[@class = 'sectionTitle']"))
{
ans = lapply(sectionNodes, function(x) {
idx = as.integer(xmlGetAttr(x, "sectionNum"))
getNodeSet(x, sprintf("./following-sibling::textbox[ following-sibling::textbox[@sectionNum = %d]]", idx + 1L))
})
names(ans) = sapply(sectionNodes, xmlValue)
ans
}
getIndent =
# This is an R function that we use as a function in an XPath query.
# It is passed a list with one element which is the bbox attribute.
# This could be an empty list so we return -1 for this case.
function(bbox)
{
if(length(bbox))
as.numeric(strsplit(bbox[[1]], ",")[[1]][1])
else
-1
}
getBottom =
# Also called from XPath.
function(bbox)
{
as.numeric(strsplit(bbox[[1]], ",")[[1]][2])
}
getBBoxEl =
# Also called from XPath.
function(bbox, pos)
{
as.numeric(strsplit(bbox[[1]], ",")[[1]][pos])
}
getCurves =
function(doc, nodes = getNodeSet(doc, ".//curve"))
{
lapply(nodes, getCurve)
}
getCurve =
function(node, pts = xmlGetAttr(node, "pts"))
{
matrix(as.numeric(unlist(strsplit(pts, ","))),, 2, byrow = TRUE)
}
getFontSizes =
#
# return a table of the counts for different font sizes
# If name = TRUE, then we get a 2-way table of the font size and description
#
# Font sizes are complicated. The font name may have the size (e.g. mulhearn's CV)
# Otherwise, we use the size attribute which is not the font size, but the box size
# but can be a proxy, but not always a good one.
# The size can be larger for text within a section eventhough the section title/header
# is physically larger. It depends on the content of the box.
# This is where we may want to keep the size of the individual characters.
function(doc, name = FALSE)
{
fonts = getFonts(doc)
if(all(grepl("[0-9]+$$", names(fonts)))) {
vals = unlist(getNodeSet(doc, "//text/@font"))
return( table( gsub(".*[^0-9]", "", vals) ) )
}
if(!name)
table(as.numeric(unlist(getNodeSet(doc, "//text/@size"))))
else {
nodes = getNodeSet(doc, "//text")
info = sapply(nodes, function(x) xmlAttrs(x)[c("font", "size")])
table(info["size",], info[ "font", ])
}
}
getFonts =
#
# Get the table of counts of the font names/descriptions used in the documents
# either by number of <text> nodes or by the number of characters using that font.
# By character allows us to better guess the default font for the document.
#
#
function(doc, byChar = FALSE)
{
if(byChar) {
nodes = getNodeSet(doc, "//text")
font = sapply(nodes, xmlGetAttr, "font")
nchar = nchar(sapply(nodes, xmlValue, trim = TRUE))
tapply(nchar, font, sum)
} else
table(unlist(getNodeSet(doc, "//text/@font")))
}
getSectionsByFont =
#
# This is another approach to try to identify section titles
#
# This examines all the fonts used in the document and then
# for all but the most commonly used font (assumed to be general text)
# it collects all of the lines that are resaonably short that use
# these fonts.
#
function(doc, ..., asDataFrame = TRUE)
{
tt = sort(getFonts(doc, TRUE))
fonts = names(tt)[ - length(tt) ]
ans = lapply(fonts, function(f)
findShortLines(doc, font = f, ...))
names(ans) = fonts
if(asDataFrame) {
tmp = lapply(ans, function(x) sapply(unlist(x), xmlValue))
page = lapply(ans, function(x) rep(seq(along = x), sapply(x, length)))
font = rep(fonts, sapply(tmp, length))
size = lapply(ans, function(x) sapply(unlist(x), getNodeSet, ".//text/@size"))
data.frame(text = unlist(tmp), font = font, size = as.numeric(unlist(size)), page = unlist(page), stringsAsFactors = FALSE, row.names = NULL)
} else
ans
}
getBBox = getIndentations =
#
# a matrix of bboxes for the <textline> nodes
#
function(doc, bbox = sapply(nodes, xmlGetAttr, "bbox", ""),
addNames = FALSE,
nodes = getNodeSet(doc, ".//textline"),
text = sapply(nodes, xmlValue))
{
if(missing(bbox) && (!missing(doc) && is.list(doc)) && all(sapply(doc, inherits, "XMLInternalElementNode"))) {
text = sapply(doc, xmlValue)
bbox = lapply(doc, xmlGetAttr, "bbox", "")
}
if(length(bbox) == 0)
return(NULL)
bbox = unlist(bbox)
bbox = bbox[bbox != ""]
if(length(bbox) == 0)
return(matrix(0, 0, 4, dimnames = list(NULL, c("left", "bottom", "right", "top"))))
m = do.call(rbind, strsplit(bbox, ","))
mode(m) = "numeric"
colnames(m) = c("left", "bottom", "right", "top")
if(addNames)
rownames(m) = text
m
}
collapseTextLine =
#
# Combine the individual nodes containing a single character into a single node within a <textline> node
# and put the resulting text into the first <text> node of that <textline> node, and delete the remaining
# <text> nodes.
#
# If the font changes across characters, we lose that information currently.
#
function(node, sub = NULL, byFont = TRUE)
{
if(byFont)
return(collapseTextLineByFont(node, sub))
txt = xmlValue(node)
if(!is.null(sub))
txt = sub(txt)
xmlValue(node[[1]]) = txt
sizes = sapply(xmlChildren(node), xmlGetAttr, "size", "")
xmlAttrs(node) = c(charcterSizes = paste(sizes, collapse = ","))
removeNodes(xmlChildren(node)[-1])
node
}
collapseTextLineByFont =
function(node, sub = NULL)
{
tnodes = getNodeSet(node, ".//text")
ff = sapply(tnodes, xmlGetAttr, "font", "")
groups = rle(ff)
idx = rep(seq(length = length(groups$lengths)), groups$lengths)
removeNodes(tnodes)
# chars = sapply(tnodes, xmlValue)
# phrases = tapply(chars, idx, paste, collapse = "")
mapply(mkTextNode, split(tnodes, idx), groups$values, MoreArgs = list(parent = node))
}
mkTextNode =
#XXX Add size attribute and the bbox
function(charNodes, font, parent = NULL, ...)
{
text = paste(sapply(charNodes, xmlValue), collapse = "")
charSizes = paste(sapply(charNodes, xmlGetAttr, "size", ""), collapse = ",")
ans = newXMLNode("text", text, attrs = c(font = font, characterSizes = charSizes), parent = parent)
bbox = getBBox(nodes = charNodes)
if(nrow(bbox)) {
bbox = paste(c(min(bbox[,1]), min(bbox[,2]), max(bbox[,3]), max(bbox[,4])), collapse = ",")
xmlAttrs(ans) = c(bbox = bbox)
}
ans
}
plot.PDFMinerDoc = showBoxes =
#
# showBoxes(pdfMinerDoc("SampleCVs/cv_amir.pdf")))
#
# pamir = pdfMinerDoc("SampleCVs/cv_amir.pdf", FALSE)
# showBoxes(getNodeSet(pamir, "/*/page[2]//*[not(ancestor::layout)]"))
# par(mfrow = c(5, 5), mar = c(0, 0, 0, 0))
# invisible(lapply(1:25, function(i) showBoxes(getNodeSet(pamir, sprintf("/*/page[%d]//*[not(ancestor::layout)]", i)), axes = FALSE)))
#!!! We could compute a more sensible value for str.cex knowing the usr coordinates for the plot, or directly from the dimensions of the PDF page.
function(doc, bbox = getIndentations(doc), margins = getMargins(, bbox), showText = TRUE, str.cex = .3, ...)
{
if(!missing(doc)) {
if(is.character(doc))
doc = pdfMinerDoc(doc, removePageNodes = FALSE, removeHeader = FALSE)
if(is(doc, "XMLInternalDocument")) {
pages = getNodeSet(doc, "//page")
if(length(pages)) {
# Show all the pages separately
r = ceiling(sqrt(length(pages)))
prev = par("mfrow", "mar")
on.exit(par(prev))
par(mfrow = c(r, ceiling(length(pages)/r)), mar = c(0, 0, 0, 0))
lapply(pages, function(p) showBoxes(getNodeSet(p, ".//*[not(ancestor::layout) and @bbox]"), axes = FALSE))
return(NULL)
}
}
}
bbox = upsideDown(bbox, margins)
plot(0, xlim = margins[c("left", "right")], ylim = margins[c("bottom", "top")], type = "n", xlab = "", ylab = "", ...)
rect(bbox[, 1], bbox[, 2], bbox[, 3], bbox[, 4])
if(showText)
text((bbox[, "left"] + bbox[, "right"])/2, (bbox[, "bottom"] + bbox[, "top"])/2, rownames(bbox), cex = str.cex)
if(!missing(doc)) {
if(xmlName(doc) == "page")
title(sprintf("%s page %s", docName(doc), xmlGetAttr(doc, "id", xmlGetAttr(doc, "number", "?"))))
r = getLines(doc)
if(nrow(r))
rect(r[,1], r[,2], r[,3], r[,4], col = "lightgray", border = "green", lty = 1)
cv = getCurves(doc)
if(length(cv))
lapply(cv, function(x)
lines(x[,1], x[,2], col = "blue") )
}
invisible(TRUE)
}
getLines =
function(doc, ...)
UseMethod("getLines")
getLines.ConvertedPDFPage =
function(doc, ...)
{
bb = xpathSApply(doc, "./line | ./rect", xmlGetAttr, "bbox")
r = matrix(as.numeric(unlist(strsplit(bb, ","))), , 4, byrow = TRUE)
colnames(r) = c("left", "bottom", "right", "top")
r = upsideDown(r, margins)
}
upsideDown =
function(bbox, margins)
{
return(bbox)
bbox[, "bottom"] = margins["top"] - bbox[, "bottom"]
bbox[, "top"] = margins["top"] - bbox[, "top"]
bbox
}
textWidth =
function(nodes)
{
UseMethod("textWidth")
}
textWidth.XMLInternalDocument =
function(nodes)
{
textWidth(getNodeSet(nodes, "//textline"))
}
textWidth.list = textWidth.XMLNodeSet =
function(nodes)
{
bbox = getBBox(nodes)
widths = bbox[,3] - bbox[,1]
structure(widths, names = sapply(nodes, xmlValue))
}
findUnderlines =
function(doc, ...)
UseMethod("findUnderlines")
findUnderlines.character =
function(doc, ...)
findUnderlines(pdfMinerDoc(doc, removePageNodes = FALSE))
findUnderlines.XMLInternalDocument =
function(doc, ...)
{
findUnderlines(getNodeSet(doc, "//page"))
}
findUnderlines.XMLNodeSet =
function(doc, ...)
{
lapply(doc, findUnderlines)
}
findUnderlines.XMLInternalElementNode =
#
# Add the tests for the line going all the way across, or that there is no other text on that line.
#
#
function(doc, threshold = 4, ...)
{
rects = getNodeSet(doc, ".//rect | .//line[@linewidth > 0]")
if(length(rects) == 0)
return(NULL)
rects = getBBox(rects)
lines = lapply(rects[, "bottom"],
function(y) {
els = getNodeSet(doc, sprintf(".//textline[@bbox and abs(get-bottom(@bbox) - %f) < %f and not(normalize-space(.) = '')]", y, threshold),
xpathFuns = list('get-bottom' = getBottom))
if(length(els)) {
pos = getBBox(els)[, 2]
tmp = lapply(pos, findTextOnSameLine, doc)
i = sapply(tmp, length) == 1
if(!all(i)) browser()
els = els[i]
}
els
})
lines[ sapply(lines, length) > 0 ]
}
findTextOnSameLine =
function(y, doc)
{
lines = getNodeSet(doc, sprintf(".//textline[@bbox and get-bottom(@bbox) = %f and not(normalize-space(.) = '') ]", y),
xpathFuns = list('get-bottom' = getBottom))
}
findNumberedSectionTitle =
function(doc)
{
getNodeSet(doc, "//textbox[ starts-with-a-number(.) ]/preceding-sibling::textbox[ not(starts-with-a-number(.)) ][1]",
xpathFuns = list('starts-with-a-number' = startsWithANumber))
}
startsWithANumber =
function(x)
{
grepl("^[0-9]+[[:punct:]]?", xmlValue(x[[1]]))
}
findBulletedSectionTitle =
function(doc)
{
els = getNodeSet(doc, "//textbox[ starts-with(., '•') ]/preceding-sibling::textbox[ not(starts-with(., '•')) ]")
}
findShortLines =
function(doc, widthPercent = .7, allCaps = FALSE, font = NA)
UseMethod("findShortLines")
findShortLines.character =
function(doc, widthPercent = .7, allCaps = FALSE, font = NA)
findShortLines(pdfMinerDoc(doc, removePageNodes = FALSE), widthPercent, allCaps, font)
findShortLines.XMLInternalDocument =
function(doc, widthPercent = .7, allCaps = FALSE, font = NA)
xpathApply(doc, "//page", findShortLines, widthPercent, allCaps, font)
findShortLines.XMLInternalElementNode =
#
#XXX We have an issue with some <text> nodes have the wrong bbox that are too narrow, (e.g. Mulhearn's "Education and Training")
# and some <textline> nodes that are too big
#
function(doc, widthPercent = .7, allCaps = FALSE, font = NA, marThreshold = 12)
{
#cat("Page", xmlGetAttr(doc, "id"), font, "\n")
mar = getMargins(doc)
fontQuery = if(!is.na(font))
sprintf("text/@font = '%s' and", font)
else
""
xp = sprintf(".//textline[@bbox and string-length(normalize-space(.)) > 1 and %s abs(get-indent(@bbox) - %f) < %f and getBBoxEl(@bbox, 3) < %f]",
fontQuery, mar[1], marThreshold, mar[3]*widthPercent)
nodes = getNodeSet(doc, xp, xpathFuns = list('get-indent' = getIndent, 'getBBoxEl' = getBBoxEl))
if(length(nodes) == 0)
return(NULL)
pos = getBBox(nodes)
ok = sapply(pos[, 2], function(pos) length(findTextOnSameLine(pos, doc)) == 1)
if(allCaps)
nodes[ok][ isUpperCase(sapply(nodes[ok], xmlValue)) ]
else
nodes[ok]
}
isUpperCase =
function(x)
{
x == toupper(x)
}
setOldClass(c("PDFMinerDoc", "ConvertedPDFDoc", "XMLInternalDocument", "XMLAbstractDocument"))
setOldClass(c("PDFMinerPage", "ConvertedPDFPage", "XMLInternalDocument", "XMLAbstractDocument"))
setOldClass(c("ConvertedPDFPage", "XMLInternalElement", "XMLInternalNode", "XMLAbstractNode"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.