getSectionText =
#
#
#
#
function(doc, asNodes = FALSE, secHeaders = findSectionHeaders(doc, ...), maxNumPages = 30, cleanSectionNums = TRUE,
addOmitted = TRUE, separateTables = TRUE, ... )
{
if(is.character(doc))
doc = readPDFXML(doc)
if(getNumPages(doc) > maxNumPages) {
warning("doc pages exceeds maxNumPages")
return(list())
}
if(separateTables) {
tblNodes = getTables(doc)
tbls = sapply(tblNodes, function(x) paste(names(x), collapse = "\n"))
nn = unlist(tblNodes)
if(!is.null(nn))
removeNodes(nn[!sapply(nn, is.null)])
}
if(length(secHeaders) == 0) {
ti = unlist(getDocTitle(doc, asNode = TRUE))
end = getLastRealTextNode(doc)
ans = getNodesBetween(ti[[length(ti)]], end)
return(ans)
}
#if only one node supplied, add last node as end node, flag to return one thing
if(length(secHeaders) == 1) {
secHeaders = c(secHeaders, getLastNode(secHeaders))
pasteTxt = TRUE
} else {pasteTxt = FALSE}
secHeaders = orderNodes(unlist(secHeaders))
secs = lapply(seq(along = secHeaders),
function(i)
getNodesBetween(secHeaders[[i]], if(i == length(secHeaders)) NULL else secHeaders[[i+1]]))
names(secs) = sapply(secHeaders, xmlValue)
if(cleanSectionNums)
names(secs) = removeNumPrefixes(names(secs))
#XXX deal with the tables.
if(asNodes)
return(secs)
txt = sapply(secs, xmlValue)
if(addOmitted) {
start = doc[[1]][["text"]]
if(!is( a <- try(findAbstract(doc), silent = TRUE), 'try-error') && length(a))
start = rev(unlist(a))[[1]]
# try( { start = rev(unlist(findAbstract(doc)))[[1]]})
onodes = getNodesBetween(start, secHeaders[[1]])
txt["<other>"] = paste(sapply(onodes, xmlValue), collapse = " ")
}
if(separateTables && length(tbls))
txt[paste0("Table", seq(along = tbls))] = tbls
if (pasteTxt) {c(txt[[1]],txt[[2]])} else {txt}
}
#
# Find lines that
# is one of the regular section header text
# have a larger font than others on the page
# shorter than most in the same column
# have a larger font than most text nodes in the column
# larger vertical distance between it and the next line
#
# Includes names
# "LatestDocs/PDF/0629421650/Padula-2002-Andes virus and first case report1.xml"
# also doesn't get the subheadings for Patient 1 and Patient 2
#
findSectionHeaders =
#
# Given a node, find the node identifying the section in the paper
# for this node.
#
# Looking for text on its own line, in bold or larger font.
# "../LatestDocs/PDF/2157652992/Calisher-2006-Bats_ Important Reservoir Hosts.xml"
# ""../LatestDocs/PDF/0000708828/Li-2009-Sensitive, qualitative detection of hu.xml"
# "../LatestDocs/PDF/2636860725/Růžek-2010-Omsk haemorrhagic fever.xml"
# "../LatestDocs/PDF/3757461900/Matsuzaki-2002-Antigenic and genetic characte1.xml"
#
#
#
#'@param checkCentered if the nodes we identify as section
# using the "expected" names are centered, then by default
# when we look for other text with the same font, we only
# include centered text. However, if checkCentered = FALSE
# we include all text with the same section header font.
# Checking for centered is currently expensive.
#
# See weaver-2001 for example of main section titles being
# centered but sub-sections are in the same font and not centered.
#
#
# Need to include the continuation text on the next lines if the section title
# spans multiple lines. See, e.g.,
# 2019 CItation/Papers/1-50/11 Bakir art%3A10.1007%2Fs00024-012-0482-8.xml
# in the CIG papers.
#
# Subsections, see CIG_citation/2019 CItation/Papers/1-50/1 aagaard jgrb50217.xml
#
function(doc, sectionName = c('introduction', 'background',
'conclusions', 'discussion', 'materials and methods',
'literature cited', 'references cited', 'the study'),
# For, e.g., Lahm-2007-Morbidity, with these 2 extra section names, we
# match References and Ackno.. and these don't have
# numbers.
# Maybe go for numbered section titles first?
otherSectionNames = c('references', 'acknowledgements', 'acknowledgments', 'results', 'methods'),
checkCentered = TRUE,
discardAfterReferences = TRUE,
allowRotated = FALSE, onlyFirst = FALSE,
order = TRUE, groupByLine = FALSE
)
{
if(is.character(doc))
doc = readPDFXML(doc)
# Find section titles with numbers
hasNum = FALSE
filter = paste(sprintf("(contains(lower-case(normalize-space(.)), '%s') and isNum(normalize-space(.)))", sectionName), collapse = " or ")
xp = sprintf("//text[%s]", filter)
intro = getNodeSet(doc, xp, xpathFuns = list(isNum = isSectionNum))
hasNum = length(intro) > 0
if(!hasNum) {
filter = paste(sprintf("lower-case(normalize-space(.)) = '%s'", sectionName), collapse = " or ")
xp = sprintf("//text[%s]", filter)
intro = getNodeSet(doc, xp)
}
if(onlyFirst)
return(intro)
if(!length(intro)) {
filter = paste(sprintf("lower-case(normalize-space(.)) = '%s'", otherSectionNames), collapse = " or ")
xp = sprintf("//text[%s]", filter)
intro = getNodeSet(doc, xp)
}
if(length(intro))
intro = intro[ ! isLowerCase(sapply(intro, xmlValue)) ]
if(length(intro)) {
if(hasNum)
return(getNodeSet(doc, sprintf("//text[isNum(normalize-space(.)) and (%s)]",
paste( sprintf("@font = '%s'", sapply(intro, xmlGetAttr, "font")), collapse = " or " )),
xpathFuns = list(isNum = isSectionNum)))
i = sapply(intro, function(x) length(getNodeSet(x, "./preceding::text[ . = 'Supporting Online Material']"))) > 0
intro = intro[!i]
if(length(intro) == 0)
return(NULL)
fontID = unique(sapply(intro, xmlGetAttr, "font"))
#XX Check if on line by itself and not just a word.
# Check if these are centered on a column or on the page. If so,
# other nodes we think are titles also better be centered.
secs = getNodesWithFont(doc, fontID = fontID)
# Discard elements that are table of contents, ie. have leaders ..... page number
secs = secs[!grepl("\\.{5,}[[:space:]]*[0-9]+$", xmlValue(secs))]
if(!allowRotated)
secs = secs[as.numeric(sapply(secs, xmlGetAttr, "rotation")) == 0]
# Discard elements that are just numbers
secs = secs[!grepl("^[0-9]+$", xmlValue(secs))]
preRefs = sapply(secs, function(x) getNodeSet(x, ".//preceding::text[ lower-case(normalize-space(.)) = 'references' or lower-case(normalize-space(.)) = 'references cited' or lower-case(normalize-space(.)) = 'supporting online material']"))
w = sapply(preRefs, length) > 0
if(any(w) && hasCoverPage(doc))
w[w] = sapply(preRefs[w], function(x) pageOf(x) != 1)
secs = secs[!w]
# if all the known section headers are all upper case
if(all(isUpperCase(sapply(intro, xmlValue)))) {
txt = sapply(secs, xmlValue)
secs = secs[ i <- isUpperCase(txt) ]
secs = secs[ !grepl("^[[:punct:]]+$", txt[i]) ]
}
# For isCentered and isOnLineBySelf, we should determine which pages
# we are dealing with and compute the getTextByCols() and
# nodesByLine() for each of these just once and pass them to these
# functions
if(checkCentered)
secs = secs[ sapply(secs, isCentered) == isCentered(intro[[1]])]
if(isOnLineBySelf(intro[[1]])) {
i = sapply(secs, isOnLineBySelf)
secs = secs[ i ]
}
if(order) { # Do we need to do this??
o = order(sapply(secs, pageOf), sapply(secs, inColumn))
secs = secs[o]
}
if(groupByLine) {
# XXX This changes the order of the nodes.
# We really should group these by page and within column, except those that span multiple columns.
# We now turn this off. What does this do to getSectionText().
secs = nodesByLine(secs)
}
secs
}
}
isUpperCase =
function(x, hasChars = TRUE)
x == toupper(x) & grepl("[A-Z]", x)
isLowerCase =
function(x)
x == tolower(x)
isSectionNum =
#
# For use in XPath test.
#
function(x)
grepl("^[0-9](\\.[0-9](\\.[0-9])?)?\\. ", x)
getNodesWithFont =
function(doc, fontID)
{
getNodeSet(doc, sprintf("//text[%s]", paste(sprintf("@font = %s", fontID), collapse = " or ")))
}
isOnLineBySelf =
function(node, pos = getColPositions(xmlParent(node)),
textNodes = getNodeSet(xmlParent(node), ".//text"),
bbox = getBBox2(textNodes, TRUE))
# doc = as(node, "XMLInternalDocument"))
{
colNodes = getTextByCols(pageOf(node, TRUE), breaks = pos, asNodes = TRUE)
# determine which column this is in
colNum = inColumn(node, colNodes)
col = colNodes[[colNum]]
# lines = split(col, as.integer(sapply(col, xmlGetAttr, "top")))
h = as.integer(xmlGetAttr(node, "top"))
npos = as.integer(sapply(col, xmlGetAttr, "top"))
sum(npos == h) == 1
}
identicalInColumn =
function(x, node)
{
if(length(x))
any(sapply(x, identical, node))
else
FALSE
}
inColumn =
function(node, cols = getTextByCols(xmlParent(node), asNodes = TRUE))
{
ans = which(sapply(cols, identicalInColumn, node))
if(length(ans))
return(ans)
# We may end up with an empty answer if the node has a different font
# than the docFont and getTextByCols() uses docFont.
# In this case, we'll compute the spans of the columns and then
# determine which one node falls in
sp = lapply(cols,
function(x) {
ll = nodesByLine(x)
le = getLineEnds(ll)
apply(le, 2, median)
})
bb = getBBox2(list(node))
start = bb[1, "left"]
end = bb[1, "left"] + bb[1, "width"]
w = sapply(sp, function(x) start >= x[1] || end >= x[1])
which(w)
}
#XXX give proper name.
f =
function(page, nodes = getNodeSet(page, ".//text"),
bb = getBBox2(nodes, TRUE),
cols = getColPositions(page))
{
b = split(bb, cut(bb$left, c(0, cols[-1], Inf) -2))
k = lapply(b, function(x) x[order(x$top),])
}
getNodesBetween =
function(x = NULL, y = NULL, useLines = TRUE, exclude = FALSE, ...)
{
if(is.null(x) && is.null(y))
stop("need to specify either x or y or both")
if(is.null(x))
x = getFirstTextNode(as(y, "XMLInternalDocument"))
else if(is.list(x))
x = x[[length(x)]]
if(is.null(y))
y = getLastNode(x)
else if(is.list(y))
y = y[[length(y)]]
s = pageOf(x)
e = pageOf(y)
#check if going from same node to same node,
if (identical(x,y) & s == e) {
return(x)
} else {
ans = if(e > s) {
# get all the nodes on each page up to e
p1 = getTextAfter(x, useLines = useLines)
if(e - s > 1) {
pgs = getPages(as(x, "XMLInternalDocument"))[ seq(s + 1, e - 1)]
pgs = lapply(pgs, getTextByCols, asNodes = TRUE)
} else
pgs = NULL
pe = getTextAfter(, y, useLines = useLines)
c(p1, unlist(pgs, recursive = FALSE), pe)
} else {
getTextAfter(x, y, useLines = useLines, ...)
}
if(exclude)
# drop x and y. XXX need to handle if y is null in call and keep then.
ans = ans[-c(1, length(ans))]
ans
}
}
getTextAfter =
#
# This is a somewhat brute-force approach to getting the <text> nodes
# between one start node and an optional end node ON THE SAME <PAGE>!
# getNodesBetween() is for multiple pages and calls this function
# so can handle single pages also.
# The name of this function is not entirely correct. We can
# specify either x OR to so it can get the nodes before the to node.
# One can specify x and not to, x and to, or just to.
#
#XXX FIX THIS TO KEEP THE TEXT BY COLUMN.
function(x = NULL, to = NULL, before = FALSE, useLines = TRUE, ...)
{
page = xmlParent(if(!is.null(x)) x else to)
cols = getTextByCols(page, asNodes = TRUE, order = TRUE, ...)
if(!is.null(x) && !is.null(to) && pageOf(to) < pageOf(x)) {
warning("to node in getTextAfter() is on earlier page (", pageOf(to) , " versus ", pageOf(x), " Ignoring to node")
to = NULL
}
original.to = to
if(useLines) {
# If to is a rect/line, find its location, otherwise find any lines on this page.
if(!is.null(to) && xmlName(to) %in% c('rect', 'line')) {
bb = getBBox(list(to))
bb[1,2] = bb[1,4]
to = NULL
# useLines = FALSE
} else
bb = getBBox(getNodeSet(page, ".//rect | .//line"))
}
if(!is.null(x)) {
# find the column and the index of the node matching x. Not the same as columnOf() as we want the index within the column.
if(xmlName(x) == "text") {
i = lapply(cols, function(n) if(length(n)) which(sapply(n, identical, x)) else integer())
colNum = which(sapply(i, length) > 0)
# colNum = which(sapply(cols, identicalInColumn, x))
} else {
## Force for now!!! Not a <text> element, so presumable a <line> or <rect>
warning("hard coded column for now")
i = 1L
colNum = 1L
}
}
if(!is.null(to)) {
# to.colNum = which(sapply(cols, identicalInColumn, to))
j = lapply(cols, function(n) if(length(n)) which(sapply(n, identical, to)) else integer())
to.colNum = which(sapply(j, length) > 0)
if(is.null(x))
return( c(cols[ seq(1, length = to.colNum - 1) ],
cols[[to.colNum]][ seq(1, length = j[[to.colNum]] - 1) ]))
}
if(is.null(to)) {
nodes = cols[[colNum]][ - (1:(i[[colNum]]-1)) ]
if(colNum < length(cols))
nodes = c(nodes, cols[(colNum+1):length(cols)])
} else {
if(colNum == to.colNum) {
nn = cols[[ colNum ]]
nodes = nn[ seq(i[[colNum]], j[[to.colNum]] - 1) ]
} else {
# nodes in x's column
nodes = cols[[colNum]][ - (1:(i[[colNum]]-1)) ]
# in Becker-2012, Author Contributions is x and References is to
# but References is actually in the 1st column of this page and Author Contributions
# is in the second but slightly above. findSectionHeaders() is ordering them this way
# but not taking into account References should probabl come first.
# Is References a node that is after Author Contributions in document order?
btwn = seq(colNum + 1, length = to.colNum - colNum - 1)
nodes = c(nodes, cols[btwn],
cols[[to.colNum]][ seq(1, length = j[[to.colNum]] - 1) ])
}
}
if(useLines) {
#XXX FIX THIS - x or to is missing?
# Handle the cases where we return earlier.
tmp = list(x)
if(!is.null(to))
tmp[[2]] = to
bb2 = getBBox2(tmp)
###!!!!!
isShape = sapply(tmp, xmlName) != "text"
if(any(isShape)) {
# Convert getBBox to x0, y0, width and height, not x0, y0, x1, y1
# Do this in getBBox() as an option.
vv = getBBox(tmp[isShape], diffs = TRUE)
bb2[isShape,] = vv
}
# bb is for rect/line. So we are looking for lines that span at least half the page
# and are further "down" the page than our x node (which is located at bb2[1,])
# Was > .6 not .53
w = (bb[,3] - bb[,1])/as.numeric(xmlGetAttr(page, "width")) > .53 & bb[,2] > bb2[1,2]
if(any(w)) {
bot = max(bb[w, 4])
f = function(x) {
bb.n = getBBox2(x)
x[ bb.n[,2] + bb.n[,4] <= bot ]
}
#browser()
# Really it is if nodes is a list with all elements being XMLInternalElementNode
# or
nodes = if(length(nodes) != length(cols)) f(unlist(nodes)) else lapply(nodes, f)
} else if(!is.null(original.to)) {
# This is different from w above. This is the which rect/line in bb and then
# append which lines are below the first node.
tmp2 = c((bb[,3] - bb[,1])/as.numeric(xmlGetAttr(page, "width")) > .53, bb[,2] > bb2[1,2])
# if(any(tmp2))
# stop("check the threshold")
#XXX finish this off.
}
}
unlist(nodes, recursive = FALSE)
}
getLastNode =
# get the final node in the document - last node in last page
# Use this when getting the content for the last section
function(node, doc = as(node, "XMLInternalDocument"))
{
# getNodeSet(doc, "//text[last() and ancestor::page]")[[1]] <- returns bad results
ans = getNodeSet(doc, "//page[last()]/text[last()]")[[1]] # above caused errors so I put this back in. Whay was it replaced?
if(pageOf(ans) == pageOf(node)) {
# if on the same page, then we need to check which column node is in
# and ensure that the ans node is in the same column.
page = xmlParent(node)
byCol = getTextByCols(page, asNodes = TRUE)
w = inColumn(node, byCol)
ans = byCol[[w]][[ length(byCol[[w]]) ]]
}
ans
}
getFirstTextNode =
# get the final node in the document - last node in last page
# Use this when getting the content for the last section
function(doc)
{
getNodeSet(doc, "//page[1]/text[1]")[[1]]
}
findShortSectionHeaders =
function(colNodes, lines = nodesByLine(colNodes))
{
short = which(findShortLines(colNodes, lines, asLogical = TRUE))
# Now check if there is a little but more space between this line
# relative to the others and/or is it in a different font/color
tops = sapply(lines, function(x) min(as.numeric(sapply(x, xmlGetAttr, "top"))))
lineskip = median(diff(sort(tops)))
before = diff(tops)[short - 1]
isShort = short[ before > lineskip * 1.1]
lines[ isShort ]
# Check for fonts here or in a separate function.
}
orderNodes =
#
# Take a list of nodes and order them by page and within each page by column
# We'll assume they are ordered correctly within column already.
#
function(nodes, pages = sapply(nodes, pageOf))
{
unlist(tapply(nodes, pages, orderNodesInPage))
}
orderNodesInPage =
function(nodes, columnNum = sapply(nodes, inColumn, colNodes),
colNodes = getTextByCols(page, breaks = colPos, asNodes = TRUE),
colPos = getColPositions(if(colsAcrossPages) as(nodes[[1]], "XMLInternalDocument") else xmlParent(nodes[[1]]), acrossPages = colsAcrossPages),
page = xmlParent(nodes[[1]]),
colsAcrossPages = any(grepl("References", sapply(nodes, xmlValue))))
{
# If this page includes a References section but the number of columns is 1 and the number of
# columns on the previous page is 2, then use the previous page's columns.
if(colsAcrossPages && length(colPos) == 1 && length(x <- getColPositions(getSibling(page, FALSE))))
colPos = x
nodes[order(columnNum)]
}
removeNumPrefixes =
# removeNumPrefixes(c("1.2 abc", " 1.2 abc def", "1.x abc", " abc def") )
function(x)
{
gsub("^[[:space:]]*[0-9.]+ ?", "", x)
}
getLastRealTextNode =
function(doc, docFont = getDocFont(doc), nodes = getNodeSet(doc, xpathQ("//text", doc)), textFonts = getTextFonts(doc, txtNodes = nodes))
{
# Make this smarter by finding the text that comes after the main text of the
# document. Acknowledgements, etc.
# We are currently calling this because we have no section headers.
# So we may look at text that is different from the regular document font.
# and/or look for
#browser()
byPage = split(nodes, sapply(nodes, pageOf))
tmp = byPage[[ length(byPage) ]]
cols = getTextByCols(doc[[ length(byPage) ]], txtNodes = tmp, asNodes = TRUE)
tmp = cols[[length(cols)]]
tmp = orderByLine(tmp)
ans = tmp[[length(tmp)]]
ans
# ans[[length(ans)]]
# bb = getBBox2(tmp)
# o = order(bb[, "top"], bb[, "left"])
# tmp = tmp[o]
# tmp[[ length(tmp) ]]
# textFonts[[length(textFonts)]]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.