ReadPDF: Extract Information from PDF Documents

#3 column example /Users/duncan/DSIProjects/Zoonotics/NewData_Feb2017/Zoo_02_02_2017 Copy.Data/PDF/2586849323


if(FALSE) {
doc = xmlParsePDFTOHTML("2ColPaper.xml")

# Deal with the second page.
p = doc[[2]]
renderPage(p, cex = .5)

cols = getTextByCols(p)
nchar(cols)
}


setGeneric("getTextNodes",
           function(x, ...)
             standardGeneric("getTextNodes"))

setMethod("getTextNodes", "PDFToXMLDoc",
          function(x, ...) {
              getNodeSet(x, "//text")
          })

setMethod("getTextNodes", "PDFToXMLPage",
          function(x, ...) {
              getNodeSet(x, ".//text")
          } )         
          


##
# Todo
#[Done]  Deal with pages that have some text in a column that spans the entire width of the page
#   and then others in 2 or more columns, e.g., an abstract and then the 2 column text.
#
#
# Fix the fonts so that they are unique. Do this in pdftohtml

#
# Headers and footers on the pages.


pdfText = 
function(doc, numPages = getNumPages(doc), docFont = getDocFont(doc))
    UseMethod("pdfText")

pdfText.PDFToXMLDoc = pdfText.PDFToXMLDoc.character = pdf_text =
function(doc, numPages = getNumPages(doc), docFont = getDocFont(doc))
{
   if(is.character(doc))
      doc = xmlParsePDFTOHTML(doc)
   
   lapply(getPages(doc)[seq(1, min(numPages, getNumPages(doc)))], getTextByCols, docFont = docFont)
}

pdfText.PDFToXMLPage = 
function(doc, numPages = getNumPages(doc), docFont = getDocFont(doc))
{
   getTextByCols(doc, docFont = docFont)
}

isCenteredMargins =
function(node, margins = margins(xmlParent(node)), bbox = getBBox2(list(node)))
{
    mid = bbox[,1] + bbox[,3]/2

    abs(mid - mean(margins)) < .2 * diff(margins)
}

isCentered =
    #
    # Determine if the node is centered within a column
    # This is for use in determining section titles
    # If we find a title that is centered and the find other text
    # with the same font but that is not centered, then that
    # additional text not a section title.
function(node, cols = getTextByCols(xmlParent(node), asNodes = TRUE),
         threshold = .2, colNum = inColumn(node, cols))
{
  if(length(cols) == 0)
      return(FALSE)
    
     # find out which column the node is in and get those columns and
     # their bounding boxes

  bb = getBBox2(cols[[ colNum[1] ]] )

     # This is now also done in nodesByLine and getLineEnds(). Leave
     # this here for now but eventually use those.
     # assemble the lines from these nodes and the horizontal bounding box for
     # each line and then the median position of each of these x1 x2
     # positions for the start and end of the line.
  byLine = by(bb, bb[, "top"], function(x) c(min(x[, "left"]),max(x[,"left"] + x[, "width"])))
  byLine2 = do.call(rbind, byLine)
  pos = apply(byLine2, 2, median)

      # and now the median middle of the lines
  mid = median(pos)

  top = xmlGetAttr(node, "top")
  lw = byLine[[ top ]]
    # lw could be NULL if we use docFont = TRUE when getting the nodes in getTextByCols().
  if(length(lw) && ((lw[1] - pos[1] < 5) || diff(pos) - diff(lw) < 40))
      return(FALSE)
  
     # now compute the middle of the string itself.
  textPos = as.numeric(xmlAttrs(node)[c("left", "width")])
  textMid = textPos[1] + textPos[2]/2
  
  textPos[1] - pos[1] > .1*diff(pos) & abs(textMid - mid) <  threshold *  median(byLine2[, 2])
}


nodesByLine =
    #
    # Group a collection of nodes in a column by lines, allowing them
    # to have a slightly different 
    #
function(nodes, asNodes = TRUE, bbox = getBBox2(nodes, TRUE),
         baseFont = getDocFont(as(nodes[[1]], "XMLInternalDocument")),
         fontSize = if(nrow(baseFont) > 0) baseFont$size else 11,
         addText = TRUE, useBase = TRUE, rotate = FALSE
        )
{
    if(length(nodes) == 1 && xmlName(nodes[[1]]) == "page")
        nodes = getNodeSet(nodes, ".//text")
    
    if(length(nodes) == 0)
        return(list())

    pgnum = sapply(nodes, pageOf)
    if(length(unique(pgnum)) > 1) {
       tmp = tapply(nodes, pgnum, nodesByLine, asNodes, baseFont = baseFont, fontSize = fontSize, addText = addText, rotate = rotate)
       return(structure(unlist(tmp, recursive = FALSE, use.names = FALSE), names = unlist(lapply(tmp, names))))
    }


    if(rotate)
        bbox = structure(bbox, names = c("top", "left", "height", "width", "text"))
    
      # If useBase, then we work with the bottom position of each character/segment/node, i.e. the baseline.
    if(useBase)
       bbox$top = bbox$top + bbox$height

    intv = seq(0, max(bbox$top)+ fontSize - 1, by = fontSize)
    topBins = cut(bbox$top, intv)
##    byLine = tapply(nodes, topBins, arrangeLineNodes, asNodes, simplify = FALSE)
    byLine = lapply(split(nodes, topBins), arrangeLineNodes, asNodes)

    names(byLine) = sapply(byLine, arrangeLineNodes, FALSE)
    byLine[ sapply(byLine, length) > 0]
}    

arrangeLineNodes =
    #
    # given the lines with the same top bin, arrange them from left to right.
    #
function(nodes, asNodes = TRUE)
{
    o = order(as.numeric(sapply(nodes, xmlGetAttr, "left")))
    if(asNodes)
        nodes[o]
    else
        paste(xmlValue(nodes[o]), collapse = " ")
}

getLineEnds =
    # Takes a list with each element a collection of nodes for that line.
    # Returns left and right end points.
function(lines)
{
   t(sapply(lines, function(x) {
                     b = getBBox2(x, TRUE)
                     c(min(b$left), max(b$left + b$width))
                  }))
}


#######
findShortLines =
    #
    # This finds lines that start at the left but are shorter than
    # other lines in the columns. This is one criterion that may identify
    # such a lines a section or sub-section header.
    # It is also the case for the final line in a paragraph.
    #
function(nodes, lines = nodesByLine(nodes),
         lw = getLineEnds(lines), asLogical = FALSE)            
{
    end = quantile(lw[, 2],  .75)
    w = end - lw[,2] > .1*median(lw[,2] - lw[,1])
    if(!asLogical && !missing(lines))
        lines[w]
    else
        w
}



##################

getTextByCols =
    #
    # Have to remove headers and footers first!
    #
    #  The nodes that are a little further to the right of the majority are indentations of the
    #  first line in a paragraph, like this sentence!
    #
    #  Need to identify blocks of text that span the entire page and those that are columnar.
    #
function(p, threshold = .1, asNodes = FALSE,
         txtNodes = getNodeSet(p, getXPathDocFontQuery(p, docFont)),
         bbox = getBBox2(txtNodes, TRUE),
         breaks = getColPositions(if(perPage) p else as(p, "XMLInternalDocument"), threshold = threshold, bbox = bbox, perPage = perPage, docFont = docFont, ...),
         perPage = FALSE, docFont = FALSE,
         order = FALSE, ...)         
{
    if(length(txtNodes) == 0)
        return(character())
    
    bb = bbox
    bb$text = sapply(txtNodes, xmlValue)
    
    if(asNodes) {
        ans = split(txtNodes, cut(bb$left, c(0, breaks[-1], Inf)))
        if(order)
            ans = lapply(ans, function(x) unlist(orderByLine(x)))
        ans
    } else {
        if(order)
            warning('ignoring order in getTextByCols for asNodes = FALSE for now')
        cols = split(bb, cut(bb$left, c(0, breaks[-1], Inf)))
        cols = sapply(cols, function(x) paste(x$text[ order(x$top) ], collapse = "\n"))
    }
}




bodyLine =
function(doc)
{
   ll = getNodeSet(doc, "//line")
   bb = getBBox(ll, asDataFrame = TRUE)
     # get the page number for each line
   bb$pageNum = sapply(ll, function(x) xmlGetAttr(xmlParent(x), "number"))

      # now get all of the text nodes
   txtBB = getBBox2(getNodeSet(doc, "//text"))
      # get the extremes for the text, i.e., the left and right margins
   txtExt = c(left = min(txtBB[, "left"]), right = max(txtBB[, "left"] + txtBB[, "width"]))

      # All the lines that are greater than 95% of the 
   hll = bb[ ((bb[, "x1"] - bb[, "x0"])/ diff(txtExt)) > .95,  ]

   # is there the same y0 (or y1) on each page for these lines
   table(hll[, "y1"])
}




# See ~/Davis/UCDSISR/R/ for getTranscriptCourses2

getPageLines =
function(page, center = 465, nodes = getNodeSet(page, "./text"), bbox = getBBox2(nodes))
{
    cols = rev(split(as.data.frame(bbox), bbox[,"left"] < center))
    tmp = lapply(cols, reassembleLines)
    tmp = lapply(cols, reassembleLines)
    colLines = lapply(tmp, function(x) sapply(x, function(x) paste(rownames(x), collapse = " ")))    
}

reassembleLines =
function(box)
{
   by(box, box[, "top"], assembleLine)
}


assembleLine =
function(els)
{
   o = order(els$left)
   els[o,]
}




xfoo = 
function(page, nodes = getNodeSet(page, ".//text"))
{    
    ll = nodesByLine(nodes)
    pos = getLineEnds(ll)
}


combineBBoxLines =
function(bbox, by = "y1")
{
    if(nrow(bbox) == 0)
       return(bbox)

    do.call(rbind, by(bbox, bbox[[by]], combineLines))
}

combineLines =
function(bbox, sameY = FALSE)
{

  sameHeight = length(unique(unlist(bbox[,c("y0", "y1")]))) == 1
     # This allows the right end of a line to go beyond the left part of the next line, i.e. overlap.     
     # Not simply meeting at the same point.  See Nittapatana-2008 paper.
  ok =  all((bbox$x0[-1] - bbox$x1[-nrow(bbox)]) < 0)
  
  if(ok && (!sameY || sameHeight)) {
      ans = bbox[1,]
      ans[c("x0", "x1")] = c(min(bbox$x0), max(bbox$x1))
      return(ans)
  } else
     bbox
}