R/plot.R

Defines functions getNodeColors renderLinesRects renderCoord renderCoords pageTitle plot.PDFToXMLDoc

#library(XML)

# See MarkLundy/readPDF2HTML.R
# Create the class hierarchy for a document generated by our modified pdftohtml.

pdftohtmlDoc =
    #
    # read an XML or a PDF document. If it is XML, it is assumed to be
    # already converted by pdftohtml. If it is a PDF document, we convert it.
    #
function(file)
{
   if(grepl("\\.pdf$", file))
      convertPDF2XML(file)
   else {
      doc = xmlParse(file)
      class(doc) = c("PDFToXMLDoc", "ConvertedPDFDoc", class(doc))
      doc
   }
}

showPage =
    # toplevel function.  Provide a file and a page number, and we render that page.
function(x, pageNum = 1, doc = xmlParse(x), page = getNodeSet(doc, "//page")[[pageNum]], ...)
{
  if(is(x, "XMLInternalDocument") && missing(doc))
      doc = x
  else if(missing(pageNum) && is(x, "XMLInternalElementNode") && xmlName(x) == "page")
      page = x
  
  renderPage(page, ...)
}

plot.PDFToXMLPage = showPage

plot.PDFToXMLDoc =
function(x, y, ...)
{
    np = getNumPages(x)
    r = ceiling(sqrt(np))
    c = np/r
    opar = par(no.readonly = TRUE)
    on.exit(par(opar))
    par(mfrow = c(r, c))
    invisible(sapply(getPages(x), renderPage))
}

pageTitle =
function(page, docname = docName(page), fullName = TRUE)
{
    if(!fullName)
        docname = basename(docname)
    sprintf("%s, page %s", URLdecode(docname), xmlGetAttr(page, "number"))
}
    

renderPage =
    #
    # Give us the page and we draw its elements - line, rect, img, text
    # The text is not proportional so is often much shorter in the horizontal direction
    # than it would be when displayed as PDF.
    #
    # No color information on the text nodes at this point.
    #
function(page, cex.text = .5, adj = c(0, 1), showText = TRUE, showBoxes = FALSE,
         title = pageTitle(page, fullName = fullName), fullName = TRUE) # , showColors = TRUE)
{    
    p = page
    psize = as.integer(xmlAttrs(p)[c("height", "width")])
    h = psize[1]

    plot(0, type = "n", xlab = "", ylab = "", xlim = c(0, psize[2]), ylim = c(0, psize[1]))
    title(title)

    renderLinesRects(page, h = h)
    renderCoords(page, h = h)
    
    imgs = getNodeSet(page, ".//img")
    if(length(imgs)) {
        bb = getBBox2(imgs, attrs = c("x", "y"))

        #XXX??? Should these y values be subtracted from h???
        # And are these widths and heights that need to be added!
        rect(bb[,1], h - bb[,2], bb[,1] + bb[,3], h - bb[,4] - bb[,4], border = "blue", lty = 3)
    }    

    if(showText) {
        txt = getNodeSet(page, ".//text")
        if(length(txt)) {
            bb = t(sapply(txt, xmlAttrs))
            if(!("rotation"  %in% colnames(bb)))
               bb = cbind(bb, rotation = rep(0, nrow(bb)))
            storage.mode(bb) = "double"

            prot = xmlGetAttr(page, "rotation", NA, as.numeric)
            if(!is.na(prot))
                bb[, "rotation"] =  bb[, "rotation"] + prot
            
            colors = getNodeColors(txt)
            text = sapply(txt, xmlValue)
            by(1:nrow(bb), bb[, "rotation"],
                   function(i)
                       text(bb[i,2], h - bb[i,1], text[i], cex = cex.text, adj = adj, col = colors[i], srt = - bb[i[1], "rotation"]))
        }
    }

    if(showBoxes) {
        txt = getNodeSet(page, ".//text")
        bb = getBBox2(txt)

        rect(bb[,1], h - bb[,2], bb[,1] + bb[,3], h - bb[,2] - bb[,4], border = "lightgreen")
    }

    TRUE
}

renderCoords =
function(page, h, coords = getCoords(page), col = "black")
{
    #   lapply(coords, renderCoord, h)
    X = unlist(lapply(coords, function(x) c(x$x, NA)))
    Y = unlist(lapply(coords, function(x) c(x$y, NA)))
    #XXX add color and lwd.
    lines(X, h - Y, col = col)
}

renderCoord =
function(x, h)
{
    lines(x$x, h - x$y, col = x$stroke, lwd = x$lineWidth)
}



renderLinesRects =
function(page, h, border = "green")
{
    rr = getNodeSet(page, ".//rect ")
    if(length(rr)) {
        bb = getBBox(rr)
        col = sapply(rr, function(x) mkColor(xmlGetAttr(x, "fill.color", "0,0,0"), isFill = TRUE))
        lwd = max(1, as.numeric(sapply(rr, function(x) xmlGetAttr(x, "lineWidth", "1.0"))))
        rect(bb[,1], h - bb[,2], bb[,3], h - bb[,4], border = border, col = col, lwd = lwd)
#XXXX temp        rect(bb[,1], bb[,2], bb[,3], bb[,4], border = border, col = col, lwd = lwd)        
    }

    if(length ( lines <- getNodeSet(page, ".//line "))) {
        bb = getBBox(lines)
        renderLines(bb, h, lines)
    }
}



renderLines =
    # See below for vectorized version.
function(bb, h, lines)
{
    sapply(1:nrow(bb),
           function(i) {
               at = xmlAttrs(lines[[i]])
               lines(bb[i, c(1,3)], h - bb[i, c(2, 4)], col = mkColor(at["stroke.color"]),
                     lwd = max(1, as.numeric(at["lineWidth"], na.rm = TRUE)),
                     lty = 2)
           })
        # lines(bb[,1], h - bb[,2], bb[,3], h-bb[,4], col = "red")    
}


renderLines =
    # need to split by lwd and color.
    # Add lwd and color from nodes.
function(bb, h, nodes)
{
    n = nrow(bb)*3
    x = y = rep(as.numeric(NA), n)
    i = seq(1, by = 3, length = nrow(bb))    
    x[i] = bb[,"x0"]
    x[i+1] = bb[, "x1"]
    y[i] = bb[, "y0"]
    y[i+1] = bb[, "y1"]
    col = "black"
    lines(x, h - y, col = col, lwd = 1, lty = 2)
#XXXX temp    lines(x, y, col = col, lwd = 1, lty = 2)
}


getNodeColors =
function(textNodes, fonts = getFontInfo(xmlParent(textNodes[[1]])))
{
    f = sapply(textNodes, xmlGetAttr, "font")
    fonts$color[ match(f, fonts$id) ]
}

mkColor =
    # Convert a triple of RGB values as a string into  a color
    # assuming a maxColorValue of 2^16
    # This could be, but is not vectorized.
function(x, alpha = maxColorValue, maxColorValue = 2^16,  isFill = FALSE)
{
   if(is.null(x) || x == "" || is.na(x))
       return(rgb(0, 0, 0))
    
   els = as.integer(strsplit(x, ",")[[1]])
   if(isFill && all(els == 0))
       NA
   else
       rgb(els[1], els[2], els[3], alpha, maxColorValue = maxColorValue)
}
dsidavis/ReadPDF documentation built on June 12, 2025, 6:39 a.m.