Supplement17.R

library(XML)
library(ReadPDF)
invisible(sapply(list.files("~/DSIProjects/ReadPDF/R", pattern = "\\.R$", full = TRUE), source))

supp17Text =
function(page, bbr = getBBox(getNodeSet(page, ".//rect"), TRUE),
         textNodes = getNodeSet(page, ".//text"),
         bb = getBBox2(textNodes, TRUE),
         asNodes = TRUE)
{
    #XXX Do rotated.
    h = bbr$y1 - bbr$y0
    i = which.max(h)
   
    w = bb$top >= bbr$y0[i] & bb$top + bb$height <= bbr$y1[i]
    ans = bb[w,]
    nodes = textNodes[w]
    if(asNodes)
        nodes
    else
        nodesByLine(nodes, FALSE)
}

getSupplementalPage =
function(doc)    
{
    doc = as(doc, "PDFToXMLDoc")
    z = getNodeSet(doc, "//text[starts-with(., 'Estimated Supplemental')]")

    if(length(z) == 0)
        NULL
    
    if(length(z) > 1) {
        pgs = pageOf(z, TRUE)
        w = sapply(pgs, function(x) length(getNodeSet(x, ".//rect")) ) > 0
        if(sum(w) > 1) {
            h = sapply(pgs, function(x) { tmp = getBBox(x, TRUE); max(abs(tmp$y1 - tmp$y0))} )
            w = h > 20
            if(sum(w) > 1)
                stop("need to distinguish pages further for supplemental data")
            else if(sum(w) == 0)
                stop("discarded too many nodes")
        }
        ans = pgs[w][[1]]
    } else
        ans = pageOf(z[[1]], TRUE)

    ans
}

suppl17 = 
function(doc = "2017_18/Palo_Alto_Unified.xml")
{    
    #    doc = readPDFXML(f)
    doc = as(doc, "PDFToXMLDoc")

    z = getNodeSet(doc, "//text[starts-with(., 'Estimated Supplemental')]")
# For Palo Alto Unified    
# 2 instances, one ends in : The second (page 146) is instructions.
# We want the one ending in :

    w = grepl(":$", sapply(z, function(x) xmlValue(x)))
    if(!any(w)) {
        if(length(z)) {
            w = sapply(z, isOnDemonstrationPage)
#browser()            
            # w = 1  # or can find the one on a page that is not the instruction page.
        } else
            stop("can't find the text 'Estimated Supplemental'")
    }


    bb = getBBox2(z[w], TRUE)
    p = pageOf(z[w], TRUE)[[1]]


    # Rotation can be on the page or on the <text>
    rot = xmlGetAttr(z[w][[1]], "rotation", 0)
    rot.p = xmlGetAttr(p, "rotation", 0)
    
    pos = getBBox2(getNodeSet(p, ".//text"), TRUE)

rot = 0 # temporary.


    # We can find the colored box that "level" with the text.
#browser()    
    if(rot == 0)
        w = abs(pos$top - bb$top) < bb$height
    else {
            # 14 comes from Delair doc where the width of the rotated text is 0!
        w = abs(pos$left - bb$left) < max(bb$width, 14) 

    }
    
    i = grep("[0-9]", pos$text[w])
    pos$text[w][i]
}


isOnDemonstrationPage =
function(node)
{
    p = xmlParent(node)

    return(length(getNodeSet(p, ".//rect")) > 0)
    
    nodes = getNodeSet(p, ".//text[starts-with(., 'Demonstration')]")
    length(nodes) > 0
}

if(FALSE) {
    xx = list.files("2017_18", pattern = "xml$", full = TRUE)
    xx.ans = sapply(xx, function(f) try(suppl17(f)))
    b = sapply(xx.ans, is, 'try-error')
    table(b)
#FALSE  TRUE 
#  735    59 

    scan = sapply(xx[b], function(x) tryCatch(isScanned(x), error = function(...) NA))
    table(scan, useNA = "always")
    # The ones we fail on that are not scanned and not a pdftohtml error (NA) are the wrong format
    # from 2016.
#FALSE  TRUE  <NA> 
#   30    27     2     


    # Now for the ones we got something. How many  values did we get.
    nvals = sapply(xx.ans[!b], length)
    table(nvals)
#  0   1   2   3   4   5 
# 17   3 707   5   2   1 

    ans = xx.ans[!b]
    amt = XML:::trim(sapply(ans[nvals == 2], `[`, 1))
      # remove spaces after ,
    amt = gsub(", +", ",", amt)
    amt = gsub("\\$Supplemental: +", "", amt)
    # convert trailing million to e5 - scientific notation.
    amt = gsub(" million", "e5", amt)        
    table(grepl("^\\$", amt))

    amt[!(grepl("^\\$", amt))]  #

    cur = as(amt, "Currency")  # doesn't convert $2.1 million
    amt[is.na(cur)]
# Clean up ', ', million $Supplemental: 
    
    pct = XML:::trim(sapply(ans[nvals == 2], `[`, 2))
    table(grepl("%", pct))
    pct[!grepl("%", pct)] # no %
    # Tiny bit of work to clean pct before converting    
    pct = as(pct, "Percent")


    
     # 5 did not give us 2 values.
    names(ans)[nvals != 2]
#[1] "2017_18/Alpine_County_Unified_and_County_Office_of_Education.xml"
#[2] "2017_18/Bangor_Union_Elementary.xml"                             
#[3] "2017_18/Borrego_Springs_Unified.xml"                             
#[4] "2017_18/Cloverdale_Unified.xml"                                  
#[5] "2017_18/Denair_Unified.xml"

#[6] "2017_18/Durham _Unified.xml"                                     
#[7] "2017_18/Franklin-McKinley_Elementary.xml"                        
#[8] "2017_18/Gridley _Unified.xml"                                    
#[9] "2017_18/Jamul-Dulzura_Union_Elementary.xml"                      
#10] "2017_18/Kashia_Elementary.xml"                                   
#11] "2017_18/Laguna_Beach_Unified.xml"                                
#12] "2017_18/Larkspur-Corte_Madera.xml"                            rotated
#13] "2017_18/Los_Alamitos_Unified.xml"                       narrow text means on a different line than value.

    # Alpine Values in box are on 2 lines. Mid point rather than top.
    # Bangor - page is narrower and the text is taller. Need to work with mid point, not top.
    # Borrego - rotated (p43). But we cannot find the whole string in a text. So will have to recombine them.
    # Cloverdale - right of page is cropped and value is not present.
    # Denair - page 55 is rotated. Actually, it is each text element. Handled now.
#2017_18/Alpine_County_Unified_and_County_Office_of_Education.xml 
#                                                              4 
#                            2017_18/Bangor_Union_Elementary.xml 
#                                                              0 
#                            2017_18/Borrego_Springs_Unified.xml 
#                                                              0 
#                                 2017_18/Cloverdale_Unified.xml 
#                                                              1 
#                                     2017_18/Denair_Unified.xml 
#                                                              3     


    # Overall
    # Not counting the xml files we didn't generate. At least 2.
status =    c(num = length(xx),
      numFailed = sum(b),
      numScanned = sum(!is.na(scan) & scan),
      numProc = length(nvals),
      pctProc = length(nvals)/length(xx),
      ok = sum(nvals == 2),
      notOk = sum(nvals != 2),
      pctOk = sum(nvals == 2)/length(nvals)      ,
      overallOk = sum(nvals == 2)/(length(xx) - sum(!is.na(scan) & scan))
     )
# 92%
}
dsidavis/LCAP documentation built on May 15, 2019, 4:19 p.m.