R/pdf2eco.R

# Function to go from a PDF file to JSON

##' Run the term extractor on a document
##' 
##' 
##' This function will run the term extractor (based on EpiTator \url{https://github.com/ecohealthalliance/EpiTator})
##' on a document. The document can be either XML generated by pdftohtml or a PDF document
##' which will be internally converted to a XML document. Additionally, the raw text can also be provided.
##' Results and intermediate text split by sections can be optionally saved.
##' 
##' @title Document to Resolved keywords
##' @param doc.file a file to parse, either XML or PDF
##' @param ecoextract file path to the ecoextract.py script
##' @param results.dir optional, directory to store the results as a rds file.
##'     If not specified, no results will be saved. If the directory does not
##'     currently exist, it will be created.
##' @param results.file optional, file name to use for the results, defaults to the \code{doc.file} basename.rds
##' @param cache.dir optional directory to cache the intermediate text results from \code{ReadPDF::getSectionText}
##'     If not specified, no caching will be performed
##' @param cache.file optional, file name to use for the cached section text
##' @param section.text a list, with one element per section to be processed
##' @return a list, with one element per section with all resolved keywords arranged in a nested list.
##' @examples
##' txt = "This mentions China"
##' ans = doc2keywords(section.text = short_text)
##' getLocation(ans)
##' 
##' @author Matt Espe and Duncan Temple Lang
doc2keywords =
    function(doc.file, ecoextract = getEcoExtractPyScript(),
             results.dir = character(), results.file = file.path(results.dir, gsub("xml$", "rds", basename(doc.file))),
             cache.dir = character(), cache.file = file.path(cache.dir, gsub("xml$", "rds", basename(doc.file))),
             section.text = load_text(doc.file, cache.file, cache.dir))

{    
    # Clean up special characters
    section.text = lapply(section.text, function(x) gsub('Â|"', "", x))

    # Drop references, etc.
    i = grep("references?|cited|acknowledgements?|funding|interests|disclosure", names(section.text),
             ignore.case = TRUE, invert = TRUE)
    if(length(i) >0)
        section.text = section.text[i]

    ans = lapply(section.text, function(x) try(sect2eco(x, ecoextract)))

    if(length(results.dir) && !is.na(results.dir)) {
        if(!dir.exists(results.dir))
            dir.create(results.dir)
        
        saveRDS(ans, results.file)
    }
    ans
}

load_text = function(doc.file, cache.file,
                     cache.dir = dirname(cache.file))
{
    if(exists(cache.file)) {
        section.text = readRDS(cache.file)
    } else {
        section.text = readXMLSections(doc.file)
        if(length(cache.file) && !is.na(cache.file)){
            if(!dir.exists(cache.dir))
                dir.create(cache.dir)
            
            saveRDS(section.text, file = cache.file)
        }
    }
    
    section.text
}

    
sect2eco = function(section, ecoextract = getEcoExtractPyScript())
{
    f = tempfile()
    f2 = tempfile()

    cat(section, file = f)
    #    system2(ecoextract, args = c(f, f2))
    system2("python3", args = c(ecoextract, f, f2))
    res = fromJSON(f2)
    res$txt = section
    res
}

fixSectionNames = function(sect_names)
{
    tmp = sect_names

    tmp = XML:::trim(tmp)
    
    # Find and relabel the "authors" - might need to be fixed later
    auth_regex = "^[A-Z][[:alpha:]]+[ -*]([A-z]\\.?[[:alpha:]]*[ -*]?)?([A-z][[:alpha:]]+[ -*]?)?,"
    i = grep(auth_regex, tmp)
    if(length(i) > 0)
        warning("Replacing: ", paste(tmp[i], collapse = ":"))
    
    tmp[i] = "authors"

    # remove numbers
    tmp = gsub("^([0-9]\\.)+ ?", "", tmp)

    tmp = tolower(tmp)
    return(tmp)
}


readXMLSections =
function(XML)
{    
    doc = readPDFXML(XML)

    title = getDocTitleString(doc)
    abst = findAbstract(doc)
        
    abst = paste(names(abst), collapse = " ")
        
    allsect = try(lapply(getSectionText(doc), function(x)
                                                  paste(unlist(x), collapse = " ")))
       
    if(is(allsect, "try-error") || length(allsect) > 20)
        allsect = list(body = getDocText(doc))
        
    c(title = title, abstract = abst, allsect)
}
dsidavis/SpilloverDA documentation built on June 1, 2019, 2:55 p.m.