# Function to go from a PDF file to JSON
##' Run the term extractor on a document
##'
##'
##' This function will run the term extractor (based on EpiTator \url{https://github.com/ecohealthalliance/EpiTator})
##' on a document. The document can be either XML generated by pdftohtml or a PDF document
##' which will be internally converted to a XML document. Additionally, the raw text can also be provided.
##' Results and intermediate text split by sections can be optionally saved.
##'
##' @title Document to Resolved keywords
##' @param doc.file a file to parse, either XML or PDF
##' @param ecoextract file path to the ecoextract.py script
##' @param results.dir optional, directory to store the results as a rds file.
##' If not specified, no results will be saved. If the directory does not
##' currently exist, it will be created.
##' @param results.file optional, file name to use for the results, defaults to the \code{doc.file} basename.rds
##' @param cache.dir optional directory to cache the intermediate text results from \code{ReadPDF::getSectionText}
##' If not specified, no caching will be performed
##' @param cache.file optional, file name to use for the cached section text
##' @param section.text a list, with one element per section to be processed
##' @return a list, with one element per section with all resolved keywords arranged in a nested list.
##' @examples
##' txt = "This mentions China"
##' ans = doc2keywords(section.text = short_text)
##' getLocation(ans)
##'
##' @author Matt Espe and Duncan Temple Lang
doc2keywords =
function(doc.file, ecoextract = getEcoExtractPyScript(),
results.dir = character(), results.file = file.path(results.dir, gsub("xml$", "rds", basename(doc.file))),
cache.dir = character(), cache.file = file.path(cache.dir, gsub("xml$", "rds", basename(doc.file))),
section.text = load_text(doc.file, cache.file, cache.dir))
{
# Clean up special characters
section.text = lapply(section.text, function(x) gsub('Â|"', "", x))
# Drop references, etc.
i = grep("references?|cited|acknowledgements?|funding|interests|disclosure", names(section.text),
ignore.case = TRUE, invert = TRUE)
if(length(i) >0)
section.text = section.text[i]
ans = lapply(section.text, function(x) try(sect2eco(x, ecoextract)))
if(length(results.dir) && !is.na(results.dir)) {
if(!dir.exists(results.dir))
dir.create(results.dir)
saveRDS(ans, results.file)
}
ans
}
load_text = function(doc.file, cache.file,
cache.dir = dirname(cache.file))
{
if(exists(cache.file)) {
section.text = readRDS(cache.file)
} else {
section.text = readXMLSections(doc.file)
if(length(cache.file) && !is.na(cache.file)){
if(!dir.exists(cache.dir))
dir.create(cache.dir)
saveRDS(section.text, file = cache.file)
}
}
section.text
}
sect2eco = function(section, ecoextract = getEcoExtractPyScript())
{
f = tempfile()
f2 = tempfile()
cat(section, file = f)
# system2(ecoextract, args = c(f, f2))
system2("python3", args = c(ecoextract, f, f2))
res = fromJSON(f2)
res$txt = section
res
}
fixSectionNames = function(sect_names)
{
tmp = sect_names
tmp = XML:::trim(tmp)
# Find and relabel the "authors" - might need to be fixed later
auth_regex = "^[A-Z][[:alpha:]]+[ -*]([A-z]\\.?[[:alpha:]]*[ -*]?)?([A-z][[:alpha:]]+[ -*]?)?,"
i = grep(auth_regex, tmp)
if(length(i) > 0)
warning("Replacing: ", paste(tmp[i], collapse = ":"))
tmp[i] = "authors"
# remove numbers
tmp = gsub("^([0-9]\\.)+ ?", "", tmp)
tmp = tolower(tmp)
return(tmp)
}
readXMLSections =
function(XML)
{
doc = readPDFXML(XML)
title = getDocTitleString(doc)
abst = findAbstract(doc)
abst = paste(names(abst), collapse = " ")
allsect = try(lapply(getSectionText(doc), function(x)
paste(unlist(x), collapse = " ")))
if(is(allsect, "try-error") || length(allsect) > 20)
allsect = list(body = getDocText(doc))
c(title = title, abstract = abst, allsect)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.