# Ryan found problems with
# J Infect Dis.-2015-Ogawa-infdis-jiv063,
# Holsomback-2009-Bayou virus detected in non-Or.pdf,
# Frances-2004-Occurrence of Ross River virus an.pdf.
# Does work: 0043026620/Kaba-2010-Detection of hepatitis E virus in w1.pdf - Article History on left side of page. Accepted 4 August 2009
# "../0857285937/Vasconcelos-2003-%5BYellow%20Fever%5D.xml" is in Portuguese.
# Date is mar-abr, 2003.
if(FALSE) {
i = file.info(list.files("..", full.names = TRUE))
dirs = rownames(i)[i$isdir]
pubs = lapply(dirs, function(x) try(getPublicationDate(list.files(x, pattern = "xml", full = TRUE))))
dirs[!err][ sapply(pubs[!err], length) == 0]
}
if(FALSE) {
# After matching.R
hasPDF = (!is.na(ms$PDF) & ms$PDF != "")
pdfs = unique(ms$PDF[hasPDF])
cleanPdfs = gsub("\\.pdf(;|$)", ".xml\\1", gsub("internal-pdf:/", path.expand(PDFDir), pdfs))
docs = strsplit(cleanPdfs, ";")
#docs = sapply(e, function(x) gsub("internal-pdf://", "", x))
#tt = table(unlist(docs))
ff = unique(unlist(docs))
ex = file.exists(ff)
table(ex)
dates = lapply(ff[ex], function(x) try(getPublicationDate(x)))
table(sapply(dates, is, "try-error")) # None.
b = ff[ex][ sapply(dates, length) == 0 ]
bpdf = gsub("\\.xml$", ".pdf", b)
################
# Older
PDFDir = "NewData_Feb2017/Zoo_02_02_2017 Copy.Data/PDF"
dirs = dirname(unlist(docs))
docDirs = list.files(PDFDir)
all(dirs %in% docDirs)
fdirs = sprintf("%s/%s", PDFDir, dirs)
xmls = sapply(fdirs, list.files, pattern = "xml$", full.names = TRUE)
xdocs = lapply(xmls, function(x) try(xmlParse(x)))
w = sapply(xdocs, is, 'try-error')
#xdocs[!w]
pd = lapply(xdocs[!w], getPublicationDate)
table(sapply(pd, length) == 0)
# The ones that didn't match.
nodate = sapply(pd, length) == 0
xmls[!w][nodate]
tmp = lapply(xdocs[!w][nodate], getPublicationDate)
i = (sapply(tmp, length) == 0)
f = xmls[!w][nodate][i]
gsub("\\.xml$", ".pdf", f)
}
getPublicationDate1 =
# OLD VERSION - SEE BELOW
function(doc, page = 1, words = c("accepted", "received", "volume", "copyright", "published", "submitted", "recebido", "aceito"))
{
if(is.character(doc))
doc = xmlParse(doc)
# look for the words that often identify a year.
cond = paste(sprintf("contains(lower-case(.), '%s')", words), collapse = " or ")
ans = xpathSApply(doc, sprintf("//page[@number='%d']//text()[%s]", page, cond), xmlValue)
if(length(ans) == 0) {
tmp = getNodeSet(doc, sprintf("//page[@number = '%d']//text()[contains(., 'Emerging Infectious Disease')]", page))
if(length(tmp))
ans = gsub(".* Vol\\. .*, (.*)", "\\1", xmlValue(tmp[[1]]))
}
if(length(ans) == 0) {
tmp = getNodeSet(doc, sprintf("//page[@number = '%d']//text()[contains(., 'Journal of ')]", page))
if(length(tmp))
ans = xmlValue(tmp[[1]])
}
# We don't do anything with the txt here.
if(FALSE || length(ans) == 0) {
tmp = getNodeSet(doc, sprintf("//page[@number = '%d']//text()", page))
if(length(tmp)) {
txt = sapply(tmp, xmlValue)
}
}
if(length(ans) == 0 && page == 1) {
# Go to the last page
npages = length(getNodeSet(doc, "//page"))
if(npages > 1) {
ans = getPublicationDate1(doc, npages, words)
if(length(ans) == 0 && npages > 2)
# try second page
return(getPublicationDate1(doc, 2, words))
}
}
ans
}
findVol =
function(doc)
{
unlist(xpathSApply(doc, "//text()[contains(lower-case(.), 'vol')]", getVolume))
}
getVolume =
function(node)
{
txt = xmlValue(node)
grep("Volume|Vol\\.?[[:space:]]?[0-9]+", txt, value = TRUE)
}
#tmp = lapply(docs2, )
hasCoverPage =
function(doc)
{
isBioOne(doc) || isMBio(doc) || isResearchGate(doc)
}
isMBio =
function(doc)
{
length(getNodeSet(doc, "//page[1]//text[contains(., 'mBio')]")) > 0 && length(getNodeSet(doc, "//page[1]//ulink[starts-with(@url, 'http://mbio.asm.org')]")) > 0
}
isBioOne =
#
# There are some documents from BioOne which are scanned documents with a front page that is not scanned.
# isScanned() only looks at the first page. So we detect these explicitly.
#
function(doc)
length(getNodeSet(doc, "//text[starts-with(., 'BioOne sees sustainable scholarly ')]")) > 0
getPublicationDate =
#
# New version
#
#
#@'return a named character vector. The values contain a date, possibly with other text content.
# the names indicate which step/method was used to identify the date. These will all be the same
# as we stop when we have any date.
#
# The steps are Scanned (no date)
# NIH Public Access (particular format in which we can find the date)
# Title (Date in the title)
# Received (information about when received, accepted, published)
# footer (taken from the footer on the first page)
# header (taken from the header of the first page)
# copyright (find the copyright symbol and the date after it)
# AboveTitle (text above the title of the paper)
# TextRegEx (find a date of the form [number] NameOfMonth[,] Year anywhere in the text)
#
function(doc, checkAbstract = TRUE)
{
if(is.character(doc))
doc = readPDFXML(doc)
if(checkAbstract) {
abstract = try(findAbstract(doc, FALSE))
if(!is(abstract, 'try-error') && length(abstract)) {
txt = paste(abstract, collapse = "\n")
m = gregexpr("\\b(19|20)[0-9]{2}\\b", txt)
if(any(m[[1]] > -1)) {
y = unique(regmatches(txt, m)[[1]])
return(structure(y, names = rep("abstract", length(y))))
}
}
}
if(isBioOne(doc) && !is.na(tmp <- textAboveTitle(doc, 2)))
return(tmp)
if(isScanned(doc)) { # was isScanned2()
y = getYearFromFileName(basename(docName(doc)))
if(length(y))
return(c(filename = y))
else
return(structure(NA, names = "Scanned"))
}
nih = getNodeSet(doc, "//text[. = 'NIH Public Access']")
if(length(nih) > 0) {
txt = unique(xpathSApply(nih[[1]], "./following-sibling::text[contains(., 'doi:')]", xmlValue))
if(length(txt))
return(structure(txt, names = "NIH Public Access")) # , journal = xmlValue(getSibling(nih[[1]]))))
}
if(length(getNodeSet(doc, "//text[starts-with(., 'www.oie.int/')]")) > 0) {
date = xmlValue(getNodeSet(doc, "//text[. = 'Date of start of the event']/following-sibling::text[1]")[[1]])
return(c(OIE = date))
}
title = getDocTitleString(doc)
if(hasYear(title))
return(structure(title, names = "Title"))
rec = getSubmissionDateInfo(doc)
if(length(rec) > 0) {
txt = xmlValue(rec[[1]])
if(!hasYear(txt)) {
top = xmlGetAttr(rec[[1]], "top")
txt = paste(xpathSApply(rec[[1]], sprintf("./following-sibling::text[@top = '%s']", top), xmlValue), collapse = " ")
}
if(hasYear(txt))
return(structure(txt, names = rep('Received', length(txt))))
}
rec = getNodeSet(doc, "//text[contains(., 'received for review')]")
if(length(rec) > 0) {
txt = xmlValue(rec[[1]])
if(!hasYear(txt)) {
top = xmlGetAttr(rec[[1]], "top")
txt = paste(xpathSApply(rec[[1]], sprintf("./following-sibling::text[@top = '%s']", top), xmlValue), collapse = " ")
}
if(hasYear(txt))
return(structure(txt, names = rep('Received', length(txt))))
}
p1 = getNodeSet(doc, "//page")[[1]]
footer = getPageFooter(p1)
if(!grepl("Downloaded", footer) && any(w <- hasYear(footer)))
return(structure(footer[w], names = rep("footer", sum(w))))
footer = getPageHeader(p1)
if(!grepl("Downloaded", footer) && any(w <- hasYear(footer)))
return(structure(footer[w], names = rep("header", sum(w))))
#XXX non-ASCII symbol
cr = getNodeSet(doc, "//text[contains(., '©')]")
if(length(cr)) {
tt = sapply(cr, xmlValue)
if(any(w <- hasYear(tt)))
return(structure(tt[w], names = rep("copyright", sum(w))))
}
tt = textAboveTitle(doc, 1)
if(any(w <- hasYear(tt)))
return(structure(tt[w], names = rep("AboveTitle", sum(w))))
txt = getDocText(doc)
rx = sprintf("([0-9]{1,2} )?(%s),? (19|20)[0-9]{2}", paste(getMonthNames(), collapse = "|"))
g = gregexpr(rx, txt, ignore.case = TRUE)
if(g[[1]][1] > 0) {
tt = unique(regmatches(txt, g)[[1]])
return(structure(tt, names = rep("MonthNameYear.TextRegEx", length(tt))))
}
# Could go to the second page and start over with the headers, etc.
tt = getNodeSet(doc, "//text[isDate(string(.))]",
xpathFuns = list(isDate = containsDate))
if(length(tt)) #XXX Put the match type here.
return(c(TextDate = unique(extractDate(sapply(tt, xmlValue)))))
fname = basename(docName(doc))
y = getYearFromFileName(fname)
if(length(y))
return(c(filename = y))
NA
}
getYearFromFileName = getYearFromString =
# getYearFromFileName("Kohl 1996.xml")
# getYearFromFileName("Smithburn-1949-The susceptibility of African w.xml")
function(fname)
{
# "(^|[^0-9])[0-9]{4}([^[0-9]|$)"
# But need to not include characters within the () ()
m = gregexpr("(\\b|_)(19[0-9]{2}|20[01][0-9]{1})(\\b|_)", fname, perl = TRUE)
if(any(m[[1]] > -1))
gsub("(^_|_$)", "", regmatches(fname, m)[[1]])
else
character()
}
getMonthNames =
function(format = c("%b.", "%b", "%B"))
{
unlist(lapply(format, function(f) format(ISOdate(2017, 1:12, 1), f)))
}
containsDate =
function(str)
{
grepl(mkDateRegexp(), str)
}
mkDateRegexp =
function()
{
sprintf("[0-9]{4} (%s)( [0-9]{,2})", paste(getMonthNames(), collapse = "|"))
}
extractDate =
function(str)
{
unlist(regmatches(str, gregexpr(mkDateRegexp(), str)))
}
getDocText =
# Too simple. See the one in ReadPDF
function(doc)
{
if(is.character(doc))
doc = readPDFXML(doc)
paste(xpathSApply(doc, "//text", xmlValue), collapse = " ")
}
textAboveTitle =
# Finds the title and then gets the text above that. This is useful when there is header material
# in a sequence of lines.
# Could find it in other ways also.
function(doc, page = 1)
{
titleNodes = getDocTitle(doc, page)
if(length(titleNodes) == 0 || is.character(titleNodes))
return(NA)
pos = min(as.integer(sapply(titleNodes, xmlGetAttr, "top")))
page = getNodeSet(titleNodes[[1]], ".//ancestor::page")[[1]]
bbox = getBBox2(getNodeSet(page, ".//text"))
rownames(bbox)[ bbox[, "top"] < pos]
}
hasYear =
function(txt)
{
# grepl("(^| )(19|20)[0-9]{2}( |$)", txt)
grepl("\\b(19|20)[0-9]{2}\\b", txt)
}
firstIsolated =
function(doc, text = pdfText(doc))
{
if(missing(text) && is.character(doc))
doc = readPDFXML(doc)
pageNum = rep(seq(along = text), sapply(text, length))
text = unlist(text)
i = grep("first[[:space:]]+isolated", text, ignore.case = TRUE)
structure(text[i], names = pageNum[i])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.