#' Processing PubMed Search
#'
#' @description Wrapper interface for RISmed and adding additional article metadata.
#' @param query query for PubMed search
#' @param ncbi_key an ncbi api key
#' @param demoversion hard limit on PubMED retrieved records to 1000. Mainly used for easy transition to demo app version of Adjutant [Default: FALSE]
#' @param forceGet whether to try get abstracts that having gone missing (default:TRUE)
#' @param ... arguments passed to RISmed::EUtilsSummary method
#'
#' @return corpus
#'
#' @import RISmed
#' @import dplyr
#' @importFrom jsonlite fromJSON
#' @import stringr
#' @export
processSearch<-function(query=NULL,ncbi_key=NA,demoversion=FALSE, forceGet = TRUE, ...){
addedParam<- list(...)
#Running Query on Pubmed - kinda just gets me PMIDS
#resQ1<-EUtilsSummary(query=query, type='esearch', db='pubmed', ...)
if(!is.na(ncbi_key)){
ncbi_key<-gsub("\\s+","",ncbi_key)
query<-sprintf("%s&api_key=%s",gsub(" ","+",query),ncbi_key)
}
resQ1 <- tryCatch({EUtilsSummary(query=query, type='esearch', db='pubmed', ...)},
error = function(err){
print("Could not connect to PubMed")
return(NULL)
},
warning = function(war){
print("Could not connect to PubMed")
return(NULL)})
if(is.null(resQ1)){return(NULL)}
# return all queries, or if it's the demo version, only return the first 1000
if(!demoversion){
#if no retmax is specified, than, run this query to get more than 1000 PMIDS
if(is.na(addedParam[['retmax']])){
#resQ1 <- EUtilsSummary(query, type='esearch', db='pubmed',retmax=resQ1@count, ...)
resQ1 <- tryCatch({EUtilsSummary(query=query, type='esearch', db='pubmed',retmax=resQ1@count,...)},
error = function(err){
print("Could not connect to PubMed")
return(NULL)
},
warning = function(war){
print("Could not connect to PubMed")
return(NULL)})
}
}
pmidUnique<-unique(resQ1@PMID)
#Gather data in batches and convert to data frame
#format the data
#need to put everything together in batches, 100 articles was the magic number before time outs occur
#in both risemed and also limits of what esummary will return
numVals<-seq(from=0,to=length(pmidUnique),by=100)
numVals<-c(numVals,tail(numVals, n=1) + (length(pmidUnique) %% 100))
corpus<-c()
for(i in 1:(length(numVals)-1)){
start = numVals[i]+1
end = numVals[i + 1]
#Sometimes, the connect drops out
#Handle this more elegantly
attempt = 1
tmpDat<-NULL
while(attempt<=3){
tmpDat<-formatData(pmidUnique[start:end],ncbi_key,forceGet)
if(!is.null(tmpDat)){
#no connection issues, jump out of loop
break;
}
attempt = attempt + 1;
#sleep longer, then try again
Sys.sleep(10*attempt)
}
#if add articles to growing corpus list
if(!is.null(tmpDat)){
corpus <- rbind(corpus,tmpDat)
}
}
#if nothing could be added warn
if(is.null(nrow(corpus)) | nrow(corpus) == 0){
warning("No articles could be retrieved - either there was an error in the search or a connection issue to NCBI")
return(NULL)
}
corpus<- dplyr::distinct(corpus)
Sys.sleep(1) #slow down the connection, avoids timeouts
return(corpus)
}
#formatting the pubmed data. Helper script to processSearch function
#retrieve and format data
formatData<-function(ids = NULL, ncbi_key = NA, forceGet=TRUE){
#This here makes the queries into small manageable chucks so it doesn't time out.
# TO DO: Switch from RISEmed to just parsing JSON files. Right now, esummary produces valid JSON
# files that are nice and easy to parse. But efetch does not produce something nice and it seems
# the best way to query through R right now is to use risemed for eftech. Apparently efetch is the only
# part of the eUtils suite that DOESN'T yet properly return JSON. eSearch does and eSummary does.
tmpids<-ids #sanity check for when results suddenly seem to get dropped
if(!is.na(ncbi_key)){
query<-sprintf('%s&api_key=%s',paste0(tmpids,collapse = ","),ncbi_key)
}else{
query<-paste0(tmpids,collapse = ",")
}
pubResults<-tryCatch({EUtilsGet(query,type="efetch",db="pubmed")},
error = function(err){
return(NULL)
})
if(is.null(pubResults)){
#connection issue, do not continue
return(NULL)
}
#make sure that results out = result in. EUtils
#its an odd fringe case, but this does actually happen
#and it needs to be delt with more gracefully
notThere<-which(!(tmpids %in% pubResults@PMID))
if(length(notThere)>0){
missingPMID<-tmpids[notThere]
print("These are missing")
print(missingPMID)
tmpids<-setdiff(tmpids,missingPMID)
}
#convert author list from list object to string obeject with author names seperated by ;
authors<-sapply(pubResults@Author,function(x){
apply(x,1,function(y){sprintf("%s, %s",y[1],y[2])}) %>% paste0(.,collapse=";")
})
#also convert mesh Terms from a list
meshTerms<-sapply(pubResults@Mesh, function(x){
if(length(x) == 1){
return(NA)
}
x<-x %>%
filter(Type == "Descriptor") %>%
summarise(meshTerms=paste0(Heading,collapse=";"))
return(x$meshTerms)
})
#get some article metadata that doesn't ship with risemed EUTilsGet
# an important note: article citations from PUBmed are basically summarizing
# what *other pubmed articles* have referenced this work. This number *does not*
# match what a google search provides. The number provided here relies on
# Pubmed Central (open access). SO it's a decent heuristic, but its not perfect
if(!is.na(ncbi_key)){
url<-sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%s&retmode=json&api_key=%s",
paste0(tmpids,collapse="+"),
ncbi_key)
}else{
url<-sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%s&retmode=json",
paste0(tmpids,collapse="+"))
}
tmp<-fromJSON(url)
metadata<-sapply(tmp$result[2:length(tmp$result)],function(x){
processMetaJSON(x)
}) %>% t() %>%
data.frame(stringsAsFactors=FALSE) %>%
mutate(X1=as.factor(X1))
colnames(metadata)<-c("PMID","articleType","language","pmcCitationCount","pmcID","doi","Title")
#putting all the data together into a data frame
risResults<-data.frame(PMID=pubResults@PMID,
YearPub=pubResults@YearPubmed,
Journal=pubResults@Title,
Authors=authors,
Title=pubResults@ArticleTitle,
Abstract=pubResults@AbstractText,
stringsAsFactors = FALSE)
allData<-dplyr::inner_join(risResults,dplyr::select(metadata,-contains("Title")),by="PMID")
### FIND AND CLEAN UP MISSING DATA
#find missing titles and replace them with metadata version
#really need to eventually replace rismed in pipeline
missingInfo<-filter(allData,is.na(Title))
if(nrow(missingInfo)>0){
idxMatch<-match(missingInfo$PMID,metadata$PMID)
idxdf<-match(missingInfo$PMID,allData$PMID)
allData[idxdf,]$Title<-metadata[idxMatch,]$Title
}
#find missing abstracts and add them properly do the data
missingInfo<-filter(allData,Abstract == "")
if(nrow(missingInfo)>0 & forceGet){
idxdf<-match(missingInfo$PMID,allData$PMID)
missingAbs<-sapply(missingInfo$PMID,function(x){
#some times there are connection errors
#use tryCatch to die a little bit more gracefully
#res<-getMissingAbstract(x)
res<-tryCatch({getMissingAbstract(x,ncbi_key)},
error = function(err){
print("Could not retrieve missing abstract")
return(NA)
},
warning = function(war){
print("Could not retrieve missing abstract")
return(NA)
})
res
})
allData[idxdf,]$Abstract<-missingAbs
}
#adding those meshTerms
allData$meshTerms<-meshTerms
#finally, clean up the pmcCitationCount
#remove anything that didn't have a title or an abstract
allData<-allData %>%
dplyr::mutate(pmcCitationCount = ifelse(is.na(pmcCitationCount),0,pmcCitationCount)) %>%
dplyr::filter(!is.na(Title)) %>%
dplyr::filter(!is.na(Abstract))
return(allData)
}
#processing the JSON files from the pubmed document additional metadata
processMetaJSON<-function(tmp=NULL){
if("error" %in% names(tmp)){
#There is no document summary information available, just result NA
return(c(NA,NA,NA,NA,NA,NA,NA))
}else{
#get DOI & PMC
altID<-tmp$articleids %>%
filter(idtype %in% c("pmc","doi","pubmed")) %>%
select(idtype,value)%>%
tidyr::spread(idtype,value,NA)
# if there are missing values, fill them in
# since stuff might not be consistently returned
if(is.null(altID$doi)){altID$doi<-NA}
if(is.null(altID$pmc)){altID$pmc<-NA}
if(is.null(altID$pubmed)){altID$pubmed<-NA}
#storing title as backup, beacuse sometimes rise med doesn't properly parse the XML
# due to HTML elements existing in the title (sigh...)
articleTitle<-tmp$title
articleTitle<-gsub("<[//]?i>","",articleTitle) #getting rid of italics <i> the most common HTML tag in title
#pmc ref count
pmcrefCount<-tmp$pmcrefcount
if(pmcrefCount=="" | is.null(pmcrefCount)){pmcrefCount<-NA}
# clearly add the pubtype (i.e. journal article)
pubtype<-tmp$pubtype
if(is.null(pubtype) | class(pubtype)=="list"){
pubtype<-NA
}
#also add subtype if it's there (how i've interpretted this)
#seems like sometimes the second option is a specific subtype (like a review)
if(length(pubtype)>1){ pubtype<-pubtype[2]}
# finally, the language
lang<-tmp$lan
if(is.null(lang)){
lang<-NA
}else if (length(lang) > 1){
lang<-paste0(lang,collapse=";")
}
#if statement is a sanity check to make sure the records are the same
if(!is.na(altID$pubmed)){
return(c(altID$pubmed,pubtype,lang,pmcrefCount,altID$pmc,altID$doi,articleTitle))
}else{
return(c(NA,NA,NA,NA,NA,NA,NA))
}
}
}
getMissingAbstract<-function(PMID=NULL,ncbi_key=NA){
if(!is.na(ncbi_key)){
test<-paste(readLines(sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&api_key=%s",PMID,ncbi_key)),collapse="\n")
}else{
test<-paste(readLines(sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s",PMID)),collapse="\n")
}
test<-str_extract(test,'abstract\\s+("([^"]|"")*")') %>% gsub("abstract","",.) %>% gsub('\\"',"",.) %>% gsub("\n","",.)
return(test)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.