inst/script/getPvalues.R

########################################################################################
# Functions to scrape P-values from Pubmed abstracts
# Date: 7-1-12
# Copyright (C) 2011 Jeffrey T. Leek (http://www.biostat.jhsph.edu/~jleek/contact.html) and Leah R. Jager
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details, see <http://www.gnu.org/licenses/>.
#
#
#    Note: These functions were written on a Mac and may have difficulties when
#          read on windows machines. 
#
########################################################################################

library(RCurl)
library(XML)
library(tm)

# A function to get all abstracts and pubmed ids for papers from the journal "journaltitle" in the year "year"
# by scraping the Pubmed API.

getAbstractsPmids = function(journaltitle,year){
  # esearch
  url <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
  q = paste("db=pubmed&term=",gsub(" ","+",journaltitle),"[ta]+AND+",year,"[dp]&usehistory=y",sep="")
  esearch <- xmlTreeParse(getURL(paste(url, q, sep="")), useInternal = T)
  webenv  <- xmlValue(getNodeSet(esearch, "//WebEnv")[[1]])
  key     <- xmlValue(getNodeSet(esearch, "//QueryKey")[[1]])
  
  # efetch
  url <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
  q   <- "db=pubmed&retmode=xml&rettype=abstract"
  efetch <- xmlTreeParse(getURL(paste(url, q, "&WebEnv=", webenv, "&query_key=", key, sep="")), useInternal = T)
  r = xmlRoot(efetch)
  
  n = xmlSize(r)
  abstracts = pmid = titles = rep(NA,n)
  for(i in 1:n){abstracts[i] =  xmlValue(r[[i]][[1]][["Article"]][["Abstract"]]); pmid[i] = xmlValue(r[[i]][[1]][["PMID"]]); titles[i] = xmlValue(r[[i]][[1]][["Article"]][["ArticleTitle"]]) }
  return(list(abstracts=abstracts,pmid=pmid,titles=titles))
}


#A function to remove trailing zeros from the P-value strings
removeTrailing = function(string){
  while(length(grep("[0-9]",strsplit(string,"")[[1]][nchar(string)])) == 0){
    string = substr(string,1,(nchar(string)-1))
  }
  return(string)
}


# A function to convert the scientific notation used by journals into
# numeric values that can be analyzed. 
convertScientific = function(string){
  if(length(grep("[[:punct:]][[:space:]]",string))>0){
    string = strsplit(string,"[[:punct:]][[:space:]]")[[1]][1]
    string = removeTrailing(string)
  }
  if(length(grep("[×x]",string))>0){
    string = gsub("[:space:]","",string)
    tmp1 = as.numeric(strsplit(string,"[×x]")[[1]][1])
    tmp2 = as.numeric(strsplit(strsplit(string,"[×x]")[[1]][2],"[???-]")[[1]][2])
    return(tmp1*10^(-tmp2))
  }else{
    return(as.numeric(string))
  }
}


# A function to scrape the P-values from a vector of abstracts with corresponding
# pubmed ids

getPvalues = function(abstract,pmid){
  pvalues = numeric(0)
  trunc = numeric(0)
  ids = numeric(0)
  
  # Get the truncated p-values
  ind = grep("[Pp][[:space:]]?[<???]",abstract)
  for(i in 1:length(ind)){
    tmp = strsplit(abstract[ind[i]],"[[:space:](][Pp][[:space:]]?[<???]")[[1]]
    n = length(tmp)
    for(j in 1:n){
      if(length(grep("[.0123456789]",substr(tmp[j],1,2))) > 0){
        if(length(grep("[A-Z]",substr(tmp[j],1,1)))>0){next;}
        tmp2 = strsplit(tmp[j],"[^[:punct:][:digit:]x[:space:]]")[[1]][1]
        tmp2 = removeTrailing(tmp2)
        tmp2 = gsub(" ","",tmp2)
        tmp2 = convertScientific(tmp2)
        pvalues = c(pvalues,as.numeric(tmp2))
        trunc = c(trunc,1)
        ids = c(ids,pmid[ind[i]])
      }
    }
  }
  
  # Get the truncated p-values
  ind = grep("[Pp][[:space:]]?=",abstract)
  for(i in 1:length(ind)){
    tmp = strsplit(abstract[ind[i]],"[[:space:](][Pp][[:space:]]?=")[[1]]
    n = length(tmp)
    for(j in 1:n){
      if(length(grep("[.0123456789]",substr(tmp[j],1,2))) > 0){
        if(length(grep("[A-Z]",substr(tmp[j],1,1)))>0){next;}
        tmp2 = strsplit(tmp[j],"[^[:punct:][:space:][:digit:]x[:space:]]")[[1]][1]
        tmp2 = removeTrailing(tmp2)
        tmp2 = gsub(" ","",tmp2)
        tmp2 = convertScientific(tmp2)
        pvalues = c(pvalues,as.numeric(tmp2))
        trunc = c(trunc,0)
        ids = c(ids,pmid[ind[i]])
      }
    }
  }
  return(list(pvalues=pvalues,ids=ids,trunc=trunc))
}


## Run the above functions over a specified set of journals and years
## to obtain P-values


journals = c("JAMA","New England Journal of Medicine","BMJ","American Journal of Epidemiology","Lancet")
years = 2000:2010

journals_pVals = matrix(NA,nrow=1,ncol=6)
colnames(journals_pVals) = c("pvalue","pvalueTruncated","pubmedID","year","abstract","title")

npapers = matrix(NA,nrow=length(journals),ncol=length(years))

for(i in 1:length(journals)){
  for(j in 1:length(years)){
    cat(journals[i]); cat(" "); cat(years[j]); cat(" "); 
    tmpData = getAbstractsPmids(journals[i],years[j])
    while(length(tmpData$abstracts) ==1 & is.na(tmpData$abstracts[1])){tmpData = getAbstractsPmids(journals[i],years[j])}
    cat("Downloaded"); cat(" ");
    npapers[i,j] = length(tmpData$abstracts)
    tmpOut = getPvalues(tmpData$abstracts,tmpData$pmid)
    
    nPvalues = length(tmpOut$pvalues)
    aa = match(tmpOut$ids,tmpData$pmid)
    
    tmpMatrix = cbind(tmpOut$pvalues,tmpOut$trunc,as.numeric(tmpOut$ids),rep(years[j],nPvalues),tmpData$abstracts[aa],tmpData$titles[aa])
    rownames(tmpMatrix) = rep(journals[i],nPvalues)
    journals_pVals = rbind(journals_pVals,tmpMatrix)
    cat("Done\n")
  }
}


## These come from the paper Pubmed ID = 20876667 and are reports from
## a GWAS. Update some P-values that are incorrectly scraped because of
## variation in scientific notation.

journals_pVals[journals_pVals[,1] == 10,1] = 10e-7
journals_pVals = journals_pVals[-1,]

## These come from the Lancet, with missed periods in the P-values

journals_pVals[13392,1] = 0.003
journals_pVals[13413,1] = 0.014
journals_pVals[13414,1] = 0.004

## This one comes from the Lancet too, where an incorrect P-value is grabbed.

journals_pVals = journals_pVals[-14674,]


# Remove rows with P-values that are 
journals_pVals = journals_pVals[!is.na(journals_pVals[,1]),]

# This one for some reason replaced a period with "small middle dot"

journals_pVals[which(journals_pVals[,3] == 11943262),1] = 1e-4

# This one has a <<< in the P-value definition
journals_pVals = journals_pVals[-which(journals_pVals[,3]=="16421237"),] 

save(journals_pVals,npapers,file="journals_pVals.RData")
leekgroup/swfdr documentation built on Dec. 11, 2020, 11:40 a.m.