R/easyPubMed_scr.R
In easyPubMed: Search and Retrieve Scientific Publication Records from PubMed

#' @title Extract Data from a PubMed Record
#'
#' @description Extract publication-specific information from a PubMed record driven by XML tags. 
#' The input record is a string (character-class vector of length 1) and includes 
#' PubMed-specific XML tags. Data are returned as a data frame where each row corresponds 
#' to one of the authors of the PubMed article.
#'
#' @usage article_to_df(pubmedArticle, autofill = FALSE, 
#'                      max_chars = 500, getKeywords = FALSE, 
#'                      getAuthors = TRUE)
#'                     
#' @param pubmedArticle String including one PubMed record.
#' @param autofill Logical. If TRUE, missing affiliations are automatically imputed based on other non-NA 
#' addresses from the same record.
#' @param max_chars Numeric (integer). Maximum number of characters to be extracted from the Article 
#' Abstract field. Set max_chars to -1 for extracting the full-length abstract. Set max_chars to 0 to 
#' extract no abstract.
#' @param getKeywords Logical. If TRUE, an attempt to extract article Keywords will be made.
#' @param getAuthors Logical. If FALSE, author information won't be extracted. This will considerably 
#' speed up the operation.
#'
#' @details 
#' Given one Pubmed Article record, this function will automatically extract a set of features. 
#' Extracted information include: PMID, DOI, article title, article abstract, publication date (year, month, day), 
#' journal name (title, abbreviation), keywords, and a set of author-specific info (names, affiliation, email address). 
#' Each row of the output data frame corresponds to one of the authors of the PubMed record. Author-independent info 
#' (publication ID, title, journal, date) are identical across all rows. If information about authors are not required, 
#' set 'getAuthors' = TRUE.
#'
#' @return Data frame including the extracted features. Each row correspond a different author.
#'
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' try({
#'   ## Display some contents
#'   data("EPMsamples")
#'   #display Query String used for collecting the data
#'   print(EPMsamples$NUBL_1618$qry_st)
#'   #Get records
#'   BL_list <- EPMsamples$NUBL_1618$rec_lst
#'   cat(BL_list[[1]])
#'   # cast PM recort to data.frame
#'   BL_df <- article_to_df(BL_list[[1]], max_chars = 0)
#'   print(BL_df)
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Query PubMed, retrieve a selected citation and format it as a data frame
#' dami_query <- "Damiano Fantini[AU] AND 2017[PDAT]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query)
#' dami_abstracts_xml <- fetch_pubmed_data(dami_on_pubmed)
#' dami_abstracts_list <- articles_to_list(dami_abstracts_xml)
#' article_to_df(pubmedArticle = dami_abstracts_list[[1]], autofill = FALSE)
#' article_to_df(pubmedArticle = dami_abstracts_list[[2]], autofill = TRUE, max_chars = 300)[1:2,]
#' }
#'
#' @export
article_to_df <-
function(pubmedArticle, 
                          autofill = FALSE, 
                          max_chars = 500, 
                          getKeywords = FALSE,
                          getAuthors = TRUE) 
{
  #
  options(warn = -1)
  
  # initial check
  # expected cols = 14
  # "pmid", "doi", "title", "abstract", "year", "month", "day", "jabbrv", "journal", 
  # "keywords", "lastname", "firstname", "address", "email" 
  
  # Global Check!
  if (class(pubmedArticle) != "character" |
      regexpr("(<PubmedArticle)(.+)(\\/PubmedArticle>)", pubmedArticle) < 0 )
  {
    message("An error occurred")
    return(NULL)
  }
  
  # max_chars Check
  if (!is.numeric(max_chars)) {
    max_chars <- 500
  } else if (max_chars < 0) {
    max_chars <- -1  
  }
  
  # Get started
  tryCatch({
    
    tmp.article <- custom_grep(xml_data = pubmedArticle, tag = "PubmedArticle", format = "char")
    if (is.null(tmp.article)) 
    {
      message("An error occurred")
      return(NULL)
    }
    
    # Title
    tmp.title <- custom_grep(xml_data = tmp.article, tag = "ArticleTitle", format = "char")
    if (length(tmp.title) > 1){
      tmp.title <- paste(tmp.title, collapse = " ", sep = " ")
    } else if (length(tmp.title) < 1) {
      tmp.title <- NA
    }
    
    # Abstract
    tmp.abstract <- custom_grep(xml_data = tmp.article, tag = "AbstractText", format = "char")
    if (length(tmp.abstract) > 1){
      tmp.abstract <- paste(tmp.abstract, collapse = " ", sep = " ")
      if(max_chars >= 0) {
        tmp.abstract <- gsub("</{0,1}i>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}b>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}sub>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}exp>", "", tmp.abstract, ignore.case = T)
        
        tmp.abstract <- substr(tmp.abstract, 0, max_chars)
      }
    } else if (length(tmp.abstract) < 1) {
      tmp.abstract <- NA
    } else {
      if(max_chars >= 0) {
        tmp.abstract <- substr(tmp.abstract, 0, max_chars)
        tmp.abstract <- gsub("</{0,1}i>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}b>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}sub>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}exp>", "", tmp.abstract, ignore.case = T)
        
      }
    }
    
    # Dates, if any
    my.dateType <- c("DateCompleted",  "DateCreated",  "DateRevised",  "PubDate")
    sel.dateType <-which(sapply(my.dateType, (function(xi) {
      regexpr(xi, tmp.article) > 0
    })))
    if (length(sel.dateType) < 1) {
      tmp.date <- c(Year=NA, Month=NA, Day=NA)
    } else {
      sel.dateType <- sel.dateType[1]
      tmp.date <- custom_grep(xml_data = tmp.article, tag = my.dateType[sel.dateType], format = "char")
      tmp.date <- sapply(c("Year", "Month", "Day"), (function(tt){
        tdat.el <- custom_grep(xml_data = tmp.date, tag = tt, format = "char")
        ifelse(is.null(tdat.el), NA, tdat.el[1])
      }))
    }
    
    # Fetch ID string
    tmp.paperID  <- custom_grep(xml_data = tmp.article, tag = "ArticleIdList", format = "char")
    if (is.null(tmp.paperID)) 
    {
      message("An error occurred")
      return(NULL)
    } else {
      tmp.paperID <- gsub("[[:space:]]", "", tmp.paperID[1])
    }
    
    # Get PMID
    tmp.PMID <- gsub("^(.*ArticleIdIdType=\\\"pubmed\\\")([[:space:]]|[[:alnum:]]){0,20}>", "", tmp.paperID)
    tmp.PMID <- gsub("<.*$", "", tmp.PMID)
    
    # Get DOI
    tmp.DOI <- gsub("^(.*ArticleIdIdType=\\\"doi\\\")([[:space:]]|[[:alnum:]]){0,20}>", "", tmp.paperID)
    tmp.DOI <- gsub("<.*$", "", tmp.DOI)
    
    # Get Journal Abbrv
    tmp.jabbrv  <- custom_grep(xml_data = tmp.article, tag = "ISOAbbreviation", format = "char")
    tmp.jabbrv <- ifelse(is.null(tmp.jabbrv), NA, tmp.jabbrv)
    
    # Get Title
    tmp.journal <- custom_grep(xml_data = tmp.article, tag = "Title", format = "char")
    tmp.journal <- ifelse(is.null(tmp.journal), NA, tmp.journal)
    
    # Fetch Keywords ----MeshHeading
    tmp.keys <- tryCatch({
      if (getKeywords) {
        tmp.keys <- custom_grep(xml_data = tmp.article, 
                                tag = "Keyword", 
                                format = "char")
        
        tmp.mesh <- custom_grep(xml_data = tmp.article, 
                                tag = "MeshHeading", 
                                format = "char")
        
        if (length(tmp.mesh) > 0) {
          tmp.mesh <- sapply(tmp.mesh, function(xxm) {
            custom_grep(xml_data = xxm, 
                        tag = "DescriptorName", 
                        format = "char")
          })
        }
        
        tmp.keys <- c(tmp.keys, tmp.mesh)
        
        if (length(tmp.keys) > 1) {
          tmp.keys <- paste(tmp.keys, collapse = "; ")
        } else if (length(tmp.keys) < 1) {
          tmp.keys <- NA
        }
      } else {
        NA
      }
    }, error = function(e) {NA})
    
    # vector with all unique fields extracted o far
    tmp.resout <- c(pmid=tmp.PMID, 
                    doi=tmp.DOI, 
                    title=tmp.title,
                    abstract=tmp.abstract,
                    year = as.vector(tmp.date[1]),
                    month = as.vector(tmp.date[2]),
                    day = as.vector(tmp.date[3]),
                    jabbrv=tmp.jabbrv,
                    journal=tmp.journal,
                    keywords=tmp.keys)
    
    # Slow part - authors
    tmp.authors <- custom_grep(xml_data = tmp.article, tag = "AuthorList", format = "char")
    
    if (length(tmp.authors) < 1 | !getAuthors) {
      # Set every placeholder with NA
      final.mat <- data.frame(rbind(c(tmp.resout, 
                                      lastname=NA, 
                                      firstname=NA, 
                                      address=NA, 
                                      email=NA)), stringsAsFactors = FALSE)
    } else {
      author.list <- custom_grep(xml_data = tmp.authors, tag = "Author", format = "char")
      final.mat <- do.call(rbind, lapply(author.list, (function(al) {
        tmp.lastnm <- custom_grep(xml_data = al, tag = "LastName", format = "char")
        tmp.firstnm <- custom_grep(xml_data = al, tag = "ForeName", format = "char")
        tmp.email <- regexpr("([[:alnum:]]|\\.|\\-\\_){3,200}@([[:alnum:]]|\\.|\\-\\_){3,200}(\\.)([[:alnum:]]){2,6}", al)
        if (tmp.email > 0) {
          tmp.email <- substr(al, tmp.email, tmp.email + attributes(tmp.email)$match.length -1 )
        } else {
          tmp.email <- NA
        }
        #
        if (regexpr("Affiliation", al) > 0) {
          tmp.add <- custom_grep(al, "Affiliation", format = "char")[1]
          tmp.add <- trim_address(tmp.add)
        } else {
          tmp.add <- NA
        }
        c(tmp.resout, 
          lastname=tmp.lastnm, 
          firstname=tmp.firstnm, 
          address=tmp.add, 
          email=tmp.email)
        #
      })))
      rownames(final.mat) <- NULL
      
      final.mat <- data.frame(final.mat, stringsAsFactors = FALSE)
      DESELECT <- is.na(final.mat$lastname) | is.na(final.mat$firstname)
      if (length(DESELECT) > 0 & sum(DESELECT) > 0)
        final.mat <- final.mat[!DESELECT, ]
      #
      if (autofill){
        tmp.address <- final.mat[,"address"]
        na.pos <- is.na(tmp.address)
        if (sum(na.pos) != length(tmp.address)) {
          tmp.list <- lapply(tmp.address, function(x) {x} ) 
          cur.add <-  tmp.list[[(which(!na.pos)[1])]]
          for (i in 1:length(na.pos)){
            if(na.pos[i]){
              tmp.list[[i]] <- cur.add
            } else {
              cur.add <- tmp.list[[i]]
            }
          }
          final.mat[,"address"] <- do.call(c, tmp.list)
        }
      }
    }
    
    # Final check and return
    if (ncol(final.mat) != 14) {
      final.mat <- NULL
    }
  }, error = function(e) {NULL}, 
  finally = {
    options(warn = 0)
    return(final.mat)
  })
}
 
 
 
#' @title Cast PubMed Data into a List of Articles
#'
#' @description Convert an XML object of PubMed records into a list of strings 
#' (character vector of length 1) corresponding to individual PubMed articles. 
#' PubMed records are identified by a "/PubmedArticle" XML tag. This automatically casts 
#' all the content of each PubMed record to a character-class object without removing XML tags.
#' 
#' @usage articles_to_list(pubmed_data, encoding = "UTF8", simplify = TRUE)
#' 
#' @param pubmed_data String corresponding to the name of an XML file (typically, 
#' the result of a batch_pubmed_download() call). Alternatively, a string including 
#' PubMed records with XML tags, such as the object returned by a fetch_pubmed_data() call.
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the 
#' function base::iconv(). See iconv() help page for how to find out more about encodings 
#' that can be used on your platform. "UTF-8" is recommended.
#' @param simplify Logical; should the result be simplified to a character vector. 
#' If FALSE, results are returned as a list.
#' 
#' @details 
#' The input is an XML object or a string including PubMed records (with XML tags). These are the
#' output of easyPubMed functions: fetch_pubmed_data() or batch_pubmed_download(). 
#' The function returns a list or a character vector where each element is a different PubMed record.
#' 
#' @return 
#' List or character vector including all the records from the original XML object in text format. 
#' Elements in the list are not named and are only accessible via their numeric index.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' try({
#'   ## Retrieve PubMed data and return a list ot articles
#'   my_query <- "Damiano Fantini[AU]"
#'   my_query <- get_pubmed_ids(pubmed_query_string = my_query)
#'   my_data <- fetch_pubmed_data(my_query, encoding = "ASCII")
#'   listed_articles <- articles_to_list(my_data)
#'   custom_grep(listed_articles[[2]], "ArticleTitle", "char")
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Download PubMed data and return a list ot articles
#' dami_query <- "Damiano Fantini[AU] AND 2018[PDAT]"
#' outfile <- batch_pubmed_download(dami_query, dest_file_prefix = "easyPM_ex001_")
#' listed_articles <- articles_to_list(pubmed_data = outfile)
#' custom_grep(listed_articles[[2]], "ArticleTitle", "char")
#' }
#' 
#' @export
articles_to_list <-
  function(pubmed_data, encoding = "UTF8", simplify = TRUE) 
{
    
  # Define a nested (core) function handling standardized input
  # This assumes a string as a input
    easyPM_exec_art_to_lst <- function(pm_dataa, simply = TRUE) 
    {
      # it's a string to process
      pm_datab <- strsplit(pm_dataa, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
      pm_datab <- sapply(pm_datab, function(x) {
        #trim extra stuff at the end of the record
        if (!grepl("</PubmedArticle>$", x))
          x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) 
        
        # Rebuid XML structure and proceed
        x <- paste("<PubmedArticle>", x)
        gsub("[[:space:]]{2,}", " ", x)}, 
        USE.NAMES = FALSE, simplify = simply)
      
      pm_datab
    }
    
  # Execute f(x)
  # Handle inputs of different type
  # check if it is a XMLAbstractDocument or a file
  TMP <- substr(pubmed_data[1], 1, 1000)
  if (grepl("<PubmedArticle", TMP)) {
    
    # it's a string to process
    out <- easyPM_exec_art_to_lst(pubmed_data[1], simply = simplify)
  
  } else if (file.exists(pubmed_data[1])) {
    
    # it's a file
    con1 <- file(pubmed_data[1], encoding = "UTF8")
    on.exit(close(con1))
    myXML <- readLines(con = con1, 
                       n = -1, ok = TRUE, encoding = "UTF8") 
    
    if (encoding != "UTF8")
      myXML <- base::iconv(myXML, from = "UTF8", to = encoding, sub = ".")

    myXML <- paste(myXML, collapse = "")
    
    # Run as above
    out <- easyPM_exec_art_to_lst(myXML, simply = simplify)
    
  } else {
    message("An error occurred")
    return(NULL)  
  }

  return(out)
}  
    
 
 
#' @title Download PubMed Records in XML or TXT Format
#'
#' @description Performs a PubMed Query (via the get_pubmed_ids() function), downloads the 
#' resulting data (via multiple fetch_pubmed_data() calls) and then saves data in a series of 
#' xml or txt files on the local drive. The function is suitable for downloading 
#' a very large number of records.
#' 
#' @usage batch_pubmed_download(pubmed_query_string, dest_dir = NULL, 
#'                              dest_file_prefix = "easyPubMed_data_", 
#'                              format = "xml", api_key = NULL, 
#'                              batch_size = 400, res_cn = 1, 
#'                              encoding = "UTF8")
#' 
#' @param pubmed_query_string String (character-vector of length 1): this is the string 
#' used for querying PubMed (the standard PubMed Query synthax applies).
#' @param dest_dir String (character-vector of length 1): this string corresponds to the name 
#' of the existing folder where files will be saved. Existing files will be overwritten. 
#' If NULL, the current working directory will be used.
#' @param dest_file_prefix String (character-vector of length 1): this string is used as 
#' prefix for the files that are written locally. 
#' @param format String (character-vector of length 1): data will be requested from Entrez 
#' in this format. Acceptable values are: c("medline","uilist","abstract","asn.1", "xml"). 
#' When format != "xml", data will be saved as text files (txt).
#' @param api_key String (character vector of length 1): user-specific API key to increase 
#' the limit of queries per second. You can obtain your key from NCBI.
#' @param batch_size Integer (1 < batch_size < 5000): maximum number of records 
#' to be saved in a single xml or txt file.
#' @param res_cn Integer (> 0): numeric index of the data batch to start downloading from. 
#' This parameter is useful to resume an incomplete download job after a system crash.
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the 
#' function base::iconv(). See iconv() help page for how to find out more about encodings 
#' that can be used on your platform. Here, we recommend using "UTF-8".
#' 
#' @details 
#' Download large number of PubMed records as a set of xml or txt files that are saved in the 
#' folder specified by the user. This function enforces data integrity. If a batch of downloaded 
#' data is corrupted, it is discarded and downloaded again. Each download cycle is monitored until 
#' the download job is successfully completed. This function should allow to download a whole copy 
#' of PubMed, if desired. The function informs the user about the current progress by constantly 
#' printing to console the number of batches still in queue for download. pubmed_query_string 
#' accepts standard PubMed synthax. The function will query PubMed multiple times using the same 
#' query string. Therefore, it is recommended to use a [EDAT] or a [PDAT] filter in the query 
#' if you want to ensure reproducible results.
#' 
#' @return 
#' Character vector including the names of files downloaded to the local system  
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ## Example 01: retrieve data from PubMed and save as XML file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out1 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' readLines(out1[1])[1:30]
#' ##
#' ## Example 02: retrieve data from PubMed and save as TXT file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out2 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180, format = "medline")
#' readLines(out2[1])[1:30]
#' }
#' 
#' @export
batch_pubmed_download <-
  function (pubmed_query_string, 
            dest_dir = NULL,
            dest_file_prefix = "easyPubMed_data_",
            format = "xml",
            api_key = NULL,
            batch_size = 400, 
            res_cn = 1, 
            encoding = "UTF8") 
    
{
  baseDir <- getwd()
  if (!is.null(dest_dir)) {
    setwd(as.character(dest_dir))
  }
  fileName.collector <- list()
  myQuery <- NULL
  my_rtime <- ifelse(is.null(api_key), 0.34, 0.11)
  
  cur_time <- Sys.time()
  while (is.null(myQuery)) {
    diff_time <- my_rtime - as.numeric(difftime(Sys.time(), cur_time, units = "secs"))
    if (diff_time > 0) {
      Sys.sleep(diff_time)
    }
    cur_time <- Sys.time()
    myQuery <- tryCatch(get_pubmed_ids(pubmed_query_string, api_key = api_key), 
                        error = function(e) NULL)
  }
  pubsNum <- as.numeric(myQuery$Count)
  tmpPapers <- NULL
  myRetstart <- 0
  myRetmax <- batch_size
  j = 1
  expTot <- pubsNum/batch_size
  if (expTot > as.integer(expTot)) {
    expTot <- as.integer(expTot) + 1
  } else {
    expTot <- as.integer(expTot)
  }
  while (myRetstart < pubsNum) {
    if (j < res_cn) {
      message(paste("cycle", j, "/", expTot, "skipped...", 
                    sep = " "))
    } else {
      cur_time <- Sys.time()
      while (is.null(myQuery) | is.null(tmpPapers)) {
        
        diff_time <- my_rtime - (as.numeric(Sys.time() - cur_time))
        if (diff_time > 0) {
          Sys.sleep(diff_time)
        }
        cur_time <- Sys.time()
        
        myQuery <- tryCatch(get_pubmed_ids(pubmed_query_string, api_key = api_key), 
                            error = function(e) NULL)
        
        diff_time <- my_rtime - as.numeric(difftime(Sys.time(), cur_time, units = "secs"))
        
        if (diff_time > 0) {
          Sys.sleep(diff_time)
        }
        cur_time <- Sys.time()
        
        # Force download as XML, but withoud collapsing strings
        if (format[1] == "xml") {
          format <- "batchxml"
        }
        
        tmpPapers <- tryCatch(fetch_pubmed_data(pubmed_id_list = myQuery, 
                                                retstart = myRetstart, 
                                                retmax = myRetmax,
                                                format = format, 
                                                encoding = encoding),
                              
                              error = function(e) NULL, 
                              finally = print(paste("PubMed data batch", 
                                                    j, "/", 
                                                    expTot, "downloaded...",
                                                    sep = " ")))
        if (is.null(tmpPapers)) {
          message("Data retrieval error. Retrying...")
        }
      }
      totDigits <- nchar(as.character(expTot)) + 1
      myExt <- paste(rep(0, totDigits - nchar(as.character(j))), 
                     collapse = "")
  
      tmp.dest.file <- paste(dest_file_prefix, myExt, j, ".txt", sep = "")
      con1 <- file(tmp.dest.file, encoding = encoding)
      doSaveData <- tryCatch(write(tmpPapers, tmp.dest.file), 
                             error = function(e) {"ERROR"}, 
                             finally = {close(con1)})
      if(is.null(doSaveData))
        doSaveData <- tmp.dest.file
      
      myQuery <- NULL
      tmpPapers <- NULL
      if (doSaveData == "ERROR") {
        myRetstart <- myRetstart - myRetmax
        j <- j - 1
        message("An error occurred... Trying to download data from PubMed again...")
      } else {
        fileName.collector[[1+length(fileName.collector)]] <- doSaveData
      }
    }
    myRetstart <- myRetstart + myRetmax
    j <- j + 1
  }
  setwd(baseDir)
  tryCatch(do.call(c, fileName.collector), error = function(e){NULL})
}
 
 
#' @title Retrieve Text Between XML Tags
#'
#' @description Extract text form a string containing XML or HTML tags. Text 
#' included between tags of interest will be returned. If multiple tagged substrings are found, 
#' they will be returned as different elements of a list or character vector.
#' 
#' @usage custom_grep(xml_data, tag, format = "list")
#' 
#' @param xml_data String (of class character and length 1): corresponds to the PubMed 
#' record or any string including XML/HTML tags.
#' @param tag String (of class character and length 1): the tag of interest (does NOT include < > chars).
#' @param format c("list", "char"): specifies the format for the output.
#' 
#' @details 
#' The input string has to be a character string (length 1) containing tags (HTML or XML format). 
#' If an XML Document is provided as input, the function will rise an error.
#' 
#' @return 
#' List or vector where each element corresponds to an in-tag substring.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' try({
#'   ## extract substrings based on regular expressions
#'   string_01 <- "I can't wait to watch the <strong>Late Night Show with" 
#'   string_01 <- paste(string_01, "Seth Meyers</strong> tonight at <strong>11:30</strong>pm CT!")
#'   print(string_01)
#'   custom_grep(xml_data = string_01, tag = "strong", format = "char")
#'   custom_grep(xml_data = string_01, tag = "strong", format = "list")
#' }, silent = TRUE)
#' 
#' @export
custom_grep <-
  function(xml_data, 
           tag, 
           format = "list")
  {
    x <- xml_data[[1]]
    tag.op <- paste("\\<", tag, "((\\>)|([[:space:]]([^[<]]*)\\>))", sep = "")
    tag.cl <- paste("(<\\/)", tag, "(\\>)", sep = "")
    #
    out.result <- list()
    i = 1
    while (!is.null(x) &&
           !is.na(x) &&
           x != "" &&
           nchar(x) > 0 &&
           regexpr(tag.op, x) > 0 &&
           regexpr(tag.cl, x) > 0){
      tag.op.pos <- regexpr(tag.op, x)
      nu.x <- substr(x, (tag.op.pos - 1), nchar(x))
      inner.trim <- regexpr(">", nu.x, fixed = TRUE)
      nu.x <- substr(nu.x, (inner.trim + 1), nchar(nu.x))
      #
      tag.cl.pos <- regexpr(tag.cl, nu.x)
      tag.cl.full <- tag.cl.pos + attributes(tag.cl.pos)$match.length + 1
      x <- substr(nu.x, tag.cl.full, nchar(x))
      nu.x <- substr(nu.x, 1, (tag.cl.pos - 1))
      #
      out.result[[i]] <- nu.x
      i <- i + 1
    }
    if (format != "list") {
      out.result <- do.call(c, out.result)
    }
    return(out.result)
  }

 
#' @title Retrieve All PubMed Record Identifiers Returned by a Query
#'
#' @description Retrieve PubMed record identifiers from Entrez following a search performed 
#' via the get_pubmed_ids() function. Identifiers are returned as a character vector.
#' 
#' @usage fetch_all_pubmed_ids(pubmed_id_list)
#' 
#' @param pubmed_id_list List: the result of a get_pubmed_ids() call.
#' 
#' @details 
#' Retrieve PubMed identifiers, without any other information (such as article title, 
#' authors, publication date, and so on). The PubMed IDs can be stored or used with other software.
#' 
#' @return 
#' Character vector including all PMID (PubMed Identifiers) returned by the current query.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ## Fetch only PubMed Record IDs (PMIDs)
#' dami_query_string <- "Damiano Fantini[AU]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#' dami_pmids <- fetch_all_pubmed_ids(dami_on_pubmed)
#' print(dami_pmids)
#' 
#' }
#' 
#' @export
fetch_all_pubmed_ids <-
  function(pubmed_id_list)
{
  # expected records, set retmax
  exp_num <- as.numeric(pubmed_id_list$Count)
  if (is.numeric(exp_num) && exp_num > 0) {
    my_retmax <- exp_num + 1
  } else {
    my_retmax <- 100000
  }
  
  # query, and then extract IDs
  myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", 
                       "db=pubmed&retmax=", my_retmax, "&term=", pubmed_id_list$OriginalQuery, "&usehistory=n", sep = "")
  IDconnect <- url(myPubmedURL, open = "rb", encoding = "UTF8")
  on.exit(close(IDconnect))
  idXML <- readLines(IDconnect, warn = FALSE, encoding = "UTF8") 

  collect_ids <- list()
  for (i in 1:length(idXML)) {
    if (grepl("^<Id>", idXML[i])) {
      xx <- custom_grep(idXML[i], tag = "Id", format = "char")
      collect_ids[[length(collect_ids) + 1]] <- as.character(xx[1])
    }
  }
  myIDlist <- as.character(do.call(c, collect_ids))
  
  # final check and return
  if(length(myIDlist) != exp_num)
    message(paste("Note that only ", length(myIDlist), " PubMed IDs were retrieved (", 
                  exp_num, " were expected).", sep = ""))
  
  return(myIDlist)
}
 
 
#' @title Retrieve PubMed Data in XML or TXT Format
#'
#' @description Retrieve PubMed records from Entrez following a search performed via the 
#' get_pubmed_ids() function. Data are downloaded in the XML or TXT format and are 
#' retrieved in batches of up to 5000 records.
#' 
#' @usage fetch_pubmed_data(pubmed_id_list, 
#'                          retstart = 0, 
#'                          retmax = 500, 
#'                          format = "xml", 
#'                          encoding = "UTF8")
#' 
#' @param pubmed_id_list List: the result of a get_pubmed_ids() call.
#' @param retstart Integer (>=0): index of the first UID in the retrieved PubMed Search Result set 
#' to be included in the output (default=0, corresponding to the first record of the entire set).
#' @param retmax Integer (>=1): size of the batch of PubMed records to be retrieved at one time.
#' @param format Character: element specifying the output format. The following values are allowed: 
#' c("asn.1", "xml", "medline", "uilist", "abstract").
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the function base::iconv(). 
#' See iconv() help page for how to find out more about encodings that can be used on your platform. 
#' Here, we recommend using "UTF-8".
#' 
#' @details 
#' Retrieve PubMed records based on the results of a get_pubmed_ids() query. 
#' Records are retrieved from Entrez via the PubMed API efetch function. The first entry to be retrieved 
#' may be adjusted via the retastart parameter (this allows the user to download large batches of PubMed 
#' data in multiple runs). The maximum number of entries to be retrieved can also be set adjusting the 
#' retmax parameter (1 < retmax < 5000). Data will be downloaded on the fly (no files are saved 
#' locally).
#' 
#' @return 
#' An object (vector) of class "character". If format is set to "xml" (default), a single String including all 
#' PubMed records (with XML tags embedded) is returned. If a different format is selected, a vector of strings 
#' is returned, where each row corresponds to a line of the output document.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references 
#' \url{https://www.data-pulse.com/dev_site/easypubmed/}
#' \url{https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/}
#'
#' @examples
#' try({ 
#'   ## Example 01: retrieve data in TXT format
#'   library("easyPubMed")
#'   dami_query_string <- "Damiano Fantini[AU] AND 2018[PDAT]"
#'   dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#'   Sys.sleep(1) # avoid server timeout
#'   dami_papers <- fetch_pubmed_data(dami_on_pubmed, format = "abstract")
#'   dami_papers[dami_papers == ""] <- "\n"
#'   cat(paste(dami_papers[1:65], collapse = ""))
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Example 02: retrieve data in XML format
#' library("easyPubMed")
#' dami_query_string <- "Damiano Fantini[AU]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#' dami_papers <- fetch_pubmed_data(dami_on_pubmed)
#' titles <- custom_grep(dami_papers, "ArticleTitle", "char")
#' print(titles)
#' }
#' 
#' @importFrom utils head
#' @export
fetch_pubmed_data <-
  function (pubmed_id_list,
            retstart = 0,
            retmax = 500,
            format = "xml", 
            encoding = "UTF8") 
{
  myIDlist <- pubmed_id_list
  if ((!is.list(myIDlist)) | is.na(myIDlist$WebEnv) | is.na(myIDlist$QueryKey) | 
      is.na(myIDlist$Count) | !is.integer(as.integer(retstart)) | 
      !is.integer(as.integer(retmax))) {
    message("There is an issue with the PubMed ID list you supplied. Please, call the function again and supply the result of a <get_pubmed_ids()> call as argument. Thank you.")
    return(NULL)
  } else {
    myWebEnv <- myIDlist$WebEnv
    myKey <- myIDlist$QueryKey
    myCount <- as.numeric(as.character(myIDlist$Count))
    myRetstart = as.integer(retstart)
    if (myRetstart < 0) {
      myRetstart = 0
    }
    myRetmax <- as.integer(retmax)
    if (myRetmax > 5000) {
      myRetmax = 5000
    }
    if (myRetmax < 1) {
      myRetmax = 1
    }
    if (format[1] %in% c("medline","uilist","abstract","asn.1", "xml")) {
      myFormat <- format[1]
    } else {
      myFormat <- "xml"
    }
    typeMode <- switch(myFormat, 
                       "asn.1" = c("null", "asn.1"),
                       "xml" = c("null", "xml"),
                       "medline" = c("medline", "text"),
                       "uilist" = c("uilist", "text"),
                       "abstract" = c("abstract", "text"))
    efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", 
                       "db=pubmed&WebEnv=", myWebEnv, "&query_key=", myKey, 
                       "&retstart=", myRetstart, "&retmax=", myRetmax, 
                       "&rettype=", typeMode[1],"&retmode=", typeMode[2], 
                       sep = "")
    
    # api_key retrieval
    api_key <- pubmed_id_list$APIkey
    if (!is.null(api_key)) {
      efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "")
    }
    
    # initialize output
    out.data <- NULL
    
    # initialize extra params
    try_num <- 1
    t_0 <- Sys.time()
    
    # Try to fetch results
    while(is.null(out.data)) {
      
      # Timing check: kill at 2 min
      if (try_num > 1)
        Sys.sleep(time = 2)
      
      t_1 <- Sys.time()
      
      if(as.numeric(difftime(t_1, t_0, units = "mins")) > 5){
        message("Killing the request! Something is not working. Please, try again later")
        return(NULL)
      }
      
      # ENTREZ server connect
      out.data <- tryCatch({    
        tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8"))
        suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8"))
      }, error = function(e) {
        # message(e)
        NULL
      }, finally = {
        try(suppressWarnings(close(tmpConnect)), silent = TRUE)
      })  
      
      # Check if error
      if (!is.null(out.data) && 
          class(out.data) == "character" &&
          grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) {
          # message(out.data)
        out.data <- NULL
      }
      try_num <- try_num + 1
    }
    
    if (is.null(out.data)) {
      message("Killing the request! Something is not working. Please, try again later")
      return(NULL)
    }
    
    if (encoding != "UTF8")
      out.data <- base::iconv(out.data, from = "UTF8", to = encoding, sub = ".")

    if (format[1] == "xml") {
      out.data <- paste(out.data, collapse = "")
    }
                          
    return(out.data)
  }
}
 
 
#' @title Simple PubMed Record Search by Full-length Title
#'
#' @description Query PubMed (Entrez) in a simple way via the PubMed API eSearch function. 
#' This function is designed to query PubMed using a full-length publication title as query string. 
#' It performs stopword removal from the query string before querying the PubMed server. 
#' Calling this function results in posting the results on the PubMed History Server. 
#' This allows later access to the resulting data via the fetch_pubmed_data() function, 
#' or other easyPubMed functions.
#' 
#' @usage get_pubmed_ids_by_fulltitle(fulltitle, field = "[Title]", api_key = NULL)
#' 
#' @param fulltitle String (character vector of length 1) that corresponds to the full-length 
#' publication title used for querying PubMed (titles should be used as is, without 
#' adding extra filters/tags).
#' @param field String (character vector of length 1) with a tag indicating the PubMed 
#' record field where the full-length string (fulltitle) should be searched in. By default, 
#' this points to the 'Title' field. This field can be changed (use fields supported by PubMed) 
#' as required by the user (for example, to attempt an exact-match query using a specific sentence 
#' included in the abstract of a record).
#' @param api_key String (character vector of length 1): user-specific API key to increase 
#' the limit of queries per second. You can obtain your key from NCBI.
#' 
#' @details 
#' This function will use the String provided as argument for querying PubMed via the eSearch 
#' function of the PubMed API. The Query Term should include a full-length publication title, 
#' without other PubMed operators (AND, OR, NOT) nor tags (i.e., [AU], [PDAT], 
#' [Affiliation], and so on). ESearch will post the UIDs resulting from the search operation 
#' onto the History server so that they can be used directly in a subsequent fetchPubmedData() call.
#' 
#' @return 
#' The function returns a list. The list includes the number of records found on PubMed and the first 
#' 20 PubMed IDs (UID) retrieved by the query. The list also includes QueryKey and WebEnv that are 
#' required for a subsequent fetch_pubmed_data() call.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ##  Search for a scientific article matching a full-length title
#' my_query <- "Body mass index and cancer risk among Chinese patients with type 2 diabetes mellitus"
#' my_field <- "[Title]"
#' # Full-length title query (designed to query titles)
#' res0 <- get_pubmed_ids(my_query)
#' print(as.numeric(res0$Count))
#' # Weird count!
#' res <- get_pubmed_ids_by_fulltitle(my_query, field = my_field)
#' # Num results = 1 as expected
#' print(as.numeric(res$Count))
#' 
#' }
#' 
#' @export
get_pubmed_ids_by_fulltitle <- 
  function(fulltitle, field = "[Title]", api_key = NULL) 
{
  out <- get_pubmed_ids(paste("\"", fulltitle, "\"", field, sep = ""), api_key = api_key)
  if (as.numeric(out$Count) > 0) {
    return (out)
  }
  
  stopwords <- easyPubMed::PubMed_stopwords
  keys <- strsplit(fulltitle, split = "[[:space:]]")[[1]]
  keys <- tolower(keys)
  keys <- keys[!keys %in% stopwords]
  Sys.sleep(0.34)
  new_query <- paste(keys, field, sep = "", collapse = " AND ")
  return(get_pubmed_ids(new_query, api_key = api_key))
}
 
 
#' @title Simple PubMed Record Search
#'
#' @description Query PubMed (Entrez) in a simple way via the PubMed API eSearch function. 
#' Calling this function results in posting the query results on the PubMed History Server. 
#' This allows later access to the resulting data via the fetch_pubmed_data() function, 
#' or other easyPubMed functions.
#' 
#' @usage get_pubmed_ids(pubmed_query_string, api_key = NULL)
#' 
#' @param pubmed_query_string is a string (character vector of length 1) that is used 
#' for querying PubMed (standard PubMed synthax, see reference for details).
#' @param api_key String (character vector of length 1): user-specific API key to 
#' increase the limit of queries per second. You can obtain your key from NCBI.
#' 
#' @details 
#' This function will use the String provided as argument for querying PubMed via the eSearch 
#' function of the PubMed API. The Query Term can include one or multiple words, as well as the standard 
#' PubMed operators (AND, OR, NOT) and tags (i.e., [AU], [PDAT], [Affiliation], and so on). ESearch will 
#' post the UIDs resulting from the search operation onto the History server so that they can be used directly 
#' in a subsequent fetchPubmedData() call.
#' 
#' @return 
#' The function returns a list. The list includes the number of records found on PubMed and 
#' the first 20 PubMed IDs (UID) retrieved by the query. The list also includes QueryKey and WebEnv 
#' that are required for a subsequent fetch_pubmed_data() call.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references 
#' \url{https://www.data-pulse.com/dev_site/easypubmed/}
#' \url{https://www.ncbi.nlm.nih.gov/books/NBK3827/#_pubmedhelp_Search_Field_Descriptions_and_}
#'
#' @examples 
#' try({
#'   ##  Search for scientific articles written by Damiano Fantini
#'   ##  and print the number of retrieved records to screen.
#'   ##  Also print the retrieved UIDs to screen.
#'   ##
#'   dami_on_pubmed <- get_pubmed_ids("Damiano Fantini[AU]")
#'   print(dami_on_pubmed$Count)
#'   print(unlist(dami_on_pubmed$IdList))
#' }, silent = TRUE)
#' 
#' @export
get_pubmed_ids <- function (pubmed_query_string, 
                            api_key = NULL) 
{
  # Silence warnings
  old_warn <- options()$warn
  options(warn = -1)
  
  # Timing
  t_0 <- Sys.time()
  
  myQuery <- as.character(pubmed_query_string)
  myQuery <- gsub(" ", "+", myQuery, fixed = TRUE)
  myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", 
                       "db=pubmed&term=", myQuery, "&usehistory=y", sep = "")
  if (!is.null(api_key)) {
    myPubmedURL <- paste(myPubmedURL, "&api_key=", api_key, sep = "")
  }
  
  idXML <- NULL
  try_num <- 1
  while(is.null(idXML)) {
    
    # Timing check: kill at 2 min
    if (try_num > 1)
      Sys.sleep(time = 2)
    
    t_1 <- Sys.time()
    
    if(as.numeric(difftime(t_1, t_0, units = "mins")) > 2){
      message("Killing the request! Something is not working. Please, try again later")
      return()
    }
    
    # ENTREZ server connect
    idXML <- tryCatch({    
      IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8"))
      idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8"))
      idXML <- paste(idXML, collapse = "")
      if (grepl("<ERROR>", substr(idXML, 1, 250))) {
        # message(idXML)
        NULL
      } else {
        idXML
      }
    }, error = function(e) {
      # message(e)
      NULL
    }, finally = {
      try(suppressWarnings(close(IDconnect)), silent = TRUE)
    })  
    
    # Data processing (if result not null)
    myIDlist <- NULL
    
    if (!is.null(idXML)) {
      tryCatch({
        
        # Initialize collector
        myIDlist <- list()
        
        my_tags <- c("Count", "RetMax", "RetStart", 
                     "QueryKey", "WebEnv", "IdList", 
                     "TranslationSet", "QueryTranslation")
        
        # First pass
        for (j in 1:length(my_tags)) {
          ttag <- my_tags[j]
          xx <- custom_grep(idXML, tag = ttag, "char")
          myIDlist[[ttag]] <- xx[1]
        }
        
        # Try to expand IdList
        nutag <- "Id"
        xx <- myIDlist[["IdList"]]
        xx <- custom_grep(xx, "Id", format = "list")
        names(xx) <- rep("Id", length(xx))
        myIDlist[["IdList"]] <- xx
        
        # Try to expand TranslationSet
        xx <- myIDlist[["TranslationSet"]]
        myIDlist[["TranslationSet"]] <- list()
        nutag <- c("From", "To")
        for (z in nutag) {
          yy <- custom_grep(xx, z, format = "char")
          myIDlist[["TranslationSet"]][[z]] <- yy[1]
        }
        
      }, error = function(e) {
        idXML <- NULL
      })
    }
    
    # Final check!
    if(!is.list(myIDlist)) {
      idXML <- NULL  
    }
  
  try_num <- try_num + 1
  }
  
  # Wrap up and return
  myIDlist[['OriginalQuery']] <- myQuery
  myIDlist[['APIkey']] <- api_key
  
  # Restore warnings
  options(warn = old_warn)
  
  return(myIDlist)
}
 
 
#' @title Extract Publication and Affiliation Data from PubMed Records
#'
#' @description Extract Publication Info from PubMed records and cast data into a 
#' data.frame where each row corresponds to a different author. It is possible to limit
#' data extraction to first authors or last authors only, or get information about 
#' all authors of each PubMed record.
#' 
#' @usage table_articles_byAuth(pubmed_data, 
#'                              included_authors = "all", 
#'                              max_chars = 500, 
#'                              autofill = TRUE, 
#'                              dest_file = NULL, 
#'                              getKeywords = TRUE, 
#'                              encoding = "UTF8")
#' 
#' @param pubmed_data PubMed Data in XML format: typically, an XML file resulting from a 
#' batch_pubmed_download() call or an XML object, result of a fetch_pubmed_data() call.
#' @param included_authors Character: c("first", "last", "all"). Only includes information 
#' from the first, the last or all authors of a PubMed record.
#' @param max_chars Numeric: maximum number of chars to extract from the AbstractText field.
#' @param autofill Logical. If TRUE, missing affiliations are imputed according to the available 
#' values (from the same article).
#' @param dest_file String (character of length 1). Name of the file that will be written for 
#' storing the output. If NULL, no file will be saved.
#' @param getKeywords Logical. If TRUE, the operation will attempt to extract PubMed record 
#' keywords (MESH topics, keywords).
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the function 
#' base::iconv(). See iconv() help page for how to find out more about encodings that can be 
#' used on your platform. Here, we recommend using "UTF-8".
#' 
#' @details 
#' Retrieve publication and author information from PubMed data, and cast them as a data.frame.
#' 
#' @return 
#' Data frame including the following fields: c("article.title","article.abstract", "date.year", 
#' "date.month", "date.day", "journal.abbrv", "journal.title", "keywords", "auth.last", 
#' "auth.fore", "auth.address", "auth.email").
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ## Cast PubMed record info into a data.frame
#'
## Auto-fill enabled
#' dami_query <- "Damiano Fantini[AU]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query)
#' dami_abstracts_xml <- fetch_pubmed_data(dami_on_pubmed, encoding = "ASCII")
#' xx <- table_articles_byAuth(pubmed_data = dami_abstracts_xml, 
#'                             included_authors = "first", 
#'                             max_chars = 100, 
#'                             autofill = TRUE)
#' 
#' print(xx[1:5, c("pmid", "lastname", "jabbrv")])
#' #
#' ## Download records first
#' ## Also, auto-fill disabled
#' dami_query <- "Damiano Fantini[AU]"
#' curr.file <- batch_pubmed_download(dami_query, dest_file_prefix = "test_bpd_", encoding = "ASCII")
#' xx <- table_articles_byAuth(pubmed_data = curr.file[1], 
#'                             included_authors = "all", 
#'                             max_chars = 20, 
#'                             autofill = FALSE)
#' print(xx[1:5, c("pmid", "lastname", "jabbrv")])
#' 
#' }
#' @importFrom utils write.table
#' 
#' @export
table_articles_byAuth <-
  function (pubmed_data, 
            included_authors = "all", 
            max_chars = 500, 
            autofill = TRUE, 
            dest_file = NULL, 
            getKeywords = TRUE, 
            encoding = "UTF8") 
{
  if (!included_authors %in% c("all", "first", "last")) 
    stop("Method is not supported!")
  message("Processing PubMed data ", appendLF = FALSE)
  
  paper.data <- articles_to_list(pubmed_data = pubmed_data, encoding = encoding)
  
  expFields <- c("pmid", "doi", "title", "abstract", "year", "month", "day", "jabbrv", 
                 "journal", "keywords", "lastname", "firstname", "address", "email")
  papers.authors.list <- lapply(1:length(paper.data), (function(i) {
    if (length(paper.data) > 50) {
      rep.dot <- as.integer(seq(1, length(paper.data), 
                                length.out = 50))
      if (i %in% rep.dot) 
        message(".", appendLF = FALSE)
    } else {
      message(".", appendLF = FALSE)
    }
    art <- paper.data[[i]]
    out <- tryCatch({article_to_df(pubmedArticle = art, 
                                  autofill = autofill, 
                                  max_chars = max_chars, 
                                  getKeywords = getKeywords, 
                                  getAuthors = TRUE)}, 
                    error = function(e) { NULL })
    
    if (is.null(out)) {
      out <- data.frame(pmid = NA, doi = NA, title = NA, 
                        abstract = NA, year = NA, month = NA, day = NA, 
                        jabbrv = NA, journal = NA, keywords = NA, lastname = NA, firstname = NA, 
                        address = NA, email = NA)
    }
    if (included_authors == "first") {
      out <- out[1, ]
    } else if (included_authors == "last") {
      out <- out[nrow(out), ]
    } 
    
    # Handle missing fields exception
    out2 <- data.frame(rebuild = (1:nrow(out))) 
    for (jj in 1:length(expFields)) {
      if (expFields[jj] %in% names(out)) {
        out2[,expFields[jj]] <- out[,expFields[jj]]
      } else {
        out2[,expFields[jj]] <- NA
      }
    }
    out2[,-1]
  }))
  message(" done!")
  
  #y <- names(papers.authors.list[[1]])
  #kp <- sapply(papers.authors.list, function(x) {sum(! y %in% names(x)) == 0 })
  #class(papers.authors.list[!kp])
  #names(papers.authors.list[!kp][[1]])
  #sum(!kp)
  
  papers.authors.df <- do.call(rbind, papers.authors.list)
  keep.rw <- apply(papers.authors.df, 1, (function(rw) {
    sum(is.na(rw)) < length(rw)
  }))
  papers.authors.df <- papers.authors.df[keep.rw, ]
  if (!is.null(dest_file)) {
    if (class(dest_file) == "character" & length(dest_file) == 1) {
      tryCatch(utils::write.table(papers.authors.df, dest_file, fileEncoding = encoding), 
               error = function(e) {
                 NULL
               })
    }
  }
  return(papers.authors.df)
}
 
 
#' @title Trim and Format Address Information
#'
#' @description Set of rules for trimming and standardizing the format of address information 
#' retrieved from PubMed records. Affiliations including more than one address will be trimmend 
#' and only the first address will be returned.
#' 
#' @usage trim_address(addr)
#' 
#' @param addr Character string including an address as extracted from PubMed records.
#' 
#' @return 
#' Character string including a formatted and trimmed address (if available).
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' addr_string <- " 2 Dept of Urology, Feinberg School of Medicine," 
#' addr_string <- paste(addr_string, "Chicago, US; Dept of Mol Bio as well...")
#' print(addr_string)
#' print(trim_address(addr = addr_string))
#' 
#' @export
trim_address <-
function(addr) 
{
  out.addr <- gsub("(^([[:space:]]{0,5}[[:digit:]]{1,2}[[:space:]]{0,2}))", "", addr)
  out.addr <- gsub("(\\.).{0,5}$", "", out.addr)
  out.addr <- gsub("(\\;).*$", "", out.addr)
  return (out.addr)
}
 
 
 
#' @title PubMed Records downloaded and analyzed via easyPubMed
#'
#' @description This dataset includes a collection of 4 examples showing how to download and analyze records 
#' from PubMed by using easyPubMed. Each element in the EPMsamples list corresponds to a different query 
#' and/or analysis. Also, each element of EPMsamples is a list including intermediates and notes about the analysis.
#'
#' @usage data("EPMsamples")
#' 
#' @format The dataset is formatted as a list including 4 elements: 
#' 
#' * `DF_papers_abs`: List of 4
#' 
#' * `DF_papers_std`: List of 4
#' 
#' * `NUBL_dw18`: List of 3
#' 
#' * `NUBL_1618`: List of 5
#' 
#'   
#' @details The dataset was built as described in this vignette: \url{https://www.data-pulse.com/projects/Rlibs/vignettes/building_the_easyPubMed_EPMsamples_dataset.html}
#'
#' @examples 
#' ## Display some contents
#' data("EPMsamples")
#' # The following examples are focused on example query #4 (i.e., NUBL_1618)
#' # Display Query String used for collecting the data
#' print(EPMsamples$NUBL_1618$qry_st)
#' # show one PubMed record element from the IL vector
#' NU_records <- EPMsamples$NUBL_1618$rec_lst
#' cat(NU_records[[1]])
#' # cast PM recort to data.frame
#' BL_df <- article_to_df(NU_records[[6]], max_chars = 0)
#' print(BL_df)
"EPMsamples"

#' @title PubMed Records about Bladder Research from Northwestern University
#'
#' @description This dataset includes a collection of 87 PubMed Records of scientific publications about Bladder 
#' biology and pathology, published by clinical and research groups from Northwestern 
#' University (Chicago, IL), between 2016 and 2018.
#'
#' @usage data("PubMed_stopwords")
#' 
#' @format A character vector including all PubMed stopwords tat are typically filtered out from queries. 
#'   
#' @details Number of stopwords included, n=133.
#'
#' @examples 
#' ## Display some contents
#' data("PubMed_stopwords")
#' head(PubMed_stopwords)
"PubMed_stopwords"

#' @title Retrieve and Process Scientific Publication Records from Pubmed
#' 
#' @description Query NCBI Entrez and retrieve PubMed records in XML or TXT format. PubMed records 
#' can be downloaded and saved as XML or text files. Data integrity is enforced during data download, 
#' allowing to retrieve and save very large number of records effortlessly. PubMed records can be processed 
#' to extract publication- and author-specific information.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples
#' try({
#'   ## Example 01: retrieve data in TXT format
#'   dami_query_string <- "Damiano Fantini[AU]"
#'   dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#'   dami_papers <- fetch_pubmed_data(dami_on_pubmed, format = "abstract")
#'   dami_papers[dami_papers == ""] <- "\n"
#'   cat(paste(dami_papers[1:65], collapse = ""))
#'   #
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Example 02: retrieve data in XML format
#' library("easyPubMed")
#' dami_query_string <- "Damiano Fantini[AU] AND 2018[PDAT]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#' dami_papers <- fetch_pubmed_data(dami_on_pubmed)
#' titles <- sapply(dami_papers, custom_grep, tag = "ArticleTitle", format = "char", USE.NAMES = FALSE)
#' print(titles)
#' #
#' ## Example 03: retrieve data from PubMed and save as XML file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out1 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' x <- paste(readLines(out1[1], n = 10), collapse = "\n")
#' cat(x)
#' #
#' ## Example 04: retrieve data from PubMed and save as TXT file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out2 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180, format = "medline")
#' x <- paste(readLines(out1[1], n = 30), collapse = "\n")
#' cat(x)
#' #
#' ## Example 05: extract information from a single PubMed record 
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out3 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' PM_data <- articles_to_list(out3[1])
#' PM_record_df <- article_to_df(PM_data[[80]])
#' print(PM_record_df[1,])
#' print(PM_record_df[,"address"])
#' #
#' ## Example 06: query PubMed and extract information from multiple records in one step 
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out4 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' PM_tab <- table_articles_byAuth(out4[1], autofill = TRUE, included_authors = "last")
#' PM_tab$address <- substr(PM_tab$address, 1, 12)
#' PM_tab[50:70,c("pmid", "jabbrv", "year", "lastname", "address")]
#' }
#' 
#' @keywords internal
"_PACKAGE"
Any scripts or data that you put into this service are public.
easyPubMed documentation built on May 2, 2019, 3:47 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
easyPubMed
Search and Retrieve Scientific Publication Records from PubMed

R/easyPubMed_scr.R
In easyPubMed: Search and Retrieve Scientific Publication Records from PubMed

Try the easyPubMed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

easyPubMed Search and Retrieve Scientific Publication Records from PubMed

R/easyPubMed_scr.R In easyPubMed: Search and Retrieve Scientific Publication Records from PubMed

Try the easyPubMed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

easyPubMed
Search and Retrieve Scientific Publication Records from PubMed

R/easyPubMed_scr.R
In easyPubMed: Search and Retrieve Scientific Publication Records from PubMed