R/easyPubMed_scr.R

#' @title Extract Data from a PubMed Record
#'
#' @description Extract publication-specific information from a PubMed record driven by XML tags. 
#' The input record is a string (character-class vector of length 1) and includes 
#' PubMed-specific XML tags. Data are returned as a data frame where each row corresponds 
#' to one of the authors of the PubMed article.
#'
#' @usage article_to_df(pubmedArticle, autofill = FALSE, 
#'                      max_chars = 500, getKeywords = FALSE, 
#'                      getAuthors = TRUE)
#'                     
#' @param pubmedArticle String including one PubMed record.
#' @param autofill Logical. If TRUE, missing affiliations are automatically imputed based on other non-NA 
#' addresses from the same record.
#' @param max_chars Numeric (integer). Maximum number of characters to be extracted from the Article 
#' Abstract field. Set max_chars to -1 for extracting the full-length abstract. Set max_chars to 0 to 
#' extract no abstract.
#' @param getKeywords Logical. If TRUE, an attempt to extract article Keywords will be made.
#' @param getAuthors Logical. If FALSE, author information won't be extracted. This will considerably 
#' speed up the operation.
#'
#' @details 
#' Given one Pubmed Article record, this function will automatically extract a set of features. 
#' Extracted information include: PMID, DOI, article title, article abstract, publication date (year, month, day), 
#' journal name (title, abbreviation), keywords, and a set of author-specific info (names, affiliation, email address). 
#' Each row of the output data frame corresponds to one of the authors of the PubMed record. Author-independent info 
#' (publication ID, title, journal, date) are identical across all rows. If information about authors are not required, 
#' set 'getAuthors' = TRUE.
#'
#' @return Data frame including the extracted features. Each row correspond a different author.
#'
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' try({
#'   ## Display some contents
#'   data("EPMsamples")
#'   #display Query String used for collecting the data
#'   print(EPMsamples$NUBL_1618$qry_st)
#'   #Get records
#'   BL_list <- EPMsamples$NUBL_1618$rec_lst
#'   cat(BL_list[[1]])
#'   # cast PM recort to data.frame
#'   BL_df <- article_to_df(BL_list[[1]], max_chars = 0)
#'   print(BL_df)
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Query PubMed, retrieve a selected citation and format it as a data frame
#' dami_query <- "Damiano Fantini[AU] AND 2017[PDAT]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query)
#' dami_abstracts_xml <- fetch_pubmed_data(dami_on_pubmed)
#' dami_abstracts_list <- articles_to_list(dami_abstracts_xml)
#' article_to_df(pubmedArticle = dami_abstracts_list[[1]], autofill = FALSE)
#' article_to_df(pubmedArticle = dami_abstracts_list[[2]], autofill = TRUE, max_chars = 300)[1:2,]
#' }
#'
#' @export
article_to_df <-
function(pubmedArticle, 
                          autofill = FALSE, 
                          max_chars = 500, 
                          getKeywords = FALSE,
                          getAuthors = TRUE) 
{
  #
  options(warn = -1)
  
  # initial check
  # expected cols = 14
  # "pmid", "doi", "title", "abstract", "year", "month", "day", "jabbrv", "journal", 
  # "keywords", "lastname", "firstname", "address", "email" 
  
  # Global Check!
  if (class(pubmedArticle) != "character" |
      regexpr("(<PubmedArticle)(.+)(\\/PubmedArticle>)", pubmedArticle) < 0 )
  {
    message("An error occurred")
    return(NULL)
  }
  
  # max_chars Check
  if (!is.numeric(max_chars)) {
    max_chars <- 500
  } else if (max_chars < 0) {
    max_chars <- -1  
  }
  
  # Get started
  tryCatch({
    
    tmp.article <- custom_grep(xml_data = pubmedArticle, tag = "PubmedArticle", format = "char")
    if (is.null(tmp.article)) 
    {
      message("An error occurred")
      return(NULL)
    }
    
    # Title
    tmp.title <- custom_grep(xml_data = tmp.article, tag = "ArticleTitle", format = "char")
    if (length(tmp.title) > 1){
      tmp.title <- paste(tmp.title, collapse = " ", sep = " ")
    } else if (length(tmp.title) < 1) {
      tmp.title <- NA
    }
    
    # Abstract
    tmp.abstract <- custom_grep(xml_data = tmp.article, tag = "AbstractText", format = "char")
    if (length(tmp.abstract) > 1){
      tmp.abstract <- paste(tmp.abstract, collapse = " ", sep = " ")
      if(max_chars >= 0) {
        tmp.abstract <- gsub("</{0,1}i>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}b>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}sub>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}exp>", "", tmp.abstract, ignore.case = T)
        
        tmp.abstract <- substr(tmp.abstract, 0, max_chars)
      }
    } else if (length(tmp.abstract) < 1) {
      tmp.abstract <- NA
    } else {
      if(max_chars >= 0) {
        tmp.abstract <- substr(tmp.abstract, 0, max_chars)
        tmp.abstract <- gsub("</{0,1}i>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}b>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}sub>", "", tmp.abstract, ignore.case = T)
        tmp.abstract <- gsub("</{0,1}exp>", "", tmp.abstract, ignore.case = T)
        
      }
    }
    
    # Dates, if any
    my.dateType <- c("DateCompleted",  "DateCreated",  "DateRevised",  "PubDate")
    sel.dateType <-which(sapply(my.dateType, (function(xi) {
      regexpr(xi, tmp.article) > 0
    })))
    if (length(sel.dateType) < 1) {
      tmp.date <- c(Year=NA, Month=NA, Day=NA)
    } else {
      sel.dateType <- sel.dateType[1]
      tmp.date <- custom_grep(xml_data = tmp.article, tag = my.dateType[sel.dateType], format = "char")
      tmp.date <- sapply(c("Year", "Month", "Day"), (function(tt){
        tdat.el <- custom_grep(xml_data = tmp.date, tag = tt, format = "char")
        ifelse(is.null(tdat.el), NA, tdat.el[1])
      }))
    }
    
    # Fetch ID string
    tmp.paperID  <- custom_grep(xml_data = tmp.article, tag = "ArticleIdList", format = "char")
    if (is.null(tmp.paperID)) 
    {
      message("An error occurred")
      return(NULL)
    } else {
      tmp.paperID <- gsub("[[:space:]]", "", tmp.paperID[1])
    }
    
    # Get PMID
    tmp.PMID <- gsub("^(.*ArticleIdIdType=\\\"pubmed\\\")([[:space:]]|[[:alnum:]]){0,20}>", "", tmp.paperID)
    tmp.PMID <- gsub("<.*$", "", tmp.PMID)
    
    # Get DOI
    tmp.DOI <- gsub("^(.*ArticleIdIdType=\\\"doi\\\")([[:space:]]|[[:alnum:]]){0,20}>", "", tmp.paperID)
    tmp.DOI <- gsub("<.*$", "", tmp.DOI)
    
    # Get Journal Abbrv
    tmp.jabbrv  <- custom_grep(xml_data = tmp.article, tag = "ISOAbbreviation", format = "char")
    tmp.jabbrv <- ifelse(is.null(tmp.jabbrv), NA, tmp.jabbrv)
    
    # Get Title
    tmp.journal <- custom_grep(xml_data = tmp.article, tag = "Title", format = "char")
    tmp.journal <- ifelse(is.null(tmp.journal), NA, tmp.journal)
    
    # Fetch Keywords ----MeshHeading
    tmp.keys <- tryCatch({
      if (getKeywords) {
        tmp.keys <- custom_grep(xml_data = tmp.article, 
                                tag = "Keyword", 
                                format = "char")
        
        tmp.mesh <- custom_grep(xml_data = tmp.article, 
                                tag = "MeshHeading", 
                                format = "char")
        
        if (length(tmp.mesh) > 0) {
          tmp.mesh <- sapply(tmp.mesh, function(xxm) {
            custom_grep(xml_data = xxm, 
                        tag = "DescriptorName", 
                        format = "char")
          })
        }
        
        tmp.keys <- c(tmp.keys, tmp.mesh)
        
        if (length(tmp.keys) > 1) {
          tmp.keys <- paste(tmp.keys, collapse = "; ")
        } else if (length(tmp.keys) < 1) {
          tmp.keys <- NA
        }
      } else {
        NA
      }
    }, error = function(e) {NA})
    
    # vector with all unique fields extracted o far
    tmp.resout <- c(pmid=tmp.PMID, 
                    doi=tmp.DOI, 
                    title=tmp.title,
                    abstract=tmp.abstract,
                    year = as.vector(tmp.date[1]),
                    month = as.vector(tmp.date[2]),
                    day = as.vector(tmp.date[3]),
                    jabbrv=tmp.jabbrv,
                    journal=tmp.journal,
                    keywords=tmp.keys)
    
    # Slow part - authors
    tmp.authors <- custom_grep(xml_data = tmp.article, tag = "AuthorList", format = "char")
    
    if (length(tmp.authors) < 1 | !getAuthors) {
      # Set every placeholder with NA
      final.mat <- data.frame(rbind(c(tmp.resout, 
                                      lastname=NA, 
                                      firstname=NA, 
                                      address=NA, 
                                      email=NA)), stringsAsFactors = FALSE)
    } else {
      author.list <- custom_grep(xml_data = tmp.authors, tag = "Author", format = "char")
      final.mat <- do.call(rbind, lapply(author.list, (function(al) {
        tmp.lastnm <- custom_grep(xml_data = al, tag = "LastName", format = "char")
        tmp.firstnm <- custom_grep(xml_data = al, tag = "ForeName", format = "char")
        tmp.email <- regexpr("([[:alnum:]]|\\.|\\-\\_){3,200}@([[:alnum:]]|\\.|\\-\\_){3,200}(\\.)([[:alnum:]]){2,6}", al)
        if (tmp.email > 0) {
          tmp.email <- substr(al, tmp.email, tmp.email + attributes(tmp.email)$match.length -1 )
        } else {
          tmp.email <- NA
        }
        #
        if (regexpr("Affiliation", al) > 0) {
          tmp.add <- custom_grep(al, "Affiliation", format = "char")[1]
          tmp.add <- trim_address(tmp.add)
        } else {
          tmp.add <- NA
        }
        c(tmp.resout, 
          lastname=tmp.lastnm, 
          firstname=tmp.firstnm, 
          address=tmp.add, 
          email=tmp.email)
        #
      })))
      rownames(final.mat) <- NULL
      
      final.mat <- data.frame(final.mat, stringsAsFactors = FALSE)
      DESELECT <- is.na(final.mat$lastname) | is.na(final.mat$firstname)
      if (length(DESELECT) > 0 & sum(DESELECT) > 0)
        final.mat <- final.mat[!DESELECT, ]
      #
      if (autofill){
        tmp.address <- final.mat[,"address"]
        na.pos <- is.na(tmp.address)
        if (sum(na.pos) != length(tmp.address)) {
          tmp.list <- lapply(tmp.address, function(x) {x} ) 
          cur.add <-  tmp.list[[(which(!na.pos)[1])]]
          for (i in 1:length(na.pos)){
            if(na.pos[i]){
              tmp.list[[i]] <- cur.add
            } else {
              cur.add <- tmp.list[[i]]
            }
          }
          final.mat[,"address"] <- do.call(c, tmp.list)
        }
      }
    }
    
    # Final check and return
    if (ncol(final.mat) != 14) {
      final.mat <- NULL
    }
  }, error = function(e) {NULL}, 
  finally = {
    options(warn = 0)
    return(final.mat)
  })
}
 
 
 
#' @title Cast PubMed Data into a List of Articles
#'
#' @description Convert an XML object of PubMed records into a list of strings 
#' (character vector of length 1) corresponding to individual PubMed articles. 
#' PubMed records are identified by a "/PubmedArticle" XML tag. This automatically casts 
#' all the content of each PubMed record to a character-class object without removing XML tags.
#' 
#' @usage articles_to_list(pubmed_data, encoding = "UTF8", simplify = TRUE)
#' 
#' @param pubmed_data String corresponding to the name of an XML file (typically, 
#' the result of a batch_pubmed_download() call). Alternatively, a string including 
#' PubMed records with XML tags, such as the object returned by a fetch_pubmed_data() call.
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the 
#' function base::iconv(). See iconv() help page for how to find out more about encodings 
#' that can be used on your platform. "UTF-8" is recommended.
#' @param simplify Logical; should the result be simplified to a character vector. 
#' If FALSE, results are returned as a list.
#' 
#' @details 
#' The input is an XML object or a string including PubMed records (with XML tags). These are the
#' output of easyPubMed functions: fetch_pubmed_data() or batch_pubmed_download(). 
#' The function returns a list or a character vector where each element is a different PubMed record.
#' 
#' @return 
#' List or character vector including all the records from the original XML object in text format. 
#' Elements in the list are not named and are only accessible via their numeric index.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' try({
#'   ## Retrieve PubMed data and return a list ot articles
#'   my_query <- "Damiano Fantini[AU]"
#'   my_query <- get_pubmed_ids(pubmed_query_string = my_query)
#'   my_data <- fetch_pubmed_data(my_query, encoding = "ASCII")
#'   listed_articles <- articles_to_list(my_data)
#'   custom_grep(listed_articles[[2]], "ArticleTitle", "char")
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Download PubMed data and return a list ot articles
#' dami_query <- "Damiano Fantini[AU] AND 2018[PDAT]"
#' outfile <- batch_pubmed_download(dami_query, dest_file_prefix = "easyPM_ex001_")
#' listed_articles <- articles_to_list(pubmed_data = outfile)
#' custom_grep(listed_articles[[2]], "ArticleTitle", "char")
#' }
#' 
#' @export
articles_to_list <-
  function(pubmed_data, encoding = "UTF8", simplify = TRUE) 
{
    
  # Define a nested (core) function handling standardized input
  # This assumes a string as a input
    easyPM_exec_art_to_lst <- function(pm_dataa, simply = TRUE) 
    {
      # it's a string to process
      pm_datab <- strsplit(pm_dataa, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
      pm_datab <- sapply(pm_datab, function(x) {
        #trim extra stuff at the end of the record
        if (!grepl("</PubmedArticle>$", x))
          x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) 
        
        # Rebuid XML structure and proceed
        x <- paste("<PubmedArticle>", x)
        gsub("[[:space:]]{2,}", " ", x)}, 
        USE.NAMES = FALSE, simplify = simply)
      
      pm_datab
    }
    
  # Execute f(x)
  # Handle inputs of different type
  # check if it is a XMLAbstractDocument or a file
  TMP <- substr(pubmed_data[1], 1, 1000)
  if (grepl("<PubmedArticle", TMP)) {
    
    # it's a string to process
    out <- easyPM_exec_art_to_lst(pubmed_data[1], simply = simplify)
  
  } else if (file.exists(pubmed_data[1])) {
    
    # it's a file
    con1 <- file(pubmed_data[1], encoding = "UTF8")
    on.exit(close(con1))
    myXML <- readLines(con = con1, 
                       n = -1, ok = TRUE, encoding = "UTF8") 
    
    if (encoding != "UTF8")
      myXML <- base::iconv(myXML, from = "UTF8", to = encoding, sub = ".")

    myXML <- paste(myXML, collapse = "")
    
    # Run as above
    out <- easyPM_exec_art_to_lst(myXML, simply = simplify)
    
  } else {
    message("An error occurred")
    return(NULL)  
  }

  return(out)
}  
    
 
 
#' @title Download PubMed Records in XML or TXT Format
#'
#' @description Performs a PubMed Query (via the get_pubmed_ids() function), downloads the 
#' resulting data (via multiple fetch_pubmed_data() calls) and then saves data in a series of 
#' xml or txt files on the local drive. The function is suitable for downloading 
#' a very large number of records.
#' 
#' @usage batch_pubmed_download(pubmed_query_string, dest_dir = NULL, 
#'                              dest_file_prefix = "easyPubMed_data_", 
#'                              format = "xml", api_key = NULL, 
#'                              batch_size = 400, res_cn = 1, 
#'                              encoding = "UTF8")
#' 
#' @param pubmed_query_string String (character-vector of length 1): this is the string 
#' used for querying PubMed (the standard PubMed Query synthax applies).
#' @param dest_dir String (character-vector of length 1): this string corresponds to the name 
#' of the existing folder where files will be saved. Existing files will be overwritten. 
#' If NULL, the current working directory will be used.
#' @param dest_file_prefix String (character-vector of length 1): this string is used as 
#' prefix for the files that are written locally. 
#' @param format String (character-vector of length 1): data will be requested from Entrez 
#' in this format. Acceptable values are: c("medline","uilist","abstract","asn.1", "xml"). 
#' When format != "xml", data will be saved as text files (txt).
#' @param api_key String (character vector of length 1): user-specific API key to increase 
#' the limit of queries per second. You can obtain your key from NCBI.
#' @param batch_size Integer (1 < batch_size < 5000): maximum number of records 
#' to be saved in a single xml or txt file.
#' @param res_cn Integer (> 0): numeric index of the data batch to start downloading from. 
#' This parameter is useful to resume an incomplete download job after a system crash.
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the 
#' function base::iconv(). See iconv() help page for how to find out more about encodings 
#' that can be used on your platform. Here, we recommend using "UTF-8".
#' 
#' @details 
#' Download large number of PubMed records as a set of xml or txt files that are saved in the 
#' folder specified by the user. This function enforces data integrity. If a batch of downloaded 
#' data is corrupted, it is discarded and downloaded again. Each download cycle is monitored until 
#' the download job is successfully completed. This function should allow to download a whole copy 
#' of PubMed, if desired. The function informs the user about the current progress by constantly 
#' printing to console the number of batches still in queue for download. pubmed_query_string 
#' accepts standard PubMed synthax. The function will query PubMed multiple times using the same 
#' query string. Therefore, it is recommended to use a [EDAT] or a [PDAT] filter in the query 
#' if you want to ensure reproducible results.
#' 
#' @return 
#' Character vector including the names of files downloaded to the local system  
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ## Example 01: retrieve data from PubMed and save as XML file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out1 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' readLines(out1[1])[1:30]
#' ##
#' ## Example 02: retrieve data from PubMed and save as TXT file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out2 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180, format = "medline")
#' readLines(out2[1])[1:30]
#' }
#' 
#' @export
batch_pubmed_download <-
  function (pubmed_query_string, 
            dest_dir = NULL,
            dest_file_prefix = "easyPubMed_data_",
            format = "xml",
            api_key = NULL,
            batch_size = 400, 
            res_cn = 1, 
            encoding = "UTF8") 
    
{
  baseDir <- getwd()
  if (!is.null(dest_dir)) {
    setwd(as.character(dest_dir))
  }
  fileName.collector <- list()
  myQuery <- NULL
  my_rtime <- ifelse(is.null(api_key), 0.34, 0.11)
  
  cur_time <- Sys.time()
  while (is.null(myQuery)) {
    diff_time <- my_rtime - as.numeric(difftime(Sys.time(), cur_time, units = "secs"))
    if (diff_time > 0) {
      Sys.sleep(diff_time)
    }
    cur_time <- Sys.time()
    myQuery <- tryCatch(get_pubmed_ids(pubmed_query_string, api_key = api_key), 
                        error = function(e) NULL)
  }
  pubsNum <- as.numeric(myQuery$Count)
  tmpPapers <- NULL
  myRetstart <- 0
  myRetmax <- batch_size
  j = 1
  expTot <- pubsNum/batch_size
  if (expTot > as.integer(expTot)) {
    expTot <- as.integer(expTot) + 1
  } else {
    expTot <- as.integer(expTot)
  }
  while (myRetstart < pubsNum) {
    if (j < res_cn) {
      message(paste("cycle", j, "/", expTot, "skipped...", 
                    sep = " "))
    } else {
      cur_time <- Sys.time()
      while (is.null(myQuery) | is.null(tmpPapers)) {
        
        diff_time <- my_rtime - (as.numeric(Sys.time() - cur_time))
        if (diff_time > 0) {
          Sys.sleep(diff_time)
        }
        cur_time <- Sys.time()
        
        myQuery <- tryCatch(get_pubmed_ids(pubmed_query_string, api_key = api_key), 
                            error = function(e) NULL)
        
        diff_time <- my_rtime - as.numeric(difftime(Sys.time(), cur_time, units = "secs"))
        
        if (diff_time > 0) {
          Sys.sleep(diff_time)
        }
        cur_time <- Sys.time()
        
        # Force download as XML, but withoud collapsing strings
        if (format[1] == "xml") {
          format <- "batchxml"
        }
        
        tmpPapers <- tryCatch(fetch_pubmed_data(pubmed_id_list = myQuery, 
                                                retstart = myRetstart, 
                                                retmax = myRetmax,
                                                format = format, 
                                                encoding = encoding),
                              
                              error = function(e) NULL, 
                              finally = print(paste("PubMed data batch", 
                                                    j, "/", 
                                                    expTot, "downloaded...",
                                                    sep = " ")))
        if (is.null(tmpPapers)) {
          message("Data retrieval error. Retrying...")
        }
      }
      totDigits <- nchar(as.character(expTot)) + 1
      myExt <- paste(rep(0, totDigits - nchar(as.character(j))), 
                     collapse = "")
  
      tmp.dest.file <- paste(dest_file_prefix, myExt, j, ".txt", sep = "")
      con1 <- file(tmp.dest.file, encoding = encoding)
      doSaveData <- tryCatch(write(tmpPapers, tmp.dest.file), 
                             error = function(e) {"ERROR"}, 
                             finally = {close(con1)})
      if(is.null(doSaveData))
        doSaveData <- tmp.dest.file
      
      myQuery <- NULL
      tmpPapers <- NULL
      if (doSaveData == "ERROR") {
        myRetstart <- myRetstart - myRetmax
        j <- j - 1
        message("An error occurred... Trying to download data from PubMed again...")
      } else {
        fileName.collector[[1+length(fileName.collector)]] <- doSaveData
      }
    }
    myRetstart <- myRetstart + myRetmax
    j <- j + 1
  }
  setwd(baseDir)
  tryCatch(do.call(c, fileName.collector), error = function(e){NULL})
}
 
 
#' @title Retrieve Text Between XML Tags
#'
#' @description Extract text form a string containing XML or HTML tags. Text 
#' included between tags of interest will be returned. If multiple tagged substrings are found, 
#' they will be returned as different elements of a list or character vector.
#' 
#' @usage custom_grep(xml_data, tag, format = "list")
#' 
#' @param xml_data String (of class character and length 1): corresponds to the PubMed 
#' record or any string including XML/HTML tags.
#' @param tag String (of class character and length 1): the tag of interest (does NOT include < > chars).
#' @param format c("list", "char"): specifies the format for the output.
#' 
#' @details 
#' The input string has to be a character string (length 1) containing tags (HTML or XML format). 
#' If an XML Document is provided as input, the function will rise an error.
#' 
#' @return 
#' List or vector where each element corresponds to an in-tag substring.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' try({
#'   ## extract substrings based on regular expressions
#'   string_01 <- "I can't wait to watch the <strong>Late Night Show with" 
#'   string_01 <- paste(string_01, "Seth Meyers</strong> tonight at <strong>11:30</strong>pm CT!")
#'   print(string_01)
#'   custom_grep(xml_data = string_01, tag = "strong", format = "char")
#'   custom_grep(xml_data = string_01, tag = "strong", format = "list")
#' }, silent = TRUE)
#' 
#' @export
custom_grep <-
  function(xml_data, 
           tag, 
           format = "list")
  {
    x <- xml_data[[1]]
    tag.op <- paste("\\<", tag, "((\\>)|([[:space:]]([^[<]]*)\\>))", sep = "")
    tag.cl <- paste("(<\\/)", tag, "(\\>)", sep = "")
    #
    out.result <- list()
    i = 1
    while (!is.null(x) &&
           !is.na(x) &&
           x != "" &&
           nchar(x) > 0 &&
           regexpr(tag.op, x) > 0 &&
           regexpr(tag.cl, x) > 0){
      tag.op.pos <- regexpr(tag.op, x)
      nu.x <- substr(x, (tag.op.pos - 1), nchar(x))
      inner.trim <- regexpr(">", nu.x, fixed = TRUE)
      nu.x <- substr(nu.x, (inner.trim + 1), nchar(nu.x))
      #
      tag.cl.pos <- regexpr(tag.cl, nu.x)
      tag.cl.full <- tag.cl.pos + attributes(tag.cl.pos)$match.length + 1
      x <- substr(nu.x, tag.cl.full, nchar(x))
      nu.x <- substr(nu.x, 1, (tag.cl.pos - 1))
      #
      out.result[[i]] <- nu.x
      i <- i + 1
    }
    if (format != "list") {
      out.result <- do.call(c, out.result)
    }
    return(out.result)
  }

 
#' @title Retrieve All PubMed Record Identifiers Returned by a Query
#'
#' @description Retrieve PubMed record identifiers from Entrez following a search performed 
#' via the get_pubmed_ids() function. Identifiers are returned as a character vector.
#' 
#' @usage fetch_all_pubmed_ids(pubmed_id_list)
#' 
#' @param pubmed_id_list List: the result of a get_pubmed_ids() call.
#' 
#' @details 
#' Retrieve PubMed identifiers, without any other information (such as article title, 
#' authors, publication date, and so on). The PubMed IDs can be stored or used with other software.
#' 
#' @return 
#' Character vector including all PMID (PubMed Identifiers) returned by the current query.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ## Fetch only PubMed Record IDs (PMIDs)
#' dami_query_string <- "Damiano Fantini[AU]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#' dami_pmids <- fetch_all_pubmed_ids(dami_on_pubmed)
#' print(dami_pmids)
#' 
#' }
#' 
#' @export
fetch_all_pubmed_ids <-
  function(pubmed_id_list)
{
  # expected records, set retmax
  exp_num <- as.numeric(pubmed_id_list$Count)
  if (is.numeric(exp_num) && exp_num > 0) {
    my_retmax <- exp_num + 1
  } else {
    my_retmax <- 100000
  }
  
  # query, and then extract IDs
  myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", 
                       "db=pubmed&retmax=", my_retmax, "&term=", pubmed_id_list$OriginalQuery, "&usehistory=n", sep = "")
  IDconnect <- url(myPubmedURL, open = "rb", encoding = "UTF8")
  on.exit(close(IDconnect))
  idXML <- readLines(IDconnect, warn = FALSE, encoding = "UTF8") 

  collect_ids <- list()
  for (i in 1:length(idXML)) {
    if (grepl("^<Id>", idXML[i])) {
      xx <- custom_grep(idXML[i], tag = "Id", format = "char")
      collect_ids[[length(collect_ids) + 1]] <- as.character(xx[1])
    }
  }
  myIDlist <- as.character(do.call(c, collect_ids))
  
  # final check and return
  if(length(myIDlist) != exp_num)
    message(paste("Note that only ", length(myIDlist), " PubMed IDs were retrieved (", 
                  exp_num, " were expected).", sep = ""))
  
  return(myIDlist)
}
 
 
#' @title Retrieve PubMed Data in XML or TXT Format
#'
#' @description Retrieve PubMed records from Entrez following a search performed via the 
#' get_pubmed_ids() function. Data are downloaded in the XML or TXT format and are 
#' retrieved in batches of up to 5000 records.
#' 
#' @usage fetch_pubmed_data(pubmed_id_list, 
#'                          retstart = 0, 
#'                          retmax = 500, 
#'                          format = "xml", 
#'                          encoding = "UTF8")
#' 
#' @param pubmed_id_list List: the result of a get_pubmed_ids() call.
#' @param retstart Integer (>=0): index of the first UID in the retrieved PubMed Search Result set 
#' to be included in the output (default=0, corresponding to the first record of the entire set).
#' @param retmax Integer (>=1): size of the batch of PubMed records to be retrieved at one time.
#' @param format Character: element specifying the output format. The following values are allowed: 
#' c("asn.1", "xml", "medline", "uilist", "abstract").
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the function base::iconv(). 
#' See iconv() help page for how to find out more about encodings that can be used on your platform. 
#' Here, we recommend using "UTF-8".
#' 
#' @details 
#' Retrieve PubMed records based on the results of a get_pubmed_ids() query. 
#' Records are retrieved from Entrez via the PubMed API efetch function. The first entry to be retrieved 
#' may be adjusted via the retastart parameter (this allows the user to download large batches of PubMed 
#' data in multiple runs). The maximum number of entries to be retrieved can also be set adjusting the 
#' retmax parameter (1 < retmax < 5000). Data will be downloaded on the fly (no files are saved 
#' locally).
#' 
#' @return 
#' An object (vector) of class "character". If format is set to "xml" (default), a single String including all 
#' PubMed records (with XML tags embedded) is returned. If a different format is selected, a vector of strings 
#' is returned, where each row corresponds to a line of the output document.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references 
#' \url{https://www.data-pulse.com/dev_site/easypubmed/}
#' \url{https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/}
#'
#' @examples
#' try({ 
#'   ## Example 01: retrieve data in TXT format
#'   library("easyPubMed")
#'   dami_query_string <- "Damiano Fantini[AU] AND 2018[PDAT]"
#'   dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#'   Sys.sleep(1) # avoid server timeout
#'   dami_papers <- fetch_pubmed_data(dami_on_pubmed, format = "abstract")
#'   dami_papers[dami_papers == ""] <- "\n"
#'   cat(paste(dami_papers[1:65], collapse = ""))
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Example 02: retrieve data in XML format
#' library("easyPubMed")
#' dami_query_string <- "Damiano Fantini[AU]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#' dami_papers <- fetch_pubmed_data(dami_on_pubmed)
#' titles <- custom_grep(dami_papers, "ArticleTitle", "char")
#' print(titles)
#' }
#' 
#' @importFrom utils head
#' @export
fetch_pubmed_data <-
  function (pubmed_id_list,
            retstart = 0,
            retmax = 500,
            format = "xml", 
            encoding = "UTF8") 
{
  myIDlist <- pubmed_id_list
  if ((!is.list(myIDlist)) | is.na(myIDlist$WebEnv) | is.na(myIDlist$QueryKey) | 
      is.na(myIDlist$Count) | !is.integer(as.integer(retstart)) | 
      !is.integer(as.integer(retmax))) {
    message("There is an issue with the PubMed ID list you supplied. Please, call the function again and supply the result of a <get_pubmed_ids()> call as argument. Thank you.")
    return(NULL)
  } else {
    myWebEnv <- myIDlist$WebEnv
    myKey <- myIDlist$QueryKey
    myCount <- as.numeric(as.character(myIDlist$Count))
    myRetstart = as.integer(retstart)
    if (myRetstart < 0) {
      myRetstart = 0
    }
    myRetmax <- as.integer(retmax)
    if (myRetmax > 5000) {
      myRetmax = 5000
    }
    if (myRetmax < 1) {
      myRetmax = 1
    }
    if (format[1] %in% c("medline","uilist","abstract","asn.1", "xml")) {
      myFormat <- format[1]
    } else {
      myFormat <- "xml"
    }
    typeMode <- switch(myFormat, 
                       "asn.1" = c("null", "asn.1"),
                       "xml" = c("null", "xml"),
                       "medline" = c("medline", "text"),
                       "uilist" = c("uilist", "text"),
                       "abstract" = c("abstract", "text"))
    efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", 
                       "db=pubmed&WebEnv=", myWebEnv, "&query_key=", myKey, 
                       "&retstart=", myRetstart, "&retmax=", myRetmax, 
                       "&rettype=", typeMode[1],"&retmode=", typeMode[2], 
                       sep = "")
    
    # api_key retrieval
    api_key <- pubmed_id_list$APIkey
    if (!is.null(api_key)) {
      efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "")
    }
    
    # initialize output
    out.data <- NULL
    
    # initialize extra params
    try_num <- 1
    t_0 <- Sys.time()
    
    # Try to fetch results
    while(is.null(out.data)) {
      
      # Timing check: kill at 2 min
      if (try_num > 1)
        Sys.sleep(time = 2)
      
      t_1 <- Sys.time()
      
      if(as.numeric(difftime(t_1, t_0, units = "mins")) > 5){
        message("Killing the request! Something is not working. Please, try again later")
        return(NULL)
      }
      
      # ENTREZ server connect
      out.data <- tryCatch({    
        tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8"))
        suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8"))
      }, error = function(e) {
        # message(e)
        NULL
      }, finally = {
        try(suppressWarnings(close(tmpConnect)), silent = TRUE)
      })  
      
      # Check if error
      if (!is.null(out.data) && 
          class(out.data) == "character" &&
          grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) {
          # message(out.data)
        out.data <- NULL
      }
      try_num <- try_num + 1
    }
    
    if (is.null(out.data)) {
      message("Killing the request! Something is not working. Please, try again later")
      return(NULL)
    }
    
    if (encoding != "UTF8")
      out.data <- base::iconv(out.data, from = "UTF8", to = encoding, sub = ".")

    if (format[1] == "xml") {
      out.data <- paste(out.data, collapse = "")
    }
                          
    return(out.data)
  }
}
 
 
#' @title Simple PubMed Record Search by Full-length Title
#'
#' @description Query PubMed (Entrez) in a simple way via the PubMed API eSearch function. 
#' This function is designed to query PubMed using a full-length publication title as query string. 
#' It performs stopword removal from the query string before querying the PubMed server. 
#' Calling this function results in posting the results on the PubMed History Server. 
#' This allows later access to the resulting data via the fetch_pubmed_data() function, 
#' or other easyPubMed functions.
#' 
#' @usage get_pubmed_ids_by_fulltitle(fulltitle, field = "[Title]", api_key = NULL)
#' 
#' @param fulltitle String (character vector of length 1) that corresponds to the full-length 
#' publication title used for querying PubMed (titles should be used as is, without 
#' adding extra filters/tags).
#' @param field String (character vector of length 1) with a tag indicating the PubMed 
#' record field where the full-length string (fulltitle) should be searched in. By default, 
#' this points to the 'Title' field. This field can be changed (use fields supported by PubMed) 
#' as required by the user (for example, to attempt an exact-match query using a specific sentence 
#' included in the abstract of a record).
#' @param api_key String (character vector of length 1): user-specific API key to increase 
#' the limit of queries per second. You can obtain your key from NCBI.
#' 
#' @details 
#' This function will use the String provided as argument for querying PubMed via the eSearch 
#' function of the PubMed API. The Query Term should include a full-length publication title, 
#' without other PubMed operators (AND, OR, NOT) nor tags (i.e., [AU], [PDAT], 
#' [Affiliation], and so on). ESearch will post the UIDs resulting from the search operation 
#' onto the History server so that they can be used directly in a subsequent fetchPubmedData() call.
#' 
#' @return 
#' The function returns a list. The list includes the number of records found on PubMed and the first 
#' 20 PubMed IDs (UID) retrieved by the query. The list also includes QueryKey and WebEnv that are 
#' required for a subsequent fetch_pubmed_data() call.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ##  Search for a scientific article matching a full-length title
#' my_query <- "Body mass index and cancer risk among Chinese patients with type 2 diabetes mellitus"
#' my_field <- "[Title]"
#' # Full-length title query (designed to query titles)
#' res0 <- get_pubmed_ids(my_query)
#' print(as.numeric(res0$Count))
#' # Weird count!
#' res <- get_pubmed_ids_by_fulltitle(my_query, field = my_field)
#' # Num results = 1 as expected
#' print(as.numeric(res$Count))
#' 
#' }
#' 
#' @export
get_pubmed_ids_by_fulltitle <- 
  function(fulltitle, field = "[Title]", api_key = NULL) 
{
  out <- get_pubmed_ids(paste("\"", fulltitle, "\"", field, sep = ""), api_key = api_key)
  if (as.numeric(out$Count) > 0) {
    return (out)
  }
  
  stopwords <- easyPubMed::PubMed_stopwords
  keys <- strsplit(fulltitle, split = "[[:space:]]")[[1]]
  keys <- tolower(keys)
  keys <- keys[!keys %in% stopwords]
  Sys.sleep(0.34)
  new_query <- paste(keys, field, sep = "", collapse = " AND ")
  return(get_pubmed_ids(new_query, api_key = api_key))
}
 
 
#' @title Simple PubMed Record Search
#'
#' @description Query PubMed (Entrez) in a simple way via the PubMed API eSearch function. 
#' Calling this function results in posting the query results on the PubMed History Server. 
#' This allows later access to the resulting data via the fetch_pubmed_data() function, 
#' or other easyPubMed functions.
#' 
#' @usage get_pubmed_ids(pubmed_query_string, api_key = NULL)
#' 
#' @param pubmed_query_string is a string (character vector of length 1) that is used 
#' for querying PubMed (standard PubMed synthax, see reference for details).
#' @param api_key String (character vector of length 1): user-specific API key to 
#' increase the limit of queries per second. You can obtain your key from NCBI.
#' 
#' @details 
#' This function will use the String provided as argument for querying PubMed via the eSearch 
#' function of the PubMed API. The Query Term can include one or multiple words, as well as the standard 
#' PubMed operators (AND, OR, NOT) and tags (i.e., [AU], [PDAT], [Affiliation], and so on). ESearch will 
#' post the UIDs resulting from the search operation onto the History server so that they can be used directly 
#' in a subsequent fetchPubmedData() call.
#' 
#' @return 
#' The function returns a list. The list includes the number of records found on PubMed and 
#' the first 20 PubMed IDs (UID) retrieved by the query. The list also includes QueryKey and WebEnv 
#' that are required for a subsequent fetch_pubmed_data() call.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references 
#' \url{https://www.data-pulse.com/dev_site/easypubmed/}
#' \url{https://www.ncbi.nlm.nih.gov/books/NBK3827/#_pubmedhelp_Search_Field_Descriptions_and_}
#'
#' @examples 
#' try({
#'   ##  Search for scientific articles written by Damiano Fantini
#'   ##  and print the number of retrieved records to screen.
#'   ##  Also print the retrieved UIDs to screen.
#'   ##
#'   dami_on_pubmed <- get_pubmed_ids("Damiano Fantini[AU]")
#'   print(dami_on_pubmed$Count)
#'   print(unlist(dami_on_pubmed$IdList))
#' }, silent = TRUE)
#' 
#' @export
get_pubmed_ids <- function (pubmed_query_string, 
                            api_key = NULL) 
{
  # Silence warnings
  old_warn <- options()$warn
  options(warn = -1)
  
  # Timing
  t_0 <- Sys.time()
  
  myQuery <- as.character(pubmed_query_string)
  myQuery <- gsub(" ", "+", myQuery, fixed = TRUE)
  myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", 
                       "db=pubmed&term=", myQuery, "&usehistory=y", sep = "")
  if (!is.null(api_key)) {
    myPubmedURL <- paste(myPubmedURL, "&api_key=", api_key, sep = "")
  }
  
  idXML <- NULL
  try_num <- 1
  while(is.null(idXML)) {
    
    # Timing check: kill at 2 min
    if (try_num > 1)
      Sys.sleep(time = 2)
    
    t_1 <- Sys.time()
    
    if(as.numeric(difftime(t_1, t_0, units = "mins")) > 2){
      message("Killing the request! Something is not working. Please, try again later")
      return()
    }
    
    # ENTREZ server connect
    idXML <- tryCatch({    
      IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8"))
      idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8"))
      idXML <- paste(idXML, collapse = "")
      if (grepl("<ERROR>", substr(idXML, 1, 250))) {
        # message(idXML)
        NULL
      } else {
        idXML
      }
    }, error = function(e) {
      # message(e)
      NULL
    }, finally = {
      try(suppressWarnings(close(IDconnect)), silent = TRUE)
    })  
    
    # Data processing (if result not null)
    myIDlist <- NULL
    
    if (!is.null(idXML)) {
      tryCatch({
        
        # Initialize collector
        myIDlist <- list()
        
        my_tags <- c("Count", "RetMax", "RetStart", 
                     "QueryKey", "WebEnv", "IdList", 
                     "TranslationSet", "QueryTranslation")
        
        # First pass
        for (j in 1:length(my_tags)) {
          ttag <- my_tags[j]
          xx <- custom_grep(idXML, tag = ttag, "char")
          myIDlist[[ttag]] <- xx[1]
        }
        
        # Try to expand IdList
        nutag <- "Id"
        xx <- myIDlist[["IdList"]]
        xx <- custom_grep(xx, "Id", format = "list")
        names(xx) <- rep("Id", length(xx))
        myIDlist[["IdList"]] <- xx
        
        # Try to expand TranslationSet
        xx <- myIDlist[["TranslationSet"]]
        myIDlist[["TranslationSet"]] <- list()
        nutag <- c("From", "To")
        for (z in nutag) {
          yy <- custom_grep(xx, z, format = "char")
          myIDlist[["TranslationSet"]][[z]] <- yy[1]
        }
        
      }, error = function(e) {
        idXML <- NULL
      })
    }
    
    # Final check!
    if(!is.list(myIDlist)) {
      idXML <- NULL  
    }
  
  try_num <- try_num + 1
  }
  
  # Wrap up and return
  myIDlist[['OriginalQuery']] <- myQuery
  myIDlist[['APIkey']] <- api_key
  
  # Restore warnings
  options(warn = old_warn)
  
  return(myIDlist)
}
 
 
#' @title Extract Publication and Affiliation Data from PubMed Records
#'
#' @description Extract Publication Info from PubMed records and cast data into a 
#' data.frame where each row corresponds to a different author. It is possible to limit
#' data extraction to first authors or last authors only, or get information about 
#' all authors of each PubMed record.
#' 
#' @usage table_articles_byAuth(pubmed_data, 
#'                              included_authors = "all", 
#'                              max_chars = 500, 
#'                              autofill = TRUE, 
#'                              dest_file = NULL, 
#'                              getKeywords = TRUE, 
#'                              encoding = "UTF8")
#' 
#' @param pubmed_data PubMed Data in XML format: typically, an XML file resulting from a 
#' batch_pubmed_download() call or an XML object, result of a fetch_pubmed_data() call.
#' @param included_authors Character: c("first", "last", "all"). Only includes information 
#' from the first, the last or all authors of a PubMed record.
#' @param max_chars Numeric: maximum number of chars to extract from the AbstractText field.
#' @param autofill Logical. If TRUE, missing affiliations are imputed according to the available 
#' values (from the same article).
#' @param dest_file String (character of length 1). Name of the file that will be written for 
#' storing the output. If NULL, no file will be saved.
#' @param getKeywords Logical. If TRUE, the operation will attempt to extract PubMed record 
#' keywords (MESH topics, keywords).
#' @param encoding The encoding of an input/output connection can be specified by name 
#' (for example, "ASCII", or "UTF-8", in the same way as it would be given to the function 
#' base::iconv(). See iconv() help page for how to find out more about encodings that can be 
#' used on your platform. Here, we recommend using "UTF-8".
#' 
#' @details 
#' Retrieve publication and author information from PubMed data, and cast them as a data.frame.
#' 
#' @return 
#' Data frame including the following fields: c("article.title","article.abstract", "date.year", 
#' "date.month", "date.day", "journal.abbrv", "journal.title", "keywords", "auth.last", 
#' "auth.fore", "auth.address", "auth.email").
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' \dontrun{
#' ## Cast PubMed record info into a data.frame
#'
## Auto-fill enabled
#' dami_query <- "Damiano Fantini[AU]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query)
#' dami_abstracts_xml <- fetch_pubmed_data(dami_on_pubmed, encoding = "ASCII")
#' xx <- table_articles_byAuth(pubmed_data = dami_abstracts_xml, 
#'                             included_authors = "first", 
#'                             max_chars = 100, 
#'                             autofill = TRUE)
#' 
#' print(xx[1:5, c("pmid", "lastname", "jabbrv")])
#' #
#' ## Download records first
#' ## Also, auto-fill disabled
#' dami_query <- "Damiano Fantini[AU]"
#' curr.file <- batch_pubmed_download(dami_query, dest_file_prefix = "test_bpd_", encoding = "ASCII")
#' xx <- table_articles_byAuth(pubmed_data = curr.file[1], 
#'                             included_authors = "all", 
#'                             max_chars = 20, 
#'                             autofill = FALSE)
#' print(xx[1:5, c("pmid", "lastname", "jabbrv")])
#' 
#' }
#' @importFrom utils write.table
#' 
#' @export
table_articles_byAuth <-
  function (pubmed_data, 
            included_authors = "all", 
            max_chars = 500, 
            autofill = TRUE, 
            dest_file = NULL, 
            getKeywords = TRUE, 
            encoding = "UTF8") 
{
  if (!included_authors %in% c("all", "first", "last")) 
    stop("Method is not supported!")
  message("Processing PubMed data ", appendLF = FALSE)
  
  paper.data <- articles_to_list(pubmed_data = pubmed_data, encoding = encoding)
  
  expFields <- c("pmid", "doi", "title", "abstract", "year", "month", "day", "jabbrv", 
                 "journal", "keywords", "lastname", "firstname", "address", "email")
  papers.authors.list <- lapply(1:length(paper.data), (function(i) {
    if (length(paper.data) > 50) {
      rep.dot <- as.integer(seq(1, length(paper.data), 
                                length.out = 50))
      if (i %in% rep.dot) 
        message(".", appendLF = FALSE)
    } else {
      message(".", appendLF = FALSE)
    }
    art <- paper.data[[i]]
    out <- tryCatch({article_to_df(pubmedArticle = art, 
                                  autofill = autofill, 
                                  max_chars = max_chars, 
                                  getKeywords = getKeywords, 
                                  getAuthors = TRUE)}, 
                    error = function(e) { NULL })
    
    if (is.null(out)) {
      out <- data.frame(pmid = NA, doi = NA, title = NA, 
                        abstract = NA, year = NA, month = NA, day = NA, 
                        jabbrv = NA, journal = NA, keywords = NA, lastname = NA, firstname = NA, 
                        address = NA, email = NA)
    }
    if (included_authors == "first") {
      out <- out[1, ]
    } else if (included_authors == "last") {
      out <- out[nrow(out), ]
    } 
    
    # Handle missing fields exception
    out2 <- data.frame(rebuild = (1:nrow(out))) 
    for (jj in 1:length(expFields)) {
      if (expFields[jj] %in% names(out)) {
        out2[,expFields[jj]] <- out[,expFields[jj]]
      } else {
        out2[,expFields[jj]] <- NA
      }
    }
    out2[,-1]
  }))
  message(" done!")
  
  #y <- names(papers.authors.list[[1]])
  #kp <- sapply(papers.authors.list, function(x) {sum(! y %in% names(x)) == 0 })
  #class(papers.authors.list[!kp])
  #names(papers.authors.list[!kp][[1]])
  #sum(!kp)
  
  papers.authors.df <- do.call(rbind, papers.authors.list)
  keep.rw <- apply(papers.authors.df, 1, (function(rw) {
    sum(is.na(rw)) < length(rw)
  }))
  papers.authors.df <- papers.authors.df[keep.rw, ]
  if (!is.null(dest_file)) {
    if (class(dest_file) == "character" & length(dest_file) == 1) {
      tryCatch(utils::write.table(papers.authors.df, dest_file, fileEncoding = encoding), 
               error = function(e) {
                 NULL
               })
    }
  }
  return(papers.authors.df)
}
 
 
#' @title Trim and Format Address Information
#'
#' @description Set of rules for trimming and standardizing the format of address information 
#' retrieved from PubMed records. Affiliations including more than one address will be trimmend 
#' and only the first address will be returned.
#' 
#' @usage trim_address(addr)
#' 
#' @param addr Character string including an address as extracted from PubMed records.
#' 
#' @return 
#' Character string including a formatted and trimmed address (if available).
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples 
#' addr_string <- " 2 Dept of Urology, Feinberg School of Medicine," 
#' addr_string <- paste(addr_string, "Chicago, US; Dept of Mol Bio as well...")
#' print(addr_string)
#' print(trim_address(addr = addr_string))
#' 
#' @export
trim_address <-
function(addr) 
{
  out.addr <- gsub("(^([[:space:]]{0,5}[[:digit:]]{1,2}[[:space:]]{0,2}))", "", addr)
  out.addr <- gsub("(\\.).{0,5}$", "", out.addr)
  out.addr <- gsub("(\\;).*$", "", out.addr)
  return (out.addr)
}
 
 
 
#' @title PubMed Records downloaded and analyzed via easyPubMed
#'
#' @description This dataset includes a collection of 4 examples showing how to download and analyze records 
#' from PubMed by using easyPubMed. Each element in the EPMsamples list corresponds to a different query 
#' and/or analysis. Also, each element of EPMsamples is a list including intermediates and notes about the analysis.
#'
#' @usage data("EPMsamples")
#' 
#' @format The dataset is formatted as a list including 4 elements: 
#' 
#' * `DF_papers_abs`: List of 4
#' 
#' * `DF_papers_std`: List of 4
#' 
#' * `NUBL_dw18`: List of 3
#' 
#' * `NUBL_1618`: List of 5
#' 
#'   
#' @details The dataset was built as described in this vignette: \url{https://www.data-pulse.com/projects/Rlibs/vignettes/building_the_easyPubMed_EPMsamples_dataset.html}
#'
#' @examples 
#' ## Display some contents
#' data("EPMsamples")
#' # The following examples are focused on example query #4 (i.e., NUBL_1618)
#' # Display Query String used for collecting the data
#' print(EPMsamples$NUBL_1618$qry_st)
#' # show one PubMed record element from the IL vector
#' NU_records <- EPMsamples$NUBL_1618$rec_lst
#' cat(NU_records[[1]])
#' # cast PM recort to data.frame
#' BL_df <- article_to_df(NU_records[[6]], max_chars = 0)
#' print(BL_df)
"EPMsamples"

#' @title PubMed Records about Bladder Research from Northwestern University
#'
#' @description This dataset includes a collection of 87 PubMed Records of scientific publications about Bladder 
#' biology and pathology, published by clinical and research groups from Northwestern 
#' University (Chicago, IL), between 2016 and 2018.
#'
#' @usage data("PubMed_stopwords")
#' 
#' @format A character vector including all PubMed stopwords tat are typically filtered out from queries. 
#'   
#' @details Number of stopwords included, n=133.
#'
#' @examples 
#' ## Display some contents
#' data("PubMed_stopwords")
#' head(PubMed_stopwords)
"PubMed_stopwords"

#' @title Retrieve and Process Scientific Publication Records from Pubmed
#' 
#' @description Query NCBI Entrez and retrieve PubMed records in XML or TXT format. PubMed records 
#' can be downloaded and saved as XML or text files. Data integrity is enforced during data download, 
#' allowing to retrieve and save very large number of records effortlessly. PubMed records can be processed 
#' to extract publication- and author-specific information.
#' 
#' @author Damiano Fantini \email{damiano.fantini@@gmail.com}
#'
#' @references \url{https://www.data-pulse.com/dev_site/easypubmed/}
#'
#' @examples
#' try({
#'   ## Example 01: retrieve data in TXT format
#'   dami_query_string <- "Damiano Fantini[AU]"
#'   dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#'   dami_papers <- fetch_pubmed_data(dami_on_pubmed, format = "abstract")
#'   dami_papers[dami_papers == ""] <- "\n"
#'   cat(paste(dami_papers[1:65], collapse = ""))
#'   #
#' }, silent = TRUE)
#' 
#' \dontrun{
#' ## Example 02: retrieve data in XML format
#' library("easyPubMed")
#' dami_query_string <- "Damiano Fantini[AU] AND 2018[PDAT]"
#' dami_on_pubmed <- get_pubmed_ids(dami_query_string)
#' dami_papers <- fetch_pubmed_data(dami_on_pubmed)
#' titles <- sapply(dami_papers, custom_grep, tag = "ArticleTitle", format = "char", USE.NAMES = FALSE)
#' print(titles)
#' #
#' ## Example 03: retrieve data from PubMed and save as XML file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out1 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' x <- paste(readLines(out1[1], n = 10), collapse = "\n")
#' cat(x)
#' #
#' ## Example 04: retrieve data from PubMed and save as TXT file
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out2 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180, format = "medline")
#' x <- paste(readLines(out1[1], n = 30), collapse = "\n")
#' cat(x)
#' #
#' ## Example 05: extract information from a single PubMed record 
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out3 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' PM_data <- articles_to_list(out3[1])
#' PM_record_df <- article_to_df(PM_data[[80]])
#' print(PM_record_df[1,])
#' print(PM_record_df[,"address"])
#' #
#' ## Example 06: query PubMed and extract information from multiple records in one step 
#' ml_query <- "Machine Learning[TI] AND 2016[PD]"
#' out4 <- batch_pubmed_download(pubmed_query_string = ml_query, batch_size = 180)
#' PM_tab <- table_articles_byAuth(out4[1], autofill = TRUE, included_authors = "last")
#' PM_tab$address <- substr(PM_tab$address, 1, 12)
#' PM_tab[50:70,c("pmid", "jabbrv", "year", "lastname", "address")]
#' }
#' 
#' @keywords internal
"_PACKAGE"

Try the easyPubMed package in your browser

Any scripts or data that you put into this service are public.

easyPubMed documentation built on May 2, 2019, 3:47 p.m.