R/download_medline.R

Defines functions download_medline

Documented in download_medline

#' Download MEDLINE format articles from PubMed
#'
#' This function allows to download MEDLINE format articles from PubMed database. The function inputs a keyword and outputs a .txt file containing the articles. The search results are returned as a character vector of MEDLINE formatted articles.
#'
#' @param keyword1 A character string indicating the keyword to be used for searching in PubMed.
#' @param keyword2 The second keyword to search for.
#'
#' @return A .txt file containing the articles in MEDLINE format
#'
#' @importFrom
#' httr GET
#'
#' @importFrom
#' purrr map safely transpose
#'
#' @importFrom
#' XML xmlParse xpathSApply xmlValue
#'
#' @examples
#' medline_text <- download_medline("cancer", "genetics")
#'
#' @rdname download_medline
#' @export download_medline
download_medline <- function(keyword1, keyword2="gene"){
  # Define the API endpoint
  endpoint <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"

  # Check if a second keyword was provided
  if (is.null(keyword2)) {
    # Define the parameters for the API call
    parameters <- paste("db=pubmed&term=", keyword1, "&retmax=10000", sep="")
  } else {
    # Define the parameters for the API call
    parameters <- paste("db=pubmed&term=", paste(keyword1, "+AND+", keyword2, sep=""),
                        "&retmax=10000", sep="")
  }

  # Make the API call to retrieve the search results
  response <- httr::GET(paste(endpoint, parameters, sep=""))

  # Parse the response to extract the PMID (PubMed identifier) values
  pmids <- XML::xmlParse(txt_content <- as.character(response))
  pmids <- XML::xpathSApply(pmids, "//Id", XML::xmlValue)

  # Define the endpoint for fetching the articles
  fetch_endpoint <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"

  # Define the parameters for the fetch API call in a for loop
  x <- seq(1:length(pmids))
  y <- length(x)
  pmids_request <- seq(from = 1, to = y, by = 100)

  # Iteration in PMIDs
  lines_tmp <- c()
  for(i in pmids_request){
    pmids_tmp <- pmids[i:100]
    fetch_parameters <- paste("db=pubmed&id=", paste(pmids_tmp, collapse=","), "&rettype=medline&retmode=text", sep="")
    fetch_url <- paste(fetch_endpoint, fetch_parameters, sep="")
    lines_tmp <- c(lines_tmp, fetch_url)
  }

  # Request URL generated
  request_ <- purrr::map(lines_tmp, purrr::safely(readLines))
  request_ <- purrr::transpose(request_)
  request_ <- request_[["result"]]

  request_lines <- unlist(request_)
  request_lines <- request_lines[-1]


  if (is.null(keyword2)) {
    # Save the response to a .txt file
    write(request_lines, file =paste(keyword1, ".txt", sep=""))
  } else{
    # Save the response to a .txt file
    write(request_lines, file =paste(keyword1,"_", keyword2, ".txt", sep=""))
  }

  closeAllConnections()

  return(request_lines)
}
Erickcufe/textCells documentation built on May 20, 2023, 11:45 p.m.