R/import_pubmed.R

Defines functions import_author_pubmed import_xml_pubmed make_publication_list_url get_publication_xml make_pubmed_query make_affiliation_query get_article_title get_article_authors_count get_article_author_rank cleanString get_article_year get_article_pmid get_article_data get_article_author_affiliation get_article_journaliso get_article_authors get_articles_data

Documented in get_articles_data import_author_pubmed import_xml_pubmed make_publication_list_url make_pubmed_query

#' Import pubmed summary for a given author
#'
#'
#' @inheritDotParams make_pubmed_query
#' @import xml2
#' @import dplyr
#' @export
import_author_pubmed <- function(lastname, firstname, ...) {

  pubmed_query <- make_pubmed_query(author_lastname = lastname, author_firstname = firstname, ...)


  publication_id <- xml2::read_xml(make_publication_list_url(pubmed_query))
  publications_xml <- get_publication_xml(publication_id)
  articles_df <- get_articles_data(publications_xml, lastname)
  articles_df
}


#' Import an XML file from Pubmed/Medline
#'
#' @details The xml file imported must be produced by Pubmed and have to focus on the particular author or list of publication the user wants to compute the sigaps score.
#' @param xml_path A file path or an URL to the xml file
#' @param lastname Lastname of the targeted author
#' @param firstname Firstname of hte targeted author
#' @inheritDotParams xml2::read_xml
#' @import xml2
#' @export
import_xml_pubmed <- function(xml_path, lastname, firstname, ...) {
  # There is a bug in the xml produced in pubmed : there is no
  # global node. Then to have a proper XML file, add a big parent node
  pubmed_xml <- readLines(xml_path)
  pubmed_valid_xml <- c("<ArticlesCollection>", pubmed_xml, "</ArticlesCollection>")
  publications_xml <- xml2::read_xml(paste(pubmed_valid_xml, collapse = "\n"))
  get_articles_data(papers_xml = publications_xml, lastname = lastname)
}

#' Make a valid URL for medline API
#'
#' @param pubmed_query A valid pubmed query
#' @return A charactery vector with a valid URL
make_publication_list_url <- function(pubmed_query) {

  base_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&usehistory=y&term="

  # Add plus signs on the pubmed query to produce a valid URL
  plus_pubmed_query <- gsub(pattern = " ", replacement = "+", x = pubmed_query, fixed = T)

  paste0(base_url, plus_pubmed_query)
}

get_publication_xml <- function(publication_id) {
  webenv <- xml_text(xml_find_all(publication_id, "WebEnv"))
  query_url <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=", webenv, "&query_key=1&rettype=null&retmode=xml")
  read_xml(query_url)
}

#' Make a Pubmed's query
#'
#' @param author_lastname The lastname of the author
#' @param author_firstname The firstname or the first letter of the firstname of the author
#' @param affiliaion A character vector with the affiliations of the author. By default NA (not used).
#' @param year_end Last year of search, by default the current year
#' @param year_start The first year of search, by default 2 years before the year_end
#' @param other_query Any arbitrary query for pubmed
#'
#' @return A character vector of length one with a query suitable for pubmed
#' @export
make_pubmed_query <- function(
  author_lastname,
  author_firstname = NULL,
  affiliation = NULL,
  year_end = NULL,
  year_start = NULL,
  other_query = NULL,
  publication_type = c("Journal Article", "Review", "Editorial")
) {

  author_name <- author_lastname

  # add author firstname if provided
  if (!is.null(author_firstname))
    author_name <- paste(author_name, author_firstname)

  # q is for pubmed Query
  q <- paste0(author_name, "[AU]")

  if (!is.null(affiliation))
    q <- paste( q, "AND", make_affiliation_query(affiliation))

  # Add year
  if (!is.null(year_end) & !is.null(year_start))
    q <- paste0(q, " AND (", year_start,"[DP] : ", year_end, "[DP])")

  # Add any arbitrary query
  if (!is.null(other_query))
    q <- paste(q, other_query)

  # Add publication type
  if (!is.null(publication_type)) {
    pubtype_query <- publication_type %>%
      paste0("[Publication Type]") %>%
      paste(collapse = " OR ") %>%
      paste0(" AND (", ., ")")
    q <- paste0(q, pubtype_query)
  }

  q
}

make_affiliation_query <- function(affiliations) {
  key <- "[Affiliation]"
  aff_key <- paste0(affiliations, key)
  aff_key_or <- paste(aff_key, collapse = " OR ")
  full_query <- paste0("(", aff_key_or, ")")
  full_query
}



get_article_title <- function(paper_xml) {
  article_title_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/ArticleTitle")
  xml2::xml_text(article_title_node)
}

get_article_authors_count <- function(paper_xml) {
  authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
  xml2::xml_length(authors_node)
}

get_article_author_rank <- function(paper_xml, lastname) {

  authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
  authors_lastnames_nodes <- xml2::xml_find_all(authors_node, "Author/LastName")
  authors_lastnames <- xml2::xml_text(authors_lastnames_nodes)
  normalized_lastnames <- cleanString(authors_lastnames)
  normalized_lastname_arg <- cleanString(lastname)
  rank <- grep(pattern = normalized_lastname_arg, x = normalized_lastnames)

  if (length(rank) == 0) {
    rank <- NA
    warning("The author rank was not found for ", get_article_title(paper_xml))
  } else if (length(rank) > 1) {
    rank <- rank[1]
    warning("More than one possible rank was found for ", get_article_title(paper_xml))
  }

  rank
}

cleanString <- function(x){
  tmp <- iconv(x, from = "UTF8", to = "ASCII//TRANSLIT")
  gsub("[^[:alpha:]]", "", tmp)
}

get_article_year <- function(paper_xml) {
  year_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/Journal/JournalIssue/PubDate/Year")
  xml2::xml_integer(year_node)
}

get_article_pmid <- function(paper_xml) {
  pmid_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/PMID")
  xml2::xml_text(pmid_node)
}

get_article_data <- function(paper_xml, lastname) {
  cumul <- list(
    pmid = get_article_pmid(paper_xml),
    title = get_article_title(paper_xml),
    year = get_article_year(paper_xml),
    journal_title_iso = get_article_journaliso(paper_xml),
    authors = get_article_authors(paper_xml),
    n_authors = get_article_authors_count(paper_xml),
    author_rank = get_article_author_rank(paper_xml, lastname)
  )

  cumul$affiliation <- get_article_author_affiliation(paper_xml, cumul$author_rank)

  cumul
}

get_article_author_affiliation <- function(paper_xml, author_rank) {
  authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
  author_node <- xml2::xml_child(authors_node, search = author_rank)
  affiliation_node <- xml2::xml_find_all(author_node, "AffiliationInfo/Affiliation")
  paste(xml_text(affiliation_node), collapse = " ; ")
}

get_article_journaliso <- function(paper_xml) {
  journaliso_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/Journal/ISOAbbreviation")
  xml2::xml_text(journaliso_node)
}

get_article_authors <- function(paper_xml) {
  authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
  n_author <- xml_length(authors_node)

  format_author <- function(author_node) {
    LastName_node <- xml2::xml_find_first(author_node, "LastName")
    FirtName_node <- xml2::xml_find_first(author_node, "ForeName")
    paste(xml_text(LastName_node), xml_text(FirtName_node))
  }

  authors_vector <- sapply(1:n_author, function(x) format_author(xml_child(authors_node, x)))
  paste(authors_vector, collapse = ", ")
}

#' Get data from a serie of article from a Pubmed XML
#'
#' @return A data.frame with
#' @import dplyr
#' @import xml2
#' @return A data.frame
get_articles_data <- function(papers_xml, lastname) {

  # Count the number of articles in order to apply n times the import function
  nb_articles <- xml2::xml_length(papers_xml)

  # Apply n times the function get_article_data
  articles_list <- lapply(1:nb_articles,
                          function(x) get_article_data(
                            xml_child(papers_xml, search = x), lastname)
                          )

  # Make a data.frame with the list
  dplyr::bind_rows(articles_list)
}
jomuller/rigaps documentation built on May 29, 2019, 12:39 p.m.