#' Import pubmed summary for a given author
#'
#'
#' @inheritDotParams make_pubmed_query
#' @import xml2
#' @import dplyr
#' @export
import_author_pubmed <- function(lastname, firstname, ...) {
pubmed_query <- make_pubmed_query(author_lastname = lastname, author_firstname = firstname, ...)
publication_id <- xml2::read_xml(make_publication_list_url(pubmed_query))
publications_xml <- get_publication_xml(publication_id)
articles_df <- get_articles_data(publications_xml, lastname)
articles_df
}
#' Import an XML file from Pubmed/Medline
#'
#' @details The xml file imported must be produced by Pubmed and have to focus on the particular author or list of publication the user wants to compute the sigaps score.
#' @param xml_path A file path or an URL to the xml file
#' @param lastname Lastname of the targeted author
#' @param firstname Firstname of hte targeted author
#' @inheritDotParams xml2::read_xml
#' @import xml2
#' @export
import_xml_pubmed <- function(xml_path, lastname, firstname, ...) {
# There is a bug in the xml produced in pubmed : there is no
# global node. Then to have a proper XML file, add a big parent node
pubmed_xml <- readLines(xml_path)
pubmed_valid_xml <- c("<ArticlesCollection>", pubmed_xml, "</ArticlesCollection>")
publications_xml <- xml2::read_xml(paste(pubmed_valid_xml, collapse = "\n"))
get_articles_data(papers_xml = publications_xml, lastname = lastname)
}
#' Make a valid URL for medline API
#'
#' @param pubmed_query A valid pubmed query
#' @return A charactery vector with a valid URL
make_publication_list_url <- function(pubmed_query) {
base_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&usehistory=y&term="
# Add plus signs on the pubmed query to produce a valid URL
plus_pubmed_query <- gsub(pattern = " ", replacement = "+", x = pubmed_query, fixed = T)
paste0(base_url, plus_pubmed_query)
}
get_publication_xml <- function(publication_id) {
webenv <- xml_text(xml_find_all(publication_id, "WebEnv"))
query_url <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=", webenv, "&query_key=1&rettype=null&retmode=xml")
read_xml(query_url)
}
#' Make a Pubmed's query
#'
#' @param author_lastname The lastname of the author
#' @param author_firstname The firstname or the first letter of the firstname of the author
#' @param affiliaion A character vector with the affiliations of the author. By default NA (not used).
#' @param year_end Last year of search, by default the current year
#' @param year_start The first year of search, by default 2 years before the year_end
#' @param other_query Any arbitrary query for pubmed
#'
#' @return A character vector of length one with a query suitable for pubmed
#' @export
make_pubmed_query <- function(
author_lastname,
author_firstname = NULL,
affiliation = NULL,
year_end = NULL,
year_start = NULL,
other_query = NULL,
publication_type = c("Journal Article", "Review", "Editorial")
) {
author_name <- author_lastname
# add author firstname if provided
if (!is.null(author_firstname))
author_name <- paste(author_name, author_firstname)
# q is for pubmed Query
q <- paste0(author_name, "[AU]")
if (!is.null(affiliation))
q <- paste( q, "AND", make_affiliation_query(affiliation))
# Add year
if (!is.null(year_end) & !is.null(year_start))
q <- paste0(q, " AND (", year_start,"[DP] : ", year_end, "[DP])")
# Add any arbitrary query
if (!is.null(other_query))
q <- paste(q, other_query)
# Add publication type
if (!is.null(publication_type)) {
pubtype_query <- publication_type %>%
paste0("[Publication Type]") %>%
paste(collapse = " OR ") %>%
paste0(" AND (", ., ")")
q <- paste0(q, pubtype_query)
}
q
}
make_affiliation_query <- function(affiliations) {
key <- "[Affiliation]"
aff_key <- paste0(affiliations, key)
aff_key_or <- paste(aff_key, collapse = " OR ")
full_query <- paste0("(", aff_key_or, ")")
full_query
}
get_article_title <- function(paper_xml) {
article_title_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/ArticleTitle")
xml2::xml_text(article_title_node)
}
get_article_authors_count <- function(paper_xml) {
authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
xml2::xml_length(authors_node)
}
get_article_author_rank <- function(paper_xml, lastname) {
authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
authors_lastnames_nodes <- xml2::xml_find_all(authors_node, "Author/LastName")
authors_lastnames <- xml2::xml_text(authors_lastnames_nodes)
normalized_lastnames <- cleanString(authors_lastnames)
normalized_lastname_arg <- cleanString(lastname)
rank <- grep(pattern = normalized_lastname_arg, x = normalized_lastnames)
if (length(rank) == 0) {
rank <- NA
warning("The author rank was not found for ", get_article_title(paper_xml))
} else if (length(rank) > 1) {
rank <- rank[1]
warning("More than one possible rank was found for ", get_article_title(paper_xml))
}
rank
}
cleanString <- function(x){
tmp <- iconv(x, from = "UTF8", to = "ASCII//TRANSLIT")
gsub("[^[:alpha:]]", "", tmp)
}
get_article_year <- function(paper_xml) {
year_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/Journal/JournalIssue/PubDate/Year")
xml2::xml_integer(year_node)
}
get_article_pmid <- function(paper_xml) {
pmid_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/PMID")
xml2::xml_text(pmid_node)
}
get_article_data <- function(paper_xml, lastname) {
cumul <- list(
pmid = get_article_pmid(paper_xml),
title = get_article_title(paper_xml),
year = get_article_year(paper_xml),
journal_title_iso = get_article_journaliso(paper_xml),
authors = get_article_authors(paper_xml),
n_authors = get_article_authors_count(paper_xml),
author_rank = get_article_author_rank(paper_xml, lastname)
)
cumul$affiliation <- get_article_author_affiliation(paper_xml, cumul$author_rank)
cumul
}
get_article_author_affiliation <- function(paper_xml, author_rank) {
authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
author_node <- xml2::xml_child(authors_node, search = author_rank)
affiliation_node <- xml2::xml_find_all(author_node, "AffiliationInfo/Affiliation")
paste(xml_text(affiliation_node), collapse = " ; ")
}
get_article_journaliso <- function(paper_xml) {
journaliso_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/Journal/ISOAbbreviation")
xml2::xml_text(journaliso_node)
}
get_article_authors <- function(paper_xml) {
authors_node <- xml2::xml_find_first(paper_xml, "MedlineCitation/Article/AuthorList")
n_author <- xml_length(authors_node)
format_author <- function(author_node) {
LastName_node <- xml2::xml_find_first(author_node, "LastName")
FirtName_node <- xml2::xml_find_first(author_node, "ForeName")
paste(xml_text(LastName_node), xml_text(FirtName_node))
}
authors_vector <- sapply(1:n_author, function(x) format_author(xml_child(authors_node, x)))
paste(authors_vector, collapse = ", ")
}
#' Get data from a serie of article from a Pubmed XML
#'
#' @return A data.frame with
#' @import dplyr
#' @import xml2
#' @return A data.frame
get_articles_data <- function(papers_xml, lastname) {
# Count the number of articles in order to apply n times the import function
nb_articles <- xml2::xml_length(papers_xml)
# Apply n times the function get_article_data
articles_list <- lapply(1:nb_articles,
function(x) get_article_data(
xml_child(papers_xml, search = x), lastname)
)
# Make a data.frame with the list
dplyr::bind_rows(articles_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.