R/mt_extract_pubmed.R

Defines functions .extract_xml .extract_fields mt_extract_pubmed

Documented in .extract_fields .extract_xml mt_extract_pubmed

#' Extract fields of interest from PubMed results
#'
#' Extract all fields of interest for each article.
#'
#' @param pubmed_xml The PubMed XML results file, as a string.
#' @param cl The number of CPU cores as a number to be passed to `pblapply`.
#' @return A tibble of all articles of interest and their fields.
#' @export
mt_extract_pubmed <- function(pubmed_xml, cl = 7) {

  # Move into the nodeset level of each PubmedArticle
  article_xml <- xml2::xml_find_all(pubmed_xml, "PubmedArticle")

  # Extract fields of interest
  article_xml %>%
    pbapply::pblapply(.extract_fields, cl = cl) %>%
    dplyr::bind_rows() %>%
    tibble::rowid_to_column()
}


#' Extract fields of interest from a single PubMed article result file
#'
#' Extract all fields of interest for an article.
#'
#' @param xml_doc The PubMed XML result file for a single study, as a string.
#' @return A tibble of all articles of interest and their fields.
.extract_fields <- function(xml_doc) {

  # Define the XPath for the fields of interest
  xpath_list <- list(
    pmid = "MedlineCitation/PMID",
    doi = "PubmedData/ArticleIdList/ArticleId[@IdType='doi']",
    pmcid = "PubmedData/ArticleIdList/ArticleId[@IdType='pmc']",
    pii = "PubmedData/ArticleIdList/ArticleId[@IdType='pii']",
    title = "MedlineCitation/Article/ArticleTitle",
    abstract = "MedlineCitation/Article/Abstract",
    publication_date_year = "MedlineCitation/Article/Journal//PubDate/Year",
    publication_date_month = "MedlineCitation/Article/Journal//PubDate/Month",
    publication_date_day = "MedlineCitation/Article/Journal//PubDate/Day",
    journal_name = "MedlineCitation/Article/Journal/Title",
    journal_name_iso = "MedlineCitation/Article/Journal/ISOAbbreviation",
    journal_volume = "MedlineCitation/Article/Journal/JournalIssue/Volume",
    journal_issue = "MedlineCitation/Article/Journal/JournalIssue/Issue",
    publication_type = "MedlineCitation/Article/PublicationTypeList/PublicationType",
    mesh_terms = "MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName",
    reference_pmids = "PubmedData/ReferenceList//ArticleId[@IdType='pubmed']"
  )

  # Extract fields of interest
  field_list <- lapply(xpath_list, .extract_xml, xml_doc = xml_doc)

  # Turn empty list levels into NA
  field_list[lapply(field_list, length) == 0] <- NA

  # Export as tibble
  tibble::as_tibble(field_list)
}


#' Extract text of interest from an XML file
#'
#' Extract text of interest from an XML file.
#'
#' @param xml_doc The XML document as it was returned from xml2.
#' @param xpath The XPath to the item of interest.
#' @return A string of interest.
.extract_xml <- function(xml_doc, xpath) {

  xml_doc %>%
    xml2::xml_find_all(xpath = xpath) %>%
    xml2::xml_contents() %>%
    xml2::xml_text() %>%
    paste(collapse = "; ")
}
serghiou/metareadr documentation built on Aug. 21, 2023, 2:33 a.m.