R/pmc_reference.R

Defines functions pmc_reference

Documented in pmc_reference

#' Format references cited
#'
#' @param doc \code{xml_document} from PubMed Central
#'
#' @return a tibble with id, pmid, authors, year, title, journal, volume, pages,
#' and doi.
#'
#' @author Chris Stubben
#'
#' @note Mixed citations without any child tags are added to the author column.
#'
#' @examples
#' # doc <- pmc_xml("PMC2231364")
#' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
#'   package = "tidypmc"
#' ))
#' x <- pmc_reference(doc)
#' x
#' @export

pmc_reference <- function(doc) {
  if (class(doc)[1] != "xml_document") {
    stop("doc should be an xml_document from PubMed Central")
  }
  z <- xml2::xml_find_all(doc, "//ref")
  # cat(as.character(z[[1]]))
  if (length(z) > 0) {
    n <- lapply(z, function(x) xml2::xml_name(xml2::xml_find_all(x, "./*")))
    x <- as.vector(unlist(n))
    x <- table(x[!x %in% c("label", "note")])
    message("Found ", paste(x, names(x), collapse = " and "), " tags")
    ## xml2::xml_find_first returns NA for missing values
    pmid <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//pub-id[@pub-id-type='pmid']"),
        trim = TRUE
      ), character(1))
    doi <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//pub-id[@pub-id-type='doi']"),
        trim = TRUE
      ), character(1))
    a1 <- lapply(z, function(x) xml2::xml_text(
        xml2::xml_find_all(x, ".//surname"),
        trim = TRUE
      ))
    a2 <- lapply(z, function(x) xml2::xml_text(
        xml2::xml_find_all(x, ".//given-names"),
        trim = TRUE
      ))
    # if all references have same number of authors, use SIMPLIFY=FALSE,
    # see PMC6369050
    authors <- vapply(
      mapply(paste, a1, a2, SIMPLIFY = FALSE),
      function(x) paste(x, collapse = ", "), character(1)
    )
    authors[authors == ""] <- NA
    # use character for same authors published twice in same year, 2012a 2012b
    year <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//year"),
        trim = TRUE
      ), character(1))
    if (all(grepl("^[0-9]+$", year))) year <- as.integer(year)
    title <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//article-title"),
        trim = TRUE
      ), character(1))
    # new lines in title PMC4909105
    title <- gsub("\n *", " ", title)
    journal <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//source"),
        trim = TRUE
      ), character(1))
    volume <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//volume"),
        trim = TRUE
      ), character(1))
    p1 <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//fpage"),
        trim = TRUE
      ), character(1))
    p2 <- vapply(z, function(x) xml2::xml_text(
        xml2::xml_find_first(x, ".//lpage"),
        trim = TRUE
      ), character(1))
    pages <- paste(p1, p2, sep = "-")
    pages <- gsub("-NA", "", pages)
    x <- tibble::tibble(
      id = seq_along(pmid), pmid, authors, year, title, journal,
      volume, pages, doi
    )
    # add mixed citation to title??
    n <- which(is.na(x$authors) & is.na(x$title))
    if (length(n) > 0) {
      if (nrow(x) == length(n)) {
        message(" References are missing author and title tags")
      } else {
        message(" ", length(n), " references are missing author and title tags")
      }
      message(" Adding /ref string to author column")
      x$authors[n] <- vapply(z[n], xml2::xml_text, character(1))
    }
  } else {
    message("No /ref tags")
    x <- NULL
  }
  x
}
ropensci/tidypmc documentation built on Dec. 14, 2019, 11:42 p.m.