R/rt_novelty_pmc.R

Defines functions rt_novelty_pmc .rt_novelty_pmc

Documented in rt_novelty_pmc

#' Identify novelty claims in PMC XML files.
#'
#' Takes a PMC XML file as a list of article sections and returns data related
#'     to the presence of novelty claims. This is the internal version designed
#'     for integration with \code{rt_all_pmc}.
#'
#' @param article_ls A PMC XML as a list of strings (from \code{.get_article_txt}).
#' @return A named list of results.
#' @noRd
.rt_novelty_pmc <- function(article_ls) {

  index_any <- list(
    novelty_first_time_1 = NA,
    novelty_first_time_2 = NA,
    novelty_first_to_1   = NA,
    novelty_previously_1 = NA,
    novelty_novel_1       = NA,
    novelty_knowledge_1   = NA
  )

  out <- list(
    is_novelty_pred = FALSE,
    novelty_text    = ""
  )

  # Search abstract and full body. The external XML validation found many
  # explicit first-time claims in results and conclusion paragraphs, not only
  # in the introduction/discussion zones.
  abstract <- unlist(article_ls$abstract)
  body     <- unlist(if (!is.null(article_ls$body_all)) article_ls$body_all else article_ls$body)
  article <- c(abstract, body)

  if (!length(article)) {
    return(c(out, index_any))
  }

  # Relevance gate: a cheap superset of every cue the pattern functions below can
  # match. Precision is enforced by those functions and .negate_novelty_1, not
  # here, so this only needs to admit anything potentially relevant.
  rel_regex <- paste(
    "first", "novel", "innovativ", "unprecedent", "previously un",
    "not been", "to our knowledge", "to the best of",
    sep = "|"
  )
  is_relevant <- any(grepl(rel_regex, article, ignore.case = TRUE))

  if (!is_relevant) {
    return(c(out, index_any))
  }

  # Preprocess
  article_processed <- .preprocess_txt(article)

  index_any$novelty_first_time_1 <- .which_novelty_first_time_1(article_processed)
  index_any$novelty_first_time_2 <- .which_novelty_first_time_2(article_processed)
  index_any$novelty_first_to_1   <- .which_novelty_first_to_1(article_processed)
  index_any$novelty_previously_1 <- .which_novelty_previously_1(article_processed)
  index_any$novelty_novel_1       <- .which_novelty_novel_1(article_processed)
  index_any$novelty_knowledge_1   <- .which_novelty_knowledge_1(article_processed)

  index <- unlist(index_any) %>% unique() %>% sort()

  # Drop cues attributed to a cited study or ordinal/temporal "first".
  if (!!length(index)) {
    is_negated <- .negate_novelty_1(article_processed[index])
    index <- index[!is_negated]
  }

  out$is_novelty_pred <- !!length(index)
  out$novelty_text    <- article[index] %>% paste(collapse = " ")

  index_any %<>% purrr::map(function(x) !!length(x))

  return(c(out, index_any))
}


#' Identify and extract novelty claims in PMC XML files.
#'
#' Takes a PMC XML file and returns data related to the presence of novelty
#'     claims, including whether such claims exist and the relevant text.
#'     Novelty is defined as the study claiming to report something
#'     "for the first time."
#'
#' @param filename The name of the PMC XML as a string.
#' @param remove_ns TRUE if an XML namespace exists, else FALSE (default).
#' @return A tibble of results. It returns the unique identifiers of the
#'     article, whether a novelty claim was found, the relevant text and
#'     whether each pattern-matching function identified relevant text.
#' @examples
#' \donttest{
#' # Path to a bundled example PMC XML file.
#' filepath <- system.file(
#'   "extdata", "PMID32171256-PMC7071725.xml", package = "rtransparency"
#' )
#'
#' # Identify and extract novelty claims.
#' results_table <- rt_novelty_pmc(filepath, remove_ns = TRUE)
#' }
#' @export
rt_novelty_pmc <- function(filename, remove_ns = FALSE) {

  # Identifier columns only; the prediction, extracted text and per-pattern flags
  # are supplied by .rt_novelty_pmc() below and must not be duplicated here.
  out <- list(
    pmid      = NA,
    pmcid_pmc = NA,
    pmcid_uid = NA,
    doi       = NA
  )

  # Parse XML
  article_xml <- tryCatch(.get_xml(filename, remove_ns), error = function(e) e)

  if (inherits(article_xml, "error")) {
    return(tibble::tibble(filename, is_success = FALSE))
  }

  # Extract IDs
  xpath <- c(
    "front/article-meta/article-id[@pub-id-type = 'pmid']",
    "front/article-meta/article-id[@pub-id-type = 'pmc']",
    "front/article-meta/article-id[@pub-id-type = 'pmc-uid']",
    "front/article-meta/article-id[@pub-id-type = 'doi']"
  )

  out %<>% purrr::list_modify(!!!purrr::map(xpath, ~ .get_text(article_xml, .x, TRUE)))

  # Extract text
  article_ls <- .get_article_txt(article_xml)
  novelty_ls <- .rt_novelty_pmc(article_ls)

  tibble::as_tibble(c(out, novelty_ls))
}

Try the rtransparency package in your browser

Any scripts or data that you put into this service are public.

rtransparency documentation built on July 1, 2026, 9:07 a.m.