R/collect_xml.R
In rfair: Assess the FAIRness of Research Data Objects and Software

# XML metadata collectors (DataCite XML, Dublin Core), ported from the
# XML_MAPPING_* tables in metadata_mapper.py and metadata_collector_xml.py.
# Uses xml2 with namespaces stripped, so the fuji "{*}local-name" paths become
# plain XPath.

#' Translate a fuji XML mapping path to namespace-stripped XPath.
#' @noRd
xml_translate <- function(p) {
  p <- gsub("\\{\\*\\}", "", p)   # namespace wildcard -> none (ns stripped)
  p <- gsub("@@", "/@", p)         # attribute marker
  gsub("xlink:", "", p)            # attribute namespace prefix
}

#' Get trimmed text for one or more XPath paths under a node.
#' @noRd
xml_get <- function(node, paths) {
  vals <- character(0)
  for (p in paths) {
    found <- tryCatch(xml2::xml_find_all(node, xml_translate(p)), error = function(e) NULL)
    if (!is.null(found) && length(found)) vals <- c(vals, trimws(xml2::xml_text(found)))
  }
  vals[nzchar(vals)]
}

#' Map a DataCite XML <resource> node to reference-schema keys.
#' @noRd
map_datacite_xml <- function(node) {
  out <- list()
  out$title <- xml_get(node, "./{*}titles/{*}title")[1]
  cr <- xml_get(node, "./{*}creators/{*}creator/{*}creatorName"); if (length(cr)) out$creator <- as.list(cr)
  ct <- xml_get(node, "./{*}contributors/{*}contributor/{*}contributorName"); if (length(ct)) out$contributor <- as.list(ct)
  out$publication_date <- xml_get(node, "./{*}publicationYear")[1]
  kw <- xml_get(node, "./{*}subjects/{*}subject"); if (length(kw)) out$keywords <- as.list(kw)
  out$object_identifier <- xml_get(node, "./{*}identifier")[1]
  pubn <- xml_get(node, "./{*}publisher")[1]; if (!is.na(pubn) && nzchar(pubn %||% "")) out$publisher <- pubn
  out$summary <- xml_get(node, "./{*}descriptions/{*}description")[1]
  out$object_type <- xml_get(node, "./{*}resourceType@@resourceTypeGeneral")[1]
  sz <- xml_get(node, "./{*}sizes/{*}size"); if (length(sz)) out$object_size <- sz[1]
  fmt <- xml_get(node, "./{*}formats/{*}format"); if (length(fmt)) out$object_format <- fmt[1]
  lic <- c(xml_get(node, "./{*}rightsList/{*}rights@@rightsURI"),
           xml_get(node, "./{*}rightsList/{*}rights"))
  if (length(lic)) { out$license <- as.list(unique(lic)); out$access_level <- as.list(unique(lic)) }
  out$language <- xml_get(node, "./{*}language")[1]
  # related identifiers (+ relation type), zipped
  rid <- xml_get(node, "./{*}relatedIdentifiers/{*}relatedIdentifier")
  rtype <- xml_get(node, "./{*}relatedIdentifiers/{*}relatedIdentifier@@relationType")
  if (length(rid)) {
    out$related_resources <- lapply(seq_along(rid), function(i) list(
      related_resource = rid[i], relation_type = if (i <= length(rtype)) rtype[i] else NA_character_))
  }
  compact(out)
}

#' Map a Dublin Core document (any wrapper) to reference-schema keys.
#' @noRd
map_dc_xml <- function(root) {
  g <- function(name) xml_get(root, sprintf(".//{*}%s", name))
  out <- list()
  out$title <- g("title")[1]
  cr <- g("creator"); if (length(cr)) out$creator <- as.list(cr)
  ct <- g("contributor"); if (length(ct)) out$contributor <- as.list(ct)
  kw <- g("subject"); if (length(kw)) out$keywords <- as.list(kw)
  out$summary <- (c(g("description"), g("abstract")))[1]
  out$publisher <- g("publisher")[1]
  out$publication_date <- (c(g("date"), g("issued"), g("available")))[1]
  out$object_identifier <- g("identifier")[1]
  out$object_type <- g("type")[1]
  out$object_format <- g("format")[1]
  out$language <- g("language")[1]
  lic <- g("license"); if (length(lic)) out$license <- as.list(lic)
  acc <- c(g("rights"), g("accessRights")); if (length(acc)) out$access_level <- as.list(acc)
  rel <- c(g("relation"), g("references"), g("source"), g("isPartOf"), g("hasVersion"),
           g("isVersionOf"), g("isReferencedBy"))
  if (length(rel)) out$related_resources <- lapply(rel, function(r) list(related_resource = r))
  compact(out)
}

#' Map a MODS document to reference-schema keys.
#' @noRd
map_mods_xml <- function(root) {
  g <- function(p) xml_get(root, p)
  compact(list(
    title = g(".//{*}titleInfo/{*}title")[1],
    creator = { x <- g(".//{*}name/{*}namePart"); if (length(x)) as.list(x) else NULL },
    publisher = g(".//{*}originInfo/{*}publisher")[1],
    object_identifier = g(".//{*}identifier")[1],
    publication_date = g(".//{*}originInfo/{*}dateCreated")[1],
    keywords = { x <- g(".//{*}subject/{*}topic"); if (length(x)) as.list(x) else NULL },
    summary = g(".//{*}abstract")[1],
    object_type = g(".//{*}typeOfResource")[1],
    language = g(".//{*}language/{*}languageTerm")[1]))
}

#' Map an EML (Ecological Metadata Language) document.
#' @noRd
map_eml_xml <- function(root) {
  g <- function(p) xml_get(root, p)
  compact(list(
    title = g(".//{*}dataset/{*}title")[1],
    object_identifier = (c(g(".//{*}dataset/{*}alternateIdentifier"), xml_get(root, "./@packageId")))[1],
    creator = { x <- g(".//{*}dataset/{*}creator/{*}individualName/{*}surName"); if (length(x)) as.list(x) else NULL },
    publication_date = g(".//{*}dataset/{*}pubDate")[1],
    keywords = { x <- g(".//{*}dataset/{*}keywordSet/{*}keyword"); if (length(x)) as.list(x) else NULL },
    summary = (c(g(".//{*}dataset/{*}abstract/{*}para"), g(".//{*}dataset/{*}abstract")))[1],
    publisher = g(".//{*}dataset/{*}publisher/{*}organizationName")[1],
    measured_variable = { x <- g(".//{*}attributeName"); if (length(x)) as.list(x) else NULL }))
}

#' Map an ISO 19139 (geographic) document (core fields only).
#' @noRd
map_iso_xml <- function(root) {
  g <- function(name) xml_get(root, sprintf(".//{*}%s", name))
  compact(list(
    title = g("title")[1],
    summary = g("abstract")[1],
    keywords = { x <- g("keyword"); if (length(x)) as.list(x) else NULL },
    object_identifier = (c(g("code"), g("fileIdentifier")))[1],
    publication_date = g("date")[1],
    language = g("language")[1]))
}

#' Parse an XML string, detect its schema, map it, and merge.
#' @noRd
collect_xml_doc <- function(ctx, content, url, mimetype = "application/xml") {
  doc <- tryCatch(xml2::read_xml(content), error = function(e) NULL)
  if (is.null(doc)) return(invisible(FALSE))
  xml2::xml_ns_strip(doc)
  resource <- xml2::xml_find_first(doc, "//resource")
  found_node <- function(name) xml2::xml_find_first(doc, sprintf("//*[local-name()='%s']", name))
  has <- function(node) !inherits(node, "xml_missing")
  if (has(resource) && length(xml2::xml_find_all(resource, "./titles"))) {
    md <- map_datacite_xml(resource); src <- "datacite_xml"; schema <- "http://datacite.org/schema/kernel-4"
  } else if (has(found_node("mods"))) {
    md <- map_mods_xml(found_node("mods")); src <- "mods_xml"; schema <- "http://www.loc.gov/mods/v3"
  } else if (has(found_node("eml"))) {
    md <- map_eml_xml(found_node("eml")); src <- "eml_xml"; schema <- "https://eml.ecoinformatics.org"
  } else if (has(found_node("MD_Metadata"))) {
    md <- map_iso_xml(found_node("MD_Metadata")); src <- "iso19139_xml"; schema <- "http://www.isotc211.org/2005/gmd"
  } else if ((length(xml2::xml_find_all(doc, ".//*[local-name()='title']")) ||
              length(xml2::xml_find_all(doc, ".//*[local-name()='identifier']"))) &&
             !identical(tolower(xml2::xml_name(xml2::xml_root(doc))), "html")) {
    # an (X)HTML landing page also has a <title>; do not mine it as Dublin Core
    md <- map_dc_xml(xml2::xml_root(doc)); src <- "dublincore_xml"; schema <- "http://purl.org/dc/elements/1.1/"
  } else {
    return(invisible(FALSE))
  }
  if (!length(md)) return(invisible(FALSE))
  merge_metadata(ctx, md, url = url, method = src, format = "xml",
                 mimetype = mimetype, schema = schema)
  ctx$metadata_sources[[length(ctx$metadata_sources) + 1L]] <- list(source = src, method = "content_negotiation")
  ctx_log(ctx, "FsF-I1-01M", "info", paste("Harvested", src, "metadata"))
  invisible(TRUE)
}

#' Fetch a URL expected to hold XML metadata and harvest it.
#' @noRd
collect_xml_from_url <- function(ctx, url, timeout = 15) {
  resp <- tryCatch(content_negotiate(url, accept = "xml", timeout = timeout), error = function(e) NULL)
  if (is.null(resp) || !isTRUE(resp$ok) || is.null(resp$content)) return(invisible())
  collect_xml_doc(ctx, resp$content, url = resp$redirect_url, mimetype = resp$content_type)
}

#' Harvest XML metadata via content negotiation (DataCite XML, generic XML).
#' @noRd
collect_xml <- function(ctx, timeout = 15) {
  resp <- tryCatch(content_negotiate(ctx$pid_url, accept = "datacite_xml", timeout = timeout),
                   error = function(e) NULL)
  if (!is.null(resp) && isTRUE(resp$ok) && !is.null(resp$content) &&
      grepl("xml", resp$content_type %||% "", ignore.case = TRUE)) {
    collect_xml_doc(ctx, resp$content, url = resp$redirect_url, mimetype = resp$content_type)
  }
  invisible()
}

Any scripts or data that you put into this service are public.

rfair documentation built on July 1, 2026, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rfair
Assess the FAIRness of Research Data Objects and Software

R/collect_xml.R
In rfair: Assess the FAIRness of Research Data Objects and Software

Defines functions collect_xml collect_xml_from_url collect_xml_doc map_iso_xml map_eml_xml map_mods_xml map_dc_xml map_datacite_xml xml_get xml_translate

Try the rfair package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfair Assess the FAIRness of Research Data Objects and Software

R/collect_xml.R In rfair: Assess the FAIRness of Research Data Objects and Software

Defines functions collect_xml collect_xml_from_url collect_xml_doc map_iso_xml map_eml_xml map_mods_xml map_dc_xml map_datacite_xml xml_get xml_translate

Try the rfair package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfair
Assess the FAIRness of Research Data Objects and Software

R/collect_xml.R
In rfair: Assess the FAIRness of Research Data Objects and Software