R/identifier.R
In rfair: Assess the FAIRness of Research Data Objects and Software

Documented in id_parse

# Persistent-identifier parsing and resolution, ported from
# fuji_server/helper/identifier_helper.py.

# PIDs treated as persistent (identifier_helper.py VALID_PIDS, :19-39).
.VALID_PIDS <- c(
  "ark", "arxiv", "bioproject", "biosample", "doi", "ensembl", "genome", "gnd",
  "handle", "lsid", "pmid", "pmcid", "purl", "refseq", "sra", "uniprot", "urn",
  "identifiers.org", "w3id"
)

# URN resolver hosts (identifier_helper.py URN_RESOLVER, :54-65).
.URN_RESOLVER <- c(
  "urn:doi:"  = "dx.doi.org/",        "urn:lex:br" = "www.lexml.gov.br/",
  "urn:nbn:de" = "nbn-resolving.org/", "urn:nbn:se" = "urn.kb.se/resolve?urn=",
  "urn:nbn:at" = "resolver.obvsg.at/", "urn:nbn:hr" = "urn.nsk.hr/",
  "urn:nbn:no" = "urn.nb.no/",         "urn:nbn:fi" = "urn.fi/",
  "urn:nbn:it" = "nbn.depositolegale.it/", "urn:nbn:nl" = "www.persistent-identifier.nl/"
)

#' Extract the bare DOI (10.x/...) from a string, or NA.
#' @noRd
extract_doi <- function(x) {
  m <- regmatches(x, regexpr("(?i)(?:doi:\\s*|(?:https?://)?(?:dx\\.)?doi\\.org/)?(10\\.\\d+(?:\\.\\d+)*/\\S+)$",
                             x, perl = TRUE))
  if (length(m) == 0L) return(NA_character_)
  sub("(?i)^(?:doi:\\s*|(?:https?://)?(?:dx\\.)?doi\\.org/)?", "", m, perl = TRUE)
}

#' Extract a Handle (prefix/suffix) from a string, or NA.
#' @noRd
extract_handle <- function(x) {
  m <- regmatches(x, regexpr("(?i)(?:hdl:\\s*|(?:https?://)?hdl\\.handle\\.net/)?([0-9]+(?:\\.[0-9]+)*/.+)$",
                             x, perl = TRUE))
  if (length(m) == 0L) return(NA_character_)
  sub("(?i)^(?:hdl:\\s*|(?:https?://)?hdl\\.handle\\.net/)?", "", m, perl = TRUE)
}

#' @noRd
is_uuid_string <- function(x) {
  grepl("^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", x, ignore.case = TRUE)
}

#' @noRd
is_hash_string <- function(x) {
  grepl("^[0-9a-f]{32}$|^[0-9a-f]{40}$|^[0-9a-f]{64}$|^[0-9a-f]{128}$", x, ignore.case = TRUE)
}

#' @noRd
is_url_string <- function(x) grepl("^https?://[^\\s/$.?#].[^\\s]*$", x, perl = TRUE)

#' Detect identifier schemes for a string (idutils.detect_identifier_schemes).
#' @return Character vector of schemes (most specific first), possibly empty.
#' @noRd
detect_schemes <- function(x) {
  schemes <- character(0)
  if (!is.na(extract_doi(x)))           schemes <- c(schemes, "doi")
  if (grepl("ark:/", x, fixed = TRUE))  schemes <- c(schemes, "ark")
  if (grepl("^urn:[a-z0-9][a-z0-9-]{0,31}:", x, ignore.case = TRUE)) schemes <- c(schemes, "urn")
  if (grepl("(?i)arxiv", x, perl = TRUE) && grepl("\\d{4}\\.\\d{4,5}", x)) schemes <- c(schemes, "arxiv")
  if (!("doi" %in% schemes) && !is.na(extract_handle(x))) schemes <- c(schemes, "handle")
  if (is_url_string(x)) schemes <- c(schemes, "url")
  unique(schemes)
}

#' Build a resolver URL for an identifier given its scheme (IdentifierHelper.to_url).
#' @noRd
to_url <- function(id, schema, normalized = id) {
  url <- switch(schema,
    doi    = paste0("https://doi.org/", normalized),
    handle = paste0("https://hdl.handle.net/", normalized),
    arxiv  = paste0("https://arxiv.org/abs/", sub("(?i)^arxiv:", "", normalized, perl = TRUE)),
    ark    = if (is_url_string(id)) id else paste0("https://n2t.net/", normalized),
    w3id   = id,
    url    = id,
    NA_character_
  )
  url
}

#' Parse a persistent identifier or URL.
#'
#' Resolves the identifier scheme, normalizes it, and constructs its resolver
#' URL, mirroring `IdentifierHelper` in F-UJI.
#'
#' @param idstring A DOI, Handle, ARK, URN, UUID, identifiers.org PID, or URL.
#' @return A list with `identifier`, `normalized_id`, `identifier_url`,
#'   `preferred_schema`, `identifier_schemes`, and `is_persistent`.
#' @export
#' @examples
#' id_parse("https://doi.org/10.5281/zenodo.8347772")$preferred_schema
id_parse <- function(idstring) {
  empty <- list(identifier = idstring, normalized_id = idstring,
                identifier_url = NA_character_, preferred_schema = NA_character_,
                identifier_schemes = character(0), is_persistent = FALSE)
  if (!is_nonempty_string(idstring)) return(empty)
  id <- trimws(idstring)
  if (nchar(id) <= 4L || grepl("^[0-9]+$", id)) return(empty)

  # workarounds (identifier_helper.py:100-106)
  id <- sub("/purl.archive.org/", "/purl.org/", id, fixed = TRUE)
  if (grepl("https://purl.", id, fixed = TRUE) || grepl("/ark:", id, fixed = TRUE)) {
    id <- sub("https:", "http:", id, fixed = TRUE)
  }
  id <- sub("/ark:", "/ark:/", id, fixed = TRUE)
  id <- sub("/ark://", "/ark:/", id, fixed = TRUE)

  schemes <- character(0); preferred <- NA_character_
  normalized <- NA_character_; id_url <- NA_character_; persistent <- FALSE

  if (is_uuid_string(id)) { schemes <- "uuid"; preferred <- "uuid" }
  else if (is_hash_string(id)) { schemes <- "hash"; preferred <- "hash" }

  if (length(schemes) == 0L || identical(schemes, "url")) {
    parts <- tryCatch(httr2::url_parse(id), error = function(e) NULL)
    netloc <- if (!is.null(parts)) parts$hostname %||% "" else ""
    # w3id
    if (!is.null(parts) && identical(parts$scheme, "https") &&
        netloc %in% c("w3id.org", "www.w3id.org") && nzchar(parts$path %||% "")) {
      schemes <- c("w3id", "url"); preferred <- "w3id"
      id_url <- id; normalized <- id
    } else {
      # identifiers.org -> reformat to prefix:accession
      if (identical(netloc, "identifiers.org")) {
        seg <- strsplit(sub("^/", "", parts$path %||% ""), "/", fixed = TRUE)[[1]]
        if (length(seg) == 2L) id <- paste0(seg[1], ":", seg[2])
      }
      m <- regmatches(id, regexpr("^([a-z0-9._]+):(.+)", id, perl = TRUE))
      if (length(m) == 1L) {
        prefix <- sub("^([a-z0-9._]+):(.+)", "\\1", m, perl = TRUE)
        suffix <- sub("^([a-z0-9._]+):(.+)", "\\2", m, perl = TRUE)
        idorg <- ref_data("identifiers_org")
        if (prefix != "doi" && !is.null(idorg[[prefix]])) {
          patt <- idorg[[prefix]]$pattern
          if (is_nonempty_string(patt) && grepl(patt, suffix, perl = TRUE)) {
            schemes <- c("identifiers.org", prefix); preferred <- prefix
            id_url <- paste0("https://identifiers.org/", id)
            normalized <- paste0(tolower(prefix), ":", suffix)
          }
        }
      }
    }
  }

  if (length(schemes) == 0L) {
    schemes <- detect_schemes(id)
  }

  if (length(schemes) > 0L) {
    # move "url" to the end so a more specific scheme is preferred
    if (length(schemes) > 1L && "url" %in% schemes) {
      schemes <- c(setdiff(schemes, "url"), "url")
    }
    if (is.na(preferred)) preferred <- schemes[1]
    if (is.na(normalized)) {
      normalized <- switch(preferred,
        doi    = extract_doi(id) %||% id,
        handle = extract_handle(id) %||% id,
        id)
      if (is.na(normalized)) normalized <- id
    }
    if (is.na(id_url)) id_url <- to_url(id, preferred, normalized)
  }

  if (!is.na(preferred) &&
      (preferred %in% .VALID_PIDS || !is.null(ref_data("identifiers_org")[[preferred]]))) {
    persistent <- TRUE
  }
  if (is.na(normalized)) normalized <- id

  list(
    identifier = id,
    normalized_id = normalized,
    identifier_url = id_url %||% NA_character_,
    preferred_schema = preferred,
    identifier_schemes = schemes,
    is_persistent = persistent
  )
}

Any scripts or data that you put into this service are public.

rfair documentation built on July 1, 2026, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rfair
Assess the FAIRness of Research Data Objects and Software

R/identifier.R
In rfair: Assess the FAIRness of Research Data Objects and Software

Defines functions id_parse to_url detect_schemes is_url_string is_hash_string is_uuid_string extract_handle extract_doi

Documented in id_parse

Try the rfair package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfair Assess the FAIRness of Research Data Objects and Software

R/identifier.R In rfair: Assess the FAIRness of Research Data Objects and Software

Defines functions id_parse to_url detect_schemes is_url_string is_hash_string is_uuid_string extract_handle extract_doi

Documented in id_parse

Try the rfair package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfair
Assess the FAIRness of Research Data Objects and Software

R/identifier.R
In rfair: Assess the FAIRness of Research Data Objects and Software