R/collect_github.R
In rfair: Assess the FAIRness of Research Data Objects and Software

# GitHub repository metadata harvester. When the assessed object is a GitHub
# repository, enriches the metadata record from the GitHub REST API (license,
# description, topics, dates). Ported in spirit from github_harvester.py.
# Set GITHUB_TOKEN to raise the API rate limit.

#' Detect an owner/repo pair from candidate URLs.
#' @noRd
github_repo_of <- function(urls) {
  for (u in as_chr(urls)) {
    m <- regmatches(u, regexec("github\\.com/([^/]+)/([^/?#]+)", u))[[1]]
    if (length(m) == 3L) return(list(owner = m[2], name = sub("\\.git$", "", m[3])))
  }
  NULL
}

#' Harvest GitHub repository metadata into the engine state.
#' @noRd
collect_github <- function(ctx, timeout = 15) {
  repo <- github_repo_of(c(ctx$pid_url, ctx$landing_url, ctx$id))
  if (is.null(repo)) return(invisible())

  api <- sprintf("https://api.github.com/repos/%s/%s", repo$owner, repo$name)
  req <- httr2::request(api)
  req <- httr2::req_headers(req, Accept = "application/vnd.github+json",
                            `X-GitHub-Api-Version` = "2022-11-28")
  req <- httr2::req_user_agent(req, "rfair R package")
  req <- httr2::req_timeout(req, timeout)
  req <- httr2::req_error(req, is_error = function(resp) FALSE)
  token <- Sys.getenv("GITHUB_TOKEN", "")
  if (nzchar(token)) req <- httr2::req_auth_bearer_token(req, token)

  resp <- tryCatch(httr2::req_perform(req), error = function(e) NULL)
  if (is.null(resp) || httr2::resp_status(resp) >= 400) return(invisible())
  j <- tryCatch(httr2::resp_body_json(resp), error = function(e) NULL)
  if (is.null(j)) return(invisible())

  spdx <- jget(j, "license", "spdx_id")
  if (identical(spdx, "NOASSERTION")) spdx <- NULL
  md <- compact(list(
    object_identifier = j$html_url,
    title = j$name,
    summary = j$description,
    object_type = "Software",
    keywords = if (length(j$topics)) as.list(unlist(j$topics)) else NULL,
    license = spdx %||% jget(j, "license", "url"),
    publisher = jget(j, "owner", "login"),
    created_date = j$created_at,
    modified_date = j$updated_at,
    language = j$language
  ))
  if (length(md)) {
    merge_metadata(ctx, md, url = j$html_url, method = "github", format = "json",
                   mimetype = "application/json", schema = "https://docs.github.com/rest")
    ctx$metadata_sources[[length(ctx$metadata_sources) + 1L]] <-
      list(source = "github", method = "content_negotiation")
    ctx$github_data <- j
    ctx_log(ctx, "FsF-R1.1-01M", "info", "Harvested GitHub repository metadata")
  }

  # deeper software metadata: latest release version + codemeta.json + CITATION.cff
  branch <- j$default_branch %||% "main"
  ver <- tryCatch(github_json(paste0(api, "/releases/latest"), token, timeout)$tag_name,
                  error = function(e) NULL)
  cm <- github_software_files(ctx, repo, branch, token, timeout)
  sw <- compact(c(cm, list(version = ver %||% cm$version,
                           programming_language = j$language)))
  if (length(sw)) {
    merge_metadata(ctx, sw, url = j$html_url, method = "github", format = "json",
                   mimetype = "application/json", schema = "https://codemeta.github.io")
  }

  # software FAIR signals (for the FRSM software metrics) from the repo file tree
  ctx$software <- harvest_software_signals(api, repo, branch, j, ver, cm, token, timeout)
  invisible()
}

#' Detect software FAIR signals from the repository file tree + API.
#' @noRd
harvest_software_signals <- function(api, repo, branch, j, ver, cm, token = "", timeout = 15) {
  tree <- tryCatch(
    github_json(sprintf("%s/git/trees/%s?recursive=1", api, branch), token, timeout)$tree,
    error = function(e) NULL)
  paths <- tolower(as_chr(lapply(tree %||% list(), function(t) t$path)))
  any_match <- function(re) any(grepl(re, paths, perl = TRUE))

  contributors <- tryCatch(
    length(github_json(paste0(api, "/contributors?per_page=100"), token, timeout) %||% list()),
    error = function(e) 0L)
  github_license_refs <- software_license_refs(list(
    jget(j, "license", "spdx_id"),
    jget(j, "license", "url")
  ))
  metadata_license_refs <- software_license_refs(cm$license)
  license_ids <- software_spdx_ids(c(github_license_refs, metadata_license_refs))
  metadata_license_ids <- software_spdx_ids(metadata_license_refs)
  spdx_licenses <- tolower(vapply(ref_data("spdx"), function(x) x$licenseId %||% "", character(1)))
  has_spdx_license <- any(tolower(license_ids) %in% spdx_licenses)
  doi_pat <- "10\\.\\d{4,9}/[^\\s\"'<>]+"
  registry_doi <- NULL
  for (v in c(cm$object_identifier, unlist(cm$related_resources))) {
    m <- regmatches(v, regexpr(doi_pat, v %||% "", perl = TRUE))
    if (length(m)) { registry_doi <- m[1]; break }
  }
  path_signals <- software_path_signals(paths, private = isTRUE(j$private))

  # content-based signals: CRediT contributor roles, and whether the software is
  # preserved on more than one archiving infrastructure (a DOI registry plus,
  # e.g., Software Heritage or CRAN). Read the metadata/readme from the branch.
  read_raw <- function(path) tryCatch({
    u <- sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s",
                 repo$owner, repo$name, branch, path)
    r <- content_negotiate(u, accept = "default", timeout = timeout)
    if (isTRUE(r$ok)) as_chr(r$content) else ""
  }, error = function(e) "")
  meta_text <- paste(read_raw("codemeta.json"), read_raw("CITATION.cff"),
                     read_raw("README.md"), collapse = " ")
  has_credit_roles <- grepl("rolename|credit\\.niso\\.org", meta_text, ignore.case = TRUE)
  has_multiple_archives <- is_nonempty_string(registry_doi) &&
    grepl("softwareheritage\\.org|swh:1:|cran\\.r-project\\.org/package",
          meta_text, ignore.case = TRUE)

  list(
    identifier = j$html_url,
    version = ver %||% cm$version,
    registry_doi = registry_doi,
    name = j$name, description = j$description,
    language = j$language, topics = j$topics %||% list(),
    contributors = contributors,
    archived = isTRUE(j$archived),
    has_license = !is.null(j$license) || any_match("^licen[sc]e"),
    has_spdx_license = has_spdx_license,
    has_metadata_spdx_license = any(tolower(metadata_license_ids) %in% spdx_licenses),
    has_readme = any_match("^readme"),
    has_citation = any_match("^citation\\.cff|^codemeta\\.json"),
    has_tests = any_match("(^|/)tests?(/|$)|(^|/)test_|_test\\.|\\.test\\."),
    has_ci = any_match("^\\.github/workflows/|^\\.travis|^\\.circleci|^azure-pipelines|^\\.gitlab-ci"),
    has_requirements = any_match("^(requirements.*\\.txt|setup\\.py|setup\\.cfg|pyproject\\.toml|package\\.json|description|environment\\.ya?ml|renv\\.lock|cargo\\.toml|go\\.mod|pom\\.xml|build\\.gradle)$"),
    has_docs = any_match("^docs?/|readthedocs|mkdocs\\.ya?ml"),
    has_coverage = any_match("codecov|coveralls|(^|/)test-coverage|\\.codecov|(^|/)covr(\\.|/)"),
    is_public = !isTRUE(j$private),
    has_issue_tracker = isTRUE(j$has_issues),
    has_api = path_signals$has_api,
    has_open_api = path_signals$has_open_api,
    has_machine_readable_api = path_signals$has_machine_readable_api,
    has_data_format_docs = path_signals$has_data_format_docs,
    has_open_data_formats = path_signals$has_open_data_formats,
    has_schema_reference = path_signals$has_schema_reference,
    has_bundled_license_info = any_match("(^|/)(notice|copyrights?|authors)(\\.[a-z0-9]+)?$|licen[sc]e\\.note$|(^|/)licen[sc]es?/|third[-_]?party"),
    has_credit_roles = has_credit_roles,
    has_multiple_archives = has_multiple_archives,
    has_provenance_metadata = any_match("ro-crate-metadata\\.json$|(^|/)ro-crate|\\.prov(\\.|$)|(^|/)provenance|(^|/)attestations?/|(^|/)slsa")
  )
}

#' Detect software API, data-format, and schema signals from repository paths.
#' @noRd
software_path_signals <- function(paths, private = FALSE) {
  paths <- tolower(as_chr(paths))
  any_match <- function(re) any(grepl(re, paths, perl = TRUE))
  has_interface_definition <- any_match("openapi|swagger|\\.proto$|graphql")
  has_open_data_format <- any_match("(^|/)(openapi|swagger).*\\.(ya?ml|json)$|jsonld|json-ld|rdf|rdfs|\\.ttl$|\\.turtle$|\\.csv$|\\.tsv$|\\.parquet$|\\.feather$|\\.hdf5?$|\\.nc$|\\.netcdf$|\\.xml$")
  has_schema_reference <- any_match("(^|/)(openapi|swagger).*\\.(ya?ml|json)$|json-schema|schema\\.json|\\.schema\\.json$|\\.xsd$|rdfs|\\.proto$|graphql")
  has_format_docs <- has_open_data_format || has_schema_reference ||
    any_match("as_fuji_json|as_rdf|jsonld|rdf|json-schema|schema\\.json")
  list(
    has_api = has_interface_definition,
    has_open_api = has_interface_definition && !isTRUE(private),
    has_machine_readable_api = has_interface_definition,
    has_data_format_docs = has_format_docs,
    has_open_data_formats = has_open_data_format,
    has_schema_reference = has_schema_reference
  )
}

#' Extract license reference strings from scalar or structured software metadata.
#' @noRd
software_license_refs <- function(x) {
  if (is.null(x)) return(character(0))
  if (is.list(x) && !is.data.frame(x)) {
    fields <- c(x[["@id"]], x$url, x$name, x$identifier, x$licenseId)
    if (length(fields)) return(as_chr(fields))
    return(as_chr(unlist(lapply(x, software_license_refs), use.names = FALSE)))
  }
  as_chr(x)
}

#' Normalize SPDX license references to SPDX identifiers.
#' @noRd
software_spdx_ids <- function(x) {
  refs <- unique(software_license_refs(x))
  refs <- refs[refs != "NOASSERTION"]
  unname(vapply(refs, function(ref) {
    if (grepl("^https?://([^/]+\\.)?spdx\\.org/licenses/", ref, ignore.case = TRUE)) {
      sub("\\.(html|json)$", "", sub(".*/", "", ref), ignore.case = TRUE)
    } else {
      ref
    }
  }, character(1)))
}

#' GET + parse a GitHub API JSON resource.
#' @noRd
github_json <- function(url, token = "", timeout = 15) {
  req <- httr2::request(url)
  req <- httr2::req_headers(req, Accept = "application/vnd.github+json")
  req <- httr2::req_user_agent(req, "rfair R package")
  req <- httr2::req_timeout(req, timeout)
  req <- httr2::req_error(req, is_error = function(resp) FALSE)
  if (nzchar(token)) req <- httr2::req_auth_bearer_token(req, token)
  resp <- httr2::req_perform(req)
  if (httr2::resp_status(resp) >= 400) return(NULL)
  httr2::resp_body_json(resp)
}

#' Harvest codemeta.json / CITATION.cff from a repo's default branch.
#' @noRd
github_software_files <- function(ctx, repo, branch, token = "", timeout = 15) {
  raw <- function(path) sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s",
                                repo$owner, repo$name, branch, path)
  out <- list()
  # codemeta.json
  cm <- tryCatch({
    r <- content_negotiate(raw("codemeta.json"), accept = "json", timeout = timeout)
    if (isTRUE(r$ok)) jsonlite::fromJSON(r$content, simplifyVector = FALSE) else NULL
  }, error = function(e) NULL)
  if (!is.null(cm)) {
    out$title <- cm$name
    out$summary <- cm$description
    out$version <- cm$version %||% cm$softwareVersion
    out$object_identifier <- cm$identifier %||% cm$codeRepository
    lic <- cm$license
    if (is.list(lic)) lic <- lic[["@id"]] %||% lic$url %||% lic$name
    if (!is.null(lic)) out$license <- as_chr(lic)
    if (!is.null(cm$keywords)) out$keywords <- as.list(as_chr(cm$keywords))
    ctx$metadata_sources[[length(ctx$metadata_sources) + 1L]] <-
      list(source = "codemeta", method = "content_negotiation")
  }
  # CITATION.cff (YAML) -- version + doi
  cff <- tryCatch({
    r <- content_negotiate(raw("CITATION.cff"), accept = "default", timeout = timeout)
    if (isTRUE(r$ok)) yaml::yaml.load(r$content) else NULL
  }, error = function(e) NULL)
  if (is.list(cff)) {
    out$version <- out$version %||% cff$version
    if (!is.null(cff$doi)) out$related_resources <- list(list(
      related_resource = paste0("https://doi.org/", cff$doi), relation_type = "isIdenticalTo"))
  }
  compact(out)
}

Any scripts or data that you put into this service are public.

rfair documentation built on July 1, 2026, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rfair
Assess the FAIRness of Research Data Objects and Software

R/collect_github.R
In rfair: Assess the FAIRness of Research Data Objects and Software

Defines functions github_software_files github_json software_spdx_ids software_license_refs software_path_signals harvest_software_signals collect_github github_repo_of

Try the rfair package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfair Assess the FAIRness of Research Data Objects and Software

R/collect_github.R In rfair: Assess the FAIRness of Research Data Objects and Software

Defines functions github_software_files github_json software_spdx_ids software_license_refs software_path_signals harvest_software_signals collect_github github_repo_of

Try the rfair package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfair
Assess the FAIRness of Research Data Objects and Software

R/collect_github.R
In rfair: Assess the FAIRness of Research Data Objects and Software