R/oa_ngrams.R
In openalexR: Getting Bibliographic Records from 'OpenAlex' Database Using 'DSL' API

Documented in ngram2df oa_ngrams

#' Convert a "Work" entity's ngram data from list to data frame
#'
#' @keywords internal
ngram2df <- function(ngram) {
  if (is.null(ngram$meta$doi)) ngram$meta$doi <- NA_character_
  ngram_df <- as.data.frame.list(rev(ngram$meta), col.names = c("id", "doi", "count"))
  ngram_df$ngrams <- if (length(ngram$ngrams) == 0) {
    list(NULL)
  } else {
    list(ngram$ngrams)
  }
  ngram_df
}

#' Get N-grams of works
#'
#' Some work entities in OpenAlex include N-grams (word sequences and their frequencies) of their full text.
#' The N-grams are obtained from Internet Archive, which uses the spaCy parser to index scholarly works.
#' See <https://docs.openalex.org/api-entities/works/get-n-grams> for coverage and more technical details.
#'
#' @note A faster implementation is available for `curl` >= v5.0.0, and `oa_ngrams` will issue a one-time message
#' about this. This can be suppressed with `options("oa_ngrams.message.curlv5" = FALSE)`.
#'
#' @param works_identifier Character. OpenAlex ID(s) of "works" entities as item identifier(s).
#' These IDs start with "W". See more at <https://docs.openalex.org/api-entities/works#id>.
#' @param ... Unused.
#' @inheritParams oa_query
#'
#' @return A dataframe of paper metadatada and a list-column of ngrams.
#' @export
#'
#' @examples
#' \dontrun{
#'
#' ngrams_data <- oa_ngrams(c("W1963991285", "W1964141474"))
#'
#' # 10 most common ngrams in the first work
#' first_paper_ngrams <- ngrams_data$ngrams[[1]]
#' first_paper_ngrams[
#'   order(first_paper_ngrams$ngram_count, decreasing = TRUE),
#' ][
#'   1:10,
#' ]
#'
#' # Missing N-grams are `NULL` in the `ngrams` list-column
#' oa_ngrams("https://openalex.org/W2284876136")
#' }
oa_ngrams <- function(works_identifier, ...,
                      endpoint = "https://api.openalex.org",
                      verbose = FALSE) {

  ngrams_failed_template <- data.frame(id = NA, doi = NA, count = NA, ngrams = I(list(NULL)))

  out <- tryCatch(
    {
      # Check if input is ID of Works entity
      normalized_id <- shorten_oaid(works_identifier)
      if (!all(grepl("^W", normalized_id))) {
        stop(
          "Invalid OpenAlex Work entity ID(s) at `works_identifier`: ",
          paste(which(!grepl("^W", normalized_id)), sep = ", ")
        )
      }

      # Setup for querying
      query_urls <- paste(endpoint, "works", normalized_id, "ngrams", sep = "/")
      n <- length(query_urls)

      #ngrams_failed_template <- data.frame(id = NA, doi = NA, count = NA, ngrams = I(list(NULL)))
      if (verbose) {
        pb <- oa_progress(n)
      }

      # Fetch
      if (utils::packageVersion("curl") >= "5") {
        # Parallel fetch
        ngrams_files <- asNamespace("curl")$multi_download(query_urls, file.path(tempdir(), normalized_id), progress = verbose)
        ngrams_success <- ifelse(ngrams_files$success, ngrams_files$destfile, NA)
        # Convert
        ngrams_dfs <- lapply(ngrams_success, function(x) {
          if (verbose) pb$tick()
          if (is.na(x)) {
            ngrams_failed_template
          } else {
            ngram2df(jsonlite::fromJSON(x))
          }
        })
        file.remove(ngrams_files$destfile)
      } else {
        # One-time message
        if (getOption("oa_ngrams.message.curlv5", TRUE)) {
          message("Use `{curl}` >= v5.0.0 for a faster implementation of `oa_ngrams`")
          options("oa_ngrams.message.curlv5" = FALSE)
        }
        # Serial fetch
        if (verbose) {
          pb_dl <- oa_progress(n, "OpenAlex downloading")
        }

        ngrams_list <- vector("list", n)
        for (i in seq_len(n)) {
          if (verbose) pb_dl$tick()
          ngrams_list[[i]] <- tryCatch(
            jsonlite::fromJSON(query_urls[i]),
            error = function(...) ngrams_failed_template
          )
        }
        # Convert
        ngrams_dfs <- vector("list", n)
        for (i in seq_len(n)) {
          if (verbose) pb$tick()
          ngrams_dfs[[i]] <- ngram2df(ngrams_list[[i]])
        }
      }

      tibble::as_tibble(do.call(rbind.data.frame, ngrams_dfs))
    },
    error=function(cond) {
      message("ngrams not available for this work")
     # message(cond)
      # Choose a return value in case of error
      return(ngrams_failed_template)
    }
  )
  return(out)
}

Any scripts or data that you put into this service are public.

openalexR documentation built on June 8, 2025, 12:47 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

openalexR
Getting Bibliographic Records from 'OpenAlex' Database Using 'DSL' API

R/oa_ngrams.R
In openalexR: Getting Bibliographic Records from 'OpenAlex' Database Using 'DSL' API

Defines functions oa_ngrams ngram2df

Documented in ngram2df oa_ngrams

Try the openalexR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

openalexR Getting Bibliographic Records from 'OpenAlex' Database Using 'DSL' API

R/oa_ngrams.R In openalexR: Getting Bibliographic Records from 'OpenAlex' Database Using 'DSL' API

Defines functions oa_ngrams ngram2df

Documented in ngram2df oa_ngrams

Try the openalexR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

openalexR
Getting Bibliographic Records from 'OpenAlex' Database Using 'DSL' API

R/oa_ngrams.R
In openalexR: Getting Bibliographic Records from 'OpenAlex' Database Using 'DSL' API