R/oa2df.R

Defines functions snowball2df topics2df publishers2df sources2df funders2df concepts2df institutions2df authors2df abstract_build works2df oa2df

Documented in authors2df concepts2df funders2df institutions2df oa2df publishers2df snowball2df sources2df topics2df works2df

#' Convert OpenAlex collection from list to data frame
#'
#' It converts bibliographic collections gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a collection of records about works, authors, institutions, venues or concepts obtained using \code{oa_request} into a data frame/tibble.
#'
#' @param data List. Output of \code{oa_request}.
#' @param entity Character. Scholarly entity of the search.
#' The argument can be one of
#' c("works", "authors", "institutions", "concepts", "funders", "sources", "publishers", "topics").
#' @param abstract Logical. If TRUE, the function returns also the abstract of each item.
#' Ignored if entity is different from "works". Defaults to TRUE.
#' @param verbose Logical.
#' If TRUE, print information about the dataframe conversion process.
#' Defaults to TRUE.
#'
#' @inheritParams oa_query
#' @inheritParams oa_request
#' @return A tibble/dataframe result of the original OpenAlex result list.
#'
#' @examples
#' \dontrun{
#'
#' # Query to search all works citing the article:
#' #  Aria, M., & Cuccurullo, C. (2017). bibliometrix:
#' #   An R-tool for comprehensive science mapping analysis.
#' #   Journal of informetrics, 11(4), 959-975.
#'
#' #  published in 2021.
#' #  The paper is associated to the OpenAlex id W2755950973.
#'
#' #  Results have to be sorted by relevance score in a descending order.
#'
#' query <- oa_query(
#'   entity = "works",
#'   cites = "W2755950973",
#'   from_publication_date = "2021-01-01",
#'   to_publication_date = "2021-04-30"
#' )
#'
#' res <- oa_request(
#'   query_url = query,
#'   count_only = FALSE,
#'   verbose = FALSE
#' )
#'
#' oa2df(res, entity = "works")
#' }
#'
#' @export
oa2df <- function(data, entity, options = NULL, count_only = FALSE, group_by = NULL, abstract = TRUE, verbose = TRUE) {
  if (length(data) == 0) {
    return(NULL)
  }

  if (!is.null(group_by)) {
    return(do.call(rbind.data.frame, data))
  }

  if (count_only && length(data) < 8) { # assuming less than 8 fields in output$meta
    return(unlist(data))
  }

  if (entity != "snowball") {
    ch <- ifelse(is.null(options$select), "id", options$select[[1]])
    if (!is.null(data[[ch]])) {
      data <- list(data)
    }
  }

  switch(entity,
    works = works2df(data, abstract, verbose),
    authors = authors2df(data, verbose),
    institutions = institutions2df(data, verbose),
    concepts = concepts2df(data, verbose),
    funders = funders2df(data, verbose),
    sources = sources2df(data, verbose),
    publishers = publishers2df(data, verbose),
    topics = topics2df(data, verbose),
    snowball = snowball2df(data)
  )
}


#' Convert OpenAlex collection of works from list format to data frame
#'
#' It converts bibliographic collection of works gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a list of works obtained using \code{oa_request} into a data frame/tibble.
#'
#' @param abstract Logical. If TRUE, the function returns also the abstract of each item.
#' Defaults to TRUE.
#' @param pb Progress bar object. If verbose, computed from `oa_progress`.
#' NULL otherwise.
#' @inheritParams oa2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#' @importFrom stats setNames
#'
#' @examples
#' \dontrun{
#'
#' # Query to search all works citing the article:
#' #  Aria, M., & Cuccurullo, C. (2017). bibliometrix:
#' #   An R-tool for comprehensive science mapping analysis.
#' #   Journal of informetrics, 11(4), 959-975.
#'
#' #  published in 2021.
#' #  The paper is associated to the OpenAlex id W2755950973.
#'
#' #  Results have to be sorted by relevance score in a descending order.
#'
#' query <- oa_query(
#'   identifier = NULL,
#'   entity = "works",
#'   cites = "W2755950973",
#'   from_publication_date = "2021-01-01",
#'   to_publication_date = "2021-12-31",
#'   search = NULL,
#'   endpoint = "https://api.openalex.org"
#' )
#'
#' res <- oa_request(
#'   query_url = query,
#'   count_only = FALSE,
#'   verbose = FALSE
#' )
#'
#' df <- oa2df(res, entity = "works")
#'
#' df
#' }
#'
#' @export
#'
works2df <- function(data, abstract = TRUE, verbose = TRUE,
                     pb = if (verbose) oa_progress(length(data)) else NULL) {
  col_order <- c(
    "id", "title", "display_name", "author", "ab", "publication_date", "relevance_score",
    "so", "so_id", "host_organization", "issn_l", "url", "pdf_url",
    "license", "version", "first_page", "last_page", "volume", "issue", "is_oa",
    "is_oa_anywhere", "oa_status", "oa_url", "any_repository_has_fulltext",
    "language", "grants", "cited_by_count", "counts_by_year",
    "publication_year", "cited_by_api_url", "ids", "doi", "type",
    "referenced_works", "related_works", "is_paratext", "is_retracted",
    "concepts", "topics"
  )
  works_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "display_name",
    "identical", "title",
    "identical", "publication_date",
    "identical", "doi",
    "identical", "type",
    "identical", "cited_by_count",
    "identical", "publication_year",
    "identical", "cited_by_api_url",
    "identical", "is_paratext",
    "identical", "is_retracted",
    "identical", "relevance_score",
    "identical", "language",
    "flat", "grants",
    "flat", "referenced_works",
    "flat", "related_works",
    "rbind_df", "counts_by_year",
    "rbind_df", "concepts",
    "flat", "ids"
  )

  venue_cols <- c(
    url = "landing_page_url",
    pdf_url = "pdf_url",
    is_oa = "is_oa",
    license = "license",
    version = "version"
  )
  so_cols <- c(
    so_id = "id",
    so = "display_name",
    issn_l = "issn_l",
    host_organization = "host_organization_name"
  )
  inst_cols <- c("id", "display_name", "ror", "country_code", "type", "lineage")
  empty_inst <- empty_list(inst_cols)

  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    paper <- data[[i]]

    fields <- works_process[works_process$field %in% names(paper), ]
    sim_fields <- mapply(
      function(x, y) subs_na(paper[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )

    author <- venue <- ab <- NULL

    if (!is.null(paper$primary_location)) {
      so_info <- paper$primary_location["source"]
      so_info <- if (length(so_info[[1]]) == 0) NA else so_info[[1]]
      venue_info <- replace_w_na(paper$primary_location[venue_cols])
      venue <- setNames(
        c(venue_info, so_info[so_cols]),
        c(names(venue_cols), names(so_cols))
      )
    }

    # authorships and affilitation
    if (!is.null(paper$authorships)) {
      author <- subs_na(
        lapply(paper$authorships, function(l) {
          l_inst <- l$institutions
          inst_idx <- lengths(l_inst) > 0
          if (length(inst_idx) > 0 && any(inst_idx)) {
            first_inst <- l_inst[inst_idx][[1]]
            first_inst$lineage <- paste(first_inst$lineage, collapse = ", ")
          } else {
            first_inst <- empty_inst
          }
          first_inst <- prepend(first_inst, "institution")
          aff_raw <- list(
            au_affiliation_raw =
              if (length(l$raw_affiliation_strings)) {
                l$raw_affiliation_strings[[1]]
              } else {
                NA_character_
              }
          )
          l_author <- if (length(l$author) > 0) {
            prepend(replace_w_na(l$author), "au")
          } else {
            empty_list(c("au_id", "au_display_name", "au_orcid"))
          }
          c(l_author, l[c("author_position", "is_corresponding")], aff_raw, first_inst)
        }), "rbind_df"
      )
    }

    # Abstract
    if (!is.null(paper$abstract_inverted_index) && abstract) {
      ab <- abstract_build(paper$abstract_inverted_index)
    }
    paper_biblio <- replace_w_na(paper$biblio)
    open_access <- replace_w_na(paper$open_access)
    if (length(open_access) > 0) {
      names(open_access)[[1]] <- "is_oa_anywhere"
    }

    topics <- process_topics(paper, "score")
    out_ls <- c(
      sim_fields, venue, open_access, paper_biblio,
      list(author = author, ab = ab), topics
    )
    out_ls[sapply(out_ls, is.null)] <- NULL
    list_df[[i]] <- out_ls
  }

  out_df <- rbind_oa_ls(list_df)
  out_df[, intersect(col_order, names(out_df))]
}

abstract_build <- function(ab) {
  if (is.null(ab)) {
    return(NA)
  }
  w <- rep(names(ab), lengths(ab))
  ind <- unlist(ab)
  if (is.null(ind)) {
    return("")
  }

  paste(w[order(ind)], collapse = " ", sep = "")
}


#' Convert OpenAlex collection of authors' records from list format to data frame
#'
#' It converts bibliographic collection of authors' records gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a list of authors' records obtained using \code{oa_request} into a data frame/tibble.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#'
#' @examples
#' \dontrun{
#'
#' # Query to search information about all authors affiliated to the University of Naples Federico II
#' # which have authored at least 100 publications:
#'
#' # University of Naples Federico II is associated to the OpenAlex id I71267560.
#'
#'
#' query_author <- oa_query(
#'   identifier = NULL,
#'   entity = "authors",
#'   last_known_institutions.id = "I71267560",
#'   works_count = ">500"
#' )
#'
#' res <- oa_request(
#'   query_url = query_author,
#'   count_only = FALSE,
#'   verbose = FALSE
#' )
#'
#' df <- oa2df(res, entity = "authors")
#'
#' df
#' }
#'
#' @export
authors2df <- function(data, verbose = TRUE,
                       pb = if (verbose) oa_progress(length(data)) else NULL) {
  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  inst_cols <- c("id", "display_name", "ror", "country_code", "type", "lineage")
  empty_inst <- empty_list(inst_cols)
  empty_inst$affiliations_other <- list(NULL)

  author_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "works_count",
    "identical", "display_name",
    "identical", "orcid",
    "identical", "works_api_url",
    "identical", "cited_by_count",
    "identical", "relevance_score",
    "flat", "display_name_alternatives",
    "rbind_df", "counts_by_year",
    "flat", "ids"
  )

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    item <- data[[i]]

    fields <- author_process[author_process$field %in% names(item), ]
    sim_fields <- mapply(
      function(x, y) subs_na(item[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )
    sub_affiliation <- item$last_known_institutions
    if (!is.null(sub_affiliation) && length(sub_affiliation)) {
      sub_affiliation <- sub_affiliation[[1]]
      if (is.na(sub_affiliation[[1]])) {
        sub_affiliation <- empty_inst
      }
      sub_affiliation$lineage <- paste(sub_affiliation$lineage, collapse = ", ")
      sub_affiliation <- prepend(sub_affiliation, "affiliation")
    }
    sub_affiliation <- replace_w_na(sub_affiliation)

    if (!is.null(item$affiliations)) {
      affiliations_other <- sapply(item$affiliations, function(x) x$institution$id)
      if (!is.null(sub_affiliation$affiliation_id)) {
        affiliations_other <- affiliations_other[affiliations_other != sub_affiliation$affiliation_id]
      }
      sub_affiliation$affiliations_other <- list(affiliations_other)
    }
    topics <- process_topics(item, "count")
    list_df[[i]] <- c(sim_fields, sub_affiliation, topics)
  }

  col_order <- c(
    "id", "display_name", "display_name_alternatives", "relevance_score",
    "ids", "orcid", "works_count", "cited_by_count", "counts_by_year",
    "affiliation_display_name", "affiliation_id", "affiliation_ror",
    "affiliation_country_code", "affiliation_type", "affiliation_lineage",
    "affiliations_other",
    "topics", "works_api_url"
  )

  out_df <- rbind_oa_ls(list_df)
  out_df[, intersect(col_order, names(out_df))]
}


#' Convert OpenAlex collection of institutions' records from list format to data frame
#'
#' It converts bibliographic collection of institutions' records gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a list of institutions' records obtained using \code{oa_request} into a data frame/tibble.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#'
#' @examples
#' \dontrun{
#'
#' # Query to search information about all Italian educational institutions
#'
#' query_inst <- oa_query(
#'   entity = "institutions",
#'   country_code = "it",
#'   type = "education"
#' )
#'
#' res <- oa_request(
#'   query_url = query_inst,
#'   count_only = FALSE,
#'   verbose = FALSE
#' )
#'
#' oa2df(res, entity = "institutions")
#' }
#'
#' @export
institutions2df <- function(data, verbose = TRUE,
                            pb = if (verbose) oa_progress(length(data)) else NULL) {
  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  institution_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "ror",
    "identical", "works_api_url",
    "identical", "type",
    "identical", "works_count",
    "identical", "display_name",
    "identical", "country_code",
    "identical", "homepage_url",
    "identical", "image_url",
    "identical", "image_thumbnail_url",
    "identical", "cited_by_count",
    "identical", "updated_date",
    "identical", "created_date",
    "identical", "relevance_score",
    "flat", "display_name_alternatives",
    "flat", "display_name_acronyms",
    "row_df", "geo",
    "rbind_df", "counts_by_year",
    "rbind_df", "associated_institutions",
    "flat", "ids"
  )

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    item <- data[[i]]
    fields <- institution_process[institution_process$field %in% names(item), ]
    sim_fields <- mapply(
      function(x, y) subs_na(item[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )
    interna <- NULL
    if (!is.null(item$international)) {
      interna <- list(
        display_name_international = subs_na(
          item$international$display_name,
          type = "flat"
        )
      )
    }
    topics <- process_topics(item, "count")
    list_df[[i]] <- c(sim_fields, interna, topics)
  }


  col_order <- c(
    "id", "display_name", "display_name_alternatives", "display_name_acronyms",
    "display_name_international", "ror", "ids", "country_code", "geo", "type",
    "homepage_url", "image_url", "image_thumbnail_url",
    "associated_institutions", "relevance_score", "works_count",
    "cited_by_count", "counts_by_year",
    "works_api_url", "topics", "updated_date", "created_date"
  )

  out_df <- rbind_oa_ls(list_df)
  out_df[, intersect(col_order, names(out_df))]
}


#' Convert OpenAlex collection of concepts' records from list format to data frame
#'
#' It converts bibliographic collection of concepts' records gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a list of concepts' records obtained using \code{oa_request} into a data frame/tibble.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#'
#' @examples
#' \dontrun{
#'
#' # Query to search information about all Italian educational institutions
#'
#'
#' query_inst <- oa_query(
#'   entity = "concepts",
#'   display_name.search = "electrodynamics"
#' )
#'
#' res <- oa_request(
#'   query_url = query_inst,
#'   count_only = FALSE,
#'   verbose = FALSE
#' )
#'
#' df <- oa2df(res, entity = "concepts")
#'
#' df
#' }
#'
#' @export
concepts2df <- function(data, verbose = TRUE,
                        pb = if (verbose) oa_progress(length(data)) else NULL) {
  concept_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "display_name",
    "identical", "wikidata",
    "identical", "level",
    "identical", "description",
    "identical", "image_url",
    "identical", "image_thumbnail_url",
    "identical", "works_count",
    "identical", "cited_by_count",
    "identical", "works_api_url",
    "identical", "relevance_score",
    "rbind_df", "counts_by_year",
    "rbind_df", "ancestors",
    "rbind_df", "related_concepts",
    "flat", "ids"
  )

  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    item <- data[[i]]
    fields <- concept_process[concept_process$field %in% names(item), ]
    sim_fields <- mapply(
      function(x, y) subs_na(item[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )

    intern_fields <- NULL
    if (!is.null(item$international)) {
      intern_fields <- lapply(
        item$international[c("display_name", "description")],
        subs_na,
        type = "flat"
      )
      names(intern_fields) <- paste(names(intern_fields), "international", sep = "_")
    }

    list_df[[i]] <- c(sim_fields, intern_fields)
  }

  col_order <- c(
    "id", "display_name", "display_name_international", "description",
    "description_international", "wikidata", "level", "ids",
    "image_url", "image_thumbnail_url", "ancestors",
    "related_concepts", "relevance_score", "works_count",
    "cited_by_count", "counts_by_year",
    "works_api_url"
  )

  out_df <- rbind_oa_ls(list_df)
  out_df[, intersect(col_order, names(out_df))]
}


#' Convert OpenAlex collection of funders' records from list format to data frame
#'
#' It converts bibliographic collection of funders' records gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a list of funders' records obtained using \code{oa_request} into a data frame/tibble.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#'
#' @examples
#' \dontrun{
#'
#' # Get funders located in Canada with more than 100,000 citations
#'
#' res <- oa_request(
#'   "https://api.openalex.org/funders?filter=country_code:ca,cited_by_count:>100000"
#' )
#'
#' df <- oa2df(res, entity = "funders")
#'
#' df
#' }
#'
#' @export
funders2df <- function(data, verbose = TRUE,
                       pb = if (verbose) oa_progress(length(data)) else NULL) {
  funder_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "display_name",
    "flat", "alternate_titles",
    "identical", "country_code",
    "identical", "description",
    "identical", "homepage_url",
    "identical", "image_url",
    "identical", "image_thumbnail_url",
    "identical", "grants_count",
    "identical", "works_count",
    "identical", "cited_by_count",
    "flat", "summary_stats",
    "flat", "ids",
    "rbind_df", "counts_by_year",
    "rbind_df", "roles",
    "identical", "updated_date",
    "identical", "created_date"
  )

  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    item <- data[[i]]
    fields <- funder_process[funder_process$field %in% names(item), ]
    sim_fields <- mapply(
      function(x, y) subs_na(item[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )
    list_df[[i]] <- sim_fields
  }

  out_df <- rbind_oa_ls(list_df)
  out_df
}



#' Convert OpenAlex collection of sources' records from list format to data frame
#'
#' It converts bibliographic collection of sources' records gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a list of sources' records obtained using \code{oa_request} into a data frame/tibble.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#'
#' @examples
#' \dontrun{
#'
#' # Get sources from Nature
#'
#' res <- oa_request(
#'   "https://api.openalex.org/sources?search=nature"
#' )
#'
#' df <- oa2df(res, entity = "sources")
#'
#' df
#' }
#'
#' @export
sources2df <- function(data, verbose = TRUE,
                       pb = if (verbose) oa_progress(length(data)) else NULL) {
  source_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "issn_l",
    "flat", "issn",
    "identical", "display_name",
    "identical", "host_organization",
    "identical", "host_organization_name",
    "flat", "host_organization_lineage",
    "identical", "relevance_score",
    "identical", "works_count",
    "identical", "cited_by_count",
    "flat", "summary_stats",
    "identical", "is_oa",
    "identical", "is_in_doaj",
    "flat", "ids",
    "identical", "homepage_url",
    "rbind_df", "apc_prices",
    "identical", "apc_usd",
    "identical", "country_code",
    "flat", "societies",
    "flat", "alternate_titles",
    "identical", "abbreviated_title",
    "identical", "type",
    "rbind_df", "counts_by_year",
    "identical", "works_api_url",
    "identical", "updated_date",
    "identical", "created_date"
  )

  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    item <- data[[i]]
    fields <- source_process[source_process$field %in% names(item), ]
    sim_fields <- mapply(
      function(x, y) subs_na(item[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )
    topics <- process_topics(item, "count")
    list_df[[i]] <- c(sim_fields, topics)
  }

  out_df <- rbind_oa_ls(list_df)
  out_df
}



#' Convert OpenAlex collection of publishers' records from list format to data frame
#'
#' It converts bibliographic collection of publishers' records gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a list of publishers' records obtained using \code{oa_request} into a data frame/tibble.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#'
#' @examples
#' \dontrun{
#'
#' # Get publishers located in Canada with more than 100,000 citations
#'
#' res <- oa_request(
#'   "https://api.openalex.org/publishers?filter=country_codes:ca"
#' )
#'
#' df <- oa2df(res, entity = "publishers")
#'
#' df
#' }
#'
#' @export
publishers2df <- function(data, verbose = TRUE,
                          pb = if (verbose) oa_progress(length(data)) else NULL) {
  publisher_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "display_name",
    "flat", "alternate_titles",
    "identical", "hierarchy_level",
    "flat", "parent_publisher",
    "flat", "lineage",
    "identical", "country_codes",
    "identical", "homepage_url",
    "identical", "image_url",
    "identical", "image_thumbnail_url",
    "identical", "works_count",
    "identical", "cited_by_count",
    "flat", "summary_stats",
    "flat", "ids",
    "rbind_df", "counts_by_year",
    "rbind_df", "roles",
    "identical", "sources_api_url",
    "identical", "updated_date",
    "identical", "created_date"
  )

  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    item <- data[[i]]
    fields <- publisher_process[publisher_process$field %in% names(item), ]
    sim_fields <- mapply(
      function(x, y) subs_na(item[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )
    list_df[[i]] <- sim_fields
  }

  out_df <- rbind_oa_ls(list_df)
  out_df
}


#' Convert OpenAlex collection of topics' records from list format to data frame
#'
#' It converts collection of topics' records gathered from the OpenAlex database.
#' The function converts a list of topics' records obtained using \code{oa_request} into a data frame/tibble.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#' For more extensive information about OpenAlex API, please visit: <https://docs.openalex.org>
#'
#'
#' @examples
#' \dontrun{
#'
#' # Query to search information about all Italian educational institutions
#'
#'
#' query_inst <- oa_query(
#'   entity = "topics",
#'   display_name.search = "electrodynamics"
#' )
#'
#' res <- oa_request(
#'   query_url = query_inst,
#'   count_only = FALSE,
#'   verbose = FALSE
#' )
#'
#' df <- oa2df(res, entity = "topics")
#'
#' df
#' }
#'
#' @export
topics2df <- function(data, verbose = TRUE,
                        pb = if (verbose) oa_progress(length(data)) else NULL) {
  topic_process <- tibble::tribble(
    ~type, ~field,
    "identical", "id",
    "identical", "display_name",
    "identical", "description",
    "flat", "ids",
    "identical", "relevance_score",
    "identical", "works_count",
    "identical", "cited_by_count",
    "identical", "updated_date",
    "identical", "created_date",
    "rbind_df", "siblings",
    "flat", "keywords"
  )

  n <- length(data)
  list_df <- vector(mode = "list", length = n)

  for (i in seq.int(n)) {
    if (verbose) pb$tick()

    item <- data[[i]]
    fields <- topic_process[topic_process$field %in% names(item), ]
    sim_fields <- mapply(
      function(x, y) subs_na(item[[x]], type = y),
      fields$field,
      fields$type,
      SIMPLIFY = FALSE
    )
    domains <- unlist(item[c("subfield", "field", "domain")], recursive = FALSE)
    domains <- as.data.frame(do.call(cbind, domains))
    names(domains) <- gsub("\\.", "_", names(domains))
    list_df[[i]] <- c(sim_fields, domains)
  }

  col_order <- c(
    "id", "display_name", "description", "keywords", "ids",
    "subfield_id", "subfield_display_name", "field_id", "field_display_name",
    "domain_id", "domain_display_name", "siblings", "relevance_score",
    "works_count", "cited_by_count", "updated_date", "created_date"
  )

  out_df <- rbind_oa_ls(list_df)
  out_df[, intersect(col_order, names(out_df))]
}



#' Flatten snowball result
#'
#' |  id|title |...|cited_by_count| referenced_works   |cited_by |...|
#' | 100|foo   |...|             1| 98, 99             |101      |...|
#' | 200|bar   |...|             2| 198, 199           |201, 202 |...|
#' | 300|wug   |...|             2| 296, 297, 298, 299 |301, 302 |...|
#'
#' @param data List result from `oa_snowball`.
#' @param verbose Logical. If TRUE, print information on wrangling process.
#'
#' @return Tibble/data.frame of works with additional columns:
#' append `citing`, `backward_count`, `cited_by`, `forward_count`, `connection`,
#' and `connection_count.` For each work/row, these counts are WITHIN one
#' data search, and so `forward_count` <= `cited_by_count`.
#'
#' Consider the universe of all works linked to a set of starting works, (`oa_input = TRUE`)
#' for each work/row i:
#' - citing: works in the universe that i cites
#' - backward_count: number of works in the universe that i cites
#' - cited_by: works that i is cited by
#' - forward_count: number of works in the universe that i is cited by
#' - connection: works in the universe linked to i
#' - connection_count: number of works in the universe linked to i (degree of i)
#'
#' @export
#'
#' @examples
#' \dontrun{
#' flat_snow <- snowball2df(oa_snowball(
#'   identifier = "W1516819724",
#'   verbose = TRUE
#' ))
#'
#' flat_snow[, c("id", "connection", "connection_count")]
#' }
snowball2df <- function(data, verbose = FALSE) {
  nodes <- data$nodes
  ids <- nodes$id[nodes$oa_input]
  edges_df <- data$edges

  citing <- do.call(rbind.data.frame, by(
    edges_df, list(edges_df$from),
    function(x) {
      list(
        id = unique(x$from),
        citing = paste(x$to, collapse = ";"),
        backward_count = nrow(x)
      )
    }
  ))

  cited_by <- do.call(rbind.data.frame, by(
    edges_df, list(edges_df$to),
    function(x) {
      list(
        id = unique(x$to),
        cited_by = paste(x$from, collapse = ";"),
        forward_count = nrow(x)
      )
    }
  ))

  if (verbose) message("Appending new columns...")

  nodes_augmented <- merge(
    merge(nodes, citing, all.x = TRUE),
    cited_by,
    all.x = TRUE
  )

  nodes_augmented$connection <- apply(
    nodes_augmented[, c("citing", "cited_by")], 1,
    function(x) paste(x[!is.na(x)], collapse = ";")
  )

  nodes_augmented[is.na(nodes_augmented$backward_count), "backward_count"] <- 0
  nodes_augmented[is.na(nodes_augmented$forward_count), "forward_count"] <- 0
  nodes_augmented$connection_count <-
    nodes_augmented$backward_count + nodes_augmented$forward_count

  nodes_augmented
}

Try the openalexR package in your browser

Any scripts or data that you put into this service are public.

openalexR documentation built on Sept. 11, 2024, 8:40 p.m.