gbfetch: Fetch sequences from GenBank into R

Documented in fetch_metadata fetch_metadata_chunked

# Helper functions ----

# Extract name of selected rank from taxonomic classification dataframe
name_from_classification <- function (taxonomy, rank_select) {
  tibble::as_tibble(taxonomy) %>%
    dplyr::filter(rank == rank_select) %>%
    dplyr::pull(name)
}

#' Fetch metadata for a GenBank sequence
#'
#' Metadata is obtained in two steps, first with rentrez::entrez_summary(),
#' then with taxize::classification() to get the species name.
#'
#' To see other possible values to use for `col_select`, run
#' rentrez::entrez_summary() with `db = nucleotide` for a valid ID,
#' e.g. rentrez::entrez_summary(db = "nucleotide", id = "383212727")
#'
#' @param id Character vector of IDs for GenBank records.
#' Output of rentrez::entrez_search().
#' @param col_select Character vector of metadata columns to include
#' in results. Must include at least `taxid` so that species
#' can be included. Default values include:
#' \itemize{
#'   \item{gi}{Genbank GI number}
#'   \item{caption}{Genbank accession number}
#'   \item{taxid}{Taxon ID (can use to query with taxize)}
#'   \item{title}{Sequence title}
#'   \item{slen}{Sequence length}
#'   \item{subname}{Misc. data (specimen, collection country, etc), separated
#'   by |}
#'   \item{subtype}{Column names of misc. data, separated by |}
#' }
#' @param higher_taxa Logical; should higher taxonomic ranks (family and order)
#' be included in the results?
#' @param .pb Internal agument used for setting the progress bar; don't change this.
#'
#' @return Tibble
#'
#' @examples
#' \dontrun{
#' fetch_metadata_from_id("383212727", higher_taxa = TRUE)
#' fetch_metadata_from_id(c("383212727", "383212725"))
#' }
fetch_metadata_from_id <- function (
  id,
  col_select = c("gi", "caption", "taxid", "title", "slen", "subtype", "subname"),
  higher_taxa = FALSE,
  .pb = NULL) {

  # Check input
  assertthat::assert_that(is.character(id))
  assertthat::assert_that(is.character(col_select))
  assertthat::assert_that("taxid" %in% col_select)

  # Set progress bar when this function is looped
  if ((!is.null(.pb)) && inherits(.pb, "Progress") && (.pb$i < .pb$n)) .pb$tick()$print()

  # Fetch metadata and extract selected columns
  if (length(id) == 1) {
    rentrez_results <- rentrez::entrez_summary(db = "nucleotide", id = id) %>%
      magrittr::extract(col_select) %>%
      tibble::as_tibble()
  } else {
    rentrez_results <- rentrez::entrez_summary(db = "nucleotide", id = id) %>%
      purrr::map_dfr(magrittr::extract, col_select)
  }

  # The metadata from rentrez::entrez_summary() don't include proper taxonomy.
  # There is a taxid field though, which we can use to lookup taxonomy with
  # taxize.
  results <-
    dplyr::mutate(
      rentrez_results,
      taxonomy = purrr::map(taxid, ~taxize::classification(., db = "ncbi") %>% magrittr::extract2(1)),
      species = purrr::map_chr(taxonomy, ~name_from_classification(., rank_select = "species"))
    )

  if (isTRUE(higher_taxa)) {
    results <-
      dplyr::mutate(
        results,
        family = purrr::map_chr(taxonomy, ~name_from_classification(., rank_select = "family")),
        order = purrr::map_chr(taxonomy, ~name_from_classification(., rank_select = "order"))
      )
  }

  dplyr::select(results, -taxonomy)

}

#' Fetch metadata from GenBank sequences in chunks
#'
#' fetch_metadata() queries the NCBI API online, so we should do this
#' in smaller chunks to avoid errors.
#'
#' @param id Character vector of IDs for GenBank records.
#' Output of rentrez::entrez_search().
#' @param col_select Vector of column names to return.
#' @param chunk_size Number of rows to use for each chunk.
#' @param higher_taxa Logical; should higher taxonomic ranks (family and order)
#' be included in the results?
#'
#' @return A list of two items, `results` and `error`.
#'
#' @examples
#' \dontrun{
#' fetch_metadata_chunked(c("383212727", "383212725"))
#' }
fetch_metadata_chunked <- function(
  id,
  col_select = c("gi", "caption", "taxid", "title", "slen", "subtype", "subname"),
  higher_taxa = FALSE,
  chunk_size = 100) {

  # Check input
  assertthat::assert_that(is.character(id))
  assertthat::assert_that(is.character(col_select))
  assertthat::assert_that("taxid" %in% col_select)

  # Split input vector into chunks
  n <- length(id)
  r <- rep(1:ceiling(n/chunk_size), each=chunk_size)[1:n]
  id_list <- split(id, r) %>% magrittr::set_names(NULL)

  # Set progress bar
  pb <- dplyr::progress_estimated(length(id_list))

  fetch_metadata_from_id_safely <- purrr::safely(fetch_metadata_from_id)

  # Loop over vector chunks
  purrr::map(
    id_list,
    ~fetch_metadata_from_id_safely(
      id = .,
      higher_taxa = higher_taxa,
      .pb = pb),
    )

}

# Main function ----

#' Fetch metadata from GenBank
#'
#' Sequences downloaded from GenBank with \code{\link{fetch_sequences}} only include
#' the title and/or accession number. Use \code{fetch_metadata} to obtain other
#' useful metadata associated with the sequences.
#'
#'  \code{\link[rentrez]{entrez_search}} is used to obtain a vector of IDs from the
#' `query`, then \code{\link[rentrez]{entrez_summary}} is used to download metadata
#' from the IDs. However, \code{\link[rentrez]{entrez_summary}} will fail if too many
#' IDs are used as input (more than 200-300 or so). Therefore, \code{fetch_metadata}
#' splits the IDs into chunks (a list of vectors), and loops over the list.
#'
#' Sometimes errors are encountered during the loop due to the API rejecting
#' the request, internet connectivity, etc. To avoid this, the loop repeats
#' until it finishes or the number of repeats reaches `max_tries`, upon
#' which it quits with an error.
#'
#' @param query String used to query NCBI GenBank. For more about the NCBI
#' query format see
#' https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options
#' @param chunk_size Number of ids to use for each chunk. Changing this
#' doesn't tend to affect the results, but lower values have more accurate
#' progress bars.
#' @param max_tries Maximum number of times to attempt the loop.
#' @param verbose Logical; should information about number of loops attempted
#' be printed to the screen
#' @param higher_taxa Logical; should higher taxonomic ranks (family and order)
#' be included in the results?
#'
#' @return Tibble of metadata resulting from Genbank query.
#' Columns include: \describe{
#'   \item{gi}{Genbank GI number}
#'   \item{accession}{Genbank accession number}
#'   \item{taxid}{Taxon ID (can use to query with taxize)}
#'   \item{title}{Sequence title}
#'   \item{slen}{Sequence length}
#'   \item{subname}{Misc. data (specimen, collection country, etc), separated
#'   by |}
#'   \item{subtype}{Column names of misc. data, separated by |}
#'   \item{species}{Species name}
#' }
#' @examples
#' \dontrun{
#' fetch_metadata("rbcl[Gene] AND Crepidomanes[ORGN]")
#' }
#' @export
fetch_metadata <- function(
  query,
  chunk_size = 10,
  max_tries = 10,
  verbose = FALSE,
  higher_taxa = FALSE) {

  # Check input
  assertthat::assert_that(assertthat::is.string(query))
  assertthat::assert_that(assertthat::is.number(chunk_size))
  assertthat::assert_that(assertthat::is.number(max_tries))
  assertthat::assert_that(chunk_size < 200)
  assertthat::assert_that(is.logical(verbose))

  # Do an initial search without downloading any IDs to see how many hits
  # we get.
  initial_genbank_results <- rentrez::entrez_search(
    db = "nucleotide",
    term = query,
    use_history = TRUE
  )

  assertthat::assert_that(
    initial_genbank_results$count > 0,
    msg = "Query resulted in no hits")

  # Download IDs with maximum set to 1 more than the total number of hits.
  genbank_results <- rentrez::entrez_search(
    db = "nucleotide",
    term = query,
    use_history=FALSE,
    retmax = initial_genbank_results$count + 1
  )

  # Extract vector of genbank IDs
  working_id <- genbank_results$id

  # Instantiate empty tibble to store results
  results <- tibble::tibble()

  # Instatiate counter to keep track of attempts
  # to fetch metadata
  this_try <- 0

  while(length(working_id) > 0) {

    this_try <- this_try + 1

    if(isTRUE(verbose)) {
      print(glue::glue("Fetching metadata, attempt {this_try} of {max_tries}"))
    }

    if(this_try > max_tries) {
      print(glue::glue("Exceeded maximum number of tries ({max_tries}), quitting."))
      return(results)
    }

    this_result <-
      fetch_metadata_chunked(
        id = working_id,
        chunk_size = chunk_size,
        higher_taxa = higher_taxa) %>%
      purrr::transpose() %>%
      magrittr::extract("result") %>%
      purrr::flatten() %>%
      dplyr::bind_rows()

    results <- dplyr::bind_rows(results, this_result)

    working_id <- setdiff(working_id, results$gi)
  }

  # Confusingly, the entrez list actually uses "caption" to mean
  # accession, so swap this before selecting final columns.
  if ("caption" %in% colnames(results))
    results <- dplyr::rename(results, accession = caption)

  results

}