R/get_toplevel_sequence_info.R

Defines functions get_toplevel_sequence_info parse_toplevel_sequence_info toplevel_sequence_info_tbl

Documented in get_toplevel_sequence_info

toplevel_sequence_info_tbl <- function(species_name = character(),
                                       toplevel_sequence = character(),
                                       is_chromosome = logical(),
                                       coordinate_system = character(),
                                       assembly_exception_type = character(),
                                       is_circular = logical(),
                                       assembly_name = character(),
                                       length = double()) {
  tibble::tibble(species_name = species_name,
                 toplevel_sequence = toplevel_sequence,
                 is_chromosome = is_chromosome,
                 coordinate_system = coordinate_system,
                 assembly_exception_type = assembly_exception_type,
                 is_circular = is_circular,
                 assembly_name = assembly_name,
                 length = length)
}

parse_toplevel_sequence_info <- function(species_name, toplevel_sequence, lst) {

  toplevel_sequence_info_tbl(species_name = species_name,
                             toplevel_sequence = toplevel_sequence,
                             is_chromosome = as.logical(lst$is_chromosome),
                             coordinate_system = lst$coordinate_system,
                             assembly_exception_type = lst$assembly_exception_type,
                             is_circular = as.logical(lst$is_circular),
                             assembly_name = lst$assembly_name,
                             length = lst$length)
}

#' Get toplevel sequences details
#'
#' This function retrieves a few extra details about a toplevel sequence. These
#' sequences correspond to genomic regions in the genome assembly that are not a
#' component of another sequence region. Thus, toplevel sequences will be
#' chromosomes and any unlocalised or unplaced scaffolds.
#'
#' @param species_name The species name, i.e., the scientific name, all letters
#'   lowercase and space replaced by underscore. Examples: \code{'homo_sapiens'}
#'   (human), \code{'ovis_aries'} (Domestic sheep) or \code{'capra_hircus'}
#'   (Goat).
#' @param toplevel_sequence A toplevel sequence name, e.g. chromosome names such
#'   as `"1"`, `"X"`, or `"Y"`, or a non-chromosome sequence, e.g., a scaffold
#'   such as `"KI270757.1"`.
#' @param verbose Whether to be chatty.
#' @param warnings Whether to print warnings.
#' @param progress_bar Whether to show a progress bar.
#'
#' @return A \code{\link[tibble]{tibble}}, each row being a toplevel sequence,
#'   of 8 variables:
#' \describe{
#' \item{`species_name`}{Ensembl species name: this is the name used internally
#' by Ensembl to uniquely identify a species by name. It is the scientific name
#' but formatted without capitalisation and spacing converted with an
#' underscore, e.g., \code{'homo_sapiens'}.}
#' \item{`toplevel_sequence`}{Name of the toplevel sequence.}
#' \item{`is_chromosome`}{A logical indicating whether the toplevel sequence is
#' a chromosome (`TRUE`) or not (`FALSE`).}
#' \item{`coord_system`}{Coordinate system type.}
#' \item{`assembly_exception_type`}{Coordinate system type.}
#' \item{`is_circular`}{A logical indicating whether the toplevel sequence is a
#' circular sequence (`TRUE`) or not (`FALSE`).}
#' \item{`assembly_name`}{Assembly name.}
#' \item{`length`}{Genomic length toplevel sequence in base pairs.}
#' }
#'
#'
#' @md
#' @examples
#' # Get details about human chromosomes (default)
#' get_toplevel_sequence_info()
#'
#' # Get details about a scaffold
#' # (To find available toplevel sequences to query use the function
#' # `get_toplevel_sequences()`)
#' get_toplevel_sequence_info(species_name = 'homo_sapiens', toplevel_sequence = 'KI270757.1')
#'
#' @seealso [get_toplevel_sequences()]
#'
#' @export
get_toplevel_sequence_info <- function(species_name = 'homo_sapiens',
                                       toplevel_sequence = c(1:22, 'X', 'Y', 'MT'),
                                       verbose = FALSE,
                                       warnings = TRUE,
                                       progress_bar = TRUE) {

  error_msg <- glue::glue(
    'All arguments must have consistent lengths, ',
    'only values of length one are recycled:\n',
    '* Length of `species_name`: {length(species_name)}\n',
    '* Length of `toplevel_sequence`: {length(toplevel_sequence)}\n'
  )

  if (!are_vec_recyclable(species_name,
                          toplevel_sequence)) {
    rlang::abort(error_msg)
  }

  recycled_args <- vctrs::vec_recycle_common(species_name,
                                             toplevel_sequence)

  # The order of names here should be same as passed to
  # vctrs::vec_recycle_common()
  names(recycled_args) <-
    c(
      'species_name',
      'toplevel_sequence'
    )

  resource_urls <- glue::glue('/info/assembly/',
                              '{recycled_args$species_name}/{recycled_args$toplevel_sequence}')

  responses <-
    request_parallel(
      resource_urls,
      verbose = verbose,
      warnings = warnings,
      progress_bar = progress_bar
    )

  # Only keep those responses that responded successfully, i.e. with status == "OK".
  responses_ok <-
    purrr::keep(responses,
                ~ identical(.x$status, 'OK') &&
                  !rlang::is_empty(.x$content))
  if (rlang::is_empty(responses_ok))
    return(toplevel_sequence_info_tbl())

  return(purrr::imap_dfr(
    .x = responses_ok,
    .f = ~ parse_toplevel_sequence_info(
      species_name = recycled_args$species_name[.y],
      toplevel_sequence = recycled_args$toplevel_sequence[.y],
      lst = .x$content
    )
  ))

}
ramiromagno/ensemblr documentation built on Oct. 19, 2023, 11:12 a.m.