NCI Genomic Data Commons Access

Documented in slicing

#' Query GDC for data slices
#'
#' This function returns a BAM file representing reads overlapping
#' regions specified either as chromosomal regions or as gencode gene
#' symbols.
#'
#' @param uuid character(1) identifying the BAM file resource
#'
#' @param regions character() vector describing chromosomal regions,
#'     e.g., \code{c("chr1", "chr2:10000", "chr3:10000-20000")} (all
#'     of chromosome 1, chromosome 2 from position 10000 to the end,
#'     chromosome 3 from 10000 to 20000).
#'
#' @param symbols character() vector of gencode gene symbols, e.g.,
#'     \code{c("BRCA1", "PTEN")}
#'
#' @param destination character(1) default \code{tempfile()} file path
#'     for BAM file slice
#'
#' @param overwrite logical(1) default FALSE can destination be
#'     overwritten?
#'
#' @param progress logical(1) default \code{interactive()} should a
#'     progress bar be used?
#'
#' @param token character(1) security token allowing access to
#'     restricted data. Almost all BAM data is restricted, so a token is
#'     usually required. See
#'     \url{https://docs.gdc.cancer.gov/Data/Data_Security/Data_Security/#authentication-tokens}.
#'
#' @details This function uses the Genomic Data Commons "slicing" API
#'     to get portions of a BAM file specified either using "regions"
#'     or using HGNC gene symbols. 
#' 
#' @return character(1) destination to the downloaded BAM file
#'
#' @importFrom httr progress
#' @importFrom jsonlite toJSON
#' 
#' @examples
#' \dontrun{
#'  slicing("df80679e-c4d3-487b-934c-fcc782e5d46e",
#'         regions="chr17:75000000-76000000",
#'         token=gdc_token())
#' 
#' # Get 10 BAM files.
#' bamfiles = files() |> 
#'            filter(data_format=='BAM') |>
#'            results(size=10) |> ids()
#' 
#' # Current alignments at the GDC are to GRCh38
#' library('TxDb.Hsapiens.UCSC.hg38.knownGene')
#' all_genes = genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
#' 
#' first3genes = all_genes[1:3]
#' # remove strand info
#' strand(first3genes) = '*'
#' 
#' # We can get our regions easily now
#' as.character(first3genes)
#' 
#' # Use parallel downloads to speed processing
#' library(BiocParallel)
#' register(MulticoreParam())
#' 
#' fnames = bplapply(bamfiles, slicing, overwrite = TRUE,
#'                 regions=as.character(first3genes))
#' 
#' # 10 BAM files
#' fnames
#' 
#' library(GenomicAlignments)
#' lapply(unlist(fnames), readGAlignments)
#' 
#' }
#' @export
slicing <- function(uuid, regions, symbols, destination=file.path(tempdir(), paste0(uuid, '.bam')),
                    overwrite=FALSE, progress=interactive(), token=gdc_token())
{
    stopifnot(is.character(uuid), length(uuid) == 1L)
    stopifnot(missing(regions) || missing(symbols),
              !(missing(regions) && missing(symbols)))
    stopifnot(is.character(destination), length(destination) == 1L,
              (overwrite && file.exists(destination)) || !file.exists(destination))

    if (!missing(symbols))
        body <- list(gencode=I(symbols))
    else
        ## FIXME: validate regions
        body <- list(regions=regions)

    response <- .gdc_post(
        endpoint=sprintf("slicing/view/%s", uuid),
        add_headers('Content-type'='application/json'),
        write_disk(destination, overwrite),
        if (progress) progress() else NULL,
        body=toJSON(body), token=token)
    if (progress)
        cat("\n")

    destination
}