R/get_file_info.R

## Copyright 2016 Christian Diener <mail[at]cdiener.com>
##
## MIT license. See LICENSE for more information.

GDC_BASE <- "https://gdc-api.nci.nih.gov/files?"
fields <- c("file_id", "file_name", "md5sum",
    "cases.case_id", "cases.submitter_id", "cases.project.project_id",
    "cases.samples.sample_id", "cases.samples.submitter_id", "data_category",
    "data_type", "experimental_strategy", "updated_datetime")

# To fix stupid CRAN notes
utils::globalVariables(c("cases", "case_id", "submitter_id", "."))

count_ids <- function(case) {
    if (!is.null(case)) nc <- nrow(case) else nc <- 0
    if (!is.null(case$samples)) ns <- nrow(rbindlist(case$samples))
    else ns <- 0

    return(c(n_cases=nc, n_samples=ns))
}

safe_json <- function(url, max_retries=10) {
    json <- NULL
    for (i in 1:max_retries) {
        json <- tryCatch(fromJSON(url), error=function(e) NULL)
        if (!is.null(json)) {
            return(json)
        }
    }
    stop(paste0("Maximum retries reached but could not get data :(\nURL: ",
                url))
}

#' Obtains infromation about the files in GDC.
#'
#' This function downloads a list of all files in GDC together with additional
#' metadata. Here the most important metadata is the mapping of individual
#' experimental data to the corresponding patient.
#'
#' @param query Optional. The GDC query to be used. By default downloads
#' detailed information for all files in the GDC.
#' @param unique optional boolean. Whether only to return file info
#' for file that can be mapped to a unique patient and sample (if applicable).
#' @param chunk optional number. File info is requested in chunks of this size.
#' Avoids server errors when requesting all files at once.
#' @param max_size Maximum number of entries to download.
#' @return A data table mapping files to various ids and the the patient
#' @examples
#'  NULL
#'
#' @export
#' @importFrom jsonlite fromJSON
#' @importFrom data.table as.data.table tstrsplit
#' @importFrom curl curl
list_files <- function(query="default", unique=TRUE, chunk=10000,
                       max_size=Inf) {
    if (query == "default")
        query <- sprintf("%sfields=%s&pretty=true",
            GDC_BASE, paste(fields, collapse=","))

    json <- safe_json(sprintf("%s&size=%d", query, chunk))
    fi <- as.data.table(json$data$hits)
    pages <- json$data$pagination$pages
    if (chunk > 1000) cat(sprintf("Getting chunk %d/%d...", 1, pages))
    if (pages > 1) {
        for (i in 2:pages) {
            if ((i-1)*chunk >= max_size) break
            if (chunk > 1000) {
                cat(sprintf("\rGetting chunk %d/%d...", i, pages))
            }
            json <- safe_json(sprintf("%s&size=%d&from=%d",
                              query, chunk, (i-1)*chunk))
            fi <- rbind(fi, as.data.table(json$data$hits))
        }
    }

    pid <- sapply(fi$cases, function(x) x$project$project_id[1])
    fi[, c("project", "panel") := tstrsplit(pid, "-")]
    if (unique) {
        counts <- sapply(fi$cases, count_ids)
        fi <- fi[counts[1, ] == 1 & counts[2, ] <= 1]
        sids <- lapply(fi$cases, function(ca) {
            if (is.null(ca$samples)) return(data.table(NA, NA))
            else return(ca$samples[[1]])
        })
        fi[, c("sample_uuid", "sample_barcode") := rbindlist(sids)]
        fi[, c("patient_uuid", "patient_barcode") :=
            rbindlist(cases, fill=TRUE)[, .(case_id, submitter_id)]]

        fi[, cases := NULL]
        suppressWarnings(fi$tumor <-
            as.numeric(sapply(fi$sample_barcode, substr, 14, 15)) < 10)
    }

    return(fi)
}

#' File infos for the GDC release 9.0 from 11-2017
#'
#' This data set contains detailed information about all files in the GDC
#' release that have a corresponding read method in `tcgar`. This data set
#' was generated by the \code{\link{list_files}} function contained in this
#' package and filtering for the data types "Gene Expression Quantification"
#' and "Clinical Supplement".
#'
#' @format A data frame with the following columns.
#' \describe{
#'   \item{data_type}{The type of the file.}
#'   \item{updated_datetime}{Last update of the file.}
#'   \item{file_name}{The filename of the file.}
#'   \item{md5sum}{md5 hash for the file.}
#'   \item{id}{Internal GDC id for the data. Ususally the same as `file_id`.}
#'   \item{file_id}{The GDC UUID for the file.}
#'   \item{project}{The project the sample came from.}
#'   \item{panel}{The panel the sample came from.}
#'   \item{data_category}{Category for the file.}
#'   \item{experimental_strategy}{Type of experiment used to obtain the data.}
#'   \item{sample_uuid}{Unique ID for the sample the data came from.}
#'   \item{sample_barcode}{Barcode for the sample the data came from.}
#'   \item{patient_uuid}{Unique ID for the patient the data came from.}
#'   \item{patient_barcode}{Barcode for the patient the data came from.}
#'   \item{tumor}{Whether the file describes a tumor sample (normal sample
#'    otherwise).}
#' }
"gdc_files"
cdiener/tcgar documentation built on May 13, 2019, 2:41 p.m.