## Copyright 2016 Christian Diener <mail[at]cdiener.com>
##
## MIT license. See LICENSE for more information.
GDC_BASE <- "https://gdc-api.nci.nih.gov/files?"
fields <- c("file_id", "file_name", "md5sum",
"cases.case_id", "cases.submitter_id", "cases.project.project_id",
"cases.samples.sample_id", "cases.samples.submitter_id", "data_category",
"data_type", "experimental_strategy", "updated_datetime")
# To fix stupid CRAN notes
utils::globalVariables(c("cases", "case_id", "submitter_id", "."))
count_ids <- function(case) {
if (!is.null(case)) nc <- nrow(case) else nc <- 0
if (!is.null(case$samples)) ns <- nrow(rbindlist(case$samples))
else ns <- 0
return(c(n_cases=nc, n_samples=ns))
}
safe_json <- function(url, max_retries=10) {
json <- NULL
for (i in 1:max_retries) {
json <- tryCatch(fromJSON(url), error=function(e) NULL)
if (!is.null(json)) {
return(json)
}
}
stop(paste0("Maximum retries reached but could not get data :(\nURL: ",
url))
}
#' Obtains infromation about the files in GDC.
#'
#' This function downloads a list of all files in GDC together with additional
#' metadata. Here the most important metadata is the mapping of individual
#' experimental data to the corresponding patient.
#'
#' @param query Optional. The GDC query to be used. By default downloads
#' detailed information for all files in the GDC.
#' @param unique optional boolean. Whether only to return file info
#' for file that can be mapped to a unique patient and sample (if applicable).
#' @param chunk optional number. File info is requested in chunks of this size.
#' Avoids server errors when requesting all files at once.
#' @param max_size Maximum number of entries to download.
#' @return A data table mapping files to various ids and the the patient
#' @examples
#' NULL
#'
#' @export
#' @importFrom jsonlite fromJSON
#' @importFrom data.table as.data.table tstrsplit
#' @importFrom curl curl
list_files <- function(query="default", unique=TRUE, chunk=10000,
max_size=Inf) {
if (query == "default")
query <- sprintf("%sfields=%s&pretty=true",
GDC_BASE, paste(fields, collapse=","))
json <- safe_json(sprintf("%s&size=%d", query, chunk))
fi <- as.data.table(json$data$hits)
pages <- json$data$pagination$pages
if (chunk > 1000) cat(sprintf("Getting chunk %d/%d...", 1, pages))
if (pages > 1) {
for (i in 2:pages) {
if ((i-1)*chunk >= max_size) break
if (chunk > 1000) {
cat(sprintf("\rGetting chunk %d/%d...", i, pages))
}
json <- safe_json(sprintf("%s&size=%d&from=%d",
query, chunk, (i-1)*chunk))
fi <- rbind(fi, as.data.table(json$data$hits))
}
}
pid <- sapply(fi$cases, function(x) x$project$project_id[1])
fi[, c("project", "panel") := tstrsplit(pid, "-")]
if (unique) {
counts <- sapply(fi$cases, count_ids)
fi <- fi[counts[1, ] == 1 & counts[2, ] <= 1]
sids <- lapply(fi$cases, function(ca) {
if (is.null(ca$samples)) return(data.table(NA, NA))
else return(ca$samples[[1]])
})
fi[, c("sample_uuid", "sample_barcode") := rbindlist(sids)]
fi[, c("patient_uuid", "patient_barcode") :=
rbindlist(cases, fill=TRUE)[, .(case_id, submitter_id)]]
fi[, cases := NULL]
suppressWarnings(fi$tumor <-
as.numeric(sapply(fi$sample_barcode, substr, 14, 15)) < 10)
}
return(fi)
}
#' File infos for the GDC release 9.0 from 11-2017
#'
#' This data set contains detailed information about all files in the GDC
#' release that have a corresponding read method in `tcgar`. This data set
#' was generated by the \code{\link{list_files}} function contained in this
#' package and filtering for the data types "Gene Expression Quantification"
#' and "Clinical Supplement".
#'
#' @format A data frame with the following columns.
#' \describe{
#' \item{data_type}{The type of the file.}
#' \item{updated_datetime}{Last update of the file.}
#' \item{file_name}{The filename of the file.}
#' \item{md5sum}{md5 hash for the file.}
#' \item{id}{Internal GDC id for the data. Ususally the same as `file_id`.}
#' \item{file_id}{The GDC UUID for the file.}
#' \item{project}{The project the sample came from.}
#' \item{panel}{The panel the sample came from.}
#' \item{data_category}{Category for the file.}
#' \item{experimental_strategy}{Type of experiment used to obtain the data.}
#' \item{sample_uuid}{Unique ID for the sample the data came from.}
#' \item{sample_barcode}{Barcode for the sample the data came from.}
#' \item{patient_uuid}{Unique ID for the patient the data came from.}
#' \item{patient_barcode}{Barcode for the patient the data came from.}
#' \item{tumor}{Whether the file describes a tumor sample (normal sample
#' otherwise).}
#' }
"gdc_files"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.