#' Fetch Data from UCSC Xena Hosts
#'
#' When you want to query just data for several genes/samples from UCSC Xena datasets, a better way
#' is to use these `fetch_` functions instead of downloading a whole dataset. Details about functions
#' please see the following sections.
#'
#' There are three primary data types: dense matrix (samples by probes (or say identifiers)),
#' sparse (sample, position, variant), and segmented (sample, position, value).
#'
#' Dense matrices can be genotypic or phenotypic, it is a sample-by-identifiers matrix.
#' Phenotypic matrices have associated field metadata (descriptive names, codes, etc.).
#' Genotypic matricies may have an associated probeMap, which maps probes to genomic locations.
#' If a matrix has hugo probeMap, the probes themselves are gene names. Otherwise, a probeMap is
#' used to map a gene location to a set of probes.
#'
#' @param host a UCSC Xena host, like "https://toil.xenahubs.net".
#' All available hosts can be printed by [xena_default_hosts()].
#' @param dataset a UCSC Xena dataset, like "tcga_RSEM_gene_tpm".
#' All available datasets can be printed by running `XenaData$XenaDatasets` or
#' obtained from [UCSC Xena datapages](https://xenabrowser.net/datapages/).
#' @param identifiers Identifiers could be probe (like "ENSG00000000419.12"),
#' gene (like "TP53") etc.. If it is `NULL`, all identifiers in the dataset will be used.
#' @param samples ID of samples, like "TCGA-02-0047-01".
#' If it is `NULL`, all samples in the dataset will be used. However, it is better to download
#' the whole datasets if you query many samples and genes.
#' @param check if `TRUE`, check whether specified `identifiers` and `samples` exist the dataset
#' (all failed items will be filtered out). However, if `FALSE`, the code is much faster.
#' @param use_probeMap if `TRUE`, will check if the dataset has ProbeMap firstly.
#' When the dataset you want to query has a identifier-to-gene mapping, identifiers can be
#' gene symbols even the identifiers of dataset are probes or others.
#' @param time_limit time limit for getting response in seconds.
#' @return a `matirx` or character vector or a `list`.
#' @examples
#' library(UCSCXenaTools)
#'
#' host <- "https://toil.xenahubs.net"
#' dataset <- "tcga_RSEM_gene_tpm"
#' samples <- c("TCGA-02-0047-01", "TCGA-02-0055-01", "TCGA-02-2483-01", "TCGA-02-2485-01")
#' probes <- c("ENSG00000282740.1", "ENSG00000000005.5", "ENSG00000000419.12")
#' genes <- c("TP53", "RB1", "PIK3CA")
#'
#' \donttest{
#' # Fetch samples
#' fetch_dataset_samples(host, dataset, 2)
#' # Fetch identifiers
#' fetch_dataset_identifiers(host, dataset)
#' # Fetch expression value by probes
#' fetch_dense_values(host, dataset, probes, samples, check = FALSE)
#' # Fetch expression value by gene symbol (if the dataset has probeMap)
#' has_probeMap(host, dataset)
#' fetch_dense_values(host, dataset, genes, samples, check = FALSE, use_probeMap = TRUE)
#' }
#' @export
fetch <- function(host, dataset) {
message("This function is used to build consistent documentation.")
message("Please use a function starts with fetch_")
}
check_hiplot <- function(host) {
use_hiplot <- getOption("use_hiplot", default = FALSE)
if (use_hiplot) {
# Check website status
use_hiplot <- tryCatch(
{
httr::http_error(host)
TRUE
},
error = function(e) {
message("The hiplot server may down, we will not use it for now.")
FALSE
}
)
}
if (use_hiplot) {
if (!grepl("hiplot", host)) {
message("Use hiplot server (China) for mirrored data hubs (set 'options(use_hiplot = FALSE)' to disable it)")
host2 <- as.character(.xena_mirror_map_rv[host])
if (!is.na(host2)) {
host <- host2
}
}
}
return(host)
}
#' @describeIn fetch fetches values from a dense matrix.
#' @export
fetch_dense_values <- function(host, dataset, identifiers = NULL, samples = NULL,
check = TRUE, use_probeMap = FALSE, time_limit = 30) {
stopifnot(
length(host) == 1, length(dataset) == 1,
is.character(host), is.character(dataset),
is.logical(check), is.logical(use_probeMap)
)
.attach_this()
host <- check_hiplot(host)
if (check) {
# obtain all samples
all_samples <- fetch_dataset_samples(host, dataset)
# obtain all identifiers
use_probe <- use_probeMap && has_probeMap(host, dataset)
if (use_probe) {
message("-> Obtaining gene symbols from probeMap...")
url <- has_probeMap(host, dataset, return_url = TRUE)
all_identifiers <- use_cache(url, op = "readr::read_tsv(url,
col_types = readr::cols()
)[[2]]")
} else {
all_identifiers <- fetch_dataset_identifiers(host, dataset)
}
message("-> Checking identifiers...")
if (is.null(identifiers)) {
identifiers <- all_identifiers
} else {
if (!is.character(identifiers)) stop("Bad type for identifiers.")
if (!all(identifiers %in% all_identifiers)) {
which_in <- identifiers %in% all_identifiers
message("The following identifiers have been removed from host ", host, " dataset ", dataset)
print(identifiers[!which_in])
identifiers <- identifiers[which_in]
if (length(identifiers) == 0) {
stop("Bad identifiers, no one left, check input?")
}
}
}
message("-> Done.")
message("-> Checking samples...")
if (is.null(samples)) {
samples <- all_samples
} else {
if (!is.character(samples)) stop("Bad type for samples.")
if (!all(samples %in% all_samples)) {
which_in <- samples %in% all_samples
message("The following samples have been removed from host ", host, " dataset ", dataset)
print(samples[!which_in])
samples <- samples[which_in]
if (length(samples) == 0) {
stop("Bad samples, no one left, check input?")
}
}
}
message("-> Done.")
} else {
if (is.null(samples)) {
# obtain all samples
samples <- fetch_dataset_samples(host, dataset)
}
if (is.null(identifiers)) {
# obtain all identifiers
identifiers <- fetch_dataset_identifiers(host, dataset)
}
}
if (length(samples) == 1) {
samples <- as.list(samples)
}
if (length(identifiers) == 1) {
identifiers <- as.list(identifiers)
}
if (use_probeMap) {
message("-> Checking if the dataset has probeMap...")
if (has_probeMap(host, dataset)) {
message("-> Done. ProbeMap is found.")
t_start = Sys.time()
while (as.numeric(Sys.time() - t_start) < time_limit) {
res <- tryCatch(
{
.p_dataset_gene_probe_avg(host, dataset, samples, identifiers)
},
error = function(e) {
message("-> Query faild. Retrying...")
list(has_error = TRUE, error_info = e)
}
)
if (is.data.frame(res)) {
break()
}
Sys.sleep(1)
}
if (!is.data.frame(res)) {
stop(paste(
"The response times out and still returns an error",
res$error_info$message,
sep = "\n"
))
}
if (!is.null(unlist(res[["scores"]]))) {
res <- t(sapply(res[["scores"]], base::rbind))
rownames(res) <- identifiers
colnames(res) <- samples
return(res)
}
}
message("-> Done. No probeMap found or error happened, use old way...")
}
t_start = Sys.time()
while (as.numeric(Sys.time() - t_start) < time_limit) {
res <- tryCatch(
{
.p_dataset_fetch(host, dataset, samples, identifiers)
},
error = function(e) {
message("-> Query faild. Retrying...")
list(has_error = TRUE, error_info = e)
}
)
if (is.atomic(res)) {
break()
}
Sys.sleep(1)
}
if (!is.atomic(res)) {
stop(paste(
"The response times out and still returns an error",
res$error_info$message,
sep = "\n"
))
}
message("-> Query done.")
rownames(res) <- identifiers
colnames(res) <- samples
res
}
#' @describeIn fetch fetches values from a sparse `data.frame`.
#' @param genes gene names.
#' @export
fetch_sparse_values <- function(host, dataset, genes, samples = NULL,
time_limit = 30) {
# fetch_sparse_values("https://ucscpublic.xenahubs.net", "ccle/CCLE_DepMap_18Q2_maf_20180502", c("TP53", "KRAS")) -> mm
stopifnot(
length(host) == 1, length(dataset) == 1,
is.character(host), is.character(dataset)
)
.attach_this()
host <- check_hiplot(host)
if (is.null(samples)) {
samples <- fetch_dataset_samples(host, dataset)
}
t_start = Sys.time()
while (as.numeric(Sys.time() - t_start) < time_limit) {
res <- tryCatch(
{
.p_sparse_data(host, dataset, samples, genes)
},
error = function(e) {
message("-> Query faild. Retrying...")
list(has_error = TRUE, error_info = e)
}
)
if (is.list(res)) {
break()
}
Sys.sleep(1)
}
res
}
#' @describeIn fetch fetches samples from a dataset
#' @param limit number of samples, if `NULL`, return all samples.
#' @export
fetch_dataset_samples <- function(host, dataset, limit = NULL) {
.attach_this()
host <- check_hiplot(host)
if (is.null(limit)) limit = -1
.p_dataset_samples(host = host, dataset = dataset, limit = limit)
}
#' @describeIn fetch fetches identifies from a dataset.
#' @export
fetch_dataset_identifiers <- function(host, dataset) {
.attach_this()
host <- check_hiplot(host)
.p_dataset_field(host = host, dataset = dataset)
}
#' @describeIn fetch checks if a dataset has ProbeMap.
#' @param return_url if `TRUE`, returns the info of probeMap
#' instead of a logical value when the result exists.
#' @export
has_probeMap <- function(host, dataset, return_url = FALSE) {
.attach_this()
if (! host %in% .xena_mirror_map) {
host <- .xena_mirror_map[host]
}
df <- base::subset(UCSCXenaTools::XenaData, XenaHosts == host & XenaDatasets == dataset)
rv <- !is.na(df[["ProbeMap"]])
if (rv && return_url) {
paste0(
df$XenaHosts, "/download/",
df[["ProbeMap"]]
)
} else rv
}
utils::globalVariables(
c(
".p_dataset_fetch", ".p_dataset_field", ".p_dataset_gene_probe_avg",
".p_dataset_samples", ".p_sparse_data"
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.