# A data set containing features by samples data, or records
#' Functions to access/manage dataset level data
#'
#' A dataset contains a specified type of data, a cohort contains a list of
#' datasets.
#' TODO: ENSEMBL IDs and Hugo Symbol will be supported and the can exchange
#' automatically.
#' @name datasets
#' @rdname datasets
#' @param id A dataset ID.
#' @param cohort A cohort ID.
#' @param select A vector of column names or numbers to keep, drop the rest.
#' @param subset A expression to subset the dataset.
#' @param verbose Whether to print extra information.
#' @param ... other parameters passing to [data.table::fread()].
#' @export
#' @examples
#' datasets("example_TCGA_LAML")
#'
#' dataset_info("example_TCGA_LAML_MAF", "example_TCGA_LAML")
#'
#' dataset_query_idx("example_TCGA_LAML_MAF", "example_TCGA_LAML")
#' dataset_query_idx("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML")
#'
#' dataset_load("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML")
#' dataset_load("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML", select = 1:3)
#' dataset_load("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML",
#' select = c("Ensembl_ID", "TCGA-AB-2918-03A"), subset = Ensembl_ID == "ENSG00000251400"
#' )
datasets = function(id, verbose = FALSE) {
cohort_ls(id, verbose)$dataset.id
}
#' @describeIn datasets Show information of a specified dataset
#' @export
dataset_info = function(id, cohort, verbose = FALSE) {
stopifnot(length(id) == 1L, length(cohort) == 1L)
qq.options(cat_verbose = verbose, LOCAL = TRUE)
ch = cohort_ls(cohort, verbose)
if (is.null(ch)) {
return(NULL)
}
if (!id %in% ch$dataset.id) {
qqcat("bad dataset ID, valid lists are:\n", cat_prefix = cat_prefix_err)
print(ch$dataset.id)
return(NULL)
}
idx = ch$dataset.id %in% id
data = ch[idx]
fls = as.character(fs::dir_ls(data$path, regexp = id))
fls_row = fls[endsWith(fls, "rowidx")]
c(
as.list(data[, startsWith(colnames(data), "dataset"), with = FALSE]),
list(
rowidx = fls_row,
colidx = fls[endsWith(fls, "colidx")],
path = data$path
)
)
}
#' @describeIn datasets Query index information for a dataset
#' @export
dataset_query_idx = function(id, cohort, verbose = FALSE) {
stopifnot(length(id) == 1L, length(cohort) == 1L)
qq.options(cat_verbose = verbose, LOCAL = TRUE)
data = dataset_info(id, cohort, verbose = verbose)
fl_list = data[c("rowidx", "colidx")]
rv = list()
rv$rowidx = list()
for (i in seq_along(fl_list)) {
if (names(fl_list)[i] == "colidx") {
if (verbose) qqcat("query column index data\n")
rv$colidx = data.table::fread(fl_list[[i]], header = FALSE)[[1]]
} else {
if (verbose) qqcat("query row index data\n")
if (length(fl_list[[i]]) > 1) {
for (j in seq_along(fl_list[[i]])) {
rv$rowidx[[j]] = data.table::fread(fl_list[[i]][[j]], header = FALSE)[[1]]
}
new_names = rm_strs(
fs::path_file(fl_list[[i]]),
c(paste0(data$dataset.filename, "."), ".rowidx")
)
names(rv$rowidx) = new_names
} else {
rv$rowidx = data.table::fread(fl_list[[i]], header = FALSE)[[1]]
}
}
}
rv
}
rm_strs = function(x, patterns) {
for (i in patterns) {
x = gsub(i, "", x)
}
x
}
#' @describeIn datasets Load data from specified dataset
#' @export
dataset_load = function(id, cohort, subset = NULL, select = NULL, verbose = FALSE, ...) {
stopifnot(length(id) == 1L, length(cohort) == 1L)
qq.options(cat_verbose = verbose, LOCAL = TRUE)
info = dataset_info(id, cohort, verbose = verbose)
datafile = file.path(info$path, info$dataset.filename)
if (verbose) qqcat("loading dataset file with column subsetting\n")
# colnames_data = colnames(data.table::fread(datafile, nrows = 1L, verbose = verbose))
# if (verbose) qqcat("detected colnames of dataset @{collapse_text(colnames_data)}\n")
data = data.table::fread(datafile, select = select, verbose = verbose, showProgress = verbose)
if (verbose) qqcat("subsetting data")
subset = substitute(subset)
if (is.language(subset)) {
data = subset(data, subset = eval(subset))
}
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.