#' Functions to access/manage cohort level data
#'
#' Cohort is a set of datasets from the same study or for a same group of samples,
#' it is the individual data unit in Omics database construction.
#' TODO: ENSEMBL IDs and Hugo Symbol will be supported and the can exchange
#' automatically.
#' @name cohorts
#' @rdname cohorts
#' @param verbose Whether to print extra information.
#' @export
#' @examples
#' cohorts(verbose = TRUE)
cohorts = function(verbose = FALSE) {
qq.options(cat_verbose = verbose, LOCAL = TRUE)
databases = unlist(get_golem_config("coco.databases"))
if (verbose) qqcat("checking databases: @{collapse_text(databases)}\n")
ext_idx = fs::dir_exists(databases)
if (sum(!ext_idx) > 0 && verbose) {
qqcat("non-exist database path(s): @{collapse_text(databases[!ext_idx])}\n",
cat_prefix = cat_prefix_warn
)
}
databases = databases[ext_idx]
paths = fs::dir_ls(databases)
if (length(paths) < 1) {
qqcat("no cohort available, please check your setting\n", cat_prefix = cat_prefix_warn)
return(NULL)
}
cv = data.table(
cohort = fs::path_file(paths),
path = fs::path_expand(paths)
)
dup_ids = anyDuplicated(cv$cohort)
if (dup_ids > 0) {
dup_cohorts = cv$cohort[duplicated(cv$cohort)]
qqcat("duplicated cohort names found, @{collapse_text(dup_cohorts)}\n",
cat_prefix = cat_prefix_err
)
return(NULL)
}
return(cv)
}
#' @describeIn cohorts Create a new cohort with data storing in a path
#' @param path A directory path pointing the new cohort.
#' @param name Description of the cohort.
#' @param cancer_type, Cancer type the cohort belongs to.
#' @param data_provider Typically an university or an institute.
#' @param maintainer Maintainer name and email in format `xxx <xxx@xxx.com>`.
#' @param doi DOI to link the reference.
#' @param year The year to generate the cohort, if not set, use current year.
#' @param dataset_rowidx Specify which columns used for generating row index
#' files from dataset file. You can check `SPEC_ROWINDEX` for implemented
#' description. e.g., ".c1" for "Matrix" data format means if a dataset is in
#' "Matrix" format (gene expression typically), the first column is the index.
#' Moreover, you can directly specify the column name, e.g., "Sample" for data
#' format "Segment".
#' @param dataset_ncount similar to `dataset_rowidx` but this is used for
#' specify column for sample counting. See `SPEC_SAMPINDEX`.
#' @param dataset_options Valid dataset options, see `SPEC_DATASET_OPTIONS`.
#' NOTE: you can expand the `SPEC_DATASET_OPTIONS` and `SPEC_ROWINDEX` by yourself.
#' @param dataset_list A `data.frame` contains datasets information in this cohort.
#' The dataset should have the following fields:
#' - `id`: The file name without `.txt`/`.maf`(.gz) extension and cohort, which would be used as identifier to the dataset.
#' For example, if you have a gene expression dataset in cohort `abc`, and
#' your dataset name is `abc_expression.txt`, then the id should be set to `abc_expression`.
#' therefore the file is unique to whole database hosted by `{coco}`.
#' - `name`: Description of the dataset.
#' - `genome_build`: Reference genome version, e.g., "hg38".
#' - `data_platform`: See `.SPEC_DATASET_OPTIONS`.
#' - `data_type`: See `SPEC_DATASET_OPTIONS`.
#' - `data_format`: See `SPEC_DATASET_OPTIONS`, to filter and search.
#' For "Segment" format, refer to https://github.com/ShixiangWang/DoAbsolute/blob/master/inst/extdata/SNP6_solid_tumor.seg.txt
#' as an example.
#' - `tags`: Other labels for the dataset to help search and filter.
#' Separate multiple tags with comma.
#' @export
#' @examples
#' \donttest{
#' cohort_new(
#' path = system.file("cohorts/example_TCGA_LAML", package = "coco"),
#' name = "TCGA Acute Myeloid Leukemia (LAML) for examples utils",
#' cancer_type = "LAML",
#' data_provider = "TCGA",
#' maintainer = "Shixiang Wang <wangsx1@sysucc.org.cn>",
#' doi = NA,
#' year = 2016,
#' dataset_list = data.frame(
#' id = c(
#' "example_TCGA_LAML_patient_info",
#' "example_TCGA_LAML_gene_expr_HTSeq_count",
#' "example_TCGA_LAML_MAF"
#' ),
#' name = c(
#' "Patient information", "Gene expression in log2(count)",
#' "Mutation list"
#' ),
#' genome_build = c(NA, "hg19", "hg19"),
#' data_platform = c("Clinical", "RNA-Seq", "WES"),
#' data_type = c("Phenotype", "Gene expression", "Mutation"),
#' data_format = c("PatientInfo", "Matrix", "MAF"),
#' tags = c("survival,FAB", "HTSeq", NA)
#' )
#' )
#' }
cohort_new = function(path, name, cancer_type, data_provider, maintainer,
dataset_list, doi = NA, year = NULL,
dataset_rowidx = SPEC_ROWINDEX,
dataset_options = SPEC_DATASET_OPTIONS,
dataset_ncount = SPEC_SAMPINDEX,
verbose = FALSE) {
# Create a meta.yml file to store all informations
stopifnot(
length(path) == 1L,
is.character(path),
is.character(name),
is.character(cancer_type),
is.character(data_provider),
is.character(maintainer),
is.data.frame(dataset_list),
nrow(dataset_list) > 0
)
qq.options(cat_verbose = verbose, LOCAL = TRUE)
if (!fs::dir_exists(path)) {
qqcat("hey, cohort path @{path} does not exist, please check\n", cat_prefix = cat_prefix_err)
return(NULL)
}
if (verbose) qqcat("recoding information ...\n")
if (is.null(year)) year = format(Sys.Date(), "%Y")
info = list(
id = fs::path_file(path),
name = name,
cancer_type = cancer_type,
data_provider = data_provider,
maintainer = maintainer,
doi = doi,
year = year
)
if (verbose) {
qqcat("checking dataset general metadata\n")
print(info)
}
if (!all(dataset_list$data_platform %in% dataset_options$platforms)) {
qqcat("data platform does not pass, valid lists are: @{collapse_text(dataset_options$platforms)}\n", cat_prefix = cat_prefix_err)
return(NULL)
}
if (!all(dataset_list$data_type %in% dataset_options$types)) {
qqcat("data type does not pass, valid lists are: @{collapse_text(dataset_options$types)}\n", cat_prefix = cat_prefix_err)
return(NULL)
}
if (!all(dataset_list$data_format %in% dataset_options$formats)) {
qqcat("data format does not pass, valid lists are: @{collapse_text(dataset_options$formats)}\n", cat_prefix = cat_prefix_err)
return(NULL)
}
if (verbose) qqcat("parsing/checking input dataset list and generating metadata\n")
if (sum(duplicated(dataset_list$id)) > 0) {
qqcat("duplicated data list ID detected, this is invalid\n", cat_prefix = cat_prefix_err)
return(NULL)
}
if (!all(startsWith(dataset_list$id, info$id))) {
qqcat("all dataset id should use id as prefix, which is: @{info$id}\n", cat_prefix = cat_prefix_err)
return(NULL)
}
nd = nrow(dataset_list)
dataset_list$n = NA_integer_
dataset_list$filename = NA_character_
for (i in seq_len(nd)) {
if (verbose) qqcat("handling dataset `@{dataset_list$id[i]}`\n")
fp = fs::dir_ls(path, regexp = dataset_list$id[i])
fp = fp[!(endsWith(fp, "rowidx") | endsWith(fp, "colidx"))]
if (length(fp) != 1) {
qqcat("no unique dataset file path found, either zero or multiple files matched\n",
cat_prefix = cat_prefix_err
)
qqcat("matched file list: @{collapse_text(fp)}\n", cat_prefix = cat_prefix_err)
return(NULL)
}
# Sample index for sample count
sampidx = dataset_ncount[[dataset_list$data_format[i]]]
if (is.null(sampidx)) {
qqcat("no info specified to count samples in dataset\n",
cat_prefix = cat_prefix_err
)
return(NULL)
}
# Row index for metadata
rowidx = dataset_rowidx[[dataset_list$data_format[i]]]
if (is.null(rowidx)) {
qqcat("no column to specified to generate row index metadata file\n",
cat_prefix = cat_prefix_err
)
return(NULL)
}
rowidx = lapply(rowidx, function(x) {
if (startsWith(x, ".c")) {
# Convert to integer column index
x = as.integer(sub(".c", "", rowidx))
}
x
})
if (length(rowidx) == 1) rowidx = rowidx[[1]]
if (verbose) qqcat("reading data file @{fp}\n")
data = data.table::fread(fp)
if (verbose) qqcat("writing index files\n")
# Sample count
if (verbose) qqcat("count samples\n")
nsample = if (sampidx == "ncol-1") {
ncol(data) - 1
} else {
if (sampidx %in% colnames(data)) {
length(unique(data[[sampidx]]))
} else {
NA_integer_
}
}
if (verbose) qqcat("number: @{nsample}\n")
# Column index
if (verbose) qqcat("generating column index files\n")
cx_fp = paste0(fp, ".colidx")
cx_data = colnames(data)
data.table::fwrite(data.table(id = cx_data), file = cx_fp, col.names = FALSE)
# Row index
if (verbose) qqcat("generating row index files\n")
if (length(rowidx) > 1) {
# Multiple row index files may available for non-matrix data
for (j in rowidx) {
if (is.integer(j)) {
rx_fp = paste0(fp, ".", colnames(data)[j], ".rowidx")
} else {
if (!j %in% colnames(data)) {
qqcat("no column @{j} found in the dataset\n",
cat_prefix = cat_prefix_err
)
return(NULL)
}
rx_fp = paste0(fp, ".", j, ".rowidx")
}
rx_data = unique(data[[j]])
data.table::fwrite(data.table(id = rx_data), file = rx_fp, col.names = FALSE)
}
} else {
rx_fp = paste0(fp, ".rowidx")
rx_data = unique(data[[rowidx]])
data.table::fwrite(data.table(id = rx_data), file = rx_fp, col.names = FALSE)
}
if (verbose) qqcat("done for dataset `@{dataset_list$id[i]}`\n")
dataset_list$n[i] = nsample
dataset_list$filename[i] = fs::path_file(fp)
}
if (verbose) qqcat("generating cohort meta.yml\n")
meta_fp = fs::path_join(c(path, "meta.yml"))
info$dataset = dataset_list
write_yaml(info, file = meta_fp)
message("job finished, you can check `meta.yml` and index files in cohort path")
return(invisible(NULL))
}
#' @describeIn cohorts List datasets (or metadata) in a cohort
#' @param id Cohort ID, which can obtained from [cohorts()].
#' @export
#' @examples
#' cohort_ls("example_TCGA_LAML")
cohort_ls = function(id, verbose = FALSE) {
# Read meta.yml file and parse it
stopifnot(length(id) == 1L)
qq.options(cat_verbose = verbose, LOCAL = TRUE)
chs = cohorts(verbose)
if (!id %in% chs$cohort) {
qqcat("bad cohort ID, valid lists are:\n", cat_prefix = cat_prefix_err)
print(chs$cohort)
return(NULL)
}
chs = chs[chs$cohort == id]
meta_fl = fs::path_join(c(chs$path, "meta.yml"))
if (!fs::file_exists(meta_fl)) {
qqcat("meta file `@{meta_fl}` does not exist, have you created? If yes, contact developer!",
cat_prefix = cat_prefix_err
)
return(NULL)
}
data = as.data.frame(read_yaml(meta_fl))
data$path = chs$path
return(data.table(data))
}
cohort_load = function() {
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.