R/dataset.R

Defines functions dataset_load rm_strs dataset_query_idx dataset_info datasets

Documented in dataset_info dataset_load dataset_query_idx datasets

# A data set containing features by samples data, or records

#' Functions to access/manage dataset level data
#'
#' A dataset contains a specified type of data, a cohort contains a list of
#' datasets.
#' TODO: ENSEMBL IDs and Hugo Symbol will be supported and the can exchange
#' automatically.
#' @name datasets
#' @rdname datasets
#' @param id A dataset ID.
#' @param cohort A cohort ID.
#' @param select A vector of column names or numbers to keep, drop the rest.
#' @param subset A expression to subset the dataset.
#' @param verbose Whether to print extra information.
#' @param ... other parameters passing to [data.table::fread()].
#' @export
#' @examples
#' datasets("example_TCGA_LAML")
#'
#' dataset_info("example_TCGA_LAML_MAF", "example_TCGA_LAML")
#'
#' dataset_query_idx("example_TCGA_LAML_MAF", "example_TCGA_LAML")
#' dataset_query_idx("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML")
#'
#' dataset_load("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML")
#' dataset_load("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML", select = 1:3)
#' dataset_load("example_TCGA_LAML_gene_expr_HTSeq_count", "example_TCGA_LAML",
#'   select = c("Ensembl_ID", "TCGA-AB-2918-03A"), subset = Ensembl_ID == "ENSG00000251400"
#' )
datasets = function(id, verbose = FALSE) {
  cohort_ls(id, verbose)$dataset.id
}

#' @describeIn datasets Show information of a specified dataset
#' @export
dataset_info = function(id, cohort, verbose = FALSE) {
  stopifnot(length(id) == 1L, length(cohort) == 1L)
  qq.options(cat_verbose = verbose, LOCAL = TRUE)
  ch = cohort_ls(cohort, verbose)
  if (is.null(ch)) {
    return(NULL)
  }
  if (!id %in% ch$dataset.id) {
    qqcat("bad dataset ID, valid lists are:\n", cat_prefix = cat_prefix_err)
    print(ch$dataset.id)
    return(NULL)
  }
  idx = ch$dataset.id %in% id
  data = ch[idx]
  fls = as.character(fs::dir_ls(data$path, regexp = id))
  fls_row = fls[endsWith(fls, "rowidx")]
  c(
    as.list(data[, startsWith(colnames(data), "dataset"), with = FALSE]),
    list(
      rowidx = fls_row,
      colidx = fls[endsWith(fls, "colidx")],
      path = data$path
    )
  )
}

#' @describeIn datasets Query index information for a dataset
#' @export
dataset_query_idx = function(id, cohort, verbose = FALSE) {
  stopifnot(length(id) == 1L, length(cohort) == 1L)
  qq.options(cat_verbose = verbose, LOCAL = TRUE)

  data = dataset_info(id, cohort, verbose = verbose)
  fl_list = data[c("rowidx", "colidx")]
  rv = list()
  rv$rowidx = list()
  for (i in seq_along(fl_list)) {
    if (names(fl_list)[i] == "colidx") {
      if (verbose) qqcat("query column index data\n")
      rv$colidx = data.table::fread(fl_list[[i]], header = FALSE)[[1]]
    } else {
      if (verbose) qqcat("query row index data\n")
      if (length(fl_list[[i]]) > 1) {
        for (j in seq_along(fl_list[[i]])) {
          rv$rowidx[[j]] = data.table::fread(fl_list[[i]][[j]], header = FALSE)[[1]]
        }
        new_names = rm_strs(
          fs::path_file(fl_list[[i]]),
          c(paste0(data$dataset.filename, "."), ".rowidx")
        )
        names(rv$rowidx) = new_names
      } else {
        rv$rowidx = data.table::fread(fl_list[[i]], header = FALSE)[[1]]
      }
    }
  }
  rv
}

rm_strs = function(x, patterns) {
  for (i in patterns) {
    x = gsub(i, "", x)
  }
  x
}

#' @describeIn datasets Load data from specified dataset
#' @export
dataset_load = function(id, cohort, subset = NULL, select = NULL, verbose = FALSE, ...) {
  stopifnot(length(id) == 1L, length(cohort) == 1L)
  qq.options(cat_verbose = verbose, LOCAL = TRUE)

  info = dataset_info(id, cohort, verbose = verbose)
  datafile = file.path(info$path, info$dataset.filename)

  if (verbose) qqcat("loading dataset file with column subsetting\n")
  # colnames_data = colnames(data.table::fread(datafile, nrows = 1L, verbose = verbose))
  # if (verbose) qqcat("detected colnames of dataset @{collapse_text(colnames_data)}\n")
  data = data.table::fread(datafile, select = select, verbose = verbose, showProgress = verbose)

  if (verbose) qqcat("subsetting data")
  subset = substitute(subset)

  if (is.language(subset)) {
    data = subset(data, subset = eval(subset))
  }

  data
}
ShixiangWang/coco documentation built on July 9, 2022, 4:43 a.m.