R/cohort.R

Defines functions cohort_load cohort_ls cohort_new cohorts

Documented in cohort_ls cohort_new cohorts

#' Functions to access/manage cohort level data
#'
#' Cohort is a set of datasets from the same study or for a same group of samples,
#' it is the individual data unit in Omics database construction.
#' TODO: ENSEMBL IDs and Hugo Symbol will be supported and the can exchange
#' automatically.
#' @name cohorts
#' @rdname cohorts
#' @param verbose Whether to print extra information.
#' @export
#' @examples
#' cohorts(verbose = TRUE)
cohorts = function(verbose = FALSE) {
  qq.options(cat_verbose = verbose, LOCAL = TRUE)
  databases = unlist(get_golem_config("coco.databases"))
  if (verbose) qqcat("checking databases: @{collapse_text(databases)}\n")
  ext_idx = fs::dir_exists(databases)
  if (sum(!ext_idx) > 0 && verbose) {
    qqcat("non-exist database path(s): @{collapse_text(databases[!ext_idx])}\n",
      cat_prefix = cat_prefix_warn
    )
  }
  databases = databases[ext_idx]
  paths = fs::dir_ls(databases)
  if (length(paths) < 1) {
    qqcat("no cohort available, please check your setting\n", cat_prefix = cat_prefix_warn)
    return(NULL)
  }

  cv = data.table(
    cohort = fs::path_file(paths),
    path = fs::path_expand(paths)
  )
  dup_ids = anyDuplicated(cv$cohort)
  if (dup_ids > 0) {
    dup_cohorts = cv$cohort[duplicated(cv$cohort)]
    qqcat("duplicated cohort names found, @{collapse_text(dup_cohorts)}\n",
      cat_prefix = cat_prefix_err
    )
    return(NULL)
  }
  return(cv)
}

#' @describeIn cohorts Create a new cohort with data storing in a path
#' @param path A directory path pointing the new cohort.
#' @param name Description of the cohort.
#' @param cancer_type, Cancer type the cohort belongs to.
#' @param data_provider Typically an university or an institute.
#' @param maintainer Maintainer name and email in format `xxx <xxx@xxx.com>`.
#' @param doi DOI to link the reference.
#' @param year The year to generate the cohort, if not set, use current year.
#' @param dataset_rowidx Specify which columns used for generating row index
#' files from dataset file. You can check `SPEC_ROWINDEX` for implemented
#' description. e.g., ".c1" for "Matrix" data format means if a dataset is in
#' "Matrix" format (gene expression typically), the first column is the index.
#' Moreover, you can directly specify the column name, e.g., "Sample" for data
#' format "Segment".
#' @param dataset_ncount similar to `dataset_rowidx` but this is used for
#' specify column for sample counting. See `SPEC_SAMPINDEX`.
#' @param dataset_options Valid dataset options, see `SPEC_DATASET_OPTIONS`.
#' NOTE: you can expand the `SPEC_DATASET_OPTIONS` and `SPEC_ROWINDEX` by yourself.
#' @param dataset_list A `data.frame` contains datasets information in this cohort.
#' The dataset should have the following fields:
#'   - `id`: The file name without `.txt`/`.maf`(.gz) extension and cohort, which would be used as identifier to the dataset.
#' For example, if you have a gene expression dataset in cohort `abc`, and
#' your dataset name is `abc_expression.txt`, then the id should be set to `abc_expression`.
#' therefore the file is unique to whole database hosted by `{coco}`.
#'   - `name`: Description of the dataset.
#'   - `genome_build`: Reference genome version, e.g., "hg38".
#'   - `data_platform`: See `.SPEC_DATASET_OPTIONS`.
#'   - `data_type`: See `SPEC_DATASET_OPTIONS`.
#'   - `data_format`: See `SPEC_DATASET_OPTIONS`, to filter and search.
#' For "Segment" format, refer to https://github.com/ShixiangWang/DoAbsolute/blob/master/inst/extdata/SNP6_solid_tumor.seg.txt
#' as an example.
#'   - `tags`: Other labels for the dataset to help search and filter.
#' Separate multiple tags with comma.
#' @export
#' @examples
#' \donttest{
#' cohort_new(
#'   path = system.file("cohorts/example_TCGA_LAML", package = "coco"),
#'   name = "TCGA Acute Myeloid Leukemia (LAML) for examples utils",
#'   cancer_type = "LAML",
#'   data_provider = "TCGA",
#'   maintainer = "Shixiang Wang <wangsx1@sysucc.org.cn>",
#'   doi = NA,
#'   year = 2016,
#'   dataset_list = data.frame(
#'     id = c(
#'       "example_TCGA_LAML_patient_info",
#'       "example_TCGA_LAML_gene_expr_HTSeq_count",
#'       "example_TCGA_LAML_MAF"
#'     ),
#'     name = c(
#'       "Patient information", "Gene expression in log2(count)",
#'       "Mutation list"
#'     ),
#'     genome_build = c(NA, "hg19", "hg19"),
#'     data_platform = c("Clinical", "RNA-Seq", "WES"),
#'     data_type = c("Phenotype", "Gene expression", "Mutation"),
#'     data_format = c("PatientInfo", "Matrix", "MAF"),
#'     tags = c("survival,FAB", "HTSeq", NA)
#'   )
#' )
#' }
cohort_new = function(path, name, cancer_type, data_provider, maintainer,
                      dataset_list, doi = NA, year = NULL,
                      dataset_rowidx = SPEC_ROWINDEX,
                      dataset_options = SPEC_DATASET_OPTIONS,
                      dataset_ncount = SPEC_SAMPINDEX,
                      verbose = FALSE) {
  # Create a meta.yml file to store all informations
  stopifnot(
    length(path) == 1L,
    is.character(path),
    is.character(name),
    is.character(cancer_type),
    is.character(data_provider),
    is.character(maintainer),
    is.data.frame(dataset_list),
    nrow(dataset_list) > 0
  )
  qq.options(cat_verbose = verbose, LOCAL = TRUE)
  if (!fs::dir_exists(path)) {
    qqcat("hey, cohort path @{path} does not exist, please check\n", cat_prefix = cat_prefix_err)
    return(NULL)
  }

  if (verbose) qqcat("recoding information ...\n")
  if (is.null(year)) year = format(Sys.Date(), "%Y")
  info = list(
    id = fs::path_file(path),
    name = name,
    cancer_type = cancer_type,
    data_provider = data_provider,
    maintainer = maintainer,
    doi = doi,
    year = year
  )

  if (verbose) {
    qqcat("checking dataset general metadata\n")
    print(info)
  }
  if (!all(dataset_list$data_platform %in% dataset_options$platforms)) {
    qqcat("data platform does not pass, valid lists are: @{collapse_text(dataset_options$platforms)}\n", cat_prefix = cat_prefix_err)
    return(NULL)
  }
  if (!all(dataset_list$data_type %in% dataset_options$types)) {
    qqcat("data type does not pass, valid lists are: @{collapse_text(dataset_options$types)}\n", cat_prefix = cat_prefix_err)
    return(NULL)
  }
  if (!all(dataset_list$data_format %in% dataset_options$formats)) {
    qqcat("data format does not pass, valid lists are: @{collapse_text(dataset_options$formats)}\n", cat_prefix = cat_prefix_err)
    return(NULL)
  }

  if (verbose) qqcat("parsing/checking input dataset list and generating metadata\n")
  if (sum(duplicated(dataset_list$id)) > 0) {
    qqcat("duplicated data list ID detected, this is invalid\n", cat_prefix = cat_prefix_err)
    return(NULL)
  }
  if (!all(startsWith(dataset_list$id, info$id))) {
    qqcat("all dataset id should use id as prefix, which is: @{info$id}\n", cat_prefix = cat_prefix_err)
    return(NULL)
  }
  nd = nrow(dataset_list)
  dataset_list$n = NA_integer_
  dataset_list$filename = NA_character_
  for (i in seq_len(nd)) {
    if (verbose) qqcat("handling dataset `@{dataset_list$id[i]}`\n")
    fp = fs::dir_ls(path, regexp = dataset_list$id[i])
    fp = fp[!(endsWith(fp, "rowidx") | endsWith(fp, "colidx"))]
    if (length(fp) != 1) {
      qqcat("no unique dataset file path found, either zero or multiple files matched\n",
        cat_prefix = cat_prefix_err
      )
      qqcat("matched file list: @{collapse_text(fp)}\n", cat_prefix = cat_prefix_err)
      return(NULL)
    }
    # Sample index for sample count
    sampidx = dataset_ncount[[dataset_list$data_format[i]]]
    if (is.null(sampidx)) {
      qqcat("no info specified to count samples in dataset\n",
        cat_prefix = cat_prefix_err
      )
      return(NULL)
    }

    # Row index for metadata
    rowidx = dataset_rowidx[[dataset_list$data_format[i]]]
    if (is.null(rowidx)) {
      qqcat("no column to specified to generate row index metadata file\n",
        cat_prefix = cat_prefix_err
      )
      return(NULL)
    }
    rowidx = lapply(rowidx, function(x) {
      if (startsWith(x, ".c")) {
        # Convert to integer column index
        x = as.integer(sub(".c", "", rowidx))
      }
      x
    })

    if (length(rowidx) == 1) rowidx = rowidx[[1]]
    if (verbose) qqcat("reading data file @{fp}\n")
    data = data.table::fread(fp)
    if (verbose) qqcat("writing index files\n")

    # Sample count
    if (verbose) qqcat("count samples\n")
    nsample = if (sampidx == "ncol-1") {
      ncol(data) - 1
    } else {
      if (sampidx %in% colnames(data)) {
        length(unique(data[[sampidx]]))
      } else {
        NA_integer_
      }
    }
    if (verbose) qqcat("number: @{nsample}\n")

    # Column index
    if (verbose) qqcat("generating column index files\n")
    cx_fp = paste0(fp, ".colidx")
    cx_data = colnames(data)
    data.table::fwrite(data.table(id = cx_data), file = cx_fp, col.names = FALSE)

    # Row index
    if (verbose) qqcat("generating row index files\n")
    if (length(rowidx) > 1) {
      # Multiple row index files may available for non-matrix data
      for (j in rowidx) {
        if (is.integer(j)) {
          rx_fp = paste0(fp, ".", colnames(data)[j], ".rowidx")
        } else {
          if (!j %in% colnames(data)) {
            qqcat("no column @{j} found in the dataset\n",
              cat_prefix = cat_prefix_err
            )
            return(NULL)
          }
          rx_fp = paste0(fp, ".", j, ".rowidx")
        }
        rx_data = unique(data[[j]])
        data.table::fwrite(data.table(id = rx_data), file = rx_fp, col.names = FALSE)
      }
    } else {
      rx_fp = paste0(fp, ".rowidx")
      rx_data = unique(data[[rowidx]])
      data.table::fwrite(data.table(id = rx_data), file = rx_fp, col.names = FALSE)
    }

    if (verbose) qqcat("done for dataset `@{dataset_list$id[i]}`\n")
    dataset_list$n[i] = nsample
    dataset_list$filename[i] = fs::path_file(fp)
  }

  if (verbose) qqcat("generating cohort meta.yml\n")
  meta_fp = fs::path_join(c(path, "meta.yml"))
  info$dataset = dataset_list
  write_yaml(info, file = meta_fp)
  message("job finished, you can check `meta.yml` and index files in cohort path")
  return(invisible(NULL))
}

#' @describeIn cohorts List datasets (or metadata) in a cohort
#' @param id Cohort ID, which can obtained from [cohorts()].
#' @export
#' @examples
#' cohort_ls("example_TCGA_LAML")
cohort_ls = function(id, verbose = FALSE) {
  # Read meta.yml file and parse it
  stopifnot(length(id) == 1L)
  qq.options(cat_verbose = verbose, LOCAL = TRUE)
  chs = cohorts(verbose)
  if (!id %in% chs$cohort) {
    qqcat("bad cohort ID, valid lists are:\n", cat_prefix = cat_prefix_err)
    print(chs$cohort)
    return(NULL)
  }
  chs = chs[chs$cohort == id]
  meta_fl = fs::path_join(c(chs$path, "meta.yml"))
  if (!fs::file_exists(meta_fl)) {
    qqcat("meta file `@{meta_fl}` does not exist, have you created? If yes, contact developer!",
      cat_prefix = cat_prefix_err
    )
    return(NULL)
  }
  data = as.data.frame(read_yaml(meta_fl))
  data$path = chs$path
  return(data.table(data))
}

cohort_load = function() {

}
ShixiangWang/coco documentation built on July 9, 2022, 4:43 a.m.