R/array.R
In seeker: Simplified Fetching and Processing of Microarray and RNA-Seq Data

Documented in seekerArray

checkSeekerArrayArgs = function(
    study, geneIdType, platform, parentDir, metadataOnly) {
  assertString(study)
  assertTRUE(any(startsWith(study, c('GSE', 'E-', 'LOCAL'))))
  assertChoice(geneIdType, c('ensembl', 'entrez'))

  assertString(parentDir)
  assertDirectoryExists(parentDir)
  outputDir = file.path(path.expand(parentDir), study) # untar no like ~
  rawDir = file.path(outputDir, 'raw')
  metadataPath = file.path(outputDir, 'sample_metadata.csv')
  sampColname = 'sample_id'

  if (startsWith(study, 'GSE')) {
    repo = 'geo'
    assertString(platform, min.chars = 4, null.ok = TRUE)
    assert(is.null(platform), startsWith(platform, 'GPL'))
  } else if (startsWith(study, 'E-')) {
    repo = 'ae'
    assertNull(platform)
  } else {
    repo = 'local'
    assertString(platform, min.chars = 4)
    assertTRUE(startsWith(platform, 'GPL'))
    assertDirectoryExists(rawDir)
    assertFileExists(metadataPath)
    d = fread(metadataPath, na.strings = '')
    assertNames(colnames(d), must.include = sampColname)
    files = dir(rawDir, '\\.cel(\\.gz)?$', ignore.case = TRUE)
    fileSamples = gsub('\\.cel(\\.gz)?$', '', files, ignore.case = TRUE)
    assertSetEqual(fileSamples, d[[sampColname]])}

  assertFlag(metadataOnly)

  return(list(repo = repo, outputDir = outputDir, rawDir = rawDir,
              metadataPath = metadataPath, sampColname = sampColname))}


#' Process microarray data end to end
#'
#' This function fetches data and metadata from NCBI GEO and ArrayExpress,
#' processes raw Affymetrix data using RMA and custom CDFs from Brainarray, and
#' maps probes to genes. See also the vignettes:
#' \code{browseVignettes('seeker')}.
#'
#' The standard output:
#' * naive_expression_set.qs: Initial `ExpresssionSet` generated by
#'   [GEOquery::getGEO] or [ArrayExpress::ae2bioc()]. Should generally *not* be
#'   used if sample_metadata.csv and gene_expression_matrix.qs are available.
#' * sample_metadata.csv: Table of sample metadata. Column `sample_id` matches
#'   colnames of the gene expression matrix.
#' * gene_expression_matrix.qs: Rows correspond to genes, columns to samples.
#'   Expression values are log2-transformed.
#' * custom_cdf_name.txt: Name of custom CDF package used by [affy::justRMA()]
#'   to process and normalize raw Affymetrix data and map probes to genes.
#' * feature_metadata.qs: `GPL` object, if gene expression matrix was generated
#'   from processed data.
#' * probe_gene_mapping.csv.gz: Table of probes and genes, if gene expression
#'   matrix was generated from processed data.
#' * "raw" directory: Contains raw Affymetrix files.
#' * params.yml: Parameters used to process the dataset.
#' * session.log: R session information.
#'
#' The output may include other files from NCBI GEO or ArrayExpress. Files with
#' extension "qs" can be read into R using [qs::qread()].
#'
#' @param study String indicating the study accession and used to name the output
#'   directory within `parentDir`. Must start with "GSE", "E-", or "LOCAL". If
#'   starts with "GSE", data are fetched using [GEOquery::getGEO()]. If starts
#'   with "E-", data are fetched using [ArrayExpress::getAE()]. If starts with
#'   "LOCAL", data in the form of cel(.gz) files must in the directory
#'   `parentDir`/`study`/raw, and `parentDir`/`study` must contain
#'   a file "sample_metadata.csv" that has a column `sample_id` containing the
#'   names of the cel(.gz) files without the file extension.
#' @param geneIdType String indicating whether to map probes to gene IDs from
#'   Ensembl ("ensembl") or Entrez ("entrez").
#' @param platform String indicating the GEO-based platform accession for the raw
#'   data. See <https://www.ncbi.nlm.nih.gov/geo/browse/?view=platforms>.
#'   Only necessary if `study` starts with "LOCAL", or starts with "GSE"
#'   and the study uses multiple platforms.
#' @param metadataOnly Logical indicating whether to only process the sample
#'   metadata, and skip processing the expression data.
#'
#' @param parentDir Directory in which to store the output, which will be a
#'   directory named according to `study`.
#'
#' @return Path to the output directory `parentDir`/`study`, invisibly.
#'
#' @examples
#' \dontrun{
#' seekerArray('GSE25585', 'entrez')
#' }
#'
#' @seealso [seeker()]
#'
#' @export
seekerArray = function(
    study, geneIdType, platform = NULL, parentDir = '.', metadataOnly = FALSE) {

  r = checkSeekerArrayArgs(study, geneIdType, platform, parentDir, metadataOnly)
  repo = r$repo
  outputDir = r$outputDir
  rawDir = r$rawDir
  metadataPath = r$metadataPath
  sampColname = r$sampColname

  if (!dir.exists(outputDir)) dir.create(outputDir)

  withr::local_options(timeout = max(300, getOption('timeout')))
  withr::local_envvar(VROOM_CONNECTION_SIZE = 131072 * 20)

  result = if (repo == 'geo') {
    getNaiveEsetGeo(study, outputDir, rawDir, platform, metadataOnly)

  } else if (repo == 'ae') {
    if (!requireNamespace('ArrayExpress', quietly = TRUE)) {
      stop(
        glue('Processing {study} requires the ArrayExpress package.'),
        .call = FALSE)}

    rmaOk = glue('Cannot process {study} until the ArrayExpress package',
                 ' is updated to use the BioStudies API.')
    tryCatch(
      getNaiveEsetAe(study, outputDir, rawDir),
      error = function(e) list(eset = NULL, rmaOk = rmaOk))

  } else {
    getNaiveEsetLocal(study, platform)}

  eset = result$eset
  rmaOk = result$rmaOk

  if (repo == 'local') {
    metadata = fread(metadataPath, na.strings = '')
  } else if (!is.null(eset)) {
    qs::qsave(eset, file.path(outputDir, 'naive_expression_set.qs'))
    metadata = data.table(eset@phenoData@data, keep.rownames = sampColname)
    set(metadata, j = sampColname, value = stripFileExt(metadata[[sampColname]]))
    fwrite(metadata, metadataPath)}

  if (metadataOnly) return(invisible(outputDir))

  if (is.character(rmaOk)) {
    warning(rmaOk)
    return(invisible(outputDir))}

  if (rmaOk) {
    cdfname = getCdfname(eset@annotation, geneIdType)
    if (!requireNamespace(cdfname, quietly = TRUE)) {
      installCustomCdfPackages(cdfname)}
    fwrite(list(cdfname), file.path(outputDir, 'custom_cdf_name.txt'))

    emat = tryCatch({seekerRma(rawDir, cdfname)}, error = function(e) e)
    if (inherits(emat, 'error')) {
      rmaOk = FALSE
      warning(paste(
        'Attempting to use processed data, as RMA of raw Affymetrix data',
        'failed due to the following error:', trimws(as.character(emat))))
    } else {
      colnames(emat) = getNewEmatColnames(colnames(emat), repo)
      idx = metadata[[sampColname]] %in% colnames(emat)
      if (!all(idx)) {
        warning(paste(
          'The following sample(s) in the sample metadata',
          'is/are missing from the expression matrix:',
          paste(metadata[[sampColname]][!idx], collapse = ', ')))}
      emat = emat[, metadata[[sampColname]][idx], drop = FALSE]}

    paths = dir(rawDir, '\\.cel$', full.names = TRUE, ignore.case = TRUE)
    for (path in paths) {
      R.utils::gzip(path, overwrite = TRUE)}}

  if (!rmaOk) {
    # only GEO datasets
    featureMetadata = GEOquery::getGEO(eset@annotation)
    qs::qsave(featureMetadata, file.path(outputDir, 'feature_metadata.qs'))
    featureDt = setDT(featureMetadata@dataTable@table)

    platforms = getPlatforms('mapping')
    platformDt = platforms[platforms$platform == eset@annotation]

    mapping = getProbeGeneMapping(featureDt, platformDt, geneIdType)
    fwrite(mapping, file.path(outputDir, 'probe_gene_mapping.csv.gz'))

    emat = getLogTransformedEmat(eset@assayData$exprs)
    emat = getEmatGene(emat, mapping)}

  qs::qsave(emat, file.path(outputDir, 'gene_expression_matrix.qs'))

  params = list(study = study, geneIdType = geneIdType, platform = platform)
  yaml::write_yaml(params, file.path(outputDir, 'params.yml'))
  sessioninfo::session_info(
    info = 'auto', to_file = file.path(outputDir, 'session.log'))

  invisible(outputDir)}