Nothing
checkSeekerArrayArgs = function(
study, geneIdType, platform, parentDir, metadataOnly) {
assertString(study)
assertTRUE(any(startsWith(study, c('GSE', 'E-', 'LOCAL'))))
assertChoice(geneIdType, c('ensembl', 'entrez'))
assertString(parentDir)
assertDirectoryExists(parentDir)
outputDir = file.path(path.expand(parentDir), study) # untar no like ~
rawDir = file.path(outputDir, 'raw')
metadataPath = file.path(outputDir, 'sample_metadata.csv')
sampColname = 'sample_id'
if (startsWith(study, 'GSE')) {
repo = 'geo'
assertString(platform, min.chars = 4, null.ok = TRUE)
assert(is.null(platform), startsWith(platform, 'GPL'))
} else if (startsWith(study, 'E-')) {
repo = 'ae'
assertNull(platform)
} else {
repo = 'local'
assertString(platform, min.chars = 4)
assertTRUE(startsWith(platform, 'GPL'))
assertDirectoryExists(rawDir)
assertFileExists(metadataPath)
d = fread(metadataPath, na.strings = '')
assertNames(colnames(d), must.include = sampColname)
files = dir(rawDir, '\\.cel(\\.gz)?$', ignore.case = TRUE)
fileSamples = gsub('\\.cel(\\.gz)?$', '', files, ignore.case = TRUE)
assertSetEqual(fileSamples, d[[sampColname]])}
assertFlag(metadataOnly)
return(list(repo = repo, outputDir = outputDir, rawDir = rawDir,
metadataPath = metadataPath, sampColname = sampColname))}
#' Process microarray data end to end
#'
#' This function fetches data and metadata from NCBI GEO and ArrayExpress,
#' processes raw Affymetrix data using RMA and custom CDFs from Brainarray, and
#' maps probes to genes. See also the vignettes:
#' \code{browseVignettes('seeker')}.
#'
#' The standard output:
#' * naive_expression_set.qs: Initial `ExpresssionSet` generated by
#' [GEOquery::getGEO] or [ArrayExpress::ae2bioc()]. Should generally *not* be
#' used if sample_metadata.csv and gene_expression_matrix.qs are available.
#' * sample_metadata.csv: Table of sample metadata. Column `sample_id` matches
#' colnames of the gene expression matrix.
#' * gene_expression_matrix.qs: Rows correspond to genes, columns to samples.
#' Expression values are log2-transformed.
#' * custom_cdf_name.txt: Name of custom CDF package used by [affy::justRMA()]
#' to process and normalize raw Affymetrix data and map probes to genes.
#' * feature_metadata.qs: `GPL` object, if gene expression matrix was generated
#' from processed data.
#' * probe_gene_mapping.csv.gz: Table of probes and genes, if gene expression
#' matrix was generated from processed data.
#' * "raw" directory: Contains raw Affymetrix files.
#' * params.yml: Parameters used to process the dataset.
#' * session.log: R session information.
#'
#' The output may include other files from NCBI GEO or ArrayExpress. Files with
#' extension "qs" can be read into R using [qs::qread()].
#'
#' @param study String indicating the study accession and used to name the output
#' directory within `parentDir`. Must start with "GSE", "E-", or "LOCAL". If
#' starts with "GSE", data are fetched using [GEOquery::getGEO()]. If starts
#' with "E-", data are fetched using [ArrayExpress::getAE()]. If starts with
#' "LOCAL", data in the form of cel(.gz) files must in the directory
#' `parentDir`/`study`/raw, and `parentDir`/`study` must contain
#' a file "sample_metadata.csv" that has a column `sample_id` containing the
#' names of the cel(.gz) files without the file extension.
#' @param geneIdType String indicating whether to map probes to gene IDs from
#' Ensembl ("ensembl") or Entrez ("entrez").
#' @param platform String indicating the GEO-based platform accession for the raw
#' data. See <https://www.ncbi.nlm.nih.gov/geo/browse/?view=platforms>.
#' Only necessary if `study` starts with "LOCAL", or starts with "GSE"
#' and the study uses multiple platforms.
#' @param metadataOnly Logical indicating whether to only process the sample
#' metadata, and skip processing the expression data.
#'
#' @param parentDir Directory in which to store the output, which will be a
#' directory named according to `study`.
#'
#' @return Path to the output directory `parentDir`/`study`, invisibly.
#'
#' @examples
#' \dontrun{
#' seekerArray('GSE25585', 'entrez')
#' }
#'
#' @seealso [seeker()]
#'
#' @export
seekerArray = function(
study, geneIdType, platform = NULL, parentDir = '.', metadataOnly = FALSE) {
r = checkSeekerArrayArgs(study, geneIdType, platform, parentDir, metadataOnly)
repo = r$repo
outputDir = r$outputDir
rawDir = r$rawDir
metadataPath = r$metadataPath
sampColname = r$sampColname
if (!dir.exists(outputDir)) dir.create(outputDir)
withr::local_options(timeout = max(300, getOption('timeout')))
withr::local_envvar(VROOM_CONNECTION_SIZE = 131072 * 20)
result = if (repo == 'geo') {
getNaiveEsetGeo(study, outputDir, rawDir, platform, metadataOnly)
} else if (repo == 'ae') {
if (!requireNamespace('ArrayExpress', quietly = TRUE)) {
stop(
glue('Processing {study} requires the ArrayExpress package.'),
.call = FALSE)}
rmaOk = glue('Cannot process {study} until the ArrayExpress package',
' is updated to use the BioStudies API.')
tryCatch(
getNaiveEsetAe(study, outputDir, rawDir),
error = function(e) list(eset = NULL, rmaOk = rmaOk))
} else {
getNaiveEsetLocal(study, platform)}
eset = result$eset
rmaOk = result$rmaOk
if (repo == 'local') {
metadata = fread(metadataPath, na.strings = '')
} else if (!is.null(eset)) {
qs::qsave(eset, file.path(outputDir, 'naive_expression_set.qs'))
metadata = data.table(eset@phenoData@data, keep.rownames = sampColname)
set(metadata, j = sampColname, value = stripFileExt(metadata[[sampColname]]))
fwrite(metadata, metadataPath)}
if (metadataOnly) return(invisible(outputDir))
if (is.character(rmaOk)) {
warning(rmaOk)
return(invisible(outputDir))}
if (rmaOk) {
cdfname = getCdfname(eset@annotation, geneIdType)
if (!requireNamespace(cdfname, quietly = TRUE)) {
installCustomCdfPackages(cdfname)}
fwrite(list(cdfname), file.path(outputDir, 'custom_cdf_name.txt'))
emat = tryCatch({seekerRma(rawDir, cdfname)}, error = function(e) e)
if (inherits(emat, 'error')) {
rmaOk = FALSE
warning(paste(
'Attempting to use processed data, as RMA of raw Affymetrix data',
'failed due to the following error:', trimws(as.character(emat))))
} else {
colnames(emat) = getNewEmatColnames(colnames(emat), repo)
idx = metadata[[sampColname]] %in% colnames(emat)
if (!all(idx)) {
warning(paste(
'The following sample(s) in the sample metadata',
'is/are missing from the expression matrix:',
paste(metadata[[sampColname]][!idx], collapse = ', ')))}
emat = emat[, metadata[[sampColname]][idx], drop = FALSE]}
paths = dir(rawDir, '\\.cel$', full.names = TRUE, ignore.case = TRUE)
for (path in paths) {
R.utils::gzip(path, overwrite = TRUE)}}
if (!rmaOk) {
# only GEO datasets
featureMetadata = GEOquery::getGEO(eset@annotation)
qs::qsave(featureMetadata, file.path(outputDir, 'feature_metadata.qs'))
featureDt = setDT(featureMetadata@dataTable@table)
platforms = getPlatforms('mapping')
platformDt = platforms[platforms$platform == eset@annotation]
mapping = getProbeGeneMapping(featureDt, platformDt, geneIdType)
fwrite(mapping, file.path(outputDir, 'probe_gene_mapping.csv.gz'))
emat = getLogTransformedEmat(eset@assayData$exprs)
emat = getEmatGene(emat, mapping)}
qs::qsave(emat, file.path(outputDir, 'gene_expression_matrix.qs'))
params = list(study = study, geneIdType = geneIdType, platform = platform)
yaml::write_yaml(params, file.path(outputDir, 'params.yml'))
sessioninfo::session_info(
info = 'auto', to_file = file.path(outputDir, 'session.log'))
invisible(outputDir)}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.