R/array_utils.R
In seeker: Simplified Fetching and Processing of Microarray and RNA-Seq Data

Documented in getPlatforms installCustomCdfPackages

#' Get supported microarray platforms
#'
#' @param type String indicating whether to get supported platforms for
#'   processing raw Affymetrix data using custom CDF or for mapping already
#'   processed data from probes to genes.
#'
#' @return A `data.table`.
#'
#' @export
getPlatforms = function(type = c('cdf', 'mapping')) {
  type = match.arg(type)
  f = if (type == 'cdf') 'platform_cdf.csv' else 'platform_mapping.csv'
  path = system.file('extdata', f, package = 'seeker')
  d = unique(fread(path, na.strings = ''))

  if (type == 'cdf') {
    set(d, j = 'ensembl', value = paste0(d$custom_cdf_prefix, 'ensgcdf'))
    set(d, j = 'entrez', value = paste0(d$custom_cdf_prefix, 'entrezgcdf'))}
  return(d)}


getCdfname = function(anno, geneIdType) {
  d = getPlatforms('cdf')
  cdfname = d[d$platform == anno][[geneIdType]]
  return(cdfname)}


#' Install custom CDF packages
#'
#' Install Brainarray custom CDFs for processing raw Affymetrix data. See
#' <http://brainarray.mbni.med.umich.edu/Brainarray/Database/CustomCDF/CDF_download.asp>.
#'
#' @param pkgs Character vector of package names, e.g., "hgu133ahsentrezgcdf".
#' @param ver Integer version number (25 as of 5 Jan 2021).
#' @param dryRun Logical indicating whether to actually install the packages.
#'
#' @return A character vector of URLs, invisibly.
#'
#' @export
installCustomCdfPackages = function(pkgs, ver = 25, dryRun = FALSE) {
  assertCharacter(pkgs, any.missing = FALSE)
  assertIntegerish(ver, len = 1L, any.missing = FALSE)
  assertFlag(dryRun)
  urlPre = glue('http://mbni.org/customcdf/{ver}.0.0')
  pkg = NULL

  urls = foreach(pkg = unique(pkgs), .combine = c) %do% {
    hint = substr(pkg, nchar(pkg) - 6, nchar(pkg) - 3)
    urlMid = switch(hint, ensg = 'ensg', rezg = 'entrezg')
    if (is.null(urlMid)) {
      warning(glue('Cannot install {pkg}, since it doesn\'t correspond ',
                   'to gene IDs from Ensembl or Entrez.'))
      NA_character_
    } else {
      glue('{urlPre}/{urlMid}.download/{pkg}_{ver}.0.0.tar.gz')}}

  names(urls) = pkgs
  if (!dryRun) utils::install.packages(urls[!is.na(urls)], repos = NULL)
  invisible(urls)}


getNaiveEsetGeo = function(
    study, outputDir, rawDir, platform = NULL, metadataOnly = FALSE) {

  gq = GEOquery::getGEO(study, destdir = outputDir, getGPL = FALSE)
  idx = if (length(gq) == 1L) {
    1L
  } else if (is.null(platform)) {
    return(list(rmaOk = glue('{study} uses multiple platforms, but params does ',
                             'not specify which one to use for processing.')))
  } else {
    which(sapply(gq, function(x) x@annotation) == platform)}

  if (length(idx) == 0) {
    return(list(rmaOk = glue(
      '{study} uses multiple platforms, none of which is {platform}.')))}
  eset = gq[[idx]]

  rmaOk = eset@annotation %in% getPlatforms('cdf')$platform
  procOk = eset@annotation %in% getPlatforms('mapping')$platform

  dSuppTmp = GEOquery::getGEOSuppFiles(
    study, makeDirectory = FALSE, baseDir = outputDir, fetch_files = FALSE)
  isRaw = grepl('_RAW\\.tar$', dSuppTmp$fname)

  if (metadataOnly) return(list(eset = eset))

  if (rmaOk && any(isRaw)) {
    dSupp = GEOquery::getGEOSuppFiles(
      study, makeDirectory = FALSE, baseDir = outputDir)
    paths = rownames(dSupp)[isRaw]
    for (path in paths) {
      utils::untar(path, exdir = rawDir)}
    unlink(paths)

  } else if (procOk) {
    rmaOk = FALSE
  } else {
    rmaOk = glue('{study} uses platform {eset@annotation}, ',
                 'which is not supported with the available data.')}

  return(list(eset = eset, rmaOk = rmaOk))}


getAeMetadata = function(study, type = c('search', 'studies')) {
  type = match.arg(type)
  # urlBase = 'https://www.ebi.ac.uk/arrayexpress/json/v3'
  # url = glue('{urlBase}/{type}/{study}')
  urlBase = 'https://www.ebi.ac.uk/biostudies/api/v1'
  url = if (type == 'search') {
    glue('{urlBase}/search?accession={study}')
  } else {
    glue('{urlBase}/studies/{study}')}

  raw = curl::curl_fetch_memory(url)
  x = jsonlite::fromJSON(rawToChar(raw$content))
  # x = if (type == 'experiments') {
  #   setDT(x$experiments$experiment)
  # } else {
  #   x$files$experiment$file}
  return(x)}


getNaiveEsetAe = function(study, outputDir, rawDir) {
  withr::local_options(timeout = max(300, getOption('timeout')))

  metaList = getAeMetadata(study, 'search')
  if (metaList$totalHits != 1L) {
    return(list(rmaOk = glue(
    "'{study}' does not match exactly one study, cannot proceed.")))}

  meta1 = getAeMetadata(study, 'studies')$section$subsections
  idx = sapply(meta1, function(y) identical(y$type, 'Assays and Data'))
  meta2 = meta1[[which(idx)]]$subsections
  idx = meta2$type == 'Array Designs'
  meta3 = meta2$links[[which(idx)]]

  if (length(meta3) != 1L) {
    return(list(rmaOk = glue(
      '{study} does not use exactly one platform, cannot proceed.')))}

  platform = meta3[[1L]]$url
  idx = sapply(meta1, function(y) identical(y$type, 'MIAME Score'))
  meta4 = meta1[[which(idx)]]$attributes
  hasRaw = meta4$value[meta4$name == 'Raw'] == '*'
  hasProc = meta4$value[meta4$name == 'Processed'] == '*'

  # arrays = expers[1L]$arraydesign
  # if (length(arrays) != 1L) {
  #   return(list(rmaOk = glue(
  #     '{study} does not use exactly one platform, cannot proceed.')))}
  # platform = arrays[[1L]]$accession
  #
  # files = getAeMetadata(study, 'files') # for one study, should be a data.frame
  # hasRaw = any(files$kind == 'raw')
  # hasProc = any(sapply(files$kind, function(k) any(k == 'processed', na.rm = TRUE)))
  #
  # if (metadataOnly) return(list(eset = eset))

  if ((platform %in% getPlatforms('cdf')$ae_accession) && hasRaw) {
    mage = ArrayExpress::getAE(study, path = outputDir, type = 'full')
    eset = ArrayExpress::ae2bioc(mage)
    rmaOk = TRUE
  } else {
    type = if (hasRaw && hasProc) 'full' else if (hasRaw) 'raw' else 'processed'
    mage = ArrayExpress::getAE(
      study, path = outputDir, type = type, extract = FALSE)
    eset = NULL
    rmaOk = glue('{study} does not have raw data from a supported ',
                 'Affymetrix platform. You take it from here.')}

  if (!is.null(mage$rawArchive)) {
    unlink(file.path(outputDir, mage$rawArchive))}

  if (!dir.exists(rawDir)) dir.create(rawDir)

  . = file.rename(file.path(outputDir, basename(mage$rawFiles)),
                  file.path(rawDir, basename(mage$rawFiles)))

  return(list(eset = eset, rmaOk = rmaOk))}


methods::setClass('Eset', slots = c(annotation = 'character'))


getNaiveEsetLocal = function(study, platform) {
  rmaOk = platform %in% getPlatforms('cdf')$platform
  # procOk = platform %in% getPlatforms('mapping')$platform
  # TODO: allow processed data?
  if (rmaOk) {
    eset = methods::new('Eset', annotation = platform)
  } else {
    eset = NULL
    rmaOk = glue(
      '{study} uses platform {platform}, which is not supported for local data.')}
  return(list(eset = eset, rmaOk = rmaOk))}


seekerRma = function(inputDir, cdfname) {
  withr::local_dir(inputDir)
  eset = affy::justRMA(cdfname = cdfname)
  emat = eset@assayData$exprs
  rownames(emat) = gsub('_at$', '', rownames(emat))
  return(emat)}


stripFileExt = function(x) {
  y = gsub('\\.cel(\\.gz)?$', '', x, ignore.case = TRUE)
  return(y)}


getNewEmatColnames = function(old, repo) {
  if (repo == 'geo') {
    r = regexpr('^GSM\\d+', old)
    new = substr(old, r, attr(r, 'match.length'))
  } else {
    new = stripFileExt(old)}
  return(new)}


getProbeGeneMappingDirect = function(featureDt, geneColname, probeColname = 'ID') {
  mapping = featureDt[, c(probeColname, geneColname), with = FALSE]
  mapping = mapping[apply(mapping, 1, function(x) all(!is.na(x) & x != '')), ]
  setnames(mapping, c(probeColname, geneColname), c('probe_set', 'gene_id'))

  idx = attr(regexpr('^[0-9]+', mapping$gene_id), 'match.length')
  mapping = mapping[idx > 0]
  set(mapping, j = 'gene_id', value = substr(mapping$gene_id, 1, idx[idx > 0]))
  return(mapping)}


getProbeGeneMappingAnno = function(featureDt, dbName, interName) {
  mappingProbeTmp = featureDt[
    !is.na(featureDt[[interName]]) & featureDt[[interName]] != '',
    c('ID', interName), with = FALSE]
  setnames(mappingProbeTmp, c('ID', interName), c('probe_set', 'geneInter'))

  pkgName = paste0(substr(dbName, 1, 9), '.db')
  if (!requireNamespace(pkgName, quietly = TRUE)) {
    BiocManager::install(pkgName)}

  mapTmp1 = eval(parse(text = glue('{pkgName}::{dbName}')))
  mapTmp2 = AnnotationDbi::mappedkeys(mapTmp1)
  mapTmp3 = as.list(mapTmp1[mapTmp2])

  mappingIdInter = data.table(
    geneInter = rep.int(names(mapTmp3), lengths(mapTmp3)),
    gene_id = unlist(mapTmp3, use.names = FALSE))

  if (dbName == 'org.Hs.egUNIGENE2EG') {
    set(mappingIdInter, j = 'geneInter',
        value = gsub('^Hs\\.', '', mappingIdInter$geneInter))}

  mapping = merge(mappingIdInter, mappingProbeTmp, by = 'geneInter', sort = FALSE)
  mapping = mapping[, c('probe_set', 'gene_id')]
  return(mapping)}


getEntrezEnsemblMapping = function(organism) {
  pkgName = glue('org.{organism}.eg.db')
  if (!requireNamespace(pkgName, quietly = TRUE)) {
    BiocManager::install(pkgName)}

  mapTmp1 = eval(parse(text = glue('{pkgName}::org.{organism}.egENSEMBL')))
  mapTmp2 = AnnotationDbi::mappedkeys(mapTmp1)
  mapTmp3 = as.list(mapTmp1[mapTmp2])

  m = data.table(entrez = rep.int(names(mapTmp3), lengths(mapTmp3)),
                 ensembl = unlist(mapTmp3, use.names = FALSE))
  return(m)}


getProbeGeneMapping = function(featureDt, platformDt, geneIdType) {
  . = probe_set = gene_id = NULL

  if (platformDt$mappingFunction == 'Direct') {
    mapping = getProbeGeneMappingDirect(featureDt, platformDt$geneColname)
  } else {
    if (!is.na(platformDt$splitColumn)) {
      set(featureDt, j = platformDt$interName,
          value = gsub('\\.\\d+$', '', featureDt[[platformDt$splitColumn]]))}
    mapping = getProbeGeneMappingAnno(
      featureDt, platformDt$dbName, platformDt$interName)}

  for (col in c('probe_set', 'gene_id')) {
    set(mapping, j = col, value = as.character(mapping[[col]]))}

  if (geneIdType == 'ensembl') {
    m = getEntrezEnsemblMapping(platformDt$organism)
    mapping = merge(mapping[, .(probe_set, entrez = gene_id)], m, by = 'entrez')
    set(mapping, j = 'entrez', value = NULL)
    setnames(mapping, 'ensembl', 'gene_id')
    data.table::setorderv(mapping, 'gene_id')}

  return(mapping)}


getLogTransformedEmat = function(emat) {
  # adapted from GEO2R
  qx = stats::quantile(emat, c(0, 0.25, 0.99, 1.0), na.rm = TRUE)
  needsLog = (qx[3L] > 100) || (qx[4L] - qx[1L] > 50 && qx[2L] > 0)
  if (needsLog) {
    idx = emat > 0
    minVal = min(emat[idx], na.rm = TRUE)
    emat[!idx & !is.na(emat)] = minVal
    emat = log2(emat)}
  return(emat)}


getEmatGene = function(ematProbe, mapping) {
  .SD = NULL
  idx = rowSums(is.na(ematProbe)) == 0
  dProbe = data.table(ematProbe[idx, , drop = FALSE], keep.rownames = 'probe_set')
  dProbe = merge(mapping, dProbe, by = 'probe_set', sort = FALSE)
  dGene = dProbe[, lapply(.SD, stats::median),
                 by = 'gene_id', .SDcols = !'probe_set']
  ematGene = as.matrix(dGene[, !'gene_id'])
  rownames(ematGene) = dGene$gene_id
  return(ematGene)}

Any scripts or data that you put into this service are public.

seeker documentation built on Sept. 11, 2024, 7:54 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

seeker
Simplified Fetching and Processing of Microarray and RNA-Seq Data

R/array_utils.R
In seeker: Simplified Fetching and Processing of Microarray and RNA-Seq Data

Defines functions getProbeGeneMapping getEntrezEnsemblMapping getProbeGeneMappingAnno getProbeGeneMappingDirect getNewEmatColnames stripFileExt seekerRma getNaiveEsetLocal getNaiveEsetAe getAeMetadata getNaiveEsetGeo installCustomCdfPackages getCdfname getPlatforms

Documented in getPlatforms installCustomCdfPackages

Try the seeker package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

seeker Simplified Fetching and Processing of Microarray and RNA-Seq Data

R/array_utils.R In seeker: Simplified Fetching and Processing of Microarray and RNA-Seq Data

Defines functions getProbeGeneMapping getEntrezEnsemblMapping getProbeGeneMappingAnno getProbeGeneMappingDirect getNewEmatColnames stripFileExt seekerRma getNaiveEsetLocal getNaiveEsetAe getAeMetadata getNaiveEsetGeo installCustomCdfPackages getCdfname getPlatforms

Documented in getPlatforms installCustomCdfPackages

Try the seeker package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

seeker
Simplified Fetching and Processing of Microarray and RNA-Seq Data

R/array_utils.R
In seeker: Simplified Fetching and Processing of Microarray and RNA-Seq Data