R/loadGEO.R
In phantasus: Visual and interactive gene expression analysis

Documented in checkGPLsFallback getArchs4Files getES getGDS getGSE loadFromARCHS4 loadGEO reparseCachedESs

#' Load GEO Dataset.
#'
#' \code{loadGEO} returns the file with serialized ExpressionSet using
#'     ProtoBuf, parsed from data downloaded from GEO by identifier.
#'
#' @param name String, containing GEO identifier of the dataset.
#'     It should start with 'GSE' or 'GDS' and can include exact GPL
#'     to annotate dataset, separated with dash ('-') from the identifier.
#'
#' @param type Type of the dataset: 'GSE' or 'GDS'. If not specified,
#'     the function will take first three letters
#'     of \code{name} variable as type.
#'
#' @return File with ProtoBuf-serialized ExpressionSet-s
#'     that were downloaded by this identifier.
#'     For GSE-datasets there can be multiple annotations, so in file will be a
#'     list mapping name with GPL to ExpressionSet.
#'
#' @examples
#' \dontrun{
#'     loadGEO("GSE27112")
#'     loadGEO("GDS4922")
#' }
#'
#' @importFrom utils tail
#' @import Biobase
#' @import GEOquery
loadGEO <- function(name, type = NA) {
    cacheDir <- getOption("phantasusCacheDir")

    if (is.null(cacheDir)) {
        cacheDir <- tempdir()
    } else if (!dir.exists(cacheDir)) {
        dir.create(cacheDir)
    }

    mirrorPath <- getOption('phantasusMirrorPath')
    if (is.null(mirrorPath)) {
        mirrorPath <- "https://ftp.ncbi.nlm.nih.gov"
    }

    geoDir <- getGEODir(name, cacheDir)
    binaryName <- paste0(name, '.bin')
    filePath <- file.path(geoDir, binaryName)
    urls <- c()


    ess <- getES(name, type, destdir = cacheDir, mirrorPath = mirrorPath)

    files <- list()
    assign("es", utils::tail(ess, n=1)[[1]], envir = parent.frame())
    if (!file.exists(filePath)) {
        for (i in seq_along(ess)) {
            seriesName <- if (!grepl(pattern = "-", name) && length(ess) > 1)
                paste0(name, "-", annotation(ess[[i]])) else name
            files[[seriesName]] <- writeToList(ess[[i]])
        }
        tempBinFile <- tempfile(paste0(binaryName, ".binsave"), tmpdir=geoDir)
        protolite::serialize_pb(files, tempBinFile)
        message('Saved binary file: ', tempBinFile)
        file.rename(tempBinFile, filePath)
    }


    urls <-c(urls, unlist(strsplit(filePath, cacheDir, fixed = TRUE))[2])
    jsonlite::toJSON(urls)
}

#' Load ExpressionSet from GEO Datasets
#'
#'\code{getGDS} return the ExpressionSet object corresponding
#'     to GEO Dataset identifier.
#'
#' @param name String, containing GEO identifier of the dataset.
#'     It should start with 'GSE' or 'GDS' and can include exact GPL
#'     to annotate dataset, separated with dash ('-') from the identifier.
#'
#' @param destdir Directory for caching loaded Series and GPL
#'     files from GEO database.
#'
#' @param mirrorPath URL string which specifies the source of matrices.
#'
#' @return ExpressionSet object wrapped in list, that was available by given
#'     in \code{name} variable GEO identifier.
#'
#' @examples
#' getGDS('GDS4922')
#'
#' @export
getGDS <- function(name, destdir = tempdir(),
                   mirrorPath = "https://ftp.ncbi.nlm.nih.gov") {
    stub <- gsub("\\d{1,3}$", "nnn", name, perl = TRUE)
    filename <- sprintf("%s.soft.gz", name)
    gdsurl <- "%s/geo/datasets/%s/%s/soft/%s"

    fullGEODirPath <- getGEODir(name, destdir)
    destfile <- file.path(fullGEODirPath, filename)

    infile <- file.exists(destfile)

    if (!infile) {
        tempDestFile <- tempfile(paste0(filename, ".load"), tmpdir=fullGEODirPath)
        tryCatch({
            utils::download.file(sprintf(gdsurl, mirrorPath,
                                         stub, name, filename),
                            destfile = tempDestFile,
                            method="libcurl")
            file.rename(tempDestFile, destfile)
            infile <- TRUE
        },
        error = function(e) {
            file.remove(tempDestFile)
        },
        warning = function(w) {
            file.remove(tempDestFile)
        })
    } else {
        message(paste("Loading from locally found file", destfile))
    }

    l <- suppressWarnings(getGEO(GEO = name, destdir = fullGEODirPath, AnnotGPL = TRUE))

    # extracting all useful information on dataset
    table <- methods::slot(l, "dataTable")


    # extracting table ID_REF | IDENTIFIER/SAMPLE | SAMPLE1 | ...
    data <- Table(table)
    columnsMeta <- Columns(table)  # phenoData
    sampleNames <- as.vector(columnsMeta[["sample"]])
    rownames <- as.vector(data[["ID_REF"]])
    symbol <- as.vector(data[["IDENTIFIER"]])

    data <- data[, sampleNames]  # expression data
    exprs <- as.matrix(data)
    row.names(exprs) <- rownames

    row.names(columnsMeta) <- sampleNames
    pData <- AnnotatedDataFrame(data.frame(columnsMeta, check.names = FALSE))

    fData <- data.frame(id=rownames, symbol=symbol, row.names = rownames, stringsAsFactors = FALSE)
    fData <- AnnotatedDataFrame(fData)

    mabstract=ifelse(is.null(Meta(l)$description),"",Meta(l)$description)
    mpubmedids=ifelse(is.null(Meta(l)$pubmed_id),"",Meta(l)$pubmed_id)
    mtitle=ifelse(is.null(Meta(l)$title),"",Meta(l)$title)

    list(ExpressionSet(assayData = exprs,
                       phenoData = pData,
                       featureData = fData,
                       experimentData=new("MIAME",
                                          abstract=mabstract,
                                          title=mtitle,
                                          pubMedIds=mpubmedids,
                                          other=Meta(l))))
}


#' Returns list of ARCHS4 hdf5 files with expression data
#' @param cacheDir base directory for cache
#' @return list of .h5 files
getArchs4Files <- function(cacheDir) {
    list.files(paste(file.path(cacheDir), 'archs4', sep = .Platform$file.sep), '\\.h5$', full.names = TRUE)
}


#' Loads expression data from ARCHS4 count files.
#' Only sapmles with counted expression are kept.
#' If es already containts expression data it is returned as is.
#' @param es ExpressionSet from GEO to check for expression in ARCHS4
#' @param archs4_files list of available .h5 files from ARCHS4 project
#' @return either original es or an ExpressionSet with loaded count data from ARCHS4
loadFromARCHS4 <- function(es, archs4_files) {
    if (nrow(es) > 0 ) {
        return(es)
    }
    for (destfile in archs4_files) {

        samples <- h5read(destfile, "meta/Sample_geo_accession")
        sampleIndexes <- match(es$geo_accession,
                               samples)

        if (sum(!is.na(sampleIndexes)) == 0) {
            # no needed samples in this file
            H5close()
            next
        }

        genes <- as.character(h5read(destfile, "meta/genes"))
        h5Indexes = list(seq_len(length(genes)),
                         stats::na.omit(sampleIndexes))

        expression <- h5read(destfile,
                             "data/expression",
                             index=h5Indexes)
        rownames(expression) <- genes
        colnames(expression) <- colnames(es)[!is.na(sampleIndexes)]

        es2 <- ExpressionSet(assayData = expression,
                             phenoData = phenoData(es[, !is.na(sampleIndexes)]),
                             annotation = annotation(es),
                             experimentData = experimentData(es)
                            )

        tryCatch({
          fData(es2) <- cbind(fData(es2), "ENSEMBLID"=as.character(h5read(destfile, "meta/gene_ensemblid")))
        }, error=function (e) {})

        tryCatch({
          fData(es2) <- cbind(fData(es2), "Gene ID"=as.character(h5read(destfile, "meta/gene_entrezid")))
        }, error=function (e) {})

        fData(es2) <- cbind(fData(es2), "Gene symbol"=rownames(es2))
        H5close()
        return(es2)
    }

    return(es)
}


filterFeatureAnnotations <- function(es) {
    fvarsToKeep <- c()
    if ("Gene symbol" %in% fvarLabels(es)) {
        fvarsToKeep <- c(fvarsToKeep, "Gene symbol")
    } else {
        fvarsToKeep <- c(fvarsToKeep, grep("symbol",
                                           fvarLabels(es),
                                           ignore.case = TRUE,
                                           value = TRUE))
    }

    if ("Gene ID" %in% fvarLabels(es)) {
        fvarsToKeep <- c(fvarsToKeep, "Gene ID")
    } else if ("ID" %in% fvarLabels(es)) {
        fvarsToKeep <- c(fvarsToKeep, "ID")
    } else {
        fvarsToKeep <- c(fvarsToKeep, grep("entrez",
                                           fvarLabels(es),
                                           ignore.case = TRUE,
                                           value = TRUE))
    }

    featureData(es) <- featureData(es)[, fvarsToKeep]

    if (!any(sapply(fData(es),
                    function(x) identical(rownames(es), as.character(x))
                    ))) {
        fData(es) <- cbind("id"=rownames(es), fData(es))
    }

    es
}

filterPhenoAnnotations <- function(es) {
    phenoData(es) <- phenoData(es)[,
                                   grepl("characteristics",
                                         varLabels(es),
                                         ignore.case = TRUE) |
                                       (varLabels(es) %in% c("title",
                                                             "id",
                                                             "geo_accession"
                                       ))]

    chr <- varLabels(es)[grepl("characteristics",
                               varLabels(es),
                               ignore.case = TRUE)]

    parsePData <- function(old.phenodata) {
        old.pdata <- pData(old.phenodata)
        labels <- varLabels(old.phenodata)

        new.pdata <- as.data.frame(matrix(NA, nrow = nrow(old.pdata), ncol = 0))


        for (i in seq_len(ncol(old.pdata))) {
            splitted <- strsplit(as.vector(old.pdata[[i]]), ':')
            lengths <- sapply(splitted, length)
            if (any(lengths != 2 & lengths != 0)) {
                new.pdata[[labels[i]]] <- old.pdata[[i]]
            } else {
                zeros <- which(lengths == 0)

                splitted[zeros] <- replicate(length(zeros), list(c(NA, NA)))

                newnames <- unique(trimws(take(splitted, 1)))
                newnames <- newnames[which(!is.na(newnames))]

                for (j in seq_along(newnames)) {
                    name <- newnames[j]
                    if (!(name %in% names(new.pdata))) {
                        new.pdata[[name]] <- replicate(nrow(new.pdata), NA)
                    }
                    indices <- which(name == trimws(take(splitted, 1)))
                    new.pdata[[name]][indices] <- trimws(take(splitted, 2)[indices])
                }
            }
        }
        rownames(new.pdata) <- rownames(old.pdata)
        AnnotatedDataFrame(new.pdata)
    }

    if (ncol(es) > 0) {
        phenoData(es) <- parsePData(phenoData(es))
    }

    es
}


#' Load ExpressionSet from GEO Series
#'
#'\code{getGSE} return the ExpressionSet object(s) corresponding
#'     to GEO Series Identifier.
#'
#' @param name String, containing GEO identifier of the dataset.
#'     It should start with 'GSE' or 'GDS' and can include exact GPL
#'     to annotate dataset, separated with dash ('-') from the identifier.
#'
#' @param destdir Directory for caching loaded Series and GPL
#'     files from GEO database.
#'
#' @param mirrorPath URL string which specifies the source of matrices.
#'
#' @return List of ExpressionSet objects, that were available by given
#'     in \code{name} variable GEO identifier.
#'
#' @examples
#' \dontrun{
#'     getGSE('GSE14308', destdir = 'cache')
#'     getGSE('GSE27112')
#' }
#' getGSE('GSE53986')
#'
#' @export
#' @import rhdf5
getGSE <- function(name, destdir = tempdir(),
                   mirrorPath = "https://ftp.ncbi.nlm.nih.gov") {
    GEO <- unlist(strsplit(name, "-"))[1]

    stub <- gsub("\\d{1,3}$", "nnn", GEO, perl = TRUE)
    filename <- sprintf("%s_series_matrix.txt.gz", name)
    gseurl <- "%s/geo/series/%s/%s/matrix/%s"

    fullGEODirPath <- getGEODir(name, destdir)
    destfile <- file.path(fullGEODirPath, filename)

    if (!checkGSEType(name, destdir)) {
      stop('Currently unsupported experiment type')
    }

    infile <- file.exists(destfile)

    if (!infile) {
        tempDestFile <- tempfile(paste0(filename, ".load"), tmpdir=fullGEODirPath)
        tryCatch({
            utils::download.file(sprintf(gseurl, mirrorPath,
                                         stub, GEO, filename),
                                    destfile = tempDestFile,
                                 method="libcurl")
            file.rename(tempDestFile, destfile)
            infile <- TRUE
        },
        error = function(e) {
            file.remove(tempDestFile)
        },
        warning = function(w) {
            file.remove(tempDestFile)
        })
    } else {
        message(paste("Loading from locally found file", destfile))
    }

    if (infile && file.size(destfile) > 0) {
        ess <- list(suppressWarnings(getGEO(filename = destfile,destdir = fullGEODirPath, getGPL = FALSE, AnnotGPL = FALSE)))
        for (i in seq_len(length(ess))) {
            ess[[i]] <- annotateFeatureData(ess[[i]], destdir)
        }
    } else {
        gpls <- fromJSON(checkGPLs(name))
        if (length(gpls) == 0) {
            stop(paste("Dataset", name, "not found"))
        }
        if (length(gpls) == 1 && gpls == name) {
            stop(paste("Can't download dataset ", name))

        }
        ess <- list()
        for (i in 1:length(gpls)) {
          ess[[gpls[[i]]]] <- getGSE(gpls[[i]], destdir = destdir, mirrorPath = mirrorPath)[[1]]
        }
        return(ess)
    }


    archs4_files <- getArchs4Files(destdir)
    if (length(archs4_files) > 0)  {
        ess <- lapply(ess, loadFromARCHS4, archs4_files=archs4_files)
    }

    ess <- lapply(ess, filterFeatureAnnotations)

    ess <- lapply(ess, filterPhenoAnnotations)

    ess <- lapply(ess, inferCondition)

    ess


}

#' Load ExpressionSet by GEO identifier
#'
#'\code{getES} return the ExpressionSet object(s) corresponding
#'     to GEO identifier.
#'
#' @param name String, containing GEO identifier of the dataset.
#'     It should start with 'GSE' or 'GDS' and can include exact GPL
#'     to annotate dataset, separated with dash ('-') from the identifier.
#'
#' @param type Type of the dataset: 'GSE' or 'GDS'. If not specified,
#'     the function will take first three letters
#'     of \code{name} variable as type.
#'
#' @param destdir Directory for caching loaded Series and GPL
#'     files from GEO database.
#'
#' @param mirrorPath URL string which specifies the source of matrices.
#'
#' @return List of ExpressionSet objects, that were available by given
#'     in \code{name} variable GEO identifier.
#'
#' @examples
#' \dontrun{
#'     getES('GSE14308', type = 'GSE', destdir = 'cache')
#'     getES('GSE27112')
#' }
#' getES('GDS4922')
#'
#' @export
getES <- function(name, type = NA, destdir = tempdir(),
                  mirrorPath = "https://ftp.ncbi.nlm.nih.gov") {
    if (is.na(type)) {
        type <- substr(name, 1, 3)
    }
    geoDir <- getGEODir(name, destdir)
    possibly.cached <- file.path(geoDir, paste0(name, ".rda"))
    if (file.exists(possibly.cached)) {
        load(possibly.cached)
        message(paste("Loaded from locally cached parsed file", possibly.cached))
    } else {
        if (type == "GSE") {
            res <- getGSE(name, destdir, mirrorPath)
        } else if (type == "GDS") {
            res <- getGDS(name, destdir, mirrorPath)
        } else {
            stop("Incorrect name or type of the dataset")
        }
        if (length(res) > 1) {
            for (i in 1:length(res)) {
                ess <- list(res[[i]])
                destfile <- file.path(geoDir,
                                      paste0(name,
                                             "-",
                                             annotation(res[[i]]),
                                             ".rda"))
                message(paste("Cached dataset to ", destfile))
                save(ess, file = destfile)
            }
        }
        ess <- res
        destfile <- file.path(geoDir, paste0(name, ".rda"))
        message(paste("Cached dataset to ", destfile))
        save(ess, file = destfile)
    }
    ess
}


listCachedESs <- function(destdir) {
    correctFileName <-

    res <- list.files(destdir, pattern=".*\\.rda$", recursive = TRUE)
    res <- grep("\\.gz\\.rda$", res, invert = TRUE, value = TRUE)
    res <- res[grep("^(GSE|GDS)", basename(res), value = FALSE)]
    res <- sub("\\.rda$", "", res)
    res
}

#' Reparse cached expression sets from GEO.
#'
#' The function should be used on phantasus version updates that change
#' behavior of loading datasets from GEO. It finds all the datasets
#' that were cached and runs `getES` for them again. The function
#' uses cached Series and other files from GEO.
#'
#' @param destdir Directory used for caching loaded Series files from GEO database.
#'
#' @param mirrorPath URL string which specifies the source of matrices.
#'
#' @return vector of previously cached GSE IDs
#'
#' @examples
#' reparseCachedESs(destdir=tempdir())
#'
#' @export
reparseCachedESs <- function(destdir,
                                mirrorPath = "https://ftp.ncbi.nlm.nih.gov") {
    toReparse <- listCachedESs(destdir)

    for (path in toReparse) {
        name <- basename(path)
        geoDir <- getGEODir(name, destdir)
        message(paste0("Reparsing dataset ", name))
        destfile <- file.path(geoDir, paste0(name, ".rda"))
        bakfile <- paste0(destfile, ".bak")
        binfile <- file.path(geoDir, paste0(name, ".bin"))
        if (file.exists(binfile)) {
            file.remove(binfile)
        }

        tryCatch({
            file.rename(destfile, bakfile)
            getES(name, destdir = destdir, mirrorPath = mirrorPath)
            file.remove(bakfile)
        }, error = function(e) {
            message(paste0("Error occured while reparsing, old file stored as ",
                           bakfile))
        })
    }
    return(basename(toReparse))
}


#' Check possible annotations for GEO Dataset.
#'
#' \code{checkGPLs} returns GPL-names for
#'     the specified GEO identifier.
#'
#' @param name String, containing GEO identifier of the dataset.
#'
#' @return Vector of filenames serialized in JSON format.
#'     If there is only one GPL for that dataset, the function will
#'     return \code{name}.
#'
#' @examples
#' \dontrun{
#' checkGPLs('GSE27112')
#' checkGPLs('GSE14308')
#' }
checkGPLsFallback <- function(name) {
    spl <- unlist(strsplit(name, "-", fixed=TRUE))
    if (length(spl) == 2) {
        gpls <- jsonlite::fromJSON(checkGPLs(spl[1]))
        gpls <- intersect(gpls, name)
        return(jsonlite::toJSON(gpls))
    }

    mirrorPath <- getOption('phantasusMirrorPath')
    if (is.null(mirrorPath)) {
      mirrorPath <- "https://ftp.ncbi.nlm.nih.gov"
    }

    cacheDir <- getOption("phantasusCacheDir")

    if (is.null(cacheDir)) {
      cacheDir <- tempdir()
    } else if (!dir.exists(cacheDir)) {
      dir.create(cacheDir)
    }

    type <- substr(name, 1, 3)
    assertthat::assert_that( (type == "GDS" || type == "GSE")
                            && nchar(name) >= 4)

    stub <- gsub("\\d{1,3}$", "nnn", name, perl = TRUE)
    gdsurl <- "%s/geo/%s/%s/%s/"

    url <- sprintf(gdsurl, mirrorPath,
                   if (type == "GDS") "datasets" else "series", stub, name)

    cachePath <- paste0(sprintf(gdsurl, cacheDir,
                        if (type == "GDS") "datasets" else "series", stub, name), "gpls")

    dir.create(dirname(cachePath), recursive = TRUE, showWarnings = FALSE)

    if (file.exists(cachePath)) {
        gpls <- readLines(cachePath)
        if (length(gpls) != 0) {
            return(jsonlite::toJSON(gpls))
        }
    }

    if (type != "GDS") {
        url <- paste0(url, "matrix/")
    }

    gpls <- c()

    tryCatch({
        resp <- httr::GET(url)
        if (httr::status_code(resp) == 404) {
            warning("No such dataset")
            return(jsonlite::toJSON(c()))
        } else {
            if (type == "GDS") {
                gpls <- c(name)
            } else {
                con <- rawConnection(resp$content)
                file.names <- GEOquery:::getDirListing(con)
                close(con)

                file.names <- file.names[grepl(pattern = paste0("^", name),
                                            x = file.names)]

                file.names <- unlist(lapply(file.names, function(x) {
                    paste0(substr(x, 1, regexpr("_", x) - 1))
                }))
                if (length(file.names) == 1) {
                    file.names <- c(name)
                }
                gpls <- file.names
            }

            writeLines(gpls, cachePath)
            return(jsonlite::toJSON(gpls))
        }
    },
    error = function(e) {
        message("Problems establishing connection.")
        return(jsonlite::toJSON(c()))
    })
}

checkGPLs <- function(name) {
  spl <- unlist(strsplit(name, "-", fixed=TRUE))
  GEO <- spl[1]
  if (length(spl) == 2) {
    gpls <- jsonlite::fromJSON(checkGPLs(spl[1]))
    gpls <- intersect(gpls, name)
    return(jsonlite::toJSON(gpls))
  }

  cacheDir <- getOption("phantasusCacheDir")
  if (is.null(cacheDir)) {
    cacheDir <- tempdir()
  }
  GEOdir <- dirname(getGEODir(GEO, cacheDir))
  GPLCacheFile <- file.path(GEOdir, 'gpls')
  if (file.exists(GPLCacheFile)) {
    gpls <- readLines(GPLCacheFile)
    if (length(gpls) != 0) {
      return(jsonlite::toJSON(gpls))
    }
  }

  tryCatch({
    briefData <- getBriefData(name, cacheDir)
    platforms <- briefData$platform_id

    if (length(platforms) < 1) {
      stop('No platforms in brief data')
    }

    if (length(platforms) == 1) {
      gpls <- c(name)
    }

    if (length(platforms) >= 2) {
      gpls <- lapply(platforms, function (platform) {
        paste(spl[1], platform, sep='-')
      })
    }

    writeLines(gpls, GPLCacheFile)
    return(jsonlite::toJSON(gpls))
  }, error=function(e) {
    return(checkGPLsFallback(name))
  })
}

checkGSEType <- function (name, destDir) {
  spl <- unlist(strsplit(name, "-", fixed=TRUE))
  GEO <- spl[1]

  briefData <- getBriefData(name, destDir)
  types <- briefData$type

  if (length(types) < 1) {
    stop('No types in brief data')
  }

  supportedTypes <- c('Expression profiling by array', 'Expression profiling by high throughput sequencing')
  return (all(types %in% supportedTypes))
}

removeRepeatWords <- function(titles) {
    titles_without_repeat_words <- titles
    repeat_words <- regmatches(titles, regexpr("(?![-+}\\]\\)])(\\W|_)*\\w*$", titles, ignore.case = TRUE, perl = TRUE))
    lsuff <- lcSuffix(repeat_words, ignore.case = TRUE)
    if ((all(stringr::str_length(lsuff) == stringr::str_length(repeat_words))) & (all(sub("(\\W*|_)*\\w*$", "", titles, ignore.case = TRUE, perl = TRUE) != ""))) {
        titles_without_repeat_words <- sub("(?![-+}\\]\\)])(\\W|_)*\\w*$", "", titles, ignore.case = TRUE, perl = TRUE)
    }
    if (all(grepl("-$", titles_without_repeat_words, ignore.case = TRUE))) titles_without_repeat_words <- sub("-$", "", titles_without_repeat_words, ignore.case = TRUE, perl = TRUE)
    return(titles_without_repeat_words)
}

inferConditionImpl <- function(gse_titles) {
    inferCondition <- gse_titles
    rep_num <- NULL
    if ((length(inferCondition) > 40) | (length(inferCondition) < 3))
    {
        return(list())
    } else if (! all(grepl("[A-z]", inferCondition, ignore.case = TRUE)))
    {
        return(list())
    } else if (all(grepl(" vs[. ]{1}", inferCondition)))
    {
        return(list())
    } else
    {
        lsuff <- lcSuffix(inferCondition)
        suff_length <- stringr::str_length(lsuff)
        if (suff_length > 1) inferCondition <- stringr::str_sub(inferCondition, 1, -suff_length-1)
        if (all(grepl("((bio[A-z]*)|(tech[A-z]*))?[ .#_-]*((rep.*)|(set.*)|(sample)|(exp.*)|(case))[ .#_-]*\\d+", inferCondition, ignore.case = TRUE)))
        {
            sample <- regmatches(inferCondition, regexpr("((bio[A-z]*)|(tech[A-z]*))?[ .#_-]*((rep.*)|(set.*)|(sample)|(exp.*)|(case))[ .#_-]*\\d+", inferCondition, ignore.case = TRUE))
            rep_num <- regmatches(sample, regexpr("\\d+$", sample))
            inferCondition <- sub("(?![-+}\\]\\)])(\\W|_)*((bio[A-z]*)|(tech[A-z]*))?[ .#_-]*((rep.*)|(set.*)|(sample)|(exp.*)|(case))[ .#_-]*\\d+(\\W|_)*", "", inferCondition, ignore.case = TRUE, perl = TRUE)

        }
        else if (all(grepl("((bio[A-z]*)|(tech[A-z]*))?[ .#_-]*((rep.*)|(set.*)|(sample)|(exp.*)|(case))[ .#_-]*[A-z]$", inferCondition, ignore.case = TRUE)))
        {
            sample <- regmatches(inferCondition, regexpr("((bio[A-z]*)|(tech[A-z]*))?[ .#_-]*((rep.*)|(set.*)|(sample)|(exp.*)|(case))[ .#_-]*[A-z]$", inferCondition, ignore.case = TRUE))
            rep_num <- regmatches(sample, regexpr("[A-z]$", sample))
            inferCondition <- sub("(?![-+}\\]\\)])(\\W|_)*((bio[A-z]*)|(tech[A-z]*))?[ .#_-]*((rep.*)|(set.*)|(sample)|(exp.*)|(case))[ .#_-]*[A-z]$", "", inferCondition, ignore.case = TRUE, perl = TRUE)

        }
        else if (length(unique(sub("[- \\.#_]*\\d+$", "", inferCondition))) == 1) {
            return(list())
        }
        else if (all(grepl("[- \\.#_]*\\d+$", inferCondition)) & (length(unique(regmatches(inferCondition, regexpr("\\d+$", inferCondition)))) > 1))
        {
            sample <- sub("\\d+$", "", inferCondition)
            if (length(unique(regmatches(sample, regexpr("[- \\.#_]$", sample)))) > 1)
            {
                return(list())
            }
            else
            {
                rep_num <- regmatches(inferCondition, regexpr("\\d+$", inferCondition))
                inferCondition <- removeRepeatWords(sub("[ \\.#_]*\\d+$", "", inferCondition))

            }
        }
        else return(list())
    }

    if ((length(rep_num) > 1) & (length(unique(inferCondition)) < length(unique(gse_titles))) & (length(unique(inferCondition)) > 1))
        return(list(condition=inferCondition, replicate=rep_num))
    else return(list())
}

inferCondition <- function(es) {
    newAnnot <- inferConditionImpl(es$title)
    if (length(newAnnot) == 2) {
        pData(es)$condition <- newAnnot$condition
        pData(es)$replicate <- newAnnot$replicate
    }
    es
}


downloadGPL <- function (GPL, destdir = tempdir()) {
  GPL <- toupper(GPL)
  stub = gsub('\\d{1,3}$','nnn',GPL,perl=TRUE)
  GPLDirPath <- '%s/geo/platforms/%s/%s/annot'
  fullGPLDirPath <- file.path(sprintf(GPLDirPath, destdir, stub, GPL))

  cachedFile <- file.path(fullGPLDirPath, paste0(GPL, ".annot.gz"))
  cachedSoft <- file.path(fullGPLDirPath, paste0(GPL, ".soft"))
  cachedSoftGz <- file.path(fullGPLDirPath, paste0(GPL, ".soft.gz"))
  if (file.exists(cachedFile)) {
    if (file.size(cachedFile) == 0) {
      file.remove(cachedFile)
      message(cachedFile, ' size is 0')
    } else {
      return (cachedFile)
    }
  } else if (file.exists(cachedSoft)) {
    if (file.size(cachedSoft) == 0) {
      file.remove(cachedSoft)
      message(cachedSoft, ' size is 0')
    } else {
      return (cachedSoft)
    }
  } else if (file.exists(cachedSoftGz)) {
    if (file.size(cachedSoftGz) == 0) {
      file.remove(cachedSoftGz)
      message(cachedSoftGz, ' size is 0')
    } else {
      return (cachedSoftGz)
    }
  }


  annotPath <- 'https://ftp.ncbi.nlm.nih.gov/geo/platforms/%s/%s/annot/%s'
  annotURL <- sprintf(annotPath,stub,GPL,paste0(GPL,'.annot.gz'))
  dir.create(fullGPLDirPath, showWarnings = FALSE, recursive = TRUE)
  targetFile <- ''

  req <- httr::HEAD(annotURL)
  if (httr::status_code(req) != 404) {
    # annot available
    tmp <- tempfile(pattern=paste0(GPL, ".annot.gz"), tmpdir=fullGPLDirPath)
    tryCatch({
      download.file(annotURL, tmp)
    }, error=function (e) {
      unlink(tmp)
      stop('Could not download GPL ', GPL, e)
    })

    file.copy(tmp, cachedFile)
    unlink(tmp)
    targetFile <- cachedFile
  } else {
    # need submitter
    tmp <- tempfile(pattern=paste0(GPL, ".soft"), tmpdir=fullGPLDirPath)
    apiURL <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
    submitterURL <- paste(apiURL,'?targ=self&acc=',GPL,'&form=text&view=quick',sep='')
    tryCatch({
      download.file(submitterURL, tmp)
    }, error=function (e) {
      unlink(tmp)
      stop('Could not download GPL ', GPL, e)
    })

    gzFile <- gzfile(cachedSoftGz, 'w')
    lines <- readLines(tmp)
    writeLines(lines, gzFile)
    close(gzFile)
    unlink(tmp)
    targetFile <- cachedSoftGz
  }

  if (file.size(targetFile) == 0) {
    file.remove(targetFile)
    stop('Could not download GPL ', GPL, '. Zero file')
  }

  con <- file(targetFile, 'r')
  first_hundred_lines <- readLines(con, n=100)
  close(con)
  found<-grep('^\\^(PLATFORM|ANNOTATION)', first_hundred_lines, ignore.case = TRUE)
  if (!length(found)) {
    file.remove(targetFile)
    stop('Could not download GPL ', GPL, '.  Possible NCBI problems')
  }


  return(targetFile)
}

getGPLAnnotation <- function (GPL, destdir = tempdir()) {
    filename <- downloadGPL(GPL, destdir)
    ret <- parseGEO(filename)

    return(ret)
}

annotateFeatureData <- function (es, destdir = tempdir()) {
    platform <- as.character(es$platform_id)[1]
    platformParsed <- getGPLAnnotation(platform, destdir)

    #https://github.com/seandavi/GEOquery/blob/master/R/parseGEO.R#L569
    #############################
    vmd <- Columns(platformParsed)
    dat <- Table(platformParsed)
    ## GEO uses "TAG" instead of "ID" for SAGE GSE/GPL entries, but it is apparently
    ##     always the first column, so use dat[,1] instead of dat$ID
    ## The next line deals with the empty GSE
    tmpnames=character(0)
    if(ncol(dat)>0) {
        tmpnames=as.character(dat[,1])
    }
    ## Fixed bug caused by an ID being "NA" in GSE15197, for example
    tmpnames[is.na(tmpnames)]="NA"
    rownames(dat) <- make.unique(tmpnames)
    ## Apparently, NCBI GEO uses case-insensitive matching
    ## between platform IDs and series ID Refs ???
    dat <- dat[match(tolower(rownames(es)),tolower(rownames(dat))),]
    # Fix possibility of duplicate column names in the
    # GPL files; this is prevalent in the Annotation GPLs
    rownames(vmd) <- make.unique(colnames(Table(platformParsed)))
    colnames(dat) <- rownames(vmd)
    ##############################

    featureData(es) <- new('AnnotatedDataFrame',data=dat,varMetadata=vmd)
    es
}
Any scripts or data that you put into this service are public.
phantasus documentation built on Nov. 8, 2020, 6:39 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
phantasus
Visual and interactive gene expression analysis

R/loadGEO.R
In phantasus: Visual and interactive gene expression analysis

Defines functions inferCondition inferConditionImpl removeRepeatWords checkGPLs checkGPLsFallback reparseCachedESs listCachedESs getES getGSE filterPhenoAnnotations filterFeatureAnnotations loadFromARCHS4 getArchs4Files getGDS loadGEO

Documented in checkGPLsFallback getArchs4Files getES getGDS getGSE loadFromARCHS4 loadGEO reparseCachedESs

Try the phantasus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

phantasus Visual and interactive gene expression analysis

R/loadGEO.R In phantasus: Visual and interactive gene expression analysis

Defines functions inferCondition inferConditionImpl removeRepeatWords checkGPLs checkGPLsFallback reparseCachedESs listCachedESs getES getGSE filterPhenoAnnotations filterFeatureAnnotations loadFromARCHS4 getArchs4Files getGDS loadGEO

Documented in checkGPLsFallback getArchs4Files getES getGDS getGSE loadFromARCHS4 loadGEO reparseCachedESs

Try the phantasus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

phantasus
Visual and interactive gene expression analysis

R/loadGEO.R
In phantasus: Visual and interactive gene expression analysis