R/parseGEO.R

Defines functions parseGSEMatrix getAndParseGSEMatrices txtGrab parseGPL .parseGPLTxt parseGSM .parseGSMTxt .parseGPLWithLimits parseGDS .genericGEOTableParser fastTabRead findFirstEntity parseGSE filegrep .parseGSMWithLimits parseGeoColumns splitOnFirst parseGDSSubsets parseGeoMeta parseGEO .read_lines

Documented in parseGDS parseGEO parseGPL parseGSE parseGSEMatrix parseGSM

.na_strings = c("NA", "null", "NULL", "Null")


#' @importFrom data.table fread
.read_lines <- function(...) {
    data.table::fread(..., sep = "", header = FALSE)[[1]]
}

#' Parse GEO text
#' 
#' Workhorse GEO parsers.
#' 
#' These are probably not useful to the end-user.  Use getGEO to access these
#' functions.  parseGEO simply delegates to the appropriate specific parser.
#' There should be no reason to use the parseGPL, parseGDS, parseGSE, or
#' parseGSM functions directly.
#' 
#' @aliases parseGEO parseGPL parseGSE parseGDS parseGSM
#' @param fname The filename of a SOFT format file.  If the filename ends in
#' .gz, a gzfile() connection is used to read the file directly.
#' @param GSElimits Used to limit the number of GSMs parsed into the GSE
#' object; useful for memory management for large GSEs.
#' @param destdir The destination directory into which files will be saved (to
#' be used for caching)
#' @param AnnotGPL Fetch the annotation GPL if available
#' @param getGPL Fetch the GPL associated with a GSEMatrix entity (should
#' remain TRUE for all normal use cases)
#' @return parseGEO returns an object of the associated type.  For example, if
#' it is passed the text from a GDS entry, a GDS object is returned.
#' @author Sean Davis
#' @seealso \code{\link{getGEO}}
#' @keywords IO
#' 
#' @export
parseGEO <- function(fname, GSElimits, destdir = tempdir(), AnnotGPL = FALSE, getGPL = TRUE) {
    con <- fileOpen(fname)
    first.entity <- findFirstEntity(con)
    close(con)
    ret <- switch(as.character(first.entity[1]), sample = {
        parseGSM(fname)
    }, series = parseGSE(fname, GSElimits), dataset = {
        parseGDS(fname)
    }, platform = {
        parseGPL(fname)
    }, `0` = {
        parseGSEMatrix(fname, destdir = destdir, AnnotGPL = AnnotGPL, getGPL = getGPL)$eset
    }, )
    return(ret)
}

parseGeoMeta <- function(txt) {
    ## leader <- strsplit(grep('!\\w*_',txt,perl=TRUE,value=TRUE)[1],'_')[[1]][1]
    ## pull out only lines that are in the header
    tmp <- txt[grep("!\\w*?_", txt)]
    tmp <- gsub("!\\w*?_", "", tmp)
    first.eq <- regexpr(" = ", tmp)
    tmp <- cbind(substring(tmp, first = 1, last = first.eq - 1), substring(tmp, first = first.eq +
        3))
    tmp <- tmp[tmp[, 1] != "", ]
    header <- split(tmp[, 2], tmp[, 1])
    return(header)
}

parseGDSSubsets <- function(txt) {
    ## takes GDS text as input returns a data frame suitable for inclusion as a
    ## 'Column' slot in a GDS GeoDataTable object
    numSubsets <- length(grep("^\\^subset", txt, ignore.case = TRUE))
    subset.lists <- list()
    if (numSubsets > 0) {
        subset.types <- do.call("rbind", strsplit(txt[grep("!subset_type", txt)], " = "))[,
            2]
        subset.descriptions <- do.call("rbind", strsplit(txt[grep("!subset_description",
            txt)], " = "))[, 2]
        subset.samples <- strsplit(do.call("rbind", strsplit(txt[grep("!subset_sample_id",
            txt)], " = "))[, 2], ",")
        for (i in 1:length(subset.types)) {
            for (j in 1:length(subset.samples[[i]])) {
                subset.lists[[subset.types[i]]][[subset.samples[[i]][j]]] <- subset.descriptions[i]
            }
        }
    }
    sample.descriptions <- do.call("rbind", strsplit(txt[grep("#GSM", txt)], " = "))
    sample.descriptions[, 1] <- gsub("#", "", sample.descriptions[, 1])
    samp.table <- data.frame(sample = as.character(sample.descriptions[, 1]))
    if (length(subset.lists) > 0) {
        for (i in 1:length(subset.lists)) {
            samp.table[match(names(subset.lists[[i]]), samp.table[, 1]), names(subset.lists)[i]] <- as.factor(unlist(subset.lists[[i]]))
        }
        colnames(samp.table) <- c("sample", gsub(" ", ".", names(subset.lists)))
    } else {
        colnames(samp.table) <- "sample"
    }
    samp.table[match(sample.descriptions[, 1], samp.table[, 1]), "description"] <- sample.descriptions[,
        2]
    return(as.data.frame(samp.table))
}


splitOnFirst <- function(x, pattern) {
    patlen <- nchar(pattern)
    matches <- regexpr(pattern, x)
    leftside <- substr(x, start = 1, stop = matches - 1)
    rightside <- substr(x, start = matches + patlen, stop = 1e+07)
    return(data.frame(leftside, rightside))
}


parseGeoColumns <- function(txt) {
    cols <- as.data.frame(splitOnFirst(txt[grep("^#", txt, perl = TRUE)], " = "))
    cols[, 1] <- sub("#", "", as.character(cols[, 1]))
    colnames(cols) <- c("Column", "Description")
    return(cols)
}

### Parse a GSM; the limit is established by the parameter n, used by getGSE to
### limit the number of lines read to only the size of ONE GSM, since a single
### GSE contains many GSMs
#' @importFrom readr read_lines
.parseGSMWithLimits <- function(con, n = NULL) {
    txt <- vector("character")
    i <- 0
    hasDataTable = FALSE
    while (i <- i + 1) {
        tmp <- try(read_lines(con, 1))
        if (inherits(tmp, "try-error") | length(tmp) == 0) {
            i <- i - 1  # counted an extra line
            hasDataTable = FALSE
            break
        }
        txt[i] <- tmp
        if (i == length(txt))
            txt <- c(txt, character(i))  # double vector size
        if (length(grep("!\\w+_table_begin", txt[i], perl = TRUE)) > 0) {
            hasDataTable = TRUE
            break
        }
    }
    txt <- txt[1:i]
    cols <- parseGeoColumns(txt)
    meta <- parseGeoMeta(txt)
    geoDataTable <- new("GEODataTable", columns = data.frame(), table = data.frame())
    if (hasDataTable) {
        nLinesToRead <- NULL
        if (!is.null(n)) {
            nLinesToRead <- n - length(txt)
        }
        dat3 <- fastTabRead(con, n = nLinesToRead)
        geoDataTable <- new("GEODataTable", columns = cols, table = dat3[1:(nrow(dat3) -
            1), ])
    }
    gsm <- new("GSM", header = meta, dataTable = geoDataTable)
}


### This function does a grep on a file by doing a readline in chunks of size
### chunksize.  Return value is a data.frame with the line number of each match
### and the line itself.
filegrep <- function(con, regex, chunksize = 10000) {
    i <- 0
    ret <- NULL
    while (TRUE) {
        lines <- readLines(con, n = chunksize)
        if (length(lines) == 0) {
            break
        }
        foundLines <- grep(regex, lines)
        foundTypes <- lines[foundLines]
        if (length(foundLines) > 0) {
            foundLines <- foundLines + i
            tmp <- data.frame(foundLines = foundLines, foundTypes = foundTypes)
            if (is.null(ret)) {
                ret <- tmp
            } else {
                ret <- rbind(ret, tmp)
            }
        }
        i <- i + length(lines)
    }
    return(ret)
}

#' @importFrom readr read_lines
parseGSE <- function(fname, GSElimits = NULL) {
    gsmlist <- list()
    gpllist <- list()
    GSMcount <- 0
    message("Reading file....")
    lines = .read_lines(fname)
    message("Parsing....")
    lineCounts <- grep("\\^(SAMPLE|PLATFORM)", lines)
    message(sprintf("Found %d entities...", length(lineCounts)))
    ## I close and reopen the file because on Windows, the seek function is
    ## pretty much guaranteed to NOT work This gets the header information for
    ## the GSE
    a <- lines[1:(lineCounts[1] - 1)]
    header = parseGeoMeta(a)
    lineCounts = c(lineCounts, length(lines))
    ## parse the actual entities, now
    for (j in 1:(length(lineCounts) - 1)) {
        tmp <- strsplit(lines[lineCounts[j]], " = ")[[1]]
        accession <- tmp[2]
        message(sprintf("%s (%d of %d entities)", accession, j, length(lineCounts)))
        entityType <- tolower(sub("\\^", "", tmp[1]))
        if (entityType == "sample") {
            gsmlist[[accession]] <- .parseGSMTxt(lines[lineCounts[j]:(lineCounts[j +
                1] - 1)])
        }
        if (entityType == "platform") {
            gpllist[[accession]] <- .parseGPLTxt(lines[lineCounts[j]:(lineCounts[j +
                1] - 1)])
        }
    }
    return(new("GSE", header = header, gsms = gsmlist, gpls = gpllist))
}


findFirstEntity <- function(con) {
    while (TRUE) {
        line <- suppressWarnings(readLines(con, 100))
        if (length(line) == 0)
            return(0)
        entity.line <- grep("^\\^(DATASET|SAMPLE|SERIES|PLATFORM|ANNOTATION)", line,
            ignore.case = TRUE, value = TRUE, perl = TRUE)
        entity.line <- gsub("annotation", "platform", entity.line, ignore.case = TRUE)
        if (length(entity.line) > 0) {
            ret <- c(tolower(sub("\\^", "", strsplit(entity.line, " = ")[[1]][1])),
                strsplit(entity.line, " = ")[[1]][2])
            return(ret)
        }
        # catch GSE SeriesMatrix files
        checkline = grep("^!Series_title", line, ignore.case = TRUE, value = TRUE,
            perl = TRUE)
        # Messy, but GSEMatrix files use tab-separation rather than '=' for
        # separation, so if the ' = ' is not present, we presume to have a
        # GSEMatrix file (return 0)
        if (length(checkline) > 0) {
            if (!grepl(" = ", checkline))
                return(0)
        }
    }
    return(0)
}

fastTabRead <- function(con, sep = "\t", header = TRUE, sampleRows = 100, colClasses = NULL,
    n = NULL, quote = "\"", ...) {
    ### Need to read tables quickly, so guess the colclasses on the fly.  This is
    ### a bit dangerous since the first rows might not be representative of the
    ### entire set, but it is SO MUCH FASTER than the alternative, I have to do
    ### it.
    dat3 <- data.frame(NULL)
    numberOfLines <- -1
    if (!is.null(n)) {
        numberOfLines <- n - sampleRows
    }
    if (is.null(colClasses)) {
        if (!is.null(n)) {
            sampleRows <- min(sampleRows, n)
        }
        dat1 <- read.table(con, sep = sep, header = header, nrows = sampleRows, fill = TRUE,
            check.names = FALSE, quote = quote, comment.char = "", na.strings = c("NA",
                "null", "NULL", "Null"), ...)
        colclasses <- apply(dat1, 2, class)
        colclasses[1] <- "factor"
        dat2 <- try(read.delim(con, sep = sep, colClasses = colclasses, header = FALSE,
            quote = quote, comment.char = "", na.strings = c("NA", "null", "NULL",
                "Null"), nrows = numberOfLines, ...), silent = TRUE)
        if (inherits(dat2, "try-error")) {
            dat3 = dat1
        } else {
            colnames(dat2) <- colnames(dat1)
            dat3 <- rbind(dat1, dat2)
        }
    } else {
        dat3 <- read.delim(con, sep = sep, colClasses = colClasses, header = header,
            quote = quote, comment.char = "", na.strings = c("NA", "null", "NULL",
                "Null"), nrows = numberOfLines, ...)
    }
    return(dat3)
}

.genericGEOTableParser <- function(txt) {
    # Find first line of the table
    has_table = TRUE
    tbl_begin = grep("!\\w+_table_begin", txt, perl = TRUE)[1] + 1
    # No data table at all, so everything is header info
    if (is.na(tbl_begin)) {
        tbl_begin = length(txt)
        has_table = FALSE
    }

    if (has_table) {
        metadata_rows <- seq_len(tbl_begin - 2)
    } else {
        metadata_rows <- seq_along(txt)
    }


    # Find last line of the table Edge case is that the record does not have a
    # table end line, in which case we use all lines
    tbl_end = grep("!\\w+_table_end", txt, perl = TRUE)
    if (length(tbl_end) == 0) {
        tbl_end = length(txt)
    } else {
        tbl_end = tbl_end[1] - 1
    }
    dat3 = data.frame()
    if (has_table) {
        dat3 <- data.table::fread(text = txt[tbl_begin:tbl_end], na = .na_strings)
    }
    return(list(meta_text = txt[metadata_rows], data_frame = dat3))
}

#' @importFrom data.table fread
parseGDS <- function(fname) {
    txt = data.table::fread(fname, sep = "")[[1]]

    parser_results = .genericGEOTableParser(txt)

    cols <- parseGDSSubsets(parser_results$meta_text)
    meta <- parseGeoMeta(parser_results$meta_text)
    geoDataTable <- new("GEODataTable", columns = cols, table = parser_results$data_frame)
    gds <- new("GDS", header = meta, dataTable = geoDataTable)
    return(gds)
}

.parseGPLWithLimits <- function(con, n = NULL) {
    txt <- vector("character")
    i <- 0
    hasDataTable = FALSE
    while (i <- i + 1) {
        tmp <- try(readLines(con, 1))
        if (inherits(tmp, "try-error") || length(tmp) == 0 || (!is.null(n) && i ==
            n)) {
            i <- i - 1  # counted an extra line
            hasDataTable = FALSE
            break
        }
        txt[i] <- tmp
        if (i == length(txt))
            txt <- c(txt, character(i))  # double vector size
        if (length(grep("!\\w+_table_begin", txt[i], perl = TRUE)) > 0) {
            hasDataTable = TRUE
            break
        }
    }
    txt <- txt[1:i]
    cols <- parseGeoColumns(txt)
    meta <- parseGeoMeta(txt)
    geoDataTable <- new("GEODataTable", columns = data.frame(), table = data.frame())
    if (hasDataTable) {
        nLinesToRead <- NULL
        if (!is.null(n)) {
            nLinesToRead <- n - length(txt)
        }
        dat3 <- fastTabRead(con, n = nLinesToRead, quote = "")
        geoDataTable <- new("GEODataTable", columns = cols, table = dat3[1:(nrow(dat3) -
            1), ])
    }
    gpl <- new("GPL", header = meta, dataTable = geoDataTable)
}


#' @importFrom readr read_tsv
.parseGSMTxt <- function(txt) {
    parser_results = .genericGEOTableParser(txt)

    cols <- parseGeoColumns(parser_results$meta_text)
    meta <- parseGeoMeta(parser_results$meta_text)
    geoDataTable <- new("GEODataTable", columns = cols, table = parser_results$data_frame)
    geo <- new("GSM", header = meta, dataTable = geoDataTable)
    return(geo)
}


#' @importFrom data.table fread
parseGSM <- function(fname) {
    txt = data.table::fread(fname, sep = "")[[1]]
    # read_lines reads separate blank lines on windows, so remove them before
    # proceeding. NOT doing so results in the Table header being repeated as the
    # first line of the Table and test failures galore.
    txt = txt[txt != ""]
    return(.parseGSMTxt(txt))
}

### In memory cache for GPL objects parsed from locally cached versions of GPL
### SOFT files.  It is disabled by default with
### options('GEOquery.inmemory.gpl'=FALSE).
GPLcache <- new.env(parent = emptyenv())


#' @importFrom readr read_tsv
.parseGPLTxt <- function(txt) {

    parser_results = .genericGEOTableParser(txt)

    cols <- parseGeoColumns(parser_results$meta_text)
    meta <- parseGeoMeta(parser_results$meta_text)
    geoDataTable <- new("GEODataTable", columns = cols, table = parser_results$data_frame)
    geo <- new("GPL", header = meta, dataTable = geoDataTable)
    return(geo)
}


#' @importFrom readr read_lines
parseGPL <- function(fname) {
    if (getOption("GEOquery.inmemory.gpl")) {
        info <- file.info(fname, extra_cols = FALSE)
        cache <- get0(fname, envir = GPLcache, inherits = FALSE)
        ## Check if the locally cached version wasn't modified.
        if (!is.null(cache) && cache$info$mtime == info$mtime) {
            message("Using GPL object found in memory from locally cached version.")
            return(cache$gpl)
        }
    }
    txt = data.table::fread(fname, sep = "")[[1]]
    # read_lines reads separate blank lines on windows, so remove them before
    # proceeding. NOT doing so results in the Table header being repeated as the
    # first line of the Table and test failures galore.
    txt = txt[txt != ""]
    return(.parseGPLTxt(txt))
}


txtGrab <- function(regex, x) {
    x <- as.character(x)
    a <- regexpr(regex, x, perl = TRUE)
    return(substr(x, a, a + attr(a, "match.length") - 1))
}

### Function wrapper to get and parse ALL the GSEMatrix files associated with a
### GSE into a list of ExpressionSets
getAndParseGSEMatrices <- function(GEO, destdir, AnnotGPL, getGPL = TRUE, parseCharacteristics = TRUE) {
    GEO <- toupper(GEO)
    ## This stuff functions to get the listing of available files for a given GSE
    ## given that there may be many GSEMatrix files for a given GSE.
    stub = gsub("\\d{1,3}$", "nnn", GEO, perl = TRUE)
    gdsurl <- "https://ftp.ncbi.nlm.nih.gov/geo/series/%s/%s/matrix/"
    b = getDirListing(sprintf(gdsurl, stub, GEO))
    message(sprintf("Found %d file(s)", length(b)))
    ret <- list()
    ## Loop over the files, returning a list, one element for each file
    for (i in 1:length(b)) {
        message(b[i])
        destfile = file.path(destdir, b[i])
        if (file.exists(destfile)) {
            message(sprintf("Using locally cached version: %s", destfile))
        } else {
            url = sprintf("https://ftp.ncbi.nlm.nih.gov/geo/series/%s/%s/matrix/%s",
                stub, GEO, b[i])
            downloadFile(url, destfile = destfile, mode = "wb")
        }
        ret[[b[i]]] <- parseGSEMatrix(destfile, destdir = destdir, AnnotGPL = AnnotGPL,
            getGPL = getGPL)$eset
    }
    return(ret)
}

#' Parse a GSE mstrix file
#'
#' Not meant for user calling, but parses
#' a single GSEMatrix file.
#'
#' @importFrom dplyr select filter mutate mutate_all
#' @importFrom tidyr gather spread separate
#' @importFrom readr read_lines
#' @importClassesFrom Biobase ExpressionSet
#' @importFrom magrittr %>%
#'
#' @param fname filename
#' @param AnnotGPL set to TRUE to get the annotation GPL version
#' @param destdir the destdination directory for download
#' @param getGPL whether or not to get the GPL associated
#' @param parseCharacteristics Whether or not to do full 'characteristic' parsing
#' @keywords internal
#' 
parseGSEMatrix <- function(fname, AnnotGPL = FALSE, destdir = tempdir(), getGPL = TRUE,
    parseCharacteristics = TRUE) {
    # Convenient VERY fast line reader based on data.table See:
    # https://stackoverflow.com/a/32924981/459633 data read into single character
    # column, so subset to get just text.
    dat <- data.table::fread(fname, sep = "")[[1]]

    ## get the number of !Series and !Sample lines
    series_header_row_count <- sum(grepl("^!Series_", dat))
    # In the case of ^M in the metadata (GSE781, for example), the line counts
    # for 'skip' and read.table are different.  This next line gets the 'skip'
    # line count for use below in the tmpdat reading 'skip'
    sample_header_start <- grep("^!Sample_", dat)[1]
    samples_header_row_count <- sum(grepl("^!Sample_", dat))
    series_table_begin_line = grep("^!series_matrix_table_begin", dat)
    series_table_end_line = grep("^!series_matrix_table_end", dat)
    if (length(series_table_begin_line) != 1) {
        stop("parsing failed--expected only one '!series_data_table_begin'")
    }
    # con <- fileOpen(fname) Read the !Series_ and !Sample_ lines
    header <- data.table::fread(fname, header = FALSE, nrows = series_header_row_count)
    tmpdat <- data.table::fread(fname, header = FALSE, nrows = samples_header_row_count,
        skip = sample_header_start - 1)

    headertmp <- t(header)
    headerdata <- rbind(data.frame(), headertmp[-1, ])
    colnames(headerdata) <- sub("!Series_", "", as.character(header[[1]]))
    headerlist <- lapply(split.default(headerdata, names(headerdata)), function(x) {
        as.character(Reduce(function(a, b) {
            paste(a, b, sep = "\n")
        }, x))
    })

    link = "https://www.ncbi.nlm.nih.gov/geo/"
    if (!is.null(headerlist$web_link)) {
        link <- headerlist$web_link
    } else if (!is.null(headerlist$geo_accession)) {
        link <- paste(link, "query/acc.cgi?acc=", headerlist$geo_accession, sep = "")
    }

    ed <- new("MIAME", name = ifelse(is.null(headerlist$contact_name), "", headerlist$contact_name),
        title = headerlist$title, contact = ifelse(is.null(headerlist$contact_email),
            "", headerlist$contact_email), pubMedIds = ifelse(is.null(headerlist$pubmed_id),
            "", headerlist$pubmed_id), abstract = ifelse(is.null(headerlist$summary),
            "", headerlist$summary), url = link, other = headerlist)

    tmptmp <- t(tmpdat)
    sampledat <- rbind(data.frame(), tmptmp[-1, ])
    colnames(sampledat) <- make.unique(sub("!Sample_", "", as.character(tmpdat[[1]])))
    sampledat[["geo_accession"]] = as.character(sampledat[["geo_accession"]])
    rownames(sampledat) = sampledat[["geo_accession"]]
    ## Lots of GSEs now use 'characteristics_ch1' and 'characteristics_ch2' for
    ## key-value pairs of annotation. If that is the case, this simply cleans
    ## those up and transforms the keys to column names and the values to column
    ## values.
    if (length(grep("characteristics_ch", colnames(sampledat))) > 0 && parseCharacteristics) {
        pd = sampledat %>%
            dplyr::select(dplyr::contains("characteristics_ch")) %>%
            dplyr::mutate(accession = rownames(.)) %>%
            # these next two lines avoid warnings due to columns having different
            # factor levels (attributes).
        mutate_all(as.character) %>%
            tidyr::gather(characteristics, kvpair, -accession) %>%
            dplyr::filter(grepl(":", kvpair) & !is.na(kvpair))
        # Thx to Mike Smith (@grimbough) for this code sometimes the
        # 'characteristics_ch1' fields are empty and contain no key:value pairs.
        # spread() will fail when called on an empty data_frame.  We catch this
        # case and remove the 'charactics_ch1' column instead
        if (nrow(pd)) {
            pd = dplyr::mutate(pd, characteristics = ifelse(grepl("_ch2", characteristics),
                "ch2", "ch1")) %>%
                tidyr::separate(kvpair, into = c("k", "v"), sep = ":", fill = "right",
                  extra = "merge") %>%
                dplyr::mutate(k = paste(k, characteristics, sep = ":")) %>%
                dplyr::select(-characteristics) %>%
                dplyr::filter(!is.na(v)) %>%
                dplyr::group_by(accession, k) %>%
                dplyr::mutate(v = paste0(trimws(v), collapse = ";")) %>%
                unique() %>%
                tidyr::spread(k, v)
        } else {
            pd = pd %>%
                dplyr::select(accession)
        }
        ## dplyr::mutate(characteristics=ifelse(grepl('_ch2',characteristics),'ch2','ch1'))
        ## %>% dplyr::filter(grepl(':',kvpair)) %>% tidyr::separate(kvpair, into=
        ## c('k','v'), sep=':') if(nrow(pd)>0) { pd = pd %>% dplyr::mutate(k =
        ## paste(k,characteristics,sep=':')) %>% dplyr::select(-characteristics)
        ## %>% tidyr::spread(k,v)
        sampledat = sampledat %>%
            dplyr::left_join(pd, by = c(geo_accession = "accession"))
    }

    datamat = NULL
    if (series_table_end_line - series_table_begin_line == 2) {
        datamat = read.table(textConnection(dat[(series_table_begin_line + 1):(series_table_end_line -
            1)]), header = TRUE, sep = "\t")
    } else {
        datamat <- data.table::fread(text = dat[(series_table_begin_line + 1):(series_table_end_line -
            1)], quote = "\"", na.strings = c("NA", "null", "NULL", "Null"))
    }
    ## kip = series_table_begin_line) comment.char = '!series_matrix_table_end')
    tmprownames = as.character(datamat[[1]])
    # need the as.matrix for single-sample or empty GSE
    datamat <- as.matrix(datamat[!is.na(tmprownames), -1])
    rownames(datamat) <- tmprownames[!is.na(tmprownames)]
    datamat <- as.matrix(datamat)
    rownames(sampledat) <- colnames(datamat)
    GPL = as.character(sampledat[1, grep("^platform_id", colnames(sampledat), ignore.case = TRUE)])
    ## if getGPL is FALSE, skip this and featureData is then a data.frame with no
    ## columns
    fd = new("AnnotatedDataFrame", data = data.frame(row.names = rownames(datamat)))
    if (getGPL) {
        gpl <- getGEO(GPL, AnnotGPL = AnnotGPL, destdir = destdir)
        vmd <- Columns(gpl)
        dat <- Table(gpl)
        ## GEO uses 'TAG' instead of 'ID' for SAGE GSE/GPL entries, but it is
        ## apparently always the first column, so use dat[,1] instead of dat$ID
        ## The next line deals with the empty GSE
        tmpnames = character(0)
        if (ncol(dat) > 0) {
            tmpnames = as.character(dat[, 1])
        }
        ## Fixed bug caused by an ID being 'NA' in GSE15197, for example
        tmpnames[is.na(tmpnames)] = "NA"
        rownames(dat) <- make.unique(tmpnames)
        ## Apparently, NCBI GEO uses case-insensitive matching between platform
        ## IDs and series ID Refs ???
        dat <- dat[match(tolower(rownames(datamat)), tolower(rownames(dat))), ]
        # Fix possibility of duplicate column names in the GPL files; this is
        # prevalent in the Annotation GPLs
        rownames(vmd) <- make.unique(colnames(Table(gpl)))
        colnames(dat) <- rownames(vmd)
        fd <- new("AnnotatedDataFrame", data = dat, varMetadata = vmd)
    }
    if (is.null(nrow(datamat))) {
        ## fix empty GSE datamatrix samplename stuff above does not work with
        ## empty GSEs, so fix here, also
        tmpnames <- names(datamat)
        rownames(sampledat) <- tmpnames
        datamat = matrix(nrow = 0, ncol = nrow(sampledat))
        colnames(datamat) <- tmpnames
    } else {
        ## This looks like a dangerous operation but is needed to deal with the
        ## fact that NCBI GEO allows case-insensitive matching and we need to
        ## pick one.
        rownames(datamat) <- rownames(dat)
    }
    eset <- new("ExpressionSet", phenoData = as(sampledat, "AnnotatedDataFrame"), annotation = GPL,
        featureData = fd, experimentData = ed, exprs = as.matrix(datamat))
    return(list(GPL = as.character(sampledat[1, grep("platform_id", colnames(sampledat),
        ignore.case = TRUE)]), eset = eset))
}
seandavi/GEOquery documentation built on Oct. 29, 2024, 1:24 p.m.