R/parseGEO.R

Defines functions parseGSEMatrix getAndParseGSEMatrices txtGrab parseGPL .parseGPLTxt parseGSM .parseGSMTxt .parseGPLWithLimits parseGDS fastTabRead findFirstEntity parseGSE filegrep .parseGSMWithLimits parseGeoColumns splitOnFirst parseGDSSubsets parseGeoMeta parseGEO

Documented in parseGDS parseGEO parseGPL parseGSE parseGSEMatrix parseGSM

.na_strings = c('NA','null','NULL','Null')



#' Parse GEO text
#' 
#' Workhorse GEO parsers.
#' 
#' These are probably not useful to the end-user.  Use getGEO to access these
#' functions.  parseGEO simply delegates to the appropriate specific parser.
#' There should be no reason to use the parseGPL, parseGDS, parseGSE, or
#' parseGSM functions directly.
#' 
#' @aliases parseGEO parseGPL parseGSE parseGDS parseGSM
#' @param fname The filename of a SOFT format file.  If the filename ends in
#' .gz, a gzfile() connection is used to read the file directly.
#' @param GSElimits Used to limit the number of GSMs parsed into the GSE
#' object; useful for memory management for large GSEs.
#' @param destdir The destination directory into which files will be saved (to
#' be used for caching)
#' @param AnnotGPL Fetch the annotation GPL if available
#' @param getGPL Fetch the GPL associated with a GSEMatrix entity (should
#' remain TRUE for all normal use cases)
#' @return parseGEO returns an object of the associated type.  For example, if
#' it is passed the text from a GDS entry, a GDS object is returned.
#' @author Sean Davis
#' @seealso \code{\link{getGEO}}
#' @keywords IO
#' 
#' @export
parseGEO <- function(fname,GSElimits,destdir=tempdir(),AnnotGPL=FALSE,getGPL=TRUE) {
    con <- fileOpen(fname)
    first.entity <- findFirstEntity(con)
    close(con)
    ret <- switch(as.character(first.entity[1]),
                  sample= {
                      parseGSM(fname)
                  },
                  series= parseGSE(fname,GSElimits),
                  dataset= {
                      parseGDS(fname)
                  },
                  platform= {
                      parseGPL(fname)
                  },
                  "0" = {
                      parseGSEMatrix(fname,destdir=destdir,AnnotGPL=AnnotGPL,getGPL=getGPL)$eset
                  },
                  )
    return(ret)
}

parseGeoMeta <- function(txt) {
    ## leader <- strsplit(grep('!\\w*_',txt,perl=TRUE,value=TRUE)[1],'_')[[1]][1]
    ## pull out only lines that are in the header
    tmp <- txt[grep("!\\w*?_",txt)]
    tmp <- gsub("!\\w*?_",'',tmp)
    first.eq <- regexpr(' = ',tmp)
    tmp <- cbind(substring(tmp,first=1,last=first.eq-1),
                 substring(tmp,first=first.eq+3))
    tmp <- tmp[tmp[,1]!="",]
    header <- split(tmp[,2],tmp[,1])
    return(header)
}

parseGDSSubsets <- function(txt) {
    ## takes GDS text as input
    ## returns a data frame suitable for inclusion as a "Column" slot
    ##   in a GDS GeoDataTable object
    numSubsets <- length(grep('^\\^subset',txt,ignore.case=TRUE))
    subset.lists <- list()
    if (numSubsets>0) {
        subset.types <-
            do.call('rbind',strsplit(txt[grep("!subset_type",txt)],' = '))[,2]
        subset.descriptions <-
            do.call('rbind',strsplit(txt[grep("!subset_description",txt)],' = '))[,2]
        subset.samples <-
            strsplit(do.call('rbind',strsplit(txt[grep("!subset_sample_id",txt)],' = '))[,2],',')
        for(i in 1:length(subset.types)) {
            for(j in 1:length(subset.samples[[i]])) {
                subset.lists[[subset.types[i]]][[subset.samples[[i]][j]]] <- subset.descriptions[i]
            }
        }
    }
    sample.descriptions <-
        do.call('rbind',strsplit(txt[grep('#GSM',txt)],' = '))
    sample.descriptions[,1] <- gsub('#','',sample.descriptions[,1])
    samp.table <- data.frame(sample=as.character(sample.descriptions[,1]))
    if(length(subset.lists)>0) {
        for(i in 1:length(subset.lists)) {
            samp.table[match(names(subset.lists[[i]]),samp.table[,1]),
                       names(subset.lists)[i]] <- as.factor(unlist(subset.lists[[i]]))
        }
        colnames(samp.table) <- c('sample',gsub(' ','.',names(subset.lists)))
    } else {
        colnames(samp.table) <- 'sample'
    }
    samp.table[match(sample.descriptions[,1],samp.table[,1]),'description'] <-
        sample.descriptions[,2]
    return(as.data.frame(samp.table))
}


splitOnFirst <- function(x,pattern) {
    patlen <- nchar(pattern)
    matches <- regexpr(pattern,x)
    leftside <- substr(x,start=1,stop=matches-1)
    rightside <- substr(x,start=matches+patlen,stop=10000000)
    return(data.frame(leftside,rightside))
}


parseGeoColumns <- function(txt) {
    cols <- as.data.frame(splitOnFirst(txt[grep('^#',txt,perl=TRUE)],' = '))
    cols[,1] <- sub('#','',as.character(cols[,1]))
    colnames(cols) <- c('Column','Description')
    return(cols)
}

### Parse a GSM; the limit is established by the
### parameter n, used by getGSE to limit the number
### of lines read to only the size of ONE GSM,
### since a single GSE contains many GSMs
#' @importFrom readr read_lines
.parseGSMWithLimits <- function(con,n=NULL) {
    txt <- vector('character')
    i <- 0
    hasDataTable=FALSE
    while(i <- i+1) {
        tmp <- try(read_lines(con,1))
        if(inherits(tmp,"try-error") | length(tmp)==0) {
            i <- i-1  # counted an extra line
            hasDataTable=FALSE
            break
        }
        txt[i] <- tmp
        if(i==length(txt)) txt <- c(txt, character(i))  # double vector size
        if(length(grep('!\\w+_table_begin',txt[i],perl=TRUE))>0) {
            hasDataTable=TRUE
            break
        }
    }
    txt <- txt[1:i]
    cols <- parseGeoColumns(txt)
    meta <- parseGeoMeta(txt)
    geoDataTable <- new("GEODataTable",columns=data.frame(),table=data.frame())
    if(hasDataTable) {
        nLinesToRead <- NULL
        if(!is.null(n)) {
            nLinesToRead <- n-length(txt)
        }
        dat3 <- fastTabRead(con,n=nLinesToRead)
        geoDataTable <- new('GEODataTable',columns=cols,table=dat3[1:(nrow(dat3)-1),])
    } 
    gsm <- new('GSM',
               header=meta,
               dataTable = geoDataTable)
}


### This function does a grep on a file
### by doing a readline in chunks of size
### chunksize.
### Return value is a data.frame with the line number
### of each match and the line itself.
filegrep <-
    function(con,regex,chunksize=10000) {
        i <- 0
        ret <- NULL
        while(TRUE) {
            lines <- readLines(con,n=chunksize)
            if(length(lines)==0) {
                break
            }
            foundLines <- grep(regex,lines)
            foundTypes <- lines[foundLines]
            if(length(foundLines)>0) {
                foundLines <- foundLines+i
                tmp <- data.frame(foundLines=foundLines,foundTypes=foundTypes)
                if(is.null(ret)) {
                    ret <- tmp
                } else {
                    ret <- rbind(ret,tmp)
                }
            }
            i <- i+length(lines)
        }
        return(ret) 
    }

#' @importFrom readr read_lines
parseGSE <- function(fname,GSElimits=NULL) {
    gsmlist <- list()
    gpllist <- list()
    GSMcount <- 0
    message('Reading file....')
    lines = read_lines(fname)
    message('Parsing....')
    lineCounts <- grep("\\^(SAMPLE|PLATFORM)",lines)
    message(sprintf("Found %d entities...",length(lineCounts)))
    ## I close and reopen the file because on Windows, the seek
    ## function is pretty much guaranteed to NOT work
    ## This gets the header information for the GSE
    a <- lines[1:(lineCounts[1]-1)]
    header=parseGeoMeta(a)
    lineCounts = c(lineCounts,length(lines))
    ## parse the actual entities, now
    for(j in 1:(length(lineCounts)-1)) {
        tmp <- strsplit(lines[lineCounts[j]]," = ")[[1]]
        accession <- tmp[2]
        message(sprintf("%s (%d of %d entities)",accession,j,length(lineCounts)))
        entityType <- tolower(sub("\\^","",tmp[1]))
        if(entityType=="sample") {
            gsmlist[[accession]] <- .parseGSMTxt(lines[lineCounts[j]:(lineCounts[j+1]-1)])
        }
        if(entityType=="platform") {
            gpllist[[accession]] <- .parseGPLTxt(lines[lineCounts[j]:(lineCounts[j+1]-1)])
        }
    }
    return(new("GSE",
               header= header,
               gsms  = gsmlist,
               gpls  = gpllist))
}


findFirstEntity <- function(con) {
    while(TRUE) {
        line <- suppressWarnings(readLines(con,100))
        if (length(line) == 0) return(0)
        entity.line <- grep('^\\^(DATASET|SAMPLE|SERIES|PLATFORM|ANNOTATION)',
                            line,ignore.case=TRUE,value=TRUE,perl=TRUE)
        entity.line <- gsub('annotation','platform',entity.line,ignore.case=TRUE)
        if(length(entity.line)>0) {
            ret <- c(tolower(sub('\\^','',strsplit(entity.line,' = ')[[1]][1])),
                     strsplit(entity.line,' = ')[[1]][2])
            return(ret)
        }
        # catch GSE SeriesMatrix files
        checkline = grep('^!Series_title',line,ignore.case=TRUE,value=TRUE,perl=TRUE)
        # Messy, but GSEMatrix files use tab-separation rather
        # than '=' for separation, so if the ' = ' is not present,
        # we presume to have a GSEMatrix file (return 0)
        if(length(checkline) > 0) {
            if(!grepl(' = ',checkline)) return(0)
        }
    }
    return(0)
}

fastTabRead <- function(con,sep="\t",header=TRUE,sampleRows=100,
                        colClasses=NULL,n=NULL,quote='"',...) {
### Need to read tables quickly, so guess the colclasses on the
### fly.  This is a bit dangerous since the first rows might not
### be representative of the entire set, but it is SO MUCH FASTER
### than the alternative, I have to do it.
    dat3 <- data.frame(NULL)
    numberOfLines <- -1
    if(!is.null(n)) {
        numberOfLines <- n-sampleRows
    }
    if(is.null(colClasses)) {
        if(!is.null(n)) {
            sampleRows <- min(sampleRows,n)
        }
        dat1 <- read.table(con,sep=sep,header=header,nrows=sampleRows,fill=TRUE,check.names=FALSE,
                           quote=quote,comment.char="",na.strings=c('NA','null','NULL','Null'),...)
        colclasses <- apply(dat1,2,class)
        colclasses[1] <- "factor"
        dat2 <- try(read.delim(con,sep=sep,colClasses=colclasses,
                               header=FALSE,quote=quote,comment.char="",
                               na.strings=c('NA','null','NULL','Null'),
                               nrows=numberOfLines,...),silent=TRUE)
        if(inherits(dat2,'try-error')) {
            dat3=dat1
        } else {
            colnames(dat2) <- colnames(dat1)
            dat3 <- rbind(dat1,dat2)
        }
    } else {
        dat3 <- read.delim(con,sep=sep,colClasses=colClasses,
                           header=header,quote=quote,comment.char="",
                           na.strings=c('NA','null','NULL',"Null"),nrows=numberOfLines,...)
    }
    return(dat3)
}



#' @importFrom readr read_lines read_tsv
parseGDS <- function(fname) {
    txt = read_lines(fname)
    tbl_begin = grep('!\\w+_table_begin',txt,perl=TRUE)
    if(length(tbl_begin>0)) {
        txt = txt[1:tbl_begin[1]]
        dat3 <- read_tsv(fname, comment='!dataset_table_end', skip = tbl_begin[1],
                         guess_max = 10000, na = .na_strings)
    }
    cols <- parseGDSSubsets(txt)
    meta <- parseGeoMeta(txt)
    geoDataTable <- new('GEODataTable',columns=cols,table=as.data.frame(dat3))
    gds <- new('GDS',
               header=meta,
               dataTable = geoDataTable)
    return(gds)
}

.parseGPLWithLimits <- function(con,n=NULL) {
    txt <- vector('character')
    i <- 0
    hasDataTable=FALSE
    while(i <- i+1) {
        tmp <- try(readLines(con,1))
        if(inherits(tmp,"try-error")
           || length(tmp)==0
           || (!is.null(n) && i==n)) {
            i <- i-1  # counted an extra line
            hasDataTable=FALSE
            break
        }
        txt[i] <- tmp
        if(i==length(txt)) txt <- c(txt, character(i))  # double vector size
        if(length(grep('!\\w+_table_begin',txt[i],perl=TRUE))>0) {
            hasDataTable=TRUE
            break
        }
    }
    txt <- txt[1:i]
    cols <- parseGeoColumns(txt)
    meta <- parseGeoMeta(txt)
    geoDataTable <- new("GEODataTable",columns=data.frame(),table=data.frame())
    if(hasDataTable) {
        nLinesToRead <- NULL
        if(!is.null(n)) {
            nLinesToRead <- n-length(txt)
        }
        dat3 <- fastTabRead(con,n=nLinesToRead,quote='')
        geoDataTable <- new('GEODataTable',columns=cols,table=dat3[1:(nrow(dat3)-1),])
    } 
    gpl <- new('GPL',
               header=meta,
               dataTable = geoDataTable)
}


#' @importFrom readr read_tsv
.parseGSMTxt <- function(txt) {
    tbl_begin = grep('!\\w+_table_begin',txt,perl=TRUE)
    if(length(tbl_begin>0)) {
        tbltxt = txt[(tbl_begin[1]+1):length(txt)]
        txt = txt[1:tbl_begin[1]]
        dat3 <- read_tsv(paste(tbltxt,collapse="\n"), comment='!sample_table_end',
                         guess_max = 10000, na = .na_strings)
    } else {
        # empty data table
        dat3 = data.frame()
    }

    cols <- parseGeoColumns(txt)
    meta <- parseGeoMeta(txt)
    geoDataTable <- new('GEODataTable',columns=cols,table=as.data.frame(dat3))
    geo <- new('GSM',
               header=meta,
               dataTable = geoDataTable)
    return(geo)
}
    

#' @importFrom readr read_lines
parseGSM <- function(fname) {
    txt = read_lines(fname)
    # read_lines reads separate blank lines
    # on windows, so remove them before 
    # proceeding. NOT doing so results in 
    # the Table header being repeated as the
    # first line of the Table and test failures
    # galore.
    txt = txt[txt != '']
    return(.parseGSMTxt(txt))
}

### In memory cache for GPL objects parsed from locally cached versions of GPL SOFT files.
### It is disabled by default with options('GEOquery.inmemory.gpl'=FALSE).
GPLcache <- new.env(parent=emptyenv())


#' @importFrom readr read_tsv
.parseGPLTxt <- function(txt) {
    tbl_begin = grep('!\\w+_table_begin',txt,perl=TRUE)
    
    if(length(tbl_begin>0)) {
        tbltxt = txt[(tbl_begin[1]+1):length(txt)]
        txt = txt[1:tbl_begin[1]]
        dat3 <- suppressMessages(read_tsv(paste(tbltxt,collapse="\n"), comment='!platform_table_end',
                         guess_max = 10000, na = .na_strings))
    } else {
        # empty data table
        dat3 = data.frame()
    }
    cols <- parseGeoColumns(txt)
    meta <- parseGeoMeta(txt)
    geoDataTable <- new('GEODataTable',columns=cols,table=as.data.frame(dat3))
    geo <- new('GPL',
               header=meta,
               dataTable = geoDataTable)
    return(geo)
}


#' @importFrom readr read_lines
parseGPL <- function(fname) {
    if(getOption('GEOquery.inmemory.gpl')) {
        info <- file.info(fname,extra_cols=FALSE)
        cache <- get0(fname,envir=GPLcache,inherits=FALSE)
        ## Check if the locally cached version wasn't modified.
        if(!is.null(cache) && cache$info$mtime==info$mtime) {
            message("Using GPL object found in memory from locally cached version.")
            return(cache$gpl)
        }
    }
    txt = read_lines(fname)
    # read_lines reads separate blank lines
    # on windows, so remove them before 
    # proceeding. NOT doing so results in 
    # the Table header being repeated as the
    # first line of the Table and test failures
    # galore.
    txt = txt[txt != '']
    return(.parseGPLTxt(txt))
}
    

txtGrab <- function(regex,x) {
    x <- as.character(x)
    a <- regexpr(regex,x,perl=TRUE)
    return(substr(x,a,a+attr(a,'match.length')-1))
}

### Function wrapper to get and parse ALL
### the GSEMatrix files associated with a GSE
### into a list of ExpressionSets
getAndParseGSEMatrices <- function(GEO,destdir,AnnotGPL,getGPL=TRUE,parseCharacteristics=TRUE) {
    GEO <- toupper(GEO)
    ## This stuff functions to get the listing of available files
    ## for a given GSE given that there may be many GSEMatrix
    ## files for a given GSE.
    stub = gsub('\\d{1,3}$','nnn',GEO,perl=TRUE)
    gdsurl <- 'https://ftp.ncbi.nlm.nih.gov/geo/series/%s/%s/matrix/'
    b = getDirListing(sprintf(gdsurl,stub,GEO))
    message(sprintf('Found %d file(s)',length(b)))
    ret <- list()
    ## Loop over the files, returning a list, one element
    ## for each file
    for(i in 1:length(b)) {
        message(b[i])
        destfile=file.path(destdir,b[i])
        if(file.exists(destfile)) {
            message(sprintf('Using locally cached version: %s',destfile))
        } else {
            download.file(sprintf('https://ftp.ncbi.nlm.nih.gov/geo/series/%s/%s/matrix/%s',
                                  stub,GEO,b[i]),destfile=destfile,mode='wb',
                          method=getOption('download.file.method.GEOquery'))
        }
        ret[[b[i]]] <- parseGSEMatrix(destfile,destdir=destdir,AnnotGPL=AnnotGPL,getGPL=getGPL)$eset
    }
    return(ret)
}

#' Parse a GSE mstrix file
#'
#' Not meant for user calling, but parses
#' a single GSEMatrix file.
#'
#' @importFrom dplyr select filter mutate mutate_all
#' @importFrom tidyr gather spread separate
#' @importFrom readr read_lines
#' @importClassesFrom Biobase ExpressionSet
#' @importFrom magrittr %>%
#'
#' @param fname filename
#' @param AnnotGPL set to TRUE to get the annotation GPL version
#' @param destdir the destdination directory for download
#' @param getGPL whether or not to get the GPL associated
#' @param parseCharacteristics Whether or not to do full "characteristic" parsing
#' @keywords internal
#' 
parseGSEMatrix <- function(fname,AnnotGPL=FALSE,destdir=tempdir(),getGPL=TRUE,parseCharacteristics=TRUE) {
    dat <- read_lines(fname)
    ## get the number of !Series and !Sample lines
    series_header_row_count <- sum(grepl("^!Series_", dat))
    # In the case of ^M in the metadata (GSE781, for example), the line counts
    # for "skip" and read.table are different.
    # This next line gets the "skip" line count for use
    # below in the tmpdat reading "skip"
    sample_header_start <- grep("^!Sample_", dat)[1]
    samples_header_row_count <- sum(grepl("^!Sample_", dat))
    series_table_begin_line = grep("^!series_matrix_table_begin", dat)
    if(length(series_table_begin_line) != 1) {
        stop("parsing failed--expected only one '!series_data_table_begin'")
    }
    #con <- fileOpen(fname)
    ## Read the !Series_ and !Sample_ lines
    header <- read.table(fname,sep="\t",header=FALSE,nrows=series_header_row_count)
    tmpdat <- read.table(fname,sep="\t",header=FALSE,nrows=samples_header_row_count,
                         skip=sample_header_start-1)
    
    headertmp <- t(header)
    headerdata <- rbind(data.frame(), headertmp[-1,])
    colnames(headerdata) <- sub('!Series_','',as.character(header[,1]))
    headerlist <- lapply(split.default(headerdata, names(headerdata)),
                           function(x) {
                             as.character(Reduce(function (a,b) {paste(a,b,sep = "\n")}, x))
                           })
    
    link = "https://www.ncbi.nlm.nih.gov/geo/"
    if (!is.null(headerlist$web_link)) {
      link <- headerlist$web_link
    } else if (!is.null(headerlist$geo_accession)) {
      link <- paste(link, 'query/acc.cgi?acc=', headerlist$geo_accession, sep="")
    }
    
    ed <- new ("MIAME",
               name = ifelse(is.null(headerlist$contact_name), '', headerlist$contact_name),
               title = headerlist$title,
               contact = ifelse(is.null(headerlist$contact_email), '', headerlist$contact_email),
               pubMedIds = ifelse(is.null(headerlist$pubmed_id), '', headerlist$pubmed_id), 
               abstract = ifelse(is.null(headerlist$summary), '', headerlist$summary),
               url = link,
               other = headerlist)
    
    tmptmp <- t(tmpdat)
    sampledat <- rbind(data.frame(),tmptmp[-1,])
    colnames(sampledat) <- make.unique(sub('!Sample_','',as.character(tmpdat[,1])))
    sampledat[['geo_accession']]=as.character(sampledat[['geo_accession']])
    rownames(sampledat) = sampledat[['geo_accession']]
    ## Lots of GSEs now use "characteristics_ch1" and
    ## "characteristics_ch2" for key-value pairs of
    ## annotation. If that is the case, this simply
    ## cleans those up and transforms the keys to column
    ## names and the values to column values.
    if(length(grep('characteristics_ch',colnames(sampledat)))>0 && parseCharacteristics) {
        pd = sampledat %>%
            dplyr::select(dplyr::contains('characteristics_ch')) %>%
            dplyr::mutate(accession = rownames(.)) %>%
            # these next two lines avoid warnings due
            # to columns having different factor levels
            # (attributes).
            mutate_all(as.character) %>%
            tidyr::gather(characteristics, kvpair, -accession) %>%
            dplyr::filter(grepl(':',kvpair) & !is.na(kvpair))
        # Thx to Mike Smith (@grimbough) for this code
        # sometimes the "characteristics_ch1" fields are empty and contain no 
        # key:value pairs. spread() will fail when called on an
        # empty data_frame.  We catch this case and remove the 
        # "charactics_ch1" column instead
        if(nrow(pd)) {
            pd = dplyr::mutate(pd, characteristics=ifelse(grepl('_ch2',characteristics),'ch2','ch1')) %>%
                tidyr::separate(kvpair, into= c('k','v'), sep=":", fill = 'right', extra = "merge") %>%
                dplyr::mutate(k = paste(k,characteristics,sep=":")) %>%
                dplyr::select(-characteristics) %>%
                dplyr::filter(!is.na(v)) %>%
                dplyr::group_by(accession,k) %>%
                dplyr::mutate(v = paste0(trimws(v), collapse = ";")) %>%
                unique() %>%
                tidyr::spread(k,v)
        } else {
            pd = pd %>% 
                dplyr::select(accession)
        }
        ##     dplyr::mutate(characteristics=ifelse(grepl('_ch2',characteristics),'ch2','ch1')) %>%
        ##     dplyr::filter(grepl(':',kvpair)) %>% 
        ##     tidyr::separate(kvpair, into= c('k','v'), sep=":")
        ## if(nrow(pd)>0) {
        ##     pd = pd %>% dplyr::mutate(k = paste(k,characteristics,sep=":")) %>%
        ##         dplyr::select(-characteristics) %>%
        ##         tidyr::spread(k,v)
        sampledat = sampledat %>% dplyr::left_join(pd,by=c('geo_accession'='accession'))
    }
    
    ## used to be able to use colclasses, but some SNP arrays provide only the
    ## genotypes in AA AB BB form, so need to switch it up....
    ##  colClasses <- c('character',rep('numeric',nrow(sampledat)))
    datamat <- read_tsv(fname,quote='"',
                        na=c('NA','null','NULL','Null'), 
                        # somewhere in the past month or so, read_tsv changed
                        # the way it dealt with skip!!! Had to add the -1 to
                        # avoid the problem.
                        skip = series_table_begin_line,
                        comment = '!series_matrix_table_end',
                        skip_empty_rows = FALSE)
    tmprownames = datamat[[1]]
                                        # need the as.matrix for single-sample or empty GSE
    datamat <- as.matrix(datamat[!is.na(tmprownames),-1])
    rownames(datamat) <- tmprownames[!is.na(tmprownames)]
    datamat <- as.matrix(datamat)
    rownames(sampledat) <- colnames(datamat)
    GPL=as.character(sampledat[1,grep('platform_id',colnames(sampledat),ignore.case=TRUE)])
    ## if getGPL is FALSE, skip this and featureData is then a data.frame with no columns
    fd = new("AnnotatedDataFrame",data=data.frame(row.names=rownames(datamat)))
    if(getGPL) {
        gpl <- getGEO(GPL,AnnotGPL=AnnotGPL,destdir=destdir)
        vmd <- Columns(gpl)
        dat <- Table(gpl)
        ## GEO uses "TAG" instead of "ID" for SAGE GSE/GPL entries, but it is apparently
        ##     always the first column, so use dat[,1] instead of dat$ID
        ## The next line deals with the empty GSE
        tmpnames=character(0)
        if(ncol(dat)>0) {
          tmpnames=as.character(dat[,1])
        }
        ## Fixed bug caused by an ID being "NA" in GSE15197, for example
        tmpnames[is.na(tmpnames)]="NA"
        rownames(dat) <- make.unique(tmpnames)
        ## Apparently, NCBI GEO uses case-insensitive matching
        ## between platform IDs and series ID Refs ???
        dat <- dat[match(tolower(rownames(datamat)),tolower(rownames(dat))),]
                                        # Fix possibility of duplicate column names in the
                                        # GPL files; this is prevalent in the Annotation GPLs
        rownames(vmd) <- make.unique(colnames(Table(gpl)))
        colnames(dat) <- rownames(vmd)
        fd <- new('AnnotatedDataFrame',data=dat,varMetadata=vmd)
    }
    if(is.null(nrow(datamat))) {
        ## fix empty GSE datamatrix
        ## samplename stuff above does not work with
        ## empty GSEs, so fix here, also
        tmpnames <- names(datamat)
        rownames(sampledat) <- tmpnames
        datamat=matrix(nrow=0,ncol=nrow(sampledat))
        colnames(datamat) <- tmpnames
    } else {
        ## This looks like a dangerous operation but is needed
        ## to deal with the fact that NCBI GEO allows case-insensitive
        ## matching and we need to pick one.
        rownames(datamat) <- rownames(dat)
    }
    eset <- new('ExpressionSet',
                phenoData=as(sampledat,'AnnotatedDataFrame'),
                annotation=GPL,
                featureData=fd,
                experimentData=ed,
                exprs=as.matrix(datamat))
    return(list(GPL=as.character(sampledat[1,grep('platform_id',colnames(sampledat),ignore.case=TRUE)]),eset=eset))
}

Try the GEOquery package in your browser

Any scripts or data that you put into this service are public.

GEOquery documentation built on Nov. 8, 2020, 5:18 p.m.