R/getGEOfile.R

Defines functions gunzip getGEORaw getGEOfile

Documented in getGEOfile gunzip

#' Download a file from GEO soft file to the local machine
#' 
#' This function simply downloads a SOFT format file associated with the GEO
#' accession number given.
#' 
#' This function downloads GEO SOFT files based on accession number.  It does
#' not do any parsing.  The first two arguments should be fairly
#' self-explanatory, but the last is based on the input to the acc.cgi url at
#' the geo website.  In the default "full" mode, the entire SOFT format file is
#' downloaded.  Both "brief" and "quick" offer shortened versions of the files,
#' good for "peeking" at the file before a big download on a slow connection.
#' Finally, "data" downloads only the data table part of the SOFT file and is
#' good for downloading a simple EXCEL-like file for use with other programs (a
#' convenience).
#' 
#' @param GEO Character string, the GEO accession for download (eg., GDS84,
#' GPL96, GSE2553, or GSM10)
#' @param destdir Directory in which to store the resulting downloaded file.
#' Defaults to tempdir()
#' @param AnnotGPL A boolean defaulting to FALSE as to whether or not to use
#' the Annotation GPL information.  These files are nice to use because they
#' contain up-to-date information remapped from Entrez Gene on a regular basis.
#' However, they do not exist for all GPLs; in general, they are only available
#' for GPLs referenced by a GDS
#' @param amount Amount of information to pull from GEO.  Only applies to GSE,
#' GPL, or GSM.  See details...
#' @return Invisibly returns the full path of the downloaded file.
#'
#' @importFrom utils download.file
#'
#' 
#' @author Sean Davis
#' @seealso \code{\link{getGEO}}
#' @references http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi
#' @keywords IO
#' @examples
#' 
#'   # myfile <- getGEOfile('GDS10')
#'
#' @export
getGEOfile <- function(GEO,destdir=tempdir(),AnnotGPL=FALSE,
                       amount=c('full','brief','quick','data'))
  {
    amount <- match.arg(amount)
    geotype <- toupper(substr(GEO,1,3))
    mode <- 'wb'
    GEO <- toupper(GEO)
    stub = gsub('\\d{1,3}$','nnn',GEO,perl=TRUE)
    if (geotype == 'GDS') {
      gdsurl <- 'https://ftp.ncbi.nlm.nih.gov/geo/datasets/%s/%s/soft/%s'
      myurl <- sprintf(gdsurl,stub,GEO,paste0(GEO,'.soft.gz'))
      destfile <- file.path(destdir,paste0(GEO,'.soft.gz'))
    }
    if (geotype == 'GSE' & amount=='full') {
      gseurl <- 'https://ftp.ncbi.nlm.nih.gov/geo/series/%s/%s/soft/%s'
      myurl <- sprintf(gseurl,stub,GEO,paste0(GEO,'_family.soft.gz'))
      destfile <- file.path(destdir,paste(GEO,'.soft.gz',sep=""))
    }
    if (geotype == 'GSE' & amount!='full' & amount!='table') {
      gseurl <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
      myurl <- paste(gseurl,'?targ=self&acc=',GEO,'&form=text&view=',amount,sep='')
      destfile <- file.path(destdir,paste(GEO,'.soft',sep=""))
      mode <- 'w'
    }
    if (geotype == 'GPL') {
      if (AnnotGPL) {
        gplurl <- 'https://ftp.ncbi.nlm.nih.gov/geo/platforms/%s/%s/annot/%s'
        myurl <- sprintf(gplurl,stub,GEO,paste0(GEO,'.annot.gz'))
        destfile <- file.path(destdir,paste(GEO,'.annot.gz',sep=""))
        # check to see if Annotation GPL is present.  If so,
        # use it, else move on to submitter GPL
        res=try({
          if(!file.exists(destfile)) {
            download.file(myurl,destfile,mode=mode,quiet=TRUE,method=getOption('download.file.method.GEOquery'))
            message('File stored at: ')
            message(destfile)
          } else {
            message(sprintf('Using locally cached version of %s found here:\n%s ',GEO,destfile))
          }
        },silent=TRUE)
        if(!inherits(res,'try-error')) {
          return(invisible(destfile))
        } else {
          message('Annotation GPL not available, so will use submitter GPL instead')
        }
      } 
      gseurl <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
      myurl <- paste(gseurl,'?targ=self&acc=',GEO,'&form=text&view=',amount,sep='')
      destfile <- file.path(destdir,paste(GEO,'.soft',sep=""))
      mode <- 'w'
      if(!file.exists(destfile)) {
        download.file(myurl,destfile,mode=mode,quiet=TRUE,method=getOption('download.file.method.GEOquery'))
        message('File stored at: ')
        message(destfile)
      } else {
        message(sprintf('Using locally cached version of %s found here:\n%s ',GEO,destfile))
      }
      return(invisible(destfile))
    }
    if (geotype == 'GSM') {
      gseurl <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
      myurl <- paste(gseurl,'?targ=self&acc=',GEO,'&form=text&view=',amount,sep='')
      destfile <- file.path(destdir,paste(GEO,'.soft',sep=""))
      mode <- 'w'
    }
    if(!file.exists(destfile)) {
      download.file(myurl,destfile,mode=mode,quiet=TRUE,method=getOption('download.file.method.GEOquery'))
      message('File stored at: ')
      message(destfile)
    } else {
      message(sprintf('Using locally cached version of %s found here:\n%s ',GEO,destfile))
    }
#    if(length(grep('\\.gz',destfile,perl=TRUE))>0) {
#      gunzip(destfile,overwrite=TRUE,remove=TRUE)
#      destfile <- sub('\\.gz$','',destfile)
#    }
    invisible(destfile)
  }

getGEORaw <- function(GEO,destdir=tempdir()) {
  return(getGEOSuppFiles(GEO,baseDir=destdir))
}
                             


#' Gunzip a file
#' 
#' gunzip a file
#' 
#' This function was stripped out of R.utils due to breaking some stuff on the
#' bioconductor build machine.
#' 
#' @param filename The filename to be unzipped
#' @param destname The destination file
#' @param overwrite Boolean indicating whether or not to overwrite a destfile
#' of the same name
#' @param remove Boolean indicating whether or not to remove the original file
#' after completion
#' @param BFR.SIZE The size of the read buffer....
#' @return Invisibly, the number of bytes read.
#' @author Original author: Henrik Bengtsson
#' @seealso \code{\link{gzfile}}
#' @keywords IO
#'
#' @examples
#' 
#' # gunzip('file.gz',remove=FALSE)
#' 
#' 
#' @export
gunzip <- function(filename, destname=gsub("[.]gz$", "", filename), overwrite=FALSE, remove=TRUE, BFR.SIZE=1e7) {
  if (filename == destname) 
    stop(sprintf("Argument 'filename' and 'destname' are identical: %s", filename));
  if (!overwrite && file.exists(destname))
    stop(sprintf("File already exists: %s", destname));

  inn <- gzfile(filename, "rb");
  on.exit(if (!is.null(inn)) close(inn));

  out <- file(destname, "wb"); 
  on.exit(close(out), add=TRUE);

  nbytes <- 0;
  repeat { 
    bfr <- readBin(inn, what=raw(0), size=1, n=BFR.SIZE);
    n <- length(bfr);
    if (n == 0)
      break;
    nbytes <- nbytes + n;
    writeBin(bfr, con=out, size=1); 
  };

  if (remove) {
    close(inn);
    inn <- NULL;
    file.remove(filename);
  }
    
  invisible(nbytes);
}

Try the GEOquery package in your browser

Any scripts or data that you put into this service are public.

GEOquery documentation built on Nov. 8, 2020, 5:18 p.m.