R/GEO.R
In ChIPseeker: ChIPseeker for ChIP peak Annotation, Comparison, and Visualization

Documented in downloadGEObedFiles downloadGSMbedFiles getGEOgenomeVersion getGEOInfo getGEOspecies

########################################
##                                    ##
## data last update: Mar 03, 2015     ##
##                                    ##
########################################



##' accessing species statistics collecting from GEO database
##'
##'
##' @title getGEOspecies
##' @return data.frame
##' @author G Yu
##' @export
getGEOspecies <- function() {
    gsminfo <- get_gsminfo()
    species <- gsminfo$organism
    res <- as.data.frame(table(species))
    return(res)
}

##' get genome version statistics collecting from GEO ChIPseq data
##'
##'
##' @title getGEOgenomeVersion
##' @return data.frame
##' @author G Yu
##' @export
getGEOgenomeVersion <- function() {
    gsminfo <- get_gsminfo()
    gv <- gsminfo[, c("organism",
                      "genomeVersion")]
    genomeVersion <- gv$genomeVersion
    res <- as.data.frame(table(genomeVersion))
    gv <- unique(gv)

    res <- merge(gv, res, by.x="genomeVersion", by.y="genomeVersion", all.y=TRUE)
    res <- res[, c("organism", "genomeVersion", "Freq")]
    return(res)
}

##' get subset of GEO information by genome version keyword
##'
##'
##' @title getGEOInfo
##' @param genome genome version
##' @param simplify simplify result or not
##' @return data.frame
##' @author G Yu
##' @export
getGEOInfo <- function(genome, simplify =TRUE) {
    gsminfo <- get_gsminfo()
    genomeVersion <- NULL ## to satisfy codetools
    res <- subset(gsminfo, subset = genomeVersion == genome)
    if (simplify) {
        res <- res[,c("series_id", "gsm", "organism", "title", "supplementary_file", "genomeVersion", "pubmed_id")]
    }
    return(res)
}

##' download all BED files of a particular genome version
##'
##'
##' @title downloadGEObedFiles
##' @param genome genome version
##' @param destDir destination folder
##' @return NULL
##' @author G Yu
##' @export
downloadGEObedFiles <- function(genome, destDir=getwd()) {
    info <- getGEOInfo(genome)
    downloadGEO.internal(info, destDir)
}

##' download BED supplementary files of a list of GSM accession numbers
##'
##'
##' @title downloadGSMbedFiles
##' @param GSM GSM accession numbers
##' @param destDir destination folder
##' @return NULL
##' @author G Yu
##' @export
downloadGSMbedFiles <- function(GSM, destDir=getwd()) {
    gsminfo <- get_gsminfo()
    info <- gsminfo[gsminfo$gsm %in% GSM,]
    downloadGEO.internal(info, destDir)
}

##' @importFrom utils download.file
downloadGEO.internal <- function(info, destDir) {
    fnames <- as.character(info$supplementary_file)
    destfiles <- sub(".*\\/", paste(destDir, "/", sep=""), fnames)
    names(destfiles) <- NULL

    for (i in seq_along(fnames)) {
        if ( ! file.exists(destfiles[i]) )
            tryCatch(download.file(fnames[i],
                          destfile=destfiles[i],
                          mode="wb"),
                     error = function(e) message(fnames[i], ': file not found and skip'))
    }
}

##' @importFrom utils data
## @importFrom GEOmetadb
## @importFrom RSQLite dbConnect
## @importFrom RSQLite dbGetQuery
prepareGSMInfo <- function() {
    pkg <- "GEOmetadb"
    require(pkg, character.only=TRUE)
    getSQLiteFile <- eval(parse(text="getSQLiteFile"))
    ## get the latest version of sql file
    is.dl <- tryCatch(getSQLiteFile(), error = function(e) NULL)

    if (is.null(is.dl)) {
        url <- 'http://starbuck1.s3.amazonaws.com/sradb/GEOmetadb.sqlite.gz'
        HEAD <- eval(parse(text = "httr::HEAD"))
        hh <- HEAD(url)
        size <- hh$headers[["content-length"]]
        cmd <- paste('wget -c', url)
        while(file.info("GEOmetadb.sqlite.gz")$size < size) {
            system(cmd)
        }
        if (file.exists('GEOmetadb.sqlite') && file.exists('GEOmetadb.sqlite.gz')) {
            file.remove("GEOmetadb.sqlite")
        }
        system('gunzip GEOmetadb.sqlite.gz')
    }

    GEOmetadbFile="GEOmetadb.sqlite"
    file.info(GEOmetadbFile)

    sqlpkg <- "RSQLite"
    require(sqlpkg, character.only=TRUE)
    dbConnect <- eval(parse(text="dbConnect"))
    dbGetQuery <- eval(parse(text="dbGetQuery"))
    SQLite <- eval(parse(text="SQLite"))

    con <- dbConnect(SQLite(),GEOmetadbFile)
    ## dbListTables(con)


    pkg <- "GEOquery"
    require(pkg, character.only=TRUE)
    getGEO <- eval(parse(text="getGEO"))
    Meta <- eval(parse(text="Meta"))

    ## get all GPL IDs
    ## download soft using gpl = getGEO("GPLXXX")
    ## using Meta(gpl) find the technology match sequencing
    ## get all gsm IDs
    ## parse it

    gpl <- dbGetQuery(con, 'select gpl, technology from gpl')
    gpl <- gpl[gpl[,2] == "high-throughput sequencing",1]
    gpl <- gpl[!is.na(gpl)]

    ## save the processedGSM vector that contain all the GSM that have been processed.
    ## next time when preparing GSMInfo, filter those have been processed before.
    load(system.file("extdata/processedGSM.rda", package="ChIPseeker"))
    processedGSM <- get("processedGSM")
    newGSM <- c()

    gpldir <- "GPL"
    if (!file.exists(gpldir)) {
        dir.create(gpldir)
    }

    for (gid in gpl) {
        gg <- tryCatch(getGEO(gid, destdir=gpldir), error=function(e) NULL)
        if (is.null(gg)) {
            next
        }
        gsm <- Meta(gg)$sample_id
        gsm <- gsm[! (gsm %in% processedGSM) ]
        if (length(gsm) == 0) {
            next
        }
        newGSM <- c(newGSM, gsm)

        sf <- batchGetGSMsuppFile(gsm)
        if (!is.null(sf)) {
            save(sf, file=paste(gid, "_sf.rda", sep=""))
        }
    }

    processedGSM <- c(processedGSM, newGSM)
    processedGSM <- unique(processedGSM)
    save(processedGSM, file="../processedGSM.rda", compress="xz")


    sfiles <- list.files(pattern="_sf.rda")
    res <- data.frame(gsm=NULL, remoteFile=NULL)
    for (ff in sfiles) {
        load(ff)
        if (!is.null(sf)) {
            res <- rbind(res, sf)
        }
    }
    colnames(res)[2] <- "supplementary_file"


    GSMInfo <- lapply(unique(as.character(res$gsm)), function(i) {
        dbGetQuery(con,paste("select gsm,series_id,gpl,organism_ch1,title,characteristics_ch1,source_name_ch1,extract_protocol_ch1,description,data_processing,submission_date ",
                             "from gsm where gsm='", i, "'", sep=""))
    })

    GSMInfo <- do.call("rbind", GSMInfo)

    colnames(GSMInfo) <- sub("_ch1", "", colnames(GSMInfo))

    gsminfo <- merge(GSMInfo, res, by.x="gsm", by.y="gsm")

    tryCatch(utils::data("ucsc_release", package="ChIPseeker"))
    ucsc_release <- get("ucsc_release")

    genVer <- lapply(1:nrow(gsminfo), function(i)
                     getGenomicVersion(ucsc_release,
                                       gsminfo[i, "data_processing"],
                                       gsminfo[i, "organism"],
                                       gsminfo[i, "supplementary_file"])
                     )

    gsminfo$genomeVersion <- unlist(genVer)

    gse <- as.character(gsminfo$series_id)
    pubmed <- lapply(gse, function(i) {
        dbGetQuery(con,paste("select gse,pubmed_id ",
                             "from gse where gse='", i, "'", sep=""))
    })
    pm <- do.call(rbind, pubmed)
    pm <- unique(pm)
    gsminfo <- merge(gsminfo, pm, by.x="series_id", by.y="gse", all.x=TRUE)

    ## remove non-ASCII characters
    for(i in 1:ncol(gsminfo)) {
        gsminfo[,i] = iconv(gsminfo[,i], "latin1", "ASCII", sub="")
    }
    gsminfo2 <- gsminfo
    rm(gsminfo)

    utils::data("gsminfo", package="ChIPseeker")
    gsminfo <- get("gsminfo")
    gsminfo <- rbind(gsminfo, gsminfo2)
    gsminfo <- unique(gsminfo)

    save(gsminfo, file="../gsminfo.rda", compress="xz")
}


getGenomicVersion <- function(ucsc_release, data_processing, organism, supplementary_file) {
    data_processing <- as.character(data_processing)
    organism <- as.character(organism)
    supplementary_file <- as.character(supplementary_file)

    species <- NULL
    gs <- subset(ucsc_release, subset = species == organism)
    if (nrow(gs) == 0) return(NA)

    genMatch <- unlist(sapply(gs$ucsc_version, grep, data_processing))
    if (length(genMatch) == 0) {
        genMatch <- unlist(sapply(gs$ucsc_version, grep, supplementary_file))
        if (length(genMatch) == 0) {
            return(NA)
        }
    }

    genVer <- names(genMatch)
    if (length(genVer) > 1) {
        genVer <- genVer[1]
    }

    return(genVer)
}

## getGSE_ENCODE <- function() {
##     encode=readLines("http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html")
##     encode.chipseq <- encode[grep("ChIP-Seq", encode)]
##     ## require(gsubfn)
##     ## gse <- sapply(encode.chipseq, function(i) {
##     ##     res <- strapply(i, "(GSE\\d+)")
##     ##     unique(unlist(res))
##     ## })
##     gse <- sapply(encode.chipseq, gsub, pattern='.*(GSE\\d+).*', replacement='\\1')
##     names(gse) <- NULL
##     return(gse)
## }

## GSE2GSM <- function(GSE) {
##     info <- getGEO(GSE, GSEMatrix=FALSE)
##     metaInfo <- Meta(info)
##     gsm <- metaInfo$sample_id
##     return(gsm)
## }

##' @importFrom parallel mclapply
##' @importFrom parallel detectCores
batchGetGSMsuppFile <- function(gsm) {
    suppfiles <- mclapply(seq_along(gsm), function(i) {
        cat("processing ", gsm[i], "\t",  i , " of ", length(gsm), "\n")
        tryCatch(getGSMsuppFile(gsm[i]), error=function(e) NULL)
    }, mc.cores=detectCores())

    suppfiles <- suppfiles[!unlist(lapply(suppfiles, is.null))]

    sf <- do.call("rbind", suppfiles)
    return(sf)
}

getGSMsuppFile <- function(GSM) {
    pkg <- "GEOquery"
    require(pkg, character.only=TRUE)
    getGEO <- eval(parse(text="getGEO"))
    Meta <- eval(parse(text="Meta"))

    destdir="geo_soft"
    if (!file.exists(destdir)) {
        dir.create(destdir)
    }
    info <- getGEO(GSM, GSEMatrix=FALSE, destdir=destdir)
    ## http://www.ncbi.nlm.nih.gov/geo/info/soft2.html
    metaInfo <- Meta(info)

    ## suppmentary file names
    fnames <- unlist(metaInfo[grep("supplementary", names(metaInfo))])
    names(fnames) <- NULL
    i <- c(grep("bed.gz", fnames), grep("Peak.gz", fnames), grep("bedGraph.gz", fnames))

    if (length(i) == 0) {
        message("No bed files found")
        return(NULL)
    }
    fnames <- fnames[i]
    res <- data.frame(gsm=GSM, remoteFile = fnames)
    return(res)
}

get_gsminfo <- function() {
    tryCatch(utils::data("gsminfo", package="ChIPseeker"))
    gsminfo <- get("gsminfo")
    return(gsminfo)
}

Any scripts or data that you put into this service are public.

ChIPseeker documentation built on March 6, 2021, 2 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ChIPseeker
ChIPseeker for ChIP peak Annotation, Comparison, and Visualization

R/GEO.R
In ChIPseeker: ChIPseeker for ChIP peak Annotation, Comparison, and Visualization

Defines functions get_gsminfo getGSMsuppFile batchGetGSMsuppFile getGenomicVersion prepareGSMInfo downloadGEO.internal downloadGSMbedFiles downloadGEObedFiles getGEOInfo getGEOgenomeVersion getGEOspecies

Documented in downloadGEObedFiles downloadGSMbedFiles getGEOgenomeVersion getGEOInfo getGEOspecies

Try the ChIPseeker package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ChIPseeker ChIPseeker for ChIP peak Annotation, Comparison, and Visualization

R/GEO.R In ChIPseeker: ChIPseeker for ChIP peak Annotation, Comparison, and Visualization

Defines functions get_gsminfo getGSMsuppFile batchGetGSMsuppFile getGenomicVersion prepareGSMInfo downloadGEO.internal downloadGSMbedFiles downloadGEObedFiles getGEOInfo getGEOgenomeVersion getGEOspecies

Documented in downloadGEObedFiles downloadGSMbedFiles getGEOgenomeVersion getGEOInfo getGEOspecies

Try the ChIPseeker package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ChIPseeker
ChIPseeker for ChIP peak Annotation, Comparison, and Visualization

R/GEO.R
In ChIPseeker: ChIPseeker for ChIP peak Annotation, Comparison, and Visualization