R/makeUCSCTracks.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

# -----------------------------------------------------------------------------
## Import UCSC track to GRanges
## 

trackToGRangesRecipe <- function(ahm)
{
    outputFile <- outputFile(ahm)
    if (!file.exists(outputFile)) {
        trackName <- basename(metadata(ahm)$SourceFile)
        session <- browserSession()
        genome(session) <- metadata(ahm)$Genome
        query <- ucscTableQuery(session, trackName)
        gr <- track(query)
        save(gr, file=outputFile)
        ## if (!getOption("AnnotationHub_Use_Disk", FALSE)) {
        ##     upload_to_S3(outputFile, metadata(ahm)$RDataPath)
        ## }
    }
    outputFile(ahm)
}

## -----------------------------------------------------------------------------
## Memo-ize some web access, to avoid unnecessary internet traffic
## 

.ucscSession <- local({
    session <- NULL
    function() {
        if (is.null(session)) {
            session <<- browserSession()
        }
        session
    }
})

.GRangesForUSCSGenome <- local({
    env <- new.env(parent=emptyenv())
    function(genome) {
        if (!exists(genome, envir=env))
            env[[genome]] <- GRangesForUCSCGenome(genome)
        env[[genome]]
    }
})

.ucscTableQuery <- local({
    env <- new.env(parent=emptyenv())
    function(genome, trackName) {
        if (!exists(genome, envir=env)) {
            range <- .GRangesForUSCSGenome(genome)
            env[[genome]] <- ucscTableQuery(.ucscSession(), trackName, range)
        } else {
            trackName(env[[genome]]) <- trackName
        }
        env[[genome]]
    }
})

###########################################################################
## Pre-Processing code to allow saving of "bad" tracks that will not load...
## Run these helper functions ahead of time and save output objects

## This function takes about 20 + minutes to run.  The next two (for
## finding bad tracks) take overnight.
.getTracksForGenomes <- function(genomes, session){

    res <- list()
    ## Temp only look at a couple genomes
    ## genomes <- genomes[1:3]
    ## This step alone takes a very long time.
    for (i in seq_along(genomes)){
        genome(session) <- genomes[[i]]
        res[[i]] <- trackNames(session)
    }
    
    ## return a list of genomes with a character of tracks for each.
    names(res) <- genomes
    ## save(res, file="allPossibleTracks.rda") ## auto-save
    res
}
## Usage:
## genomes <- ucscGenomes()$db; session <- browserSession()
## tracks <- .getTracksForGenomes(genomes, session)


## Need a helper to find "bad tracks" for a genome.
.findBadTracks <- function(tracks, genome){
    session <- .ucscSession()
    genome(session) <- genome    
    errors <- list()
    for(i in seq_along(tracks)){
        errors[[i]] <- tryCatch({
            ucscTableQuery(session, tracks[[i]])
        }, error = function(err){return(err)})
    }
    idx = unlist(lapply(errors, is, "error"))
    badTracks <- tracks[idx]
    ## save(badTracks, file=paste0(genome,"BadTracks.rda")) ## safety net
    badTracks
}
## Takes a vector of possible tracks, and a genome (as a string)
## Usage: (after calling:  tracks <- .getTracksForGenomes(genomes, session) )
## badTracks = .findBadTracks(tracks[[1]], "hg19")

## helper to find bad tracks for ALL the genomes (this takes all night)
.getBadTracksForGenomes <- function(genomes, tracksList){
    allBadTracks <- list()
    ## This step takes a lot of very long times.
    for(i in seq_along(genomes)){
        allBadTracks[[i]] <- .findBadTracks(tracksList[[i]], genomes[[i]])
    }
    ## return a list of genomes with a character of tracks for each.
    names(allBadTracks) <- genomes
    ## save(allBadTracks, file="allBadTracks.rda") ## auto-save
    allBadTracks
}
## just gets the bad tracks for all the genomes (should take ages to run)
## Usage: (after calling:  tracks <- .getTracksForGenomes(genomes, session))
## badTracks = .getBadTracksForGenomes(genomes, tracks)


##############################################################################
##############################################################################
##############################################################################

##############################################################################
## Code to import tracks from UCSC and to make them into AHMs
UCSCTrackImportPreparer <-
    setClass("UCSCTrackImportPreparer", contains="ImportPreparer")

UCSCFullTrackImportPreparer <-
    setClass("UCSCFullTrackImportPreparer", contains="ImportPreparer")


## To store data on each track:
## ftp://hgdownload.cse.ucsc.edu/goldenpath/<genome name>/database/<track name>

## SO: hg19, track oreganno looks like this.
## ftp://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/oreganno
## The reason is that the 1st part (up to oreganno) is actual location
## of ftp data the oreganno data will be there in several files dumped
## from the DB.

.checkAllTracks <- function(allTracks) {
    species <-
        GenomicFeatures:::lookup_organism_by_UCSC_genome(names(allTracks))
    if (any(idx <- is.na(species))) {
        badSpecies <- names(species)[idx]
        stop("update GenomicFeatures:::lookup_organism_by_UCSC_genome",
             " to support ", paste(sQuote(badSpecies), collapse=","))
    }
    TRUE
}

## Helper for loading tracks
.cachedTracks <- function(filename) {
    loadFile <- system.file("extdata","badUCSCTracks", filename,
                            package = "AnnotationHubData")
    x <- load(loadFile)
    get(x)
}

.UCSCTrackSourceTracks <- function(){
    ## retrieve all possible tracks from UCSC
    genomes <- ucscGenomes()$db

    ## get the tracks for each genome. (pre-computed)
    allTracks <- .cachedTracks("allPossibleTracks.rda")
    badTracks <- .cachedTracks("allBadTracks.rda")

    ## check that we can know all species names for all these tracks.
    .checkAllTracks(allTracks)
    
    ## Now I just have to merge the results of these two things
    Map(function(a, b) a[!(a %in% b)], allTracks, badTracks)
}





## FIXME: does not pass BiocVersion
## STEP 1: make a function to process metadata into AHMs
.UCSCTrackMetadata <-
    function(type = c("FULL", "TRACKONLY"))
{
    ## just process all the sourceTracks
    sourceTracks <- .UCSCTrackSourceTracks()
    
    type <- match.arg(type)
    recipe <- switch(type, FULL="trackandTablesToGRangesRecipe",
                     TRACKONLY="trackToGRangesRecipe",
                     stop("No valid type argument"))
    
    ## To store data on each track:
    ## ftp://hgdownload.cse.ucsc.edu/goldenpath/<genome>/database/<track>

    genome <- rep(names(sourceTracks), sapply(sourceTracks,length))
    track <- unlist(sourceTracks, use.names=FALSE)
    names(track) <- genome    
    trackName <- unlist(lapply(sourceTracks, names), use.names=FALSE)

    ## This really has to be the same for both. (parsed later on for trackName)
    sourceFile <- paste0("goldenpath/", genome, "/database/", track)
    
    ## customize name and description depending if it's the full track or not
    description <-
        paste0("GRanges object from UCSC track ", sQuote(trackName))
    if (type=="FULL")
        description <- paste0(description, ", with additional tables")

    sourceUrl <- paste0("rtracklayer://hgdownload.cse.ucsc.edu/", sourceFile)
    title <- trackName
    species <-  unname(GenomicFeatures:::lookup_organism_by_UCSC_genome(genome))

    sourceVersion <- genome
    stockTags <- c("UCSC", "track", "Gene", "Transcript", "Annotation")
    tags <- lapply(track, c, stockTags)

    uspecies <- unique(species)
    utaxid <- vapply(uspecies, GenomeInfoDb:::lookup_tax_id_by_organism,
                     integer(1))
    taxonomyId <- utaxid[match(species, uspecies)]

    ## use Map to make all these from vectors

    Map(AnnotationHubMetadata,
        Description=description,
        Genome=genome,
        SourceFile=sourceFile,
        SourceUrl=sourceUrl,
        SourceVersion=sourceVersion,
        Species=species,
        TaxonomyId=taxonomyId,
        Title=title,
        Tags=tags,
        MoreArgs=list(
          ## AnnotationHubRoot=NA_character_,
          Coordinate_1_based = TRUE,
          DataProvider = "hgdownload.cse.ucsc.edu",
          Maintainer = "Marc Carlson <mcarlson@fhcrc.org>",
          RDataClass = "GRanges",
          RDataDateAdded = format(Sys.time(), "%Y-%m-%d GMT"),
          RDataVersion = "0.0.1",
          Recipe = c(recipe, package="AnnotationHubData"),
          SourceMd5=NA_character_,
          SourceSize=NA_real_))
}

## Open questions:
## 1) How did I decide which method to call (which recipe?) before?  (methods)
## 2) make sure that allGoodTracks is used when we make the metadata, and that the AHMs are made based on the answer to #1.


## STEP 2: Make a recipe function
## In this case these live in: trackToGRangesRecipe.R and
## trackWithAuxiliaryTableToGRangesRecipe.R



## STEP 3:  Call the helper to set up the newResources() method
makeAnnotationHubResource("UCSCTrackImportPreparer",
                          .UCSCTrackMetadata(type="TRACKONLY"))

makeAnnotationHubResource("UCSCFullTrackImportPreparer",
                          .UCSCTrackMetadata(type="FULL"))







################################################################################
## Then comment all that is below this:
################################################################################

## TODO: find an alternative way to get the extra stuff below called.  (there should be some other examples by now).
## Also the code at the top of these two methods does not even look 'correct' (subsetting issues)

## ## method for track only recipe
## setMethod(newResources, "UCSCTrackImportPreparer",
##     function(importPreparer, currentMetadata=list(), numberGenomesToProcess=NULL,
##              ...)
## {
    
##     allGoodTracks <- .UCSCTrackSourceTracks()
##     if( is.null(numberGenomesToProcess)){
##         sourceTracks <- allGoodTracks
##     }else{
##         sourceTracks <- allGoodTracks[numberGenomesToProcess]
##     }

##     ## filter known    
## ##     knownTracks <- sapply(currentMetadata, function(elt) {
## ##         sub("^.+/database/","",(metadata(elt)@SourceFile) 
## ##     })
## ##     sourceTracks <- sourceTracks[!sourceTracks %in% knownTracks]
   
##     ## AnnotationHubMetadata
##     .UCSCTrackMetadata(sourceTracks, type="TRACKONLY")
## })


## ## For full tracks
## setMethod(newResources, "UCSCFullTrackImportPreparer",
##     function(importPreparer, currentMetadata=list(), numberGenomesToProcess=NULL,
##              ...)
## {
    
##     allGoodTracks <- .UCSCTrackSourceTracks()
##     if( is.null(numberGenomesToProcess)){
##         sourceTracks <- allGoodTracks
##     }else{
##         sourceTracks <- allGoodTracks[1:numberGenomesToProcess]
##     }

##     ## filter known    
## ##     knownTracks <- sapply(currentMetadata, function(elt) {
## ##         sub("^.+/database/","",(metadata(elt)@SourceFile) 
## ##     })
## ##     sourceTracks <- sourceTracks[!sourceTracks %in% knownTracks]

##     ## AnnotationHubMetadata
##     .UCSCTrackMetadata(sourceTracks, type="FULL")
## })
















## I need to be able to learn what the files are that are associated
## with each track in an automatic way.  Maybe I can use rtracklayer?
## Or maybe I will just have to look for common names in the dir
## listed?

## ?trackName
## library(rtracklayer)
## session <- browserSession()
## head(ucscGenomes())
## genome(session) <- "hg19"
## head(trackNames(session))


## Already can do stuff like this:
## query <- ucscTableQuery(session, "Conservation",
##                         GRangesForUCSCGenome("mm9", "chr12",
##                                              IRanges(57795963, 57815592)))
## track(query)  # gets a GRanges object


## So does this work with my example track?  Well only "kinda".
## You get the main table alright, but you don't get the rest.
## query <- ucscTableQuery(session, "oreganno")
## foo = track(query)  


## But here is something that is more useful (potentially)
## tableNames(query)  ## lists table names
## This is good so long as table names match the filenames...

## or how about just doing this:
## tableName(query) <- "oregannoAttr"
## bar  = getTable(query)

## ?ucscGenomes




## There are some other issues: (some tracks are listed but are also unknown).
## BUT: these failure tracks seem to be consistently failing.

## So for example: c("ruler", "oligoMatch","cutters") all fail on hg19 and hg16.
## And others too?  (untested).  This suggests that I can probably
## just make a black list and use that to prevent certain things.

##

Any scripts or data that you put into this service are public.

AnnotationHubData documentation built on April 17, 2021, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makeUCSCTracks.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions .UCSCTrackMetadata .UCSCTrackSourceTracks .cachedTracks .checkAllTracks .getBadTracksForGenomes .findBadTracks .getTracksForGenomes trackToGRangesRecipe

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData Transform public data resources into Bioconductor Data Structures

R/makeUCSCTracks.R In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions .UCSCTrackMetadata .UCSCTrackSourceTracks .cachedTracks .checkAllTracks .getBadTracksForGenomes .findBadTracks .getTracksForGenomes trackToGRangesRecipe

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makeUCSCTracks.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures