R/UCSCTrackUpdateChecker.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

#############################################################
### Code to get the dir list and then filter it etc.



## helper to filter for each
.filter <- function(allTracks, badTracks){
    allTracks[!(allTracks %in% badTracks)]
}
## goodTracks <- mapply(.filter, allTracks, badTracks)


## Try a new strategy for getting ALL the dir listings at ONCE.

.goldenPathUrl <- "ftp://hgdownload.cse.ucsc.edu/goldenPath/"

###############################################
.getListing <- function(){
    listing <- .ftpDirectoryInfo(.goldenPathUrl)
    save(listing, file="listing.Rda")
    listing
}

###############################################
# Get a list of genomes from the UCSC FTP server
.getGenomeAbbrevs <- function(genomes){
    res <- .listRemoteFiles(.goldenPathUrl)
    # By arrow I mean symlinks
    dirsWithoutArrow <- sub(" -> [[:alnum:]]+/?$", "", res)
    listOfDirs <- read.table(text=dirsWithoutArrow, stringsAsFactors=FALSE)[[1]]
    # The dir list points to the current and parent, remove them
    listOfDirs <- listOfDirs[! (listOfDirs %in% c(".", "..")) ]
    listOfDirs <- listOfDirs[ listOfDirs %in% genomes ]
    listOfDirs
    }    

###############################################
## make this into a couple of functions and call
## Now I need to split that up based on subdirs of interest.
## there are empty rows whenever we switch dirs.
.getAllBreakPoints <- function(listing){
    grep('^$',listing) ## index of ALL split points
}
## subsIdx <- .getAllBreakPoints(listing)


## SECOND function
## There are a subset of other dirs that I want after some of those...
## To get these 1st make a set of strings based on genomes
.getNamedGenomeBreakPoints <- function(genomes, listing){
    genStrs1 <- paste(genomes, "/database:",sep="")
    genStrs2 <- paste("^", genomes, "/database:",sep="")
    ## and then grep to get these indices
    genDbIdx <- unlist(lapply(genStrs2, grep, listing))  ## takes a min
    names(genDbIdx) <- sub("/database:","",listing[genDbIdx])
    genDbIdx
}
## genDbIdx <- .getNamedGenomeBreakPoints(genomes2, listing)



## PROCESSES LATEST DATE AND TABLE NAME INFO FOR A GENOME
.getTableDates <- function(genome, listing, subsIdx, genDbIdx){

    ## helper takes all subsIdx and one genDbIdx
    getRange <- function(genIdx, subsIdx, listing){
        if(length(genIdx) >1){ message("too many genIdxs for: \n", genome)}
        end <- min(subsIdx[subsIdx > genIdx])
        ## if(length(end) >1){ message("too many ends for: \n", genome)}
        listing[genIdx:(end-1)]
    }
    
    dirs <- getRange(genDbIdx[names(genDbIdx) %in% genome],
                     subsIdx, listing)
    dirs <- dirs[-(1:3)]
    ## drop all but the txt.gz files.
    dirs <- dirs[grep(".txt.gz$",dirs)]

    ## Then remove unwanted stuff from the front end
    dirs <- sub("^.+ftp\\s+\\d+\\s","",dirs)
    ## remove .txt.gz from name
    dirs <- sub(".txt.gz$","",dirs)

    
    ## UCSC means for abbreviating months.
    months <- c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep',
                'Oct','Nov','Dec')
    names(months) <- c('01','02','03','04','05','06','07','08','09','10',
                       '11','12')
    ## helper for cleanup
    dateClean <- function(str){
        ## then replace whitespaces with single tabs
        swp = gsub("\\s+","\t",str)
        ## then split on tabs
        res <- unlist(strsplit(swp, "\t"))
        if(length(grep(":",res[3]))>0){
            res[3] <- "2013"
        }
        paste(res[3],
              names(months)[months %in% res[1]],
              res[2],
              sep="-")
        ##as.POSIXct(date, tz = "GMT")
    }

    dates <- unlist(lapply(dirs, dateClean))
    nameClean <- function(str){
        swp = gsub("\\s+","\t",str)
        res <- unlist(strsplit(swp, "\t"))[4]
    }
    tables <- unlist(lapply(dirs, nameClean))
    ## hand back as data.frame
    #data.frame(dates=dates, tables=tables)
    names(tables) <- dates
    tables
}
## example of how you can use this for a particular genome.
## res <- .getTableDates(genome="hg19", listing, subsIdx, genDbIdx)



## Wrapper function to do all the work of getting and processing the
## latest date information from the FTP site.
getLatestTableDates <- function(){
    listing <- .getListing()
    allTracks <- .cachedTracks("allPossibleTracks.rda") 
    badTracks <- .cachedTracks("allBadTracks.rda")
    goodTracks <- mapply(.filter, allTracks, badTracks)
    genomes <- names(goodTracks)
    genomes2 <- .getGenomeAbbrevs(genomes)
    subsIdx <- .getAllBreakPoints(listing)
    genDbIdx <- .getNamedGenomeBreakPoints(genomes2, listing)
    curTables <- lapply(genomes2, .getTableDates, listing, subsIdx, genDbIdx)
    names(curTables) <- genomes
    curTables
}
## curTables <- getLatestTableDates()




############################################################################
## I also need the association between a table and it's tracks

## get a list of tables for a genome and track
.findTrackTables <- function(track, session){
    query <- ucscTableQuery(session, track)
    tableNames(query)
}
## session <- browserSession()
## genome(session) <- "hg19"
## res <- .findTrackTables("cytoBand", session)


## get all tables for a genome based on tracks
.getTrackTablesForGenome <- function(genome, tracks){
    ## iterate through the list to get all the tableNames for each track
    session <- browserSession()
    genome(session) <- genome
    tables <- lapply(tracks, .findTrackTables, session)
    names(tables) <- tracks
    tables
}
## tracks <- goodTracks[[1]][1:3] ## reduce to a simple vector o track names
## genome <- "hg19"
## res <- .getTrackTablesForGenome(genome, tracks)


## Actual code to make and save this will be done as a loop instead of
## the more elegant mapply just so that I can save as I go...
.getTrackTablesForAllGenomes <- function(genomes, trackLists){
    if(length(genomes) != length(trackLists)) stop("The number of genomes must equal the number of track lists.")
    genomeTrackTable <- list()
    for(i in seq_along(genomes)){
        message("Now processing: ",genomes[i])
        genomeTrackTable[[i]] <- .getTrackTablesForGenome(genomes[i],
                                                          trackLists[[i]])
        names(genomeTrackTable)[i] <- genomes[i]
        save(genomeTrackTable,file="genomeTrackTable.Rda") 
    }
    genomeTrackTable
}
## res <- .getTrackTablesForAllGenomes(genomes, goodTracks)
## Can also load this with load("genomeTrackTable.Rda")  ## it will be saved




##############################################################################
## Helpers for translating table dates to track dates.
.getLatestTrackDate <- function(track, genome, genomeTrackTable, curTables){
    tables <- genomeTrackTable[[genome]][[track]]
    dates <- curTables[[genome]][curTables[[genome]] %in% tables]
    maxDate <- max(as.POSIXct(names(dates)))
    as.character(maxDate) ## cast because there were issues storing this data.
}
## genome <- "hg19"
## track <- "stsMap"
## res <- .getLatestTrackDate(track, genome, genomeTrackTable, curTables)


getLatestTrackDates <- function(){
    curTables <- getLatestTableDates()
    genomeTrackTable <-
        .cachedTracks("genomeTrackTable.Rda") 
    ## rough sketch (for now)
    trackDates <- list()
    for(i in seq_along(names(genomeTrackTable))){
        genome <- names(genomeTrackTable[i])
        tracks <- names(genomeTrackTable[[i]])
        trackDates[[i]] <- unlist(lapply(tracks, .getLatestTrackDate,
                                         genome,  genomeTrackTable, curTables))
        names(trackDates[[i]]) <- tracks
    }
    names(trackDates) <- names(genomeTrackTable)
    trackDates
}
## This runs
## trackDates <- getLatestTrackDates()


## BUT there are strange warnings!
##  1: In max.default(structure(numeric(0), class = c("POSIXct",  ... :
##  no non-missing arguments to max; returning -Inf

## It causes me to have some NA values for some of the dates...
## This in turn is caused by a small percentage of the tracks
## (134/3573) that are misnamed in the FS presented by UCSC (named
## differently from the tracks).


### Not used:

#############################################################################
### Now I just need some code to compare this data with what is in the
### metadata
#
### So get the metadata
#getAHTrackDates <- function(){
#    ah = AnnotationHub()
#    m = AnnotationHub:::.metadata(snapshotUrl(ah),
#      filters = list(DataProvider="hgdownload.cse.ucsc.edu"),
#      cols = c("SourceFile","RDataDateAdded","DataProvider"))
#
#    ## SourceFile is easier to clean up than sourceUrl...
#    data <- strsplit(m$SourceFile,"/")
#    trackNames <- unlist(lapply(data, function(x){x[4]}))
#    genNames <- unlist(lapply(data, function(x){x[2]}))
#    if(length(trackNames) == length(genNames) &&
#       dim(m)[1] == length(trackNames)){
#        m <- cbind(as.data.frame(m), trackNames, genNames)
#    }
#
#    ## Now make m into a similar structure to trackDates
#    trx <- as.character(m$RDataDateAdded)
#    names(trx) <- trackNames
#    ## then split it
#    split(trx, f=m$genNames)
#}
### ahTrackDates <-  getAHTrackDates()

#############################################################################
### And now we can use this to write code that compares and just lists
### tracks that are out of date.
#tracksToUpdate <- function(){
#    trackDates <- getLatestTrackDates()
#    ahTrackDates <-  getAHTrackDates()
#    
#    ## go along and for each list element in ahTrackDates we will need
#    ## to look at the equivalent element from trackDates and then
#    ## compare the dates.  Dates will be not OK (TRUE for update
#    ## this), OK, or NA.
#    ## This requires a helper to test indiv, elements of a vector
#    .testDate <- function(x, other){
#        if(is.na(x)){
#            return(NA)
#        }else if(is.na(other[names(x)])){
#            return(NA)
#        }else{
#            if(as.POSIXct(x) > as.POSIXct(other[names(x)])){
#                return(FALSE)
#            }else{ ## this means it needs an update 
#                return(TRUE)
#            }
#        }
#    }
#    ## And another helper to lets us loop across genomes
#    .compareDates <- function(genomeDateSet,trackDates){
#        ah <- genomeDateSet[[1]]
#        other <- trackDates[names(genomeDateSet)][[1]]
#        ## res <- lapply(ah, .testDate, other=other) ## rips names from vector
#        res <- vector()
#        for(i in seq_along(ah)){
#            res[i] <- .testDate(ah[i], other)
#        }
#        names(res) <- names(ah)
#        res
#    }
#    
#
#    ## I have to loop along the ahTrackDates object. I can use lapply
#    ## (but not mapply), and have to pass in all of trackDates every
#    ## time.
#
#    ## And again I am burned because lapply() jetisons the names...
#    res <- list()
#    for(i in seq_along(ahTrackDates)){
#        res[[i]] <- .compareDates(ahTrackDates[i], trackDates)
#    }
#    names(res) <- names(ahTrackDates)
#
#    ## return list of vectors.
#    res
#}
### res <- tracksToUpdate()

Any scripts or data that you put into this service are public.

AnnotationHubData documentation built on April 17, 2021, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/UCSCTrackUpdateChecker.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions getLatestTrackDates .getLatestTrackDate .getTrackTablesForAllGenomes .getTrackTablesForGenome .findTrackTables getLatestTableDates .getTableDates .getNamedGenomeBreakPoints .getAllBreakPoints .getGenomeAbbrevs .getListing .filter

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData Transform public data resources into Bioconductor Data Structures

R/UCSCTrackUpdateChecker.R In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions getLatestTrackDates .getLatestTrackDate .getTrackTablesForAllGenomes .getTrackTablesForGenome .findTrackTables getLatestTableDates .getTableDates .getNamedGenomeBreakPoints .getAllBreakPoints .getGenomeAbbrevs .getListing .filter

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/UCSCTrackUpdateChecker.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures