R/trackWithAuxiliaryTableToGRangesRecipe.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

.makeAuxTable <- function(n, auxFiles, ahm){
    
     colClasses <- metadata(ahm)$RecipeArgs$auxColClasses[n][[1]]$cols
     auxFile <- auxFiles[n]
     tbl.aux <- read.table(auxFile, sep="\t", colClasses=colClasses)
     colnames(tbl.aux) <- names(colClasses)
     tbl.aux
}

.getMergeArgs <- function(n, ahm){
    metadata(ahm)$RecipeArgs$auxColClasses[n][[1]]$merge
}


## from FILES (with json)
trackWithAuxiliaryTablesToGRanges <- function(ahm)
{
     mainFile <- inputFiles(ahm)[1] ## always the 1st one? - discuss with Dan and Paul
     auxFiles <- inputFiles(ahm)[-1]
     if(!(length(mainFile) == 1)) stop("No files present in input json.") 
     if(!(length(auxFiles) >= 1)) stop("No auxiliary files listed in input json.  Wrong recipe?")

     colClasses <- metadata(ahm)$RecipeArgs$mainColClasses
     tbl.main <- read.table(gzfile(mainFile), sep="\t", header=FALSE,
                           colClasses=colClasses)
     colnames(tbl.main) <- names(colClasses)

     auxLen <- length(auxFiles)
     ## a couple for loops because we need to know 'n'...
     auxTabs <- list()
     for(i in seq_len(auxLen)){
         auxTabs[[i]] <- .makeAuxTable(i, auxFiles, ahm)
     } 
     mergeArgs <- list()
     for(i in seq_len(auxLen)){
         mergeArgs[[i]] <- .getMergeArgs(i, ahm)
     } 
     
     ## merge together uses for loop again (to concentrate result down to one thing)
     for(i in  seq_len(auxLen)){
         if(i ==1){tbl <- tbl.main}
         ## otherwise recycle
         tbl <- merge(tbl, auxTabs[[i]], by.x=mergeArgs[[i]][["byX"]],
                       by.y=mergeArgs[[i]][["byY"]],
                       all.x=TRUE)  
     }
     
     tbl <- .sortTableByChromosomalLocation(tbl)
     colnames <- colnames(tbl)
     requiredColnames <- c("seqname", "start", "end")
     stopifnot(all(requiredColnames %in% colnames))
     otherColnames <- setdiff(colnames, requiredColnames)

     ## drop any rows withouth a seqname
     tbl <- tbl[!is.na(tbl$seqname),]
     
     if("strand" %in%  otherColnames){
         gr <- with(tbl, GRanges(seqname, IRanges(start, end), strand))
         otherColnames <- setdiff(colnames, c(requiredColnames,"strand"))
     }else{  
         gr <- with(tbl, GRanges(seqname, IRanges(start, end)))
     }
     
     mcols(gr) <- DataFrame(tbl[, otherColnames])

        # add seqlength & chromosome circularity information
    newSeqInfo <- constructSeqInfo(metadata(ahm)$Species,
                                    metadata(ahm)$Genome)
        # if gr only has a subset of all possible chromosomes,
        # then update those only
    seqinfo(gr) <- newSeqInfo[names(seqinfo(gr))]

    save(gr, file=outputFile(ahm))
    if (!getOption("AnnotationHub_Use_Disk", FALSE)) {
        upload_to_S3(outputFile(ahm), metadata(ahm)$RDataPath)
    }

    outputFile(ahm)

} # trackWithAuxiliaryTableToGRanges
#-------------------------------------------------------------------------------




## helper to remove 'id' col
.removeId <- function(table){ 
    newColnames <- setdiff(colnames(table), "id")
    table[,newColnames]
}
## compress a whole table one col at a time.
## This (currently) assumes all cols should be characters
.compressTable <- function(table, levels){
    sf <- factor(table$id,levels=levels)
    table <- .removeId(table)
    res <- DataFrame()
    for(i in seq_len(ncol(table))){ ## for ea. column
        col <- splitAsList(as.character(table[[i]]), f=sf)
        if(i==1){
            res <- DataFrame(col)
        }else{
            res <- DataFrame(res, DataFrame(col)) ## cbind doesn't work?
        }
    }
    colnames(res) <- colnames(table)
    res
}


.makeComplexGR <- function(tbl,auxFiles,auxTabs){
    ## replace "chrom" with "seqnames".
    colnames(tbl)[colnames(tbl) %in% "chrom"] <- "seqname"
    colnames(tbl)[colnames(tbl) %in% "chromStart"] <- "start"
    colnames(tbl)[colnames(tbl) %in% "chromEnd"] <- "end"
    
    tbl <-.sortTableByChromosomalLocation(tbl)
    colnames <- colnames(tbl)
    requiredColnames <- c("seqname", "start", "end")
    stopifnot(all(requiredColnames %in% colnames))
    otherColnames <- setdiff(colnames, requiredColnames)
    
    ## drop any rows withouth a seqname
    tbl <- tbl[!is.na(tbl$seqname),]
    
    if("strand" %in%  otherColnames){
        gr <- with(tbl, GRanges(seqname, IRanges(start, end), strand))
        otherColnames <- setdiff(colnames, c(requiredColnames,"strand"))
    }else{  
        gr <- with(tbl, GRanges(seqname, IRanges(start, end)))
    }
    ## append the initial mcols
    mcols(gr) <- DataFrame(tbl[, otherColnames])
    
    
    
    ## make a spliting factor based on the initial table
    splitFactor <- factor(tbl$id, levels=tbl$id)
    
    for(i in seq_along(auxFiles)){
        new <- auxTabs[[i]]
        if(identical(as.character(splitFactor), as.character(new$id))){
            ## Add it in
            mcols(gr) <- DataFrame(mcols(gr), .removeId(new))
        }else{## otherwise compress it 1st               
            mcols(gr) <- DataFrame(mcols(gr),.compressTable(new,
                                                    levels(splitFactor)))
        }
    }
    gr
}

## Track AND auxiliary tables.
## Unfortunately, the schemas for some tracks are complex.
## This means that in the future I will have to use ucscSchema etc. to
## get the addional information so that I can properly assemble them.
## For now, we will check for "id" and only proceed if all tables have this.
trackandTablesToGRangesRecipe <- function(ahm)
{
    session <- browserSession()
    genome <- metadata(ahm)$Genome
    genome(session) <- genome
    sourceFile <- metadata(ahm)$SourceFile
    track <- sub("^.+/database/","",sourceFile)
    query <- ucscTableQuery(session, track)
    tableNames <- tableNames(query)
    
    mainFile <- tableNames[1] ## always the 1st one to be 2main table 
    auxFiles <- tableNames[-1]    
    if(!(length(auxFiles) >= 1)) { ## this means we are done already
        gr <- track(query)
    }else{ ## have to do a merge 1st
        ## have to "get" primary in table form to assure "id" will be present
        tbl <- getTable(ucscTableQuery(session, mainFile))

        ## Now get the other tables
        auxTabs <- list()
        for(i in seq_along(auxFiles)){
            ## query <- ucscTableQuery(session, track)
            tableName(query) <- auxFiles[i]
            auxTabs[[i]] <- getTable(query)
        }

        allColNames <- list()
        allColNames[[1]] <- colnames(tbl)
        for(i in seq_len(length(auxTabs))){
            idx <- i+1
            #print(idx)
            allColNames[[idx]] <- colnames(auxTabs[[i]])
        }
        ## for each element is there a value called "id"?
        idPresent <- unlist(lapply(allColNames, function(x){'id' %in% x}))
        
        if(all(idPresent)){
            gr <- .makeComplexGR(tbl,auxFiles,auxTabs)
        }else{
            message("track schema is too complex: using basic track instead")
            query <- ucscTableQuery(session, track)
            gr <- track(query)
        }
        
    }
    
    
##     ## add seqlength & chromosome circularity information
##     newSeqInfo <- constructSeqInfo(metadata(ahm)$Species,
##                                     metadata(ahm)$Genome)
##     ## if gr only has a subset of all possible chromosomes,
##     ## then update those only
##     seqinfo(gr) <- newSeqInfo[names(seqinfo(gr))]
    save(gr, file=outputFile(ahm))
    outputFile(ahm)

} # trackandTablesToGRangesRecipe
#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------

Any scripts or data that you put into this service are public.

AnnotationHubData documentation built on April 17, 2021, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/trackWithAuxiliaryTableToGRangesRecipe.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions trackandTablesToGRangesRecipe .makeComplexGR .compressTable .removeId trackWithAuxiliaryTablesToGRanges .getMergeArgs .makeAuxTable

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData Transform public data resources into Bioconductor Data Structures

R/trackWithAuxiliaryTableToGRangesRecipe.R In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions trackandTablesToGRangesRecipe .makeComplexGR .compressTable .removeId trackWithAuxiliaryTablesToGRanges .getMergeArgs .makeAuxTable

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/trackWithAuxiliaryTableToGRangesRecipe.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures