R/Methods.R

Defines functions dropProperty .toSaf getUTRsByTranscript .transcriptLengths .ensVersionFromSourceUrl .reduceAH validateEnsDb

##***********************************************************************
##
##     Methods for EnsDb classes
##
##***********************************************************************
setMethod("show", "EnsDb", function(object) {
    if (is.null(object@ensdb)) {
        cat("Dash it! Got an empty thing!\n")
    } else {
        info <- dbGetQuery(object@ensdb, "select * from metadata")
        cat("EnsDb for Ensembl:\n")
        if (inherits(object@ensdb, "SQLiteConnection"))
            cat(paste0("|Backend: SQLite\n"))
        if (inherits(object@ensdb, "MySQLConnection"))
            cat(paste0("|Backend: MySQL\n"))
        for (i in 1:nrow(info)) {
            cat(paste0("|", info[ i, "name" ], ": ",
                       info[ i, "value" ], "\n"))
        }
        ## gene and transcript info.
        cat(paste0("| No. of genes: ",
                   dbGetQuery(object@ensdb,
                              "select count(distinct gene_id) from gene")[1, 1],
                   ".\n"))
        cat(paste0("| No. of transcripts: ",
                   dbGetQuery(object@ensdb,
                              "select count(distinct tx_id) from tx")[1, 1],
                   ".\n"))
        if (hasProteinData(object))
            cat("|Protein data available.\n")
        flts <- .activeFilter(object)
        if (is(flts, "AnnotationFilter") | is(flts, "AnnotationFilterList")) {
            cat("|Active filter(s):\n")
            show(flts)
        }
    }
})

############################################################
## organism
setMethod("organism", "EnsDb", function(object){
    Species <- .getMetaDataValue(object@ensdb, "Organism")
    ## reformat the e.g. homo_sapiens string into Homo sapiens
                                        #
    Species <- gsub(Species, pattern="_", replacement=" ", fixed=TRUE)
    Species <- .organismName(Species)
    return(Species)
})

############################################################
## metadata
setMethod("metadata", "EnsDb", function(x, ...){
    Res <- dbGetQuery(dbconn(x), "select * from metadata")
    return(Res)
})

############################################################
## Validation
##
validateEnsDb <- function(object){
    ## check if the database contains all required tables...
    if(!is.null(object@ensdb)){
        msg <- validMsg(NULL, NULL)
        OK <- dbHasRequiredTables(object@ensdb)
        if (is.character(OK))
            msg <- validMsg(msg, OK)
        OK <- dbHasValidTables(object@ensdb)
        if (is.character(OK))
            msg <- validMsg(msg, OK)
        if (hasProteinData(object)) {
            OK <- dbHasRequiredTables(
                object@ensdb,
                tables = .ensdb_protein_tables(dbSchemaVersion(dbconn(object))))
            if (is.character(OK))
                msg <- validMsg(msg, OK)
            OK <- dbHasValidTables(
                object@ensdb,
                tables = .ensdb_protein_tables(dbSchemaVersion(dbconn(object))))
            if (is.character(OK))
                msg <- validMsg(msg, OK)
            cdsTx <- dbGetQuery(dbconn(object),
                                "select tx_id, tx_cds_seq_start from tx");
            if (is.character(cdsTx$tx_cds_seq_start)) {
                suppressWarnings(
                    cdsTx[, "tx_cds_seq_start"] <- as.numeric(cdsTx$tx_cds_seq_start)
                )
            }
            cdsTx <- cdsTx[!is.na(cdsTx$tx_cds_seq_start), "tx_id"]
            protTx <- dbGetQuery(dbconn(object),
                                 "select distinct tx_id from protein")$tx_id
            if (!all(cdsTx %in% protTx))
                msg <- validMsg(msg, paste0("Not all transcripts with a CDS ",
                                            "are assigned to a protein ID!"))
            if (!all(protTx %in% cdsTx))
                msg <- validMsg(msg, paste0("Not all proteins are assigned to ",
                                            "a transcript with a CDS!"))

        }
        if (is.null(msg)) TRUE
        else msg
    }
    return(TRUE)
}
setValidity("EnsDb", validateEnsDb)
setMethod("initialize", "EnsDb", function(.Object,...){
    OK <- validateEnsDb(.Object)
    if(class(OK)=="character"){
        stop(OK)
    }
    callNextMethod(.Object, ...)
})

############################################################
## dbconn
setMethod("dbconn", "EnsDb", function(x){
    return(x@ensdb)
})

############################################################
## ensemblVersion
##
## returns the ensembl version of the package.
setMethod("ensemblVersion", "EnsDb", function(x){
    eVersion <- getMetadataValue(x, "ensembl_version")
    return(eVersion)
})

############################################################
## getMetadataValue
##
## returns the metadata value for the specified name/key
setMethod("getMetadataValue", "EnsDb", function(x, name){
    if(missing(name))
        stop("Argument name has to be specified!")
    return(metadata(x)[metadata(x)$name==name, "value"])
})

############################################################
## seqinfo
setMethod("seqinfo", "EnsDb", function(x){
    Chrs <- .fix_is_circular(dbGetQuery(dbconn(x), "select * from chromosome"))
    Chr.build <- .getMetaDataValue(dbconn(x), "genome_build")
    Chrs$seq_name <- formatSeqnamesFromQuery(x, Chrs$seq_name)
    SI <- Seqinfo(seqnames = Chrs$seq_name,
                  seqlengths = Chrs$seq_length,
                  isCircular = Chrs$is_circular == 1,
                  genome = Chr.build)
    SI
})

############################################################
## seqlevels
setMethod("seqlevels", "EnsDb", function(x){
    Chrs <- dbGetQuery(dbconn(x), "select distinct seq_name from chromosome")
    Chrs <- formatSeqnamesFromQuery(x, Chrs$seq_name)
    return(Chrs)
})

############################################################
## getGenomeFaFile
##
## queries the dna.toplevel.fa file from AnnotationHub matching the current
## Ensembl version
## Update: if we can't find a FaFile matching the Ensembl version we suggest
## ones that might match.
setMethod("getGenomeFaFile", "EnsDb", function(x, pattern = "dna.toplevel.fa") {
    if (!requireNamespace("AnnotationHub", quietly = TRUE)) {
        stop("The 'AnnotationHub' package is needed for this function to ",
             "work. Please install it.", call. = FALSE)
    }
    ah <- AnnotationHub::AnnotationHub()
    ## Reduce the AnnotationHub to species, provider and genome version.
    ah <- .reduceAH(ah, organism = organism(x), dataprovider = "Ensembl",
                    genome = unique(genome(x)))
    if(length(ah) == 0)
        stop("Can not find any ressources in AnnotationHub for organism: ",
             organism(x), ", data provider: Ensembl and genome version: ",
             unique(genome(x)), "!")
    ## Reduce to all Fasta files with toplevel or primary_assembly.
    ah <- ah[ah$rdataclass == "FaFile", ]
    if(length(ah) == 0)
        stop("No FaFiles available in AnnotationHub for organism: ",
             organism(x), ", data provider: Ensembl and genome version: ",
             unique(genome(x)), "! You might also try to use the",
             " 'getGenomeTwoBitFile' method instead.")
    ## Reduce to dna.toplevel or dna.primary_assembly.
    idx <- c(grep(ah$title, pattern = "dna.toplevel"),
             grep(ah$title, pattern = "dna.primary_assembly"))
    if(length(idx) == 0)
        stop("No genome assembly fasta file available for organism: ",
             organism(x), ", data provider: Ensembl and genome version: ",
             unique(genome(x)), "!")
    ah <- ah[idx, ]
    ## Get the Ensembl version from the source url.
    ensVers <- .ensVersionFromSourceUrl(ah$sourceurl)
    if(any(ensVers == ensemblVersion(x))){
        ## Got it.
        itIs <- which(ensVers == ensemblVersion(x))
    }else{
        ## Get the "closest" one.
        diffs <- abs(ensVers - as.numeric(ensemblVersion(x)))
        itIs <- which(diffs == min(diffs))[1]
        message("Returning the Fasta file for Ensembl version ", ensVers[itIs],
                " since no file for Ensembl version ", ensemblVersion(x),
                " is available.")
    }
    ## Getting the ressource.
    Dna <- ah[[names(ah)[itIs]]]
    ## generate an index if none is available
    if(is.na(index(Dna))){
        indexFa(Dna)
        Dna <- FaFile(path(Dna))
    }
    return(Dna)
})
## Just restricting the Annotation Hub to entries matching the species and the
## genome; not yet the Ensembl version.
.reduceAH <- function(ah, organism = NULL, dataprovider = "Ensembl",
                      genome = NULL){
    if(!is.null(dataprovider))
        ah <- ah[ah$dataprovider == dataprovider, ]
    if(!is.null(organism))
        ah <- ah[ah$species == organism, ]
    if(!is.null(genome))
        ah <- ah[ah$genome == genome, ]
    return(ah)
}

.ensVersionFromSourceUrl <- function(url){
    url <- strsplit(url, split="/", fixed=TRUE)
    ensVers <- unlist(lapply(url, function(z){
        idx <- grep(z, pattern="^release")
        if(length(idx) == 0)
            return(-1)
        return(as.numeric(unlist(strsplit(z[idx], split="-"))[2]))
    }))
    return(ensVers)
}

############################################################
##  getGenomeTwoBitFile
##
##  Search and retrieve a genomic DNA resource through a TwoBitFile
##  from AnnotationHub.
setMethod("getGenomeTwoBitFile", "EnsDb", function(x){
    ah <- AnnotationHub::AnnotationHub()
    ## Reduce the AnnotationHub to species, provider and genome version.
    ah <- .reduceAH(ah, organism = organism(x), dataprovider = "Ensembl",
                    genome = unique(genome(x)))
    if(length(ah) == 0)
        stop("Can not find any ressources in AnnotationHub for organism: ",
             organism(x), ", data provider: Ensembl and genome version: ",
             unique(genome(x)), "!")
    ## Reduce to all Fasta files with toplevel or primary_assembly.
    ah <- ah[ah$rdataclass == "TwoBitFile", ]
    if(length(ah) == 0)
        stop("No TwoBitFile available in AnnotationHub for organism: ",
             organism(x), ", data provider: Ensembl and genome version: ",
             unique(genome(x)), "!")
    ## Reduce to dna.toplevel or dna.primary_assembly.
    idx <- c(grep(ah$title, pattern = "dna.toplevel"),
             grep(ah$title, pattern = "dna.primary_assembly"))
    if(length(idx) == 0)
        stop("No genome assembly fasta file available for organism: ",
             organism(x), ", data provider: Ensembl and genome version: ",
             unique(genome(x)), "!")
    ah <- ah[idx, ]
    ## Get the Ensembl version from the source url.
    ensVers <- .ensVersionFromSourceUrl(ah$sourceurl)
    if(any(ensVers == ensemblVersion(x))) {
        ## Got it.
        itIs <- which(ensVers == ensemblVersion(x))
    } else {
        ## Get the "closest" one.
        diffs <- abs(ensVers - as.numeric(ensemblVersion(x)))
        itIs <- which(diffs == min(diffs))[1]
        message("Returning the TwoBit file for Ensembl version ", ensVers[itIs],
                " since no file for Ensembl version ", ensemblVersion(x),
                " is available.")
    }
    ## Getting the ressource.
    Dna <- ah[[names(ah)[itIs]]]
    return(Dna)
})

############################################################
## listTables
setMethod("listTables", "EnsDb", function(x, ...){
    if(length(x@tables)==0){
        tables <- dbListTables(dbconn(x))
        ## read the columns for these tables.
        Tables <- vector(length=length(tables), "list")
        for(i in 1:length(Tables)){
            Tables[[ i ]] <- colnames(dbGetQuery(dbconn(x),
                                                 paste0("select * from ",
                                                        tables[ i ],
                                                        " limit 1")))
        }
        names(Tables) <- tables
        x@tables <- Tables
    }
    Tab <- x@tables
    Tab <- Tab[tablesByDegree(x, tab=names(Tab))]
    ## Manually add tx_name as a "virtual" column; getWhat will insert the tx_id into that.
    Tab$tx <- unique(c(Tab$tx, "tx_name"))
    ## Manually add the symbol as a "virtual" column.
    Tab$gene <- unique(c(Tab$gene, "symbol"))
    return(Tab)
})

############################################################
## listColumns
setMethod("listColumns", "EnsDb", function(x,
                                           table,
                                           skip.keys = TRUE,
                                           metadata = FALSE, ...){
    if (length(x@tables) == 0) {
        tables <- dbListTables(dbconn(x))
        ## read the columns for these tables.
        Tables <- vector(length = length(tables), "list")
        for (i in 1:length(Tables)){
            Tables[[i]] <- colnames(dbGetQuery(dbconn(x),
                                               paste0("select * from ",
                                                      tables[i],
                                                      " limit 1")))
        }
        names(Tables) <- tables
        x@tables <- Tables
    }
    Tab <- x@tables
    if (!metadata)
        Tab <- Tab[names(Tab) != "metadata"]
    ## Manually add tx_name as a "virtual" column; getWhat will insert
    ## the tx_id into that.
    Tab$tx <- unique(c(Tab$tx, "tx_name"))
    ## Manually add the symbol as a "virtual" column.
    Tab$gene <- unique(c(Tab$gene, "symbol"))
    if (!missing(table))
        columns <- unlist(Tab[names(Tab) %in% table], use.names = FALSE)
    else
        columns <- unlist(Tab, use.names=FALSE)
    if (skip.keys) {
        ## remove everything that has a _pk or _fk...
        idx <- grep(columns, pattern = "_fk$")
        if(length(idx) > 0)
            columns <- columns[ -idx ]
        idx <- grep(columns, pattern="_pk$")
        if(length(idx) > 0)
            columns <- columns[ -idx ]
    }
    unique(columns)
})

############################################################
## listGenebiotypes
setMethod("listGenebiotypes", "EnsDb", function(x, ...){
    return(dbGetQuery(dbconn(x), "select distinct gene_biotype from gene")[,1])
})

############################################################
## listTxbiotypes
setMethod("listTxbiotypes", "EnsDb", function(x, ...){
    return(dbGetQuery(dbconn(x), "select distinct tx_biotype from tx")[,1])
})

############################################################
## cleanColumns
##
## checks columns and removes all that are not present in database tables
## the method checks internally whether the columns are in the full form,
## i.e. gene.gene_id (<table name>.<column name>)
setMethod("cleanColumns", "EnsDb", function(x, columns, ...){
    if(missing(columns))
        stop("No columns submitted!")
    ## vote of the majority
    full.name <- length(grep(columns, pattern=".", fixed=TRUE)) >
        floor(length(columns) / 2)
    if (full.name) {
        suppressWarnings(
            full.columns <- unlist(prefixColumns(x,
                                                 unlist(listTables(x)),
                                                 clean = FALSE),
                                   use.names=TRUE)
        )
        bm <- columns %in% full.columns
        removed <- columns[ !bm ]
    } else {
        dbtabs <- names(listTables(x))
        dbtabs <- dbtabs[dbtabs != "metadata"]
        bm <- columns %in% unlist(listTables(x)[dbtabs])
        removed <- columns[!bm]
    }
    if(length(removed) > 0){
        if (length(removed) == 1)
            warning("Column ", paste(sQuote(removed), collapse=", "),
                    " is not present in the database and has been removed")
        else
            warning("Columns ", paste(sQuote(removed), collapse=", "),
                    " are not present in the database and have been removed")
    }
    return(columns[bm])
})

############################################################
## tablesForColumns
##
## returns the tables for the specified columns.
setMethod("tablesForColumns", "EnsDb", function(x, columns, ...){
    if(missing(columns))
        stop("No columns submitted!")
    bm <- unlist(lapply(listTables(x), function(z){
        return(any(z %in% columns))
    }))
    if(!any(bm))
        return(NULL)
    Tables <- names(bm)[ bm ]
    Tables <- Tables[ !(Tables %in% c("metadata")) ]
    return(Tables)
})

############################################################
## tablesByDegree
##
## returns the table names ordered by degree, i.e. edges to other tables
setMethod("tablesByDegree", "EnsDb", function(x,
                                              tab=names(listTables(x)),
                                              ...){
    Table.order <- c(gene = 1, tx = 2, tx2exon = 3, exon = 4, chromosome = 5,
                     protein = 6, uniprot = 7, protein_domain = 8,
                     entrezgene = 9,
                     metadata = 99)
    Tab <- tab[ order(Table.order[ tab ]) ]
    return(Tab)
})

############################################################
## hasProteinData
##
## Simply check if the database has required tables protein, uniprot
## and protein_domain.
#' @title Determine whether protein data is available in the database
#'
#' @aliases hasProteinData
#'
#' @description Determines whether the \code{\linkS4class{EnsDb}}
#'     provides protein annotation data.
#'
#' @param x The \code{\linkS4class{EnsDb}} object.
#'
#' @return A logical of length one, \code{TRUE} if protein annotations are
#'     available and \code{FALSE} otherwise.
#'
#' @author Johannes Rainer
#'
#' @seealso \code{\link{listTables}}
#'
#' @examples
#' library(EnsDb.Hsapiens.v86)
#' ## Does this database/package have protein annotations?
#' hasProteinData(EnsDb.Hsapiens.v86)
setMethod("hasProteinData", "EnsDb", function(x) {
    tabs <- listTables(x)
    return(all(c("protein", "uniprot", "protein_domain") %in%
               names(tabs)))
})

############################################################
## genes
##
## get genes from the database.
setMethod("genes", "EnsDb", function(x,
                                     columns = c(listColumns(x, "gene"),
                                                 "entrezid"),
                                     filter = AnnotationFilterList(),
                                     order.by = "",
                                     order.type = "asc",
                                     return.type = "GRanges"){
    return.type <- match.arg(return.type, c("data.frame", "GRanges", "DataFrame"))
    columns <- cleanColumns(x, unique(c(columns, "gene_id")))
    ## if return.type is GRanges we require columns: seq_name, gene_seq_start
    ## and gene_seq_end and seq_strand
    if(return.type=="GRanges"){
        columns <- unique(c(columns, c("gene_seq_start", "gene_seq_end",
                                       "seq_name", "seq_strand")))
    }
    filter <- .processFilterParam(filter, x)
    filter <- setFeatureInGRangesFilter(filter, "gene")
    ## Eventually add columns for the filters:
    columns <- addFilterColumns(columns, filter, x)
    retColumns <- columns
    ## If we don't have an order.by define one.
    if(all(order.by == "")){
        order.by <- NULL
        if (any(columns == "seq_name"))
            order.by <- c(order.by, "seq_name")
        if( any(columns == "gene_seq_start"))
            order.by <- c(order.by, "gene_seq_start")
        if(is.null(order.by))
            order.by <- ""
    }
    Res <- getWhat(x, columns=columns, filter=filter,
                   order.by=order.by, order.type=order.type,
                   startWith = "gene", join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(x)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "gene_id")
    if (return.type=="data.frame" | return.type=="DataFrame") {
        notThere <- !(retColumns %in% colnames(Res))
        if(any(notThere))
            warning("Columns ",
                           paste0("'", retColumns[notThere], "'", collapse=", "),
                           " not found in the database!")
        retColumns <- retColumns[!notThere]
        Res <- Res[, retColumns, drop = FALSE]
        if(return.type=="DataFrame")
            Res <- DataFrame(Res)
        return(Res)
    }
    if (return.type=="GRanges") {
        metacols <- columns[ !(columns %in% c("seq_name",
                                              "seq_strand",
                                              "gene_seq_start",
                                              "gene_seq_end")) ]
        suppressWarnings(
            SI <- seqinfo(x)
        )
        SI <- SI[as.character(unique(Res$seq_name))]
        GR <- GRanges(seqnames=Rle(Res$seq_name),
                      ranges=IRanges(start=Res$gene_seq_start, end=Res$gene_seq_end),
                      strand=Rle(Res$seq_strand),
                      seqinfo=SI[as.character(unique(Res$seq_name))],
                      Res[ , metacols, drop=FALSE ]
                    )
        names(GR) <- Res$gene_id
        return(GR)
    }
})

############################################################
## transcripts:
##
## get transcripts from the database.
setMethod("transcripts", "EnsDb", function(x, columns = listColumns(x, "tx"),
                                           filter = AnnotationFilterList(),
                                           order.by = "", order.type = "asc",
                                           return.type = "GRanges"){
    return.type <- match.arg(return.type, c("data.frame", "GRanges", "DataFrame"))
    columns <- cleanColumns(x, unique(c(columns, "tx_id")))
    ## if return.type is GRanges we require columns: seq_name, gene_seq_start
    ## and gene_seq_end and seq_strand
    if(return.type=="GRanges"){
        columns <- unique(c(columns, c("tx_seq_start",
                                       "tx_seq_end",
                                       "seq_name",
                                       "seq_strand")))
    }
    filter <- .processFilterParam(filter, x)
    filter <- setFeatureInGRangesFilter(filter, "tx")
    ## Eventually add columns for the filters:
    columns <- addFilterColumns(columns, filter, x)
    retColumns <- columns
    ## If we don't have an order.by define one.
    if(all(order.by == "")){
        order.by <- NULL
        if(any(columns == "seq_name"))
            order.by <- c(order.by, "seq_name")
        if(any(columns == "tx_seq_start"))
            order.by <- c(order.by, "tx_seq_start")
        if(is.null(order.by))
            order.by <- ""
    }
    Res <- getWhat(x, columns=columns, filter = filter,
                   order.by=order.by, order.type=order.type,
                   startWith = "tx", join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(x)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "tx_id")
    if(return.type=="data.frame" | return.type=="DataFrame"){
        notThere <- !(retColumns %in% colnames(Res))
        if(any(notThere))
            warning("Columns ", paste0("'", retColumns[notThere], "'",
                                       collapse=", "),
                           " not found in the database!")
        retColumns <- retColumns[!notThere]
        Res <- Res[, retColumns, drop = FALSE]
        if(return.type=="DataFrame")
            Res <- DataFrame(Res)
        return(Res)
    }
    if(return.type=="GRanges"){
        notThere <- !(columns %in% colnames(Res))
        if(any(notThere))
            warning(paste0("Columns ", paste(columns[notThere], collapse=", "),
                           " not present in the result data.frame!"))
        columns <- columns[!notThere]
        metacols <- columns[ !(columns %in% c("seq_name",
                                              "seq_strand",
                                              "tx_seq_start",
                                              "tx_seq_end")) ]
        suppressWarnings(
            SI <- seqinfo(x)
        )
        SI <- SI[as.character(unique(Res$seq_name))]
        GR <- GRanges(seqnames=Rle(Res$seq_name),
                      ranges=IRanges(start=Res$tx_seq_start, end=Res$tx_seq_end),
                      strand=Rle(Res$seq_strand),
                      seqinfo=SI[as.character(unique(Res$seq_name))],
                      Res[ , metacols, drop=FALSE ]
                    )
        names(GR) <- Res$tx_id
        return(GR)
    }
})

############################################################
## promoters:
##
setMethod("promoters", "EnsDb",
          function(x, upstream=2000, downstream=200, ...)
          {
              gr <- transcripts(x, ...)
              trim(suppressWarnings(promoters(gr,
                                              upstream=upstream,
                                              downstream=downstream)))
          }
)

############################################################
## exons
##
## get exons from the database.
setMethod("exons", "EnsDb", function(x, columns = listColumns(x, "exon"),
                                     filter = AnnotationFilterList(),
                                     order.by = "", order.type = "asc",
                                     return.type = "GRanges"){
    return.type <- match.arg(return.type, c("data.frame", "GRanges", "DataFrame"))
    if(!any(columns %in% c(listColumns(x, "exon"), "exon_idx"))){
        ## have to have at least one column from the gene table...
        columns <- c(columns, "exon_id")
    }
    columns <- cleanColumns(x, unique(c(columns, "exon_id")))
    ## if return.type is GRanges we require columns: seq_name, gene_seq_start
    ## and gene_seq_end and seq_strand
    if(return.type=="GRanges"){
        columns <- unique(c(columns, c("exon_seq_start",
                                       "exon_seq_end",
                                       "seq_name",
                                       "seq_strand")))
    }
    filter <- .processFilterParam(filter, x)
    filter <- setFeatureInGRangesFilter(filter, "exon")
    ## Eventually add columns for the filters:
    columns <- addFilterColumns(columns, filter, x)
    retColumns <- columns
    ## If we don't have an order.by define one.
    if (order.by == "") {
        order.by <- NULL
        if (any(columns == "seq_name"))
            order.by <- c(order.by, "seq_name")
        if (any(columns == "exon_seq_start"))
            order.by <- c(order.by, "exon_seq_start")
        if(is.null(order.by))
            order.by <- ""
    }
    Res <- getWhat(x, columns=columns, filter=filter,
                   order.by=order.by, order.type=order.type,
                   startWith = "exon", join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(x)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "exon_id")
    if(return.type=="data.frame" | return.type=="DataFrame"){
        notThere <- !(retColumns %in% colnames(Res))
        if(any(notThere))
            warning("Columns ", paste0("'", retColumns[notThere], "'",
                                       collapse=", "),
                           " not found in the database!")
        retColumns <- retColumns[!notThere]
        Res <- Res[, retColumns, drop = FALSE]
        if(return.type=="DataFrame")
            Res <- DataFrame(Res)
        return(Res)
    }
    if(return.type=="GRanges"){
        notThere <- !(columns %in% colnames(Res))
        if(any(notThere))
            warning(paste0("Columns ", paste(columns[notThere], collapse=", "),
                           " not present in the result data.frame!"))
        columns <- columns[!notThere]
        metacols <- columns[ !(columns %in% c("seq_name",
                                              "seq_strand",
                                              "exon_seq_start",
                                              "exon_seq_end")) ]
        suppressWarnings(
            SI <- seqinfo(x)
        )
        SI <- SI[as.character(unique(Res$seq_name))]
        GR <- GRanges(seqnames=Rle(Res$seq_name),
                      ranges=IRanges(start=Res$exon_seq_start, end=Res$exon_seq_end),
                      strand=Rle(Res$seq_strand),
                      seqinfo=SI[as.character(unique(Res$seq_name))],
                      Res[ , metacols, drop=FALSE ]
                    )
        names(GR) <- Res$exon_id
        return(GR)
    }
})

############################################################
## exonsBy
##
## should return a GRangesList
setMethod("exonsBy", "EnsDb", function(x, by = c("tx", "gene"),
                                       columns = listColumns(x, "exon"),
                                       filter = AnnotationFilterList(),
                                       use.names = FALSE) {
    by <- match.arg(by, c("tx", "gene"))
    bySuff <- "_id"
    if (use.names) {
        if (by == "tx") {
            use.names <- FALSE
            warning("Argument use.names ignored as no transcript names are available.")
        } else {
            columns <- unique(c(columns, "gene_name"))
            bySuff <- "_name"
        }
    }
    filter <- .processFilterParam(filter, x)
    ## We're applying eventual GRangesFilter to either gene or tx.
    filter <- setFeatureInGRangesFilter(filter, by)
    ## Eventually add columns for the filters:
    columns <- cleanColumns(x, unique(c(columns, "exon_id")))
    columns <- addFilterColumns(columns, filter, x)
    ## Quick fix; rename any exon_rank to exon_idx.
    columns[columns == "exon_rank"] <- "exon_idx"

    ## The minimum columns we need, in addition to "columns"
    min.columns <- c(paste0(by, "_id"), "seq_name","exon_seq_start",
                     "exon_seq_end", "exon_id", "seq_strand")
    by.id.full <- unlist(prefixColumns(x, columns = paste0(by, "_id"),
                                        clean = FALSE),
                         use.names = FALSE)
    if (by == "gene") {
        ## tx columns have to be removed, since the same exon can be part of
        ## more than one tx
        txcolumns <- c(listColumns(x, "tx"), "exon_idx")
        txcolumns <- txcolumns[txcolumns != "gene_id"]
        torem <- columns %in% txcolumns
        if (any(torem))
            warning("Columns ",
                    paste(columns[ torem ], collapse = ","),
                    " have been removed as they are not allowed if exons",
                    " are fetched by gene.")
        columns <- columns[!torem]
    } else {
        min.columns <- unique(c(min.columns, "exon_idx"))
        columns <- c(columns, "exon_idx")
    }
    ## define the minimal columns that we need...
    ret_cols <- unique(columns)  ## before adding the "min.columns"
    columns <- unique(c(columns, min.columns))
    ## get the seqinfo:
    suppressWarnings(
        SI <- seqinfo(x)
    )
    ## Resolve ordering problems.
    orderR <- orderResultsInR(x)
    if (orderR) {
        order.by <- ""
    } else {
        if (by == "gene") {
            order.by <- paste0("gene.gene_id, ",
                               "case when seq_strand = 1 then exon_seq_start",
                               " when seq_strand = -1 then (exon_seq_end * -1)",
                               " end")
        } else {
            ## Funny thing is the query takes longer if I use tx2exon.tx_id!
            order.by <- "tx.gene_id, tx.tx_id, tx2exon.exon_idx"
        }
    }
    Res <- getWhat(x, columns = columns, filter = filter,
                   order.by = order.by, skip.order.check = TRUE,
                   startWith = by, join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(x)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "exon_id")
    ## Now, order in R, if not already done in SQL.
    if (orderR) {
        if (by == "gene") {
            startend <- (Res$seq_strand == 1) * Res$exon_seq_start +
                (Res$seq_strand == -1) * (Res$exon_seq_end * -1)
            Res <- Res[order(Res$gene_id, startend,
                             method = "radix"), ]
        } else {
            Res <- Res[order(Res$tx_id, Res$exon_idx,
                             method = "radix"), ]
        }
    }
    SI <- SI[as.character(unique(Res$seq_name))]
    ## replace exon_idx with exon_rank
    colnames(Res)[colnames(Res) == "exon_idx"] <- "exon_rank"
    columns[columns == "exon_idx"] <- "exon_rank"
    ret_cols[ret_cols == "exon_idx"] <- "exon_rank"
    notThere <- !(ret_cols %in% colnames(Res))
    if (any(notThere))
        warning("Columns ", paste0("'", ret_cols[notThere], "'",
                                   collapse = ", "),
                " not found in the database!")
    ret_cols <- ret_cols[!notThere]
    columns.metadata <- ret_cols[!(ret_cols %in% c("seq_name", "seq_strand",
                                                   "exon_seq_start",
                                                   "exon_seq_end"))]
    columns.metadata <- match(columns.metadata, colnames(Res))
    GR <- GRanges(seqnames = Rle(Res$seq_name),
                  strand = Rle(Res$seq_strand),
                  ranges = IRanges(start = Res$exon_seq_start,
                                   end = Res$exon_seq_end),
                  seqinfo = SI,
                  Res[, columns.metadata, drop=FALSE]
                  )
    split(GR, Res[, paste0(by, bySuff)])
})

############################################################
## transcriptsBy
##
setMethod("transcriptsBy", "EnsDb", function(x, by = c("gene", "exon"),
                                             columns = listColumns(x, "tx"),
                                             filter = AnnotationFilterList()) {
    if (any(by == "cds"))
        stop("fetching transcripts by cds is not (yet) implemented.")
    by <- match.arg(by, c("gene", "exon"))
    byId <- paste0(by, "_id")
    min.columns <- c(paste0(by, "_id"), "seq_name", "tx_seq_start",
                     "tx_seq_end", "tx_id", "seq_strand")
    ## can not have exon columns!
    ex_cols <- c(listColumns(x, "exon"), "exon_idx")
    ex_cols <- ex_cols[ex_cols != "tx_id"]
    torem <- columns %in% ex_cols
    if (any(torem))
        warning("Columns ",
                paste(columns[ torem ], collapse=","),
                " have been removed as they are not allowed if",
                " transcripts are fetched.")
    columns <- columns[!torem]
    ## Process filters
    filter <- .processFilterParam(filter, x)
    ## GRanges filter should be based on either gene or exon coors.
    filter <- setFeatureInGRangesFilter(filter, by)
    ## Eventually add columns for the filters:
    columns <- addFilterColumns(columns, filter, x)
    ret_cols <- unique(columns)
    ## define the minimal columns that we need...
    columns <- cleanColumns(x, unique(c(columns, min.columns)))
    ## get the seqinfo:
    suppressWarnings(
        SI <- seqinfo(x)
    )
    byIdFull <- unlist(prefixColumns(x, columns = byId, clean = FALSE),
                         use.names = FALSE)
    orderR <- orderResultsInR(x)
    if (orderR) {
        order.by <- ""
    } else {
        order.by <- paste0(byIdFull ,
                           ", case when seq_strand = 1 then tx_seq_start",
                           " when seq_strand = -1 then (tx_seq_end * -1) end")
    }
    Res <- getWhat(x, columns=columns, filter=filter,
                   order.by=order.by, skip.order.check=TRUE,
                   startWith = by, join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(x)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "tx_id")
    if (orderR) {
        startEnd <- (Res$seq_strand == 1) * Res$tx_seq_start +
            (Res$seq_strand == -1) * (Res$tx_seq_end * -1)
        Res <- Res[order(Res[, byId], startEnd, method = "radix"), ]
    }
    SI <- SI[as.character(unique(Res$seq_name))]
    ## Replace exon_idx with exon_rank
    colnames(Res) <- gsub(colnames(Res), pattern = "exon_idx",
                                   replacement = "exon_rank", fixed = TRUE)
    ret_cols[ret_cols == "exon_idx"] <- "exon_rank"
    notThere <- !(ret_cols %in% colnames(Res))
    if(any(notThere))
        warning("Columns ", paste0("'", ret_cols[notThere], "'", collapse=", "),
                " not found in the database!")
    ret_cols <- ret_cols[!notThere]
    columns.metadata <- ret_cols[!(ret_cols %in% c("seq_name", "seq_strand",
                                                   "tx_seq_start",
                                                   "tx_seq_end"))]
    columns.metadata <- match(columns.metadata, colnames(Res))
    GR <- GRanges(seqnames=Rle(Res$seq_name),
                  strand=Rle(Res$seq_strand),
                  ranges=IRanges(start=Res$tx_seq_start, end=Res$tx_seq_end),
                  seqinfo=SI,
                  Res[ , columns.metadata, drop=FALSE ]
                )
    return(split(GR, Res[ , byId]))
})

############################################################
## lengthOf
## for GRangesList...
setMethod("lengthOf", "GRangesList", function(x, ...){
    return(sum(width(reduce(x))))
##    return(unlist(lapply(width(reduce(x)), sum)))
})
## return the length of genes or transcripts
setMethod("lengthOf", "EnsDb", function(x, of="gene",
                                        filter=AnnotationFilterList()){
    of <- match.arg(of, c("gene", "tx"))
    ## get the exons by gene or transcript from the database...
    suppressWarnings(
        GRL <- exonsBy(x, by=of, filter=filter)
      )
    return(lengthOf(GRL))
})

####============================================================
##  transcriptLengths
##
##  For TxDb: calls just the function (not method!) from the GenomicFeatures
##            package.
##  For EnsDb: calls the .transcriptLengths function.
.transcriptLengths <- function(x, with.cds_len = FALSE, with.utr5_len = FALSE,
                               with.utr3_len = FALSE,
                               filter = AnnotationFilterList(), 
                               exons = NA, transcripts = NA) {
    ## The preloaded data option is currently only for the coordinates mapping
    ## functions, therefore the filter is "tx_id" only.
    preload_ranges_missing <- which(c(
        identical(exons,NA),
        identical(transcripts,NA)
    ))
    if(identical(integer(0), preload_ranges_missing)){
        if (!is(exons, "CompressedGRangesList"))
            stop("Argument 'exons' has to be a 'CompressedGRangesList' object")
        if (!is(transcripts, "GRanges"))
            stop("Argument 'transcripts' has to be an 'GRanges' object")
        if (identical(integer(0),grep('T[0-9]', names(exons)[[1]])))
            stop("Argument 'exons' has to be by 'tx'.")
        ## Check if x is a formula and eventually translate it.
        if (is(filter, "formula"))
            filter <- AnnotationFilter(filter)
        tryCatch({
            filter_type <- filter@field
            filter_tx <- filter@value
        }, error = function(e){
            message("No filter detected, all transcripts will be returned")
            filter_tx <<- names(transcripts)
        })
        if (exists('filter_type')){
            if (filter_type != "tx_id")
                stop("Filter must be 'tx_id'.")            
        }
        tryCatch({
            allTxs <- transcripts[names(transcripts) %in% unique(filter_tx)]
        }, error = function(e){
            allTxs <<- GRanges()
        })
    } else if (length(preload_ranges_missing) == 2){
        filter <- .processFilterParam(filter, x)
        allTxs <- transcripts(x, filter = filter)        
    } else {
        stop(paste(
            "Argument", 
            c("'exons'", "'transcripts'")[preload_ranges_missing],
            'missing.'
            , sep = " "
        ))
    }
    if (length(allTxs) == 0)
        return(data.frame(tx_id = character(), gene_id = character(),
                          nexon = integer(), tx_len = integer()))
    if(identical(integer(0), preload_ranges_missing)){
        exns <- exons[match(allTxs$tx_id, names(exons))]
    } else {
        exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id))
        ## Match ordering
        exns <- exns[match(allTxs$tx_id, names(exns))]        
    }
    ## Calculate length of transcripts.
    txLengths <- sum(width(exns))
    ## Calculate no. of exons.
    ## build result data frame:
    Res <- data.frame(tx_id = allTxs$tx_id, gene_id = allTxs$gene_id,
                      nexon = lengths(exns), tx_len = txLengths,
                      stringsAsFactors = FALSE)
    if(!any(c(with.cds_len, with.utr5_len, with.utr3_len))) {
        ## Return what we've got thus far.
        return(Res)
    }
    if (with.cds_len)
        Res$cds_len <- NA
    if (with.utr5_len)
        Res$utr5_len <- NA
    if (with.utr3_len)
        Res$utr3_len <- NA
    ## Otherwise do the remaining stuff...
    txs <- allTxs[!is.na(allTxs$tx_cds_seq_start)]
    if (length(txs) > 0) {
        cExns <- exns[txs$tx_id]
        cReg <- GRanges(seqnames = seqnames(txs),
                        ranges = IRanges(start = txs$tx_cds_seq_start,
                                         end = txs$tx_cds_seq_end),
                        strand = strand(txs),
                        tx_id = txs$tx_id)
        cReg <- split(cReg, f = cReg$tx_id)
        ## Match order.
        cReg <- cReg[match(txs$tx_id, names(cReg))]
        cdsExns <- intersect(cReg, cExns)
        ## cExns: all exons of coding transcripts (includes untranslated
        ##        and translated region)
        ## cReg: just the start-end position of the coding region of the tx.
        ## cdsExns: the coding part of all exons of the tx.
        if (with.cds_len) {
            ## Calculate CDS length
            cdsLengths <- sum(width(cdsExns))
            Res[names(cdsLengths), "cds_len"] <- cdsLengths
        }
        if (with.utr3_len | with.utr5_len) {
            ## ! UTR is the difference between the exons and the cds-exons
            ## Note: order of parameters is important!
            utrReg <- setdiff(cExns, cdsExns)
            leftOfCds <- utrReg[end(utrReg) < start(cReg)]
            rightOfCds <- utrReg[start(utrReg) > end(cReg)]
            ## Calculate lengths.
            leftOfLengths <- sum(width(leftOfCds))
            rightOfLengths <- sum(width(rightOfCds))
            minusTx <- which(as.character(strand(txs)) == "-" )
            if (with.utr3_len) {
                ## Ordering of txs and all other stuff matches.
                tmp <- rightOfLengths
                tmp[minusTx] <- leftOfLengths[minusTx]
                Res[names(tmp), "utr3_len"] <- tmp
            }
            if (with.utr5_len) {
                tmp <- leftOfLengths
                tmp[minusTx] <- rightOfLengths[minusTx]
                Res[names(tmp), "utr5_len"] <- tmp
            }
        }
    }
    Res
}

############################################################
## cdsBy
##
## Return coding region ranges by tx or by gene.
setMethod("cdsBy", "EnsDb", function(x, by = c("tx", "gene"),
                                     columns = NULL,
                                     filter = AnnotationFilterList(),
                                     use.names = FALSE){
    by <- match.arg(by, c("tx", "gene"))
    filter <- .processFilterParam(filter, x)
    filter <- setFeatureInGRangesFilter(filter, by)
    columns <- cleanColumns(x, columns)
    ## Eventually add columns for the filters:
    columns <- addFilterColumns(columns, filter, x)
    ## Add a filter ensuring that only coding transcripts are queried.
    filter <- AnnotationFilterList(OnlyCodingTxFilter() ,filter)
    bySuff <- "_id"
    if (by == "tx") {
        ## adding exon_id, exon_idx to the columns.
        columns <- unique(c(columns, "exon_id", "exon_idx"))
        if (use.names)
            warning("Not considering use.names as no transcript names are",
                    " available.")
    } else {
        columns <- unique(c("gene_id", columns))
        if( use.names) {
            bySuff <- "_name"
            columns <- c(columns, "gene_name")
        }
    }
    byId <- paste0(by, bySuff)
    ## Query the data
    fetchCols <- unique(c(byId, columns, "tx_cds_seq_start", "tx_cds_seq_end",
                          "seq_name", "seq_strand", "exon_idx", "exon_id",
                          "exon_seq_start", "exon_seq_end"))
    ## Ordering of the results:
    ## Force ordering in R by default here to fix issue #11
    ##orderR <- orderResultsInR(x)
    orderR <- TRUE
    if (orderR) {
        order.by <- ""
    } else {
        if (by == "tx") {
            ## Here we want to sort the exons by exon_idx
            order.by <- "tx.tx_id, tx2exon.exon_idx"
        } else {
            ## Here we want to sort the transcripts by tx start.
            order.by <- paste0("gene.gene_id, case when seq_strand = 1 then",
                               " tx_cds_seq_start when seq_strand = -1 then",
                               "(tx_cds_seq_end * -1) end")
        }
    }
    Res <- getWhat(x, columns = fetchCols,
                   filter = filter,
                   order.by = order.by,
                   skip.order.check = TRUE,
                   startWith = by, join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(x)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "exon_id")
    ## Remove rows with NA in tx_cds_seq_start; that's the case for "old"
    ## databases.
    nas <- is.na(Res$tx_cds_seq_start)
    if (any(nas))
        Res <- Res[!nas, ]
    ## Remove exons that are not within the cds.
    Res <- Res[Res$exon_seq_end >= Res$tx_cds_seq_start &
               Res$exon_seq_start <= Res$tx_cds_seq_end,
             , drop = FALSE]
    if (orderR) {
        ## And finally ordering them.
        if (by == "tx") {
            Res <- Res[order(Res$tx_id, Res$exon_idx, method = "radix"), ]
        } else {
            startend <- (Res$seq_strand == 1) * Res$tx_cds_seq_start +
                (Res$seq_strand == -1) * (Res$tx_cds_seq_end * -1)
            Res <- Res[order(Res$gene_id, startend, method = "radix"), ]
        }
    }
    if(nrow(Res)==0){
        warning("No cds found!")
        return(NULL)
    }
    cdsStarts <- pmax.int(Res$exon_seq_start, Res$tx_cds_seq_start)
    cdsEnds <- pmin.int(Res$exon_seq_end, Res$tx_cds_seq_end)
    ## get the seqinfo:
    suppressWarnings(
        SI <- seqinfo(x)
    )
    SI <- SI[as.character(unique(Res$seq_name))]
    ## Rename columns exon_idx to exon_rank, if present
    if(any(colnames(Res) == "exon_idx")){
        colnames(Res)[colnames(Res) == "exon_idx"] <- "exon_rank"
        columns[columns == "exon_idx"] <- "exon_rank"
    }
    ## Building the result.
    if(length(columns) > 0){
        notThere <- !(columns %in% colnames(Res))
        if(any(notThere))
            warning(paste0("Columns ", paste(columns[notThere], collapse=", "),
                           " not present in the result data.frame!"))
        columns <- columns[!notThere]
        GR <- GRanges(seqnames=Rle(Res$seq_name),
                      strand=Rle(Res$seq_strand),
                      ranges=IRanges(start=cdsStarts, end=cdsEnds),
                      seqinfo=SI,
                      Res[, columns, drop=FALSE])
    }else{
        GR <- GRanges(seqnames=Rle(Res$seq_name),
                      strand=Rle(Res$seq_strand),
                      ranges=IRanges(start=cdsStarts, end=cdsEnds),
                      seqinfo=SI)
    }
    GR <- split(GR, Res[, paste0(by, bySuff)])
    ## For "by gene" we reduce the redundant ranges;
    ## that way we loose however all additional columns!
    if(by == "gene")
        GR <- reduce(GR)
    return(GR)
})


############################################################
## getUTRsByTranscript
##
getUTRsByTranscript <- function(x, what, columns = NULL,
                                filter = AnnotationFilterList()) {
    filter <- .processFilterParam(filter, x)
    columns <- cleanColumns(x, columns)
    filter <- setFeatureInGRangesFilter(filter, "tx")
    ## Eventually add columns for the filters:
    columns <- addFilterColumns(columns, filter, x)
    columns <- unique(c(columns, "exon_id", "exon_idx"))
    ## Add the filter for coding tx only.
    filter <- AnnotationFilterList(OnlyCodingTxFilter(), filter)
    ## what do we need: tx_cds_seq_start, tx_cds_seq_end and exon_idx
    fetchCols <- unique(c("tx_id", columns, "tx_cds_seq_start",
                          "tx_cds_seq_end", "seq_name", "seq_strand",
                          "exon_seq_start", "exon_seq_end"))
    order.by <- "tx.tx_id"
    ## get the seqinfo:
    suppressWarnings(
        SI <- seqinfo(x)
    )
    ## Note: doing that with a single query and some coordinate juggling
    ## is faster than calling exonsBy and GRangesList setdiff etc.
    Res <- getWhat(x, columns=fetchCols,
                   filter=filter,
                   order.by=order.by,
                   skip.order.check=TRUE,
                   startWith = "tx", join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(x)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "exon_id")
    nas <- is.na(Res$tx_cds_seq_start)
    if (any(nas))
        Res <- Res[!nas, ]
    ## Remove exons that are within the cds.
    Res <- Res[Res$exon_seq_start <= Res$tx_cds_seq_start |
               Res$exon_seq_end >= Res$tx_cds_seq_end, , drop=FALSE]
    if (nrow(Res) == 0) {
        warning(paste0("No ", what, "UTR found!"))
        return(NULL)
    }
    ## Rename columns exon_idx to exon_rank, if present
    if (any(colnames(Res) == "exon_idx")) {
        colnames(Res) <- sub(colnames(Res), pattern = "exon_idx",
                             replacement = "exon_rank", fixed = TRUE)
        columns[columns == "exon_idx"] <- "exon_rank"
    }
    if (what == "five") {
        ## All those on the forward strand for which the exon start is smaller
        ## than the cds start and those on the reverse strand with an exon end
        ## larger than the cds end.
        Res <- Res[(Res$seq_strand > 0 &
                    Res$exon_seq_start <= Res$tx_cds_seq_start) |
                   (Res$seq_strand < 0 &
                    Res$exon_seq_end >= Res$tx_cds_seq_end),
                 , drop = FALSE]
    } else {
        ## Other way round.
        Res <- Res[(Res$seq_strand > 0 &
                    Res$exon_seq_end >= Res$tx_cds_seq_end) |
                   (Res$seq_strand < 0 &
                    Res$exon_seq_start <= Res$tx_cds_seq_start),
                 , drop = FALSE]
    }
    if (nrow(Res) == 0) {
        warning(paste0("No ", what, "UTR found!"))
        return(NULL)
    }
    ## Increase the cds end by 1 and decrease the start by 1, thus,
    ## avoiding that the UTR overlaps the cds
    Res$tx_cds_seq_end <- Res$tx_cds_seq_end + 1L
    Res$tx_cds_seq_start <- Res$tx_cds_seq_start - 1L
    if (what == "five") {
        utrStarts <- Res$tx_cds_seq_start
        utrEnds <- utrStarts - 1L
        rs <- which(Res$seq_strand != 1)
        if (length(rs)) {
            utrStarts[rs] <- Res$tx_cds_seq_end[rs]
            utrEnds[rs] <- utrStarts[rs] -1L
        }
    } else {
        utrStarts <- Res$tx_cds_seq_end
        utrEnds <- utrStarts - 1L
        rs <- which(Res$seq_strand != 1)
        if (length(rs)) {
            utrStarts[rs] <- Res$tx_cds_seq_start[rs]
            utrEnds[rs] <- utrStarts[rs] -1L
        }
    }
    ## Distinguish between stuff which is left of and right of the CDS:
    ## Left of the CDS: can be either 5' for + strand or 3' for - strand.
    bm <- which(Res$exon_seq_start <= Res$tx_cds_seq_start)
    if (length(bm) > 0) {
        if (what == "five") {
            ## 5' and left of CDS means we're having 5' CDSs
            bm <- bm[Res$seq_strand[bm] > 0]
            if(length(bm) > 0){
                utrStarts[bm] <- Res$exon_seq_start[bm]
                utrEnds[bm] <- pmin.int(Res$exon_seq_end[bm],
                                        Res$tx_cds_seq_start[bm])
            }
        } else {
            bm <- bm[Res$seq_strand[bm] < 0]
            if (length(bm) > 0) {
                utrStarts[bm] <- Res$exon_seq_start[bm]
                utrEnds[bm] <- pmin.int(Res$exon_seq_end[bm],
                                        Res$tx_cds_seq_start[bm])
            }
        }
    }
    ## Right of the CDS: can be either 5' for - strand of 3' for + strand.
    bm <- which(Res$exon_seq_end >= Res$tx_cds_seq_end)
    if (length(bm) > 0) {
        if (what == "five") {
            ## Right of CDS is 5' for - strand.
            bm <- bm[Res$seq_strand[bm] < 0]
            if (length(bm) > 0) {
                utrStarts[bm] <- pmax.int(Res$exon_seq_start[bm],
                                          Res$tx_cds_seq_end[bm])
                utrEnds[bm] <- Res$exon_seq_end[bm]
            }
        } else {
            ## Right of CDS is 3' for + strand
            bm <- bm[Res$seq_strand[bm] > 0]
            if (length(bm) > 0) {
                utrStarts[bm] <- pmax.int(Res$exon_seq_start[bm],
                                          Res$tx_cds_seq_end[bm])
                utrEnds[bm] <- Res$exon_seq_end[bm]
            }
        }
    }
    notThere <- !(columns %in% colnames(Res))
    if (any(notThere))
        warning(paste0("Columns ", paste(columns[notThere], collapse=", "),
                       " not present in the result data.frame!"))
    columns <- columns[!notThere]
    SI <- SI[as.character(unique(Res$seq_name))]
    GR <- GRanges(seqnames = Rle(Res$seq_name),
                  strand = Rle(Res$seq_strand),
                  ranges = IRanges(start=utrStarts, end=utrEnds),
                  seqinfo = SI,
                  Res[, columns, drop = FALSE])
    GR <- split(GR, Res[, "tx_id"])
    return(GR)
}

############################################################
## threeUTRsByTranscript
##
setMethod("threeUTRsByTranscript", "EnsDb",
          function(x, columns = NULL, filter = AnnotationFilterList()) {
              filter <- .processFilterParam(filter, x)
              getUTRsByTranscript(x = x, what = "three", columns = columns,
                                  filter = filter)
})

############################################################
## fiveUTRsByTranscript
##
setMethod("fiveUTRsByTranscript", "EnsDb",
          function(x, columns = NULL, filter = AnnotationFilterList()) {
    filter <- .processFilterParam(filter, x)
    getUTRsByTranscript(x = x, what = "five", columns = columns,
                        filter = filter)
})

############################################################
## toSAF... function to transform a GRangesList into a data.frame
## corresponding to the SAF format.
## assuming the names of the GRangesList to be the GeneID and the
## element (GRanges) the start/end coordinates
## of an exon, transcript or the gene itself.
.toSaf <- function(x){
    DF <- as.data.frame(x)
    colnames(DF)[ colnames(DF)=="group_name" ] <- "GeneID"
    colnames(DF)[ colnames(DF)=="seqnames" ] <- "Chr"
    colnames(DF)[ colnames(DF)=="start" ] <- "Start"
    colnames(DF)[ colnames(DF)=="end" ] <- "End"
    colnames(DF)[ colnames(DF)=="strand" ] <- "Strand"
    return(DF[ , c("GeneID", "Chr", "Start", "End", "Strand")])
}
## for GRangesList...
setMethod("toSAF", "GRangesList", function(x, ...){
    return(.toSaf(x))
})

############################################################
## buildQuery
setMethod("buildQuery", "EnsDb",
          function(x, columns=c("gene_id", "gene_biotype", "gene_name"),
                   filter = AnnotationFilterList(), order.by="",
                   order.type="asc",
                   skip.order.check=FALSE){
              return(.buildQuery(x=x,
                                 columns=columns,
                                 filter=filter,
                                 order.by=order.by,
                                 order.type=order.type,
                                 skip.order.check=skip.order.check))
          })

############################################################
## getWhat
##
## Method that wraps the internal .getWhat function to retrieve data from the
## database. In addition, if present, we're renaming chromosome names depending
## on the ucscChromosomeNames option.
## Additional parameters:
## o startWith: the name of the database table from which the join should start
##   or NULL for the default behaviour (i.e. genes-> tx etc).
## o join: the type of join that should be used; one of "join",
##   "left outer join" or "suggested".
setMethod("getWhat", "EnsDb",
          function(x, columns = c("gene_id", "gene_biotype", "gene_name"),
                   filter = AnnotationFilterList(), order.by = "",
                   order.type = "asc", group.by = NULL,
                   skip.order.check = FALSE, startWith = NULL,
                   join = "suggested") {
              Res <- .getWhat(x = x,
                              columns = columns,
                              filter = filter,
                              order.by = order.by,
                              order.type = order.type,
                              group.by = group.by,
                              skip.order.check = skip.order.check,
                              startWith = startWith,
                              join = join)
              ## Eventually renaming seqnames according to the specified style.
              if(any(colnames(Res) == "seq_name"))
                  Res$seq_name <- formatSeqnamesFromQuery(x, Res$seq_name)
              return(Res)
          })

############################################################
## getGeneRegionTrackForGviz
## Fetch data to add as a GeneTrack.
## filter ...                 Used to filter the result.
## chromosome, start, end ... Either all or none has to be specified. If
##                            specified, the function first retrieves all
##                            transcripts that have an exon in the specified
##                            range and adds them as a TranscriptidFilter to
##                            the filters. The query to fetch the "real" data
##                            is performed afterwards.
## featureIs ...              Wheter gene_biotype or tx_biotype should be
##                            mapped to the column feature.
setMethod(
    "getGeneRegionTrackForGviz",
    "EnsDb",
    function(x, filter = AnnotationFilterList(), chromosome = NULL,
             start = NULL, end = NULL, featureIs = "gene_biotype")
    {
        featureIs <- match.arg(featureIs, c("gene_biotype", "tx_biotype"))
        filter <- .processFilterParam(filter, x)
        if(missing(chromosome))
            chromosome <- NULL
        if(missing(start))
            start <- NULL
        if(missing(end))
            end <- NULL
        ## If only chromosome is specified, create a SeqNameFilter and
        ## add it to the filter
        if(is.null(start) & is.null(end) & !is.null(chromosome)){
            filter <- AnnotationFilterList(filter, SeqNameFilter(chromosome))
            chromosome <- NULL
        }
        if(any(c(!is.null(chromosome), !is.null(start), !is.null(end)))){
            ## Require however that all are defined!!!
            if(all(c(!is.null(chromosome), !is.null(start), !is.null(end)))){
                ## Fix eventually provided UCSC chromosome names:
                chromosome <- ucscToEns(chromosome)
                grg <- GRanges(seqnames = chromosome,
                               ranges = IRanges(start, end))
                tids <- transcripts(
                    x, filter = GRangesFilter(grg, feature = "tx",
                                              type = "any"),
                    columns = "tx_id", return.type = "data.frame")$tx_id
                if (!length(tids))
                    return(GRanges())
                filter <- AnnotationFilterList(filter, TxIdFilter(tids))
            }else{
                stop("Either all or none of arguments 'chromosome', 'start' and",
                     " 'end' have to be specified!")
            }
        }
        ## Return a data.frame with columns: chromosome, start, end, width,
        ## strand, feature,
        ## gene, exon, transcript and symbol.
        ## 1) Query the data as we usually would.
        ## 2) Perform an additional query to get cds and utr, remove all entries
        ##    from the first result for the same transcripts and rbind the
        ##    data.frames.
        needCols <- c("seq_name", "exon_seq_start", "exon_seq_end", "seq_strand",
                      featureIs, "gene_id", "exon_id",
                      "exon_idx", "tx_id", "gene_name")
        ## That's the names to which we map the original columns from the EnsDb.
        names(needCols) <- c("chromosome", "start", "end", "strand",
                             "feature", "gene", "exon", "exon_rank", "transcript",
                             "symbol")
        txs <- transcripts(x, filter = filter,
                           columns = needCols, return.type = "data.frame")
        ## Rename columns
        idx <- match(needCols, colnames(txs))
        notThere <- is.na(idx)
        idx <- idx[!notThere]
        colnames(txs)[idx] <- names(needCols)[!notThere]
        txs <- txs[, !(colnames(txs) %in% c("tx_seq_start", "tx_seq_end",
                                            "seq_name"))]
        ## now processing the 5utr
        fUtr <- fiveUTRsByTranscript(x, filter = filter, columns = needCols)
        if(length(fUtr) > 0){
            fUtr <- as(unlist(fUtr, use.names = FALSE), "data.frame")
            fUtr <- fUtr[, !(colnames(fUtr) %in% c("width", "seq_name",
                                                   "exon_seq_start",
                                                   "exon_seq_end", "strand"))]
            colnames(fUtr)[1] <- "chromosome"
            idx <- match(needCols, colnames(fUtr))
            notThere <- is.na(idx)
            idx <- idx[!notThere]
            colnames(fUtr)[idx] <- names(needCols)[!notThere]
            ## Force being in the correct ordering:
            fUtr <- fUtr[, names(needCols)]
            fUtr$feature <- "utr5"
            ## Remove transcripts from the txs data.frame
            txs <- txs[!(txs$transcript %in% fUtr$transcript), , drop=FALSE]
        }
        tUtr <- threeUTRsByTranscript(x, filter = filter, columns=needCols)
        if(length(tUtr) > 0){
            tUtr <- as(unlist(tUtr, use.names = FALSE), "data.frame")
            tUtr <- tUtr[, !(colnames(tUtr) %in% c("width", "seq_name",
                                                   "exon_seq_start",
                                                   "exon_seq_end", "strand"))]
            colnames(tUtr)[1] <- "chromosome"
            idx <- match(needCols, colnames(tUtr))
            notThere <- is.na(idx)
            idx <- idx[!notThere]
            colnames(tUtr)[idx] <- names(needCols)[!notThere]
            ## Force being in the correct ordering:
            tUtr <- tUtr[, names(needCols)]
            tUtr$feature <- "utr3"
            ## Remove transcripts from the txs data.frame
            if(nrow(txs) > 0){
                txs <- txs[!(txs$transcript %in% tUtr$transcript), , drop=FALSE]
            }
        }
        cds <- cdsBy(x, filter = filter, columns = needCols)
        if(length(cds) > 0){
            cds <- as(unlist(cds, use.names=FALSE), "data.frame")
            cds <- cds[, !(colnames(cds) %in% c("width", "seq_name",
                                                "exon_seq_start",
                                                "exon_seq_end", "strand"))]
            colnames(cds)[1] <- "chromosome"
            idx <- match(needCols, colnames(cds))
            notThere <- is.na(idx)
            idx <- idx[!notThere]
            colnames(cds)[idx] <- names(needCols)[!notThere]
            ## Force being in the correct ordering:
            cds <- cds[, names(needCols)]
            ## Remove transcripts from the txs data.frame
            if(nrow(txs) > 0){
                txs <- txs[!(txs$transcript %in% cds$transcript), , drop=FALSE]
            }
        }
        ## If there are any txs, they are noncoding...
        if (nrow(txs))
            txs$feature <- "utr"
        if(length(fUtr) > 0){
            txs <- rbind(txs, fUtr)
        }
        if(length(tUtr) > 0){
            txs <- rbind(txs, tUtr)
        }
        if(length(cds) > 0){
            txs <- rbind(txs, cds)
        }
        ## Convert into GRanges.
        suppressWarnings(
            SI <- seqinfo(x)
        )
        SI <- SI[as.character(unique(txs$chromosome))]
        GR <- GRanges(seqnames=Rle(txs$chromosome),
                      strand=Rle(txs$strand),
                      ranges=IRanges(start=txs$start, end=txs$end),
                      seqinfo=SI,
                      txs[, c("feature", "gene", "exon", "exon_rank",
                              "transcript", "symbol"), drop=FALSE])
        return(GR)
    })

####============================================================
##  properties
##
##  Get access to the "hidden" .properties slot and return it.
##  This ensures that we're not generating an error for objects that
##  do not have yet that slot.
####------------------------------------------------------------
setMethod("properties", "EnsDb", function(x, ...){
    if(.hasSlot(x, ".properties")){
        return(x@.properties)
    }else{
        warning("The present EnsDb instance has no .properties slot! ",
                "Please use 'updateEnsDb' to update the object!")
        return(list())
    }
})

####============================================================
##  getProperty
##
##  Return the value for the property with the specified name or
##  NA if not present.
####------------------------------------------------------------
setMethod("getProperty", "EnsDb", function(x, name, default = NA){
    props <- properties(x)
    if(any(names(props) == name)){
        return(props[[name]])
    }else{
        return(default)
    }
})

####============================================================
##  setProperty
##
##  Sets a property in the object. The value has to be a named vector.
####------------------------------------------------------------
setMethod("setProperty", "EnsDb", function(x, ...){
    dotL <- list(...)
    if(length(dotL) == 0){
        stop("No property specified! The property has to be submitted ",
                "in the format name=value!")
        return(x)
    }
    if(length(dotL) > 1){
        warning("'setProperty' does only support setting of a single property!",
                " Using the first submitted one.")
        dotL <- dotL[1]
    }
    if(is.null(names(dotL)) | names(dotL) == "")
        stop("A name is required! Use name=value!")
    if(.hasSlot(x, ".properties")){
        x@.properties[names(dotL)] <- dotL[[1]]
    }else{
        warning("The present EnsDb instance has no .properties slot! ",
                "Please use 'updateEnsDb' to update the object!")
    }
    return(x)
})

#' remove the property with the specified name.
#' @noRd
dropProperty <- function(x, name) {
    if (missing(name))
        return(x)
    prps <- x@.properties
    if (any(names(prps) == name))
        prps <- prps[names(prps) != name]
    x@.properties <- prps
    x
}

####============================================================
##  updateEnsDb
##
##  Update any "old" EnsDb instance to the most recent implementation.
####------------------------------------------------------------
setMethod("updateEnsDb", "EnsDb", function(x, ...){
    newE <- new("EnsDb", ensdb=x@ensdb, tables=x@tables)
    if(.hasSlot(x, ".properties"))
        newE@.properties <- x@.properties
    return(newE)
})


####============================================================
##  transcriptsByOverlaps
##
##  Just "re-implementing" the transcriptsByOverlaps methods from the
##  GenomicFeature package, finetuning and adapting it for EnsDbs
####------------------------------------------------------------
setMethod("transcriptsByOverlaps", "EnsDb",
          function(x, ranges, maxgap = -1L, minoverlap = 0L,
                   type = c("any", "start", "end"),
                   columns = listColumns(x, "tx"),
                   filter = AnnotationFilterList()) {
    if (missing(ranges))
        stop("Parameter 'ranges' is missing!")
    filter <- .processFilterParam(filter, x)
    SLs <- unique(as.character(seqnames(ranges)))
    filter <- AnnotationFilterList(filter, SeqNameFilter(SLs))
    columns <- cleanColumns(x, columns)
    subsetByOverlaps(transcripts(x, columns = columns, filter = filter),
                     ranges, maxgap = maxgap, minoverlap = minoverlap,
                     type = match.arg(type))
})

####============================================================
##  exonsByOverlaps
##
####------------------------------------------------------------
setMethod("exonsByOverlaps", "EnsDb",
          function(x, ranges, maxgap = -1L, minoverlap = 0L,
                   type = c("any", "start", "end"),
                   columns = listColumns(x, "exon"),
                   filter = AnnotationFilterList()) {
    if(missing(ranges))
        stop("Parameter 'ranges' is missing!")
    filter <- .processFilterParam(filter, x)
    SLs <- unique(as.character(seqnames(ranges)))
    filter <- AnnotationFilterList(filter, SeqNameFilter(SLs))
    columns <- cleanColumns(x, columns)
    subsetByOverlaps(exons(x, columns = columns, filter = filter),
                     ranges, maxgap = maxgap, minoverlap = minoverlap,
                     type = match.arg(type))
})

############################################################
## returnFilterColumns
##
## Method to set the option whether or not the filter columns should be
## returned too.
setMethod("returnFilterColumns", "EnsDb", function(x) {
    return(getProperty(x, "returnFilterColumns"))
})
setReplaceMethod("returnFilterColumns", "EnsDb", function(x, value) {
    if(!is.logical(value))
        stop("'value' has to be a logical!")
    if(length(value) > 1)
        stop("'value' has to be a logical of length 1!")
    x <- setProperty(x, returnFilterColumns=value)
    return(x)
})

############################################################
## orderResultsInR
##
## Whether the results should be ordered in R instead of in the
## SQL call
setMethod("orderResultsInR", "EnsDb", function(x) {
    return(getProperty(x, "orderResultsInR", default = FALSE))
})
setReplaceMethod("orderResultsInR", "EnsDb", function(x, value) {
    if(!is.logical(value))
        stop("'value' has to be a logical!")
    if(length(value) > 1)
        stop("'value' has to be a logical of length 1!")
    x <- setProperty(x, orderResultsInR = value)
    return(x)
})

############################################################
## useMySQL
##
## Switch from RSQlite backend to a MariaDB/MySQL backend.
#' @title Use a MariaDB/MySQL backend
#'
#' @aliases useMySQL
#'
#' @description Change the SQL backend from \emph{SQLite} to \emph{MySQL}.
#'     When first called on an \code{\linkS4class{EnsDb}} object, the function
#'     tries to create and save all of the data into a MySQL database. All
#'     subsequent calls will connect to the already existing MySQL database.
#'
#' @details This functionality requires that the \code{RMariaDB} package is
#'     installed and that the user has (write) access to a running MySQL server.
#'     If the corresponding database does already exist users without write
#'     access can use this functionality.
#'
#' @note At present the function does not evaluate whether the versions
#'     between the SQLite and MariaDB/MySQL database differ.
#'
#' @param x The \code{\linkS4class{EnsDb}} object.
#'
#' @param host Character vector specifying the host on which the MariaDB/MySQL
#'     server runs.
#'
#' @param port The port on which the MariaDB/MySQL server can be accessed.
#'
#' @param user The user name for the MariaDB/MySQL server.
#'
#' @param pass The password for the MariaDB/MySQL server.
#'
#' @return A \code{\linkS4class{EnsDb}} object providing access to the
#'      data stored in the MySQL backend.
#'
#' @author Johannes Rainer
#'
#' @examples
#' ## Load the EnsDb database (SQLite backend).
#' library(EnsDb.Hsapiens.v86)
#' edb <- EnsDb.Hsapiens.v86
#' ## Now change the backend to MySQL; my_user and my_pass should
#' ## be the user name and password to access the MySQL server.
#' \dontrun{
#' edb_mysql <- useMySQL(edb, host = "localhost", user = my_user, pass = my_pass)
#' }
setMethod("useMySQL", "EnsDb", function(x, host = "localhost",
                                        port = 3306, user, pass) {
    if (missing(user))
        stop("'user' has to be specified.")
    if (missing(pass))
        stop("'pass' has to be specified.")
    ## Check if RMariaDB package is available.
    if(requireNamespace("RMariaDB", quietly = TRUE)) {
        ## Check if we can connect to MySQL.
        driva <- dbDriver("MariaDB")
        con <- dbConnect(driva, host = host, user = user, pass = pass,
                         port = port)
        ## Check if database is available.
        dbs <- dbGetQuery(con, "show databases;")
        ## sqliteName should be in the format EnsDb.Hsapiens.v75!
        sqliteName <- .makePackageName(dbconn(x))
        ## sqliteName <- sub(basename(dbfile(dbconn(x))),
        ##                   pattern = ".sqlite", replacement = "",
        ##                   fixed = TRUE)
        mysqlName <- SQLiteName2MySQL(sqliteName)
        if (nrow(dbs) == 0 | !any(dbs$Database == mysqlName)) {
            message("Database not available, trying to create it...",
                    appendLF = FALSE)
            dbExecute(con, paste0("create database ", mysqlName))
            message("OK")
        }
        dbDisconnect(con)
        ## Connect to the database and check if we've got all tables.
        con <- dbConnect(driva, host = host, user = user, pass = pass,
                         dbname = mysqlName)
        ## If we've got no tables we try to feed the SQLite database
        if (length(dbListTables(con)) == 0)
            feedEnsDb2MySQL(x, mysql = con)
        ## Check if we've got all required tables.
        OK <- dbHasRequiredTables(con)
        if (is.character(OK))
            stop(OK)
        OK <- dbHasValidTables(con)
        if (is.character(OK))
            stop(OK)
        ## Check if the versions/creation date differ.
        metadata_pkg <- metadata(x)
        ## Now store the connection into the @ensdb slot
        ## dbDisconnect(x@ensdb)
        ## x@ensdb <- NULL
        x@ensdb <- con
        metadata_db <- metadata(x)
        cre_pkg <- metadata_pkg[metadata_pkg$name == "Creation time", "value"]
        cre_db <- metadata_db[metadata_db$name == "Creation time", "value"]
        if (cre_pkg != cre_db) {
            message("Creation date between the package and the information in",
                    " the database differ:\n o package: ", cre_pkg,
                    "\n o database: ", cre_db, ".\nYou might consider to delete",
                    " the database and re-install it calling this function.")
        }
        return(x)
    } else {
        stop("Package 'RMariaDB' not available.")
    }
})

############################################################
## proteins
##
## If return type is GRanges, make a seqlevel and seqinfo for each protein, i.e.
## put each protein on its own sequence.
#' @title Protein related functionality
#'
#' @aliases proteins
#'
#' @description This help page provides information about most of the
#'     functionality related to protein annotations in \code{ensembldb}.
#'
#'     The \code{proteins} method retrieves protein related annotations from
#'     an \code{\linkS4class{EnsDb}} database.
#'
#' @details The \code{proteins} method performs the query starting from the
#'     \code{protein} tables and can hence return all annotations from the
#'     database that are related to proteins and transcripts encoding these
#'     proteins from the database. Since \code{proteins} does thus only query
#'     annotations for protein coding transcripts, the \code{\link{genes}} or
#'     \code{\link{transcripts}} methods have to be used to retrieve annotations
#'     for non-coding transcripts.
#'
#' @param object The \code{\linkS4class{EnsDb}} object.
#'
#' @param columns For \code{proteins}: character vector defining the columns to
#'     be extracted from the database. Can be any column(s) listed by the
#'     \code{\link{listColumns}} method.
#'
#' @param filter For \code{proteins}: A filter object extending
#'     \code{AnnotationFilter} or a list of such objects to select
#'     specific entries from the database. See \code{\link{Filter-classes}} for
#'     a documentation of available filters and use
#'     \code{\link{supportedFilters}} to get the full list of supported filters.
#'
#' @param order.by For \code{proteins}: a character vector specifying the
#'     column(s) by which the result should be ordered.
#'
#' @param order.type For \code{proteins}: if the results should be ordered
#'     ascending (\code{order.type = "asc"}) or descending
#'     (\code{order.type = "desc"})
#'
#' @param return.type For \code{proteins}: character of lenght one specifying
#'     the type of the returned object. Can be either \code{"DataFrame"},
#'     \code{"data.frame"} or \code{"AAStringSet"}.
#'
#' @return The \code{proteins} method returns protein related annotations from
#'     an \code{\linkS4class{EnsDb}} object with its \code{return.type} argument
#'     allowing to define the type of the returned object. Note that if
#'     \code{return.type = "AAStringSet"} additional annotation columns are
#'     stored in a \code{DataFrame} that can be accessed with the \code{mcols}
#'     method on the returned object.
#'
#' @rdname ProteinFunctionality
#'
#' @author Johannes Rainer
#'
#' @examples
#' library(ensembldb)
#' library(EnsDb.Hsapiens.v86)
#' edb <- EnsDb.Hsapiens.v86
#' ## Get all proteins from tha database for the gene ZBTB16, if protein
#' ## annotations are available
#' if (hasProteinData(edb))
#'     proteins(edb, filter = GeneNameFilter("ZBTB16"))
setMethod("proteins", "EnsDb", function(object,
                                        columns = listColumns(object, "protein"),
                                        filter = AnnotationFilterList(),
                                        order.by = "",
                                        order.type = "asc",
                                        return.type = "DataFrame") {
    if (!hasProteinData(object))
        stop("The used EnsDb does not provide protein annotations!",
             " Thus, 'proteins' can not be used.")
    return.type <- match.arg(return.type, c("DataFrame", "AAStringSet",
                                            "data.frame"))
    columns <- cleanColumns(object, unique(c(columns, "protein_id")))
    filter <- .processFilterParam(filter, object)
    filter <- setFeatureInGRangesFilter(filter, "tx")
    ## Eventually add columns for the filters:
    columns <- addFilterColumns(columns, filter, object)
    ## Check that we don't have any exon columns here.
    ex_cols <- unique(listColumns(object, c("exon", "tx2exon")))
    ex_cols <- ex_cols[ex_cols != "tx_id"]
    if (any(columns %in% ex_cols)) {
        warning("Exon specific columns are not allowed for proteins. Columns ",
                paste0("'", columns[columns %in% ex_cols], "'", collapse = ", "),
                " have been removed.")
        columns <- columns[!(columns %in% ex_cols)]
    }
    retColumns <- columns
    ## Process order.by:
    ## If not specified we might want to order them by seq_name or tx_seq_start
    ## if present in parameter columns
    if (all(order.by == "")) {
        order.by <- NULL
        if (any(columns == "seq_name"))
            order.by <- "seq_name"
        seq_col_idx <- grep(columns, pattern = "_seq_")
        if (length(seq_col_idx) > 0)
            order.by <- c(order.by, columns[seq_col_idx[1]])
        if (is.null(order.by))
            order.by <- ""
    }
    ## If we're going to return a GRanges we need to know the length of the
    ## peptide sequence.
    if (return.type == "AAStringSet") {
        columns <- unique(c(columns, "protein_sequence"))
    }
    ## protein_id is *always* required
    columns <- unique(c(columns), "protein_id")
    ## Get the data
    Res <- getWhat(object, columns = columns, filter = filter,
                   order.by = order.by, order.type = order.type,
                   startWith = "protein", join = "suggested")
    ## issue #48: collapse entrezid column if dbschema 2.0 is used.
    if (as.numeric(dbSchemaVersion(object)) > 1 & any(columns == "entrezid"))
        Res <- .collapseEntrezidInTable(Res, by = "protein_id")
    ## Now process the result.
    cols_not_found <- !(retColumns %in% colnames(Res))
    retColumns <- retColumns[!cols_not_found]
    if (any(cols_not_found))
        warning("Columns ", paste0("'", retColumns[cols_not_found], "'",
                                   collapse = ", "),
                " not found in the database!")
    if (return.type == "AAStringSet") {
        aass <- AAStringSet(Res$protein_sequence)
        names(aass) <- Res$protein_id
        ## Add the mcols:
        retColumns <- retColumns[retColumns != "protein_sequence"]
        if (length(retColumns) > 0)
            mcols(aass) <- DataFrame(Res[, retColumns, drop = FALSE])
        return(aass)
    } else {
        Res <- Res[, retColumns, drop = FALSE]
        if (return.type == "DataFrame")
            Res <- DataFrame(Res)
        return(Res)
    }
    return(NULL)
})

############################################################
## listUniprotDbs
#' @aliases listUniprotDbs
#'
#' @description The \code{listUniprotDbs} method lists all Uniprot database
#'     names in the \code{EnsDb}.
#'
#' @examples
#'
#' ## List the names of all Uniprot databases from which Uniprot IDs are
#' ## available in the EnsDb
#' if (hasProteinData(edb))
#'     listUniprotDbs(edb)
#'
#' @rdname ProteinFunctionality
setMethod("listUniprotDbs", "EnsDb", function(object) {
    if (!hasProteinData(object))
        stop("The provided EnsDb database does not provide protein annotations!")
    res <- dbGetQuery(dbconn(object), "select distinct uniprot_db from uniprot")
    return(res$uniprot_db)
})

############################################################
## listUniprotMappingTypes
#' @aliases listUniprotMappingTypes
#'
#' @description The \code{listUniprotMappingTypes} method lists all methods
#'     that were used for the mapping of Uniprot IDs to Ensembl protein IDs.
#'
#' @examples
#'
#' ## List the type of all methods that were used to map Uniprot IDs to Ensembl
#' ## protein IDs
#' if (hasProteinData(edb))
#'     listUniprotMappingTypes(edb)
#'
#' @rdname ProteinFunctionality
setMethod("listUniprotMappingTypes", "EnsDb", function(object) {
    if (!hasProteinData(object))
        stop("The provided EnsDb database does not provide protein annotations!")
    res <- dbGetQuery(dbconn(object),
                      "select distinct uniprot_mapping_type from uniprot")
    return(res$uniprot_mapping_type)
})

#' @description `supportedFilters` returns a `data.frame` with the
#'     names of all filters and the corresponding field supported by the
#'     `EnsDb` object.
#'
#' @param object For `supportedFilters`: an `EnsDb` object.
#'
#' @param ... For `supportedFilters`: currently not used.
#'
#' @return For `supportedFilters`: a `data.frame` with the names and
#'     the corresponding field of the supported filter classes.
#'
#' @md
#'
#' @rdname Filter-classes
setMethod("supportedFilters", "EnsDb", function(object, ...) {
    .supportedFilters(object)
})

#' @title Globally add filters to an EnsDb database
#'
#' @aliases addFilter addFilter,EnsDb-method
#'
#' @description These methods allow to set, delete or show globally defined
#'     filters on an \code{\linkS4class{EnsDb}} object.
#'
#'     \code{addFilter}: adds an annotation filter to the \code{EnsDb} object.
#'
#' @details Adding a filter to an \code{EnsDb} object causes this filter to be
#'     permanently active. The filter will be used for all queries to the
#'     database and is added to all additional filters passed to the methods
#'     such as \code{\link{genes}}.
#'
#' @param x The \code{\linkS4class{EnsDb}} object to which the filter should be
#'     added.
#'
#' @param filter The filter as an
#'     \code{\link[AnnotationFilter]{AnnotationFilter}},
#'     \code{\link[AnnotationFilter]{AnnotationFilterList}} or filter
#'     expression. See
#'
#' @return \code{addFilter} and \code{filter} return an \code{EnsDb} object
#'     with the specified filter added.
#'
#'     \code{activeFilter} returns an
#'     \code{\link[AnnotationFilter]{AnnotationFilterList}} object being the
#'     active global filter or \code{NA} if no filter was added.
#'
#'     \code{dropFilter} returns an \code{EnsDb} object with all eventually
#'     present global filters removed.
#'
#' @author Johannes Rainer
#'
#' @rdname global-filters
#'
#' @seealso \code{\link{Filter-classes}} for a list of all supported filters.
#'
#' @examples
#' library(EnsDb.Hsapiens.v86)
#' edb <- EnsDb.Hsapiens.v86
#'
#' ## Add a global SeqNameFilter to the database such that all subsequent
#' ## queries will be applied on the filtered database.
#' edb_y <- addFilter(edb, SeqNameFilter("Y"))
#'
#' ## Note: using the filter function is equivalent to a call to addFilter.
#'
#' ## Each call returns now only features encoded on chromosome Y
#' gns <- genes(edb_y)
#'
#' seqlevels(gns)
#'
#' ## Get all lincRNA gene transcripts on chromosome Y
#' transcripts(edb_y, filter = ~ gene_biotype == "lincRNA")
#'
#' ## Get the currently active global filter:
#' activeFilter(edb_y)
#'
#' ## Delete this filter again.
#' edb_y <- dropFilter(edb_y)
#'
#' activeFilter(edb_y)
setMethod("addFilter", "EnsDb", function(x, filter = AnnotationFilterList()) {
    .addFilter(x, filter)
})

#' @aliases dropFilter dropFilter,EnsDb-method
#'
#' @description \code{dropFilter} deletes all globally set filters from the
#'     \code{EnsDb} object.
#'
#' @rdname global-filters
setMethod("dropFilter", "EnsDb", function(x) {
    .dropFilter(x)
})

#' @aliases activeFilter activeFilter,EnsDb-method
#'
#' @description \code{activeFilter} returns the globally set filter from an
#'     \code{EnsDb} object.
#'
#' @rdname global-filters
setMethod("activeFilter", "EnsDb", function(x) {
    .activeFilter(x)
})

setMethod("intronsByTranscript", "EnsDb", function(x, ..., use.names = FALSE) {
    txs <- transcripts(x, ...)
    exns <- exonsBy(x, by = "tx", ...)
    txs <- txs[match(names(exns), names(txs))]
    psetdiff(txs, exns)
})
jotsetung/ensembldb documentation built on Aug. 21, 2024, 11:23 a.m.