#' @include GeneRegionTrack-class.R
NULL
## BiomartGeneRegionTrack Class ----------------------------------------------
#' BiomartGeneRegionTrack class and methods
#'
#'
#' A class to hold gene model data for a genomic region fetched dynamically
#' from EBI's Biomart Ensembl data source.
#'
#'
#' A track containing all gene models in a particular region as fetched from
#' EBI's Biomart service. Usually the user does not have to take care of the
#' Biomart connection, which will be established automatically based on the
#' provided genome and chromosome information. However, for full flexibility a
#' valid \code{\linkS4class{Mart}} object may be passed on to the constructor.
#' Please note that this assumes a connection to one of the Ensembl gene data
#' sources, mapping the available query data back to the internal object slots.
#'
#' @template BiomartGeneRegionTrack-class_param
#'
#' @name BiomartGeneRegionTrack-class
#'
#' @return
#'
#' The return value of the constructor function is a new object of class
#' \code{BiomartGeneRegionTrack}.
#' @section Objects from the class:
#'
#' Objects can be created using the constructor function
#' \code{BiomartGeneRegionTrack}.
#'
#' @author Florian Hahne
#'
#' @inherit GdObject-class seealso
#'
#' @references EBI Biomart webservice at \url{http://www.biomart.org}.
#'
#' @examples
#' \dontshow{
#' ## Load some sample data
#' data(bmTrack)
#' }
#'
#' ## Construct the object
#' \dontrun{
#' bmTrack <- BiomartGeneRegionTrack(
#' start = 26682683, end = 26711643,
#' chromosome = 7, genome = "mm9"
#' )
#' }
#'
#' \dontshow{
#' ## For some annoying reason the postscript device does not know about
#' ## the sans font
#' if (!interactive()) {
#' font <- ps.options()$family
#' displayPars(bmTrack) <- list(fontfamily = font, fontfamily.title = font)
#' }
#' }
#'
#' ## Plotting
#' plotTracks(bmTrack)
#'
#' ## Track names
#' names(bmTrack)
#' names(bmTrack) <- "foo"
#' plotTracks(bmTrack)
#'
#' ## Subsetting and splitting
#' subTrack <- subset(bmTrack, from = 26700000, to = 26705000)
#' length(subTrack)
#' subTrack <- bmTrack[transcript(bmTrack) == "ENSMUST00000144140"]
#' split(bmTrack, transcript(bmTrack))
#'
#' ## Accessors
#' start(bmTrack)
#' end(bmTrack)
#' width(bmTrack)
#' position(bmTrack)
#' width(subTrack) <- width(subTrack) + 100
#'
#' strand(bmTrack)
#' strand(subTrack) <- "-"
#'
#' chromosome(bmTrack)
#' chromosome(subTrack) <- "chrX"
#'
#' genome(bmTrack)
#' genome(subTrack) <- "hg19"
#'
#' range(bmTrack)
#' ranges(bmTrack)
#'
#' ## Annotation
#' identifier(bmTrack)
#' identifier(bmTrack, "lowest")
#' identifier(subTrack) <- "bar"
#'
#' feature(bmTrack)
#' feature(subTrack) <- "foo"
#'
#' exon(bmTrack)
#' exon(subTrack) <- letters[1:2]
#'
#' gene(bmTrack)
#' gene(subTrack) <- "bar"
#'
#' symbol(bmTrack)
#' symbol(subTrack) <- "foo"
#'
#' transcript(bmTrack)
#' transcript(subTrack) <- c("foo", "bar")
#' chromosome(subTrack) <- "chr7"
#' plotTracks(subTrack)
#'
#' values(bmTrack)
#'
#' ## Grouping
#' group(bmTrack)
#' group(subTrack) <- "Group 1"
#' transcript(subTrack)
#' plotTracks(subTrack)
#'
#' ## Stacking
#' stacking(bmTrack)
#' stacking(bmTrack) <- "dense"
#' plotTracks(bmTrack)
#'
#' ## coercion
#' as(bmTrack, "data.frame")
#' as(bmTrack, "UCSCData")
#'
#' ## HTML image map
#' coords(bmTrack)
#' tags(bmTrack)
#' bmTrack <- plotTracks(bmTrack)$foo
#' coords(bmTrack)
#' tags(bmTrack)
#' @importFrom biomaRt getBM useEnsembl useMart listDatasets listAttributes listFilters
#'
#' @exportClass BiomartGeneRegionTrack
setClass("BiomartGeneRegionTrack",
contains = "GeneRegionTrack",
representation = representation(biomart = "MartOrNULL", filter = "list"),
prototype = prototype(
biomart = NULL,
filter = list(),
columns = c("feature", "transcript", "symbol", "gene", "rank"),
name = "BiomartGeneRegionTrack",
dp = DisplayPars(
C_segment = "burlywood4",
D_segment = "lightblue",
J_segment = "dodgerblue2",
Mt_rRNA = "yellow",
Mt_tRNA = "darkgoldenrod",
Mt_tRNA_pseudogene = "darkgoldenrod1",
V_segment = "aquamarine",
miRNA = "cornflowerblue",
miRNA_pseudogene = "cornsilk",
misc_RNA = "cornsilk3",
misc_RNA_pseudogene = "cornsilk4",
protein_coding = "#FFD58A",
pseudogene = "brown1",
rRNA = "darkolivegreen1",
rRNA_pseudogene = "darkolivegreen",
retrotransposed = "blueviolet",
scRNA = "gold4",
scRNA_pseudogene = "darkorange2",
snRNA = "coral",
snRNA_pseudogene = "coral3",
snoRNA = "cyan",
snoRNA_pseudogene = "cyan2",
tRNA_pseudogene = "antiquewhite3",
utr3 = "#FFD58A",
utr5 = "#FFD58A",
verbose = FALSE
)
)
)
## Helper to return the default biomart to feature mapping
.getBMFeatureMap <- function() {
return(list(
gene_id = "ensembl_gene_id", transcript_id = "ensembl_transcript_id", exon_id = "ensembl_exon_id",
start = "exon_chrom_start", end = "exon_chrom_end", rank = "rank", strand = "strand",
symbol = c("external_gene_name", "external_gene_id"), feature = "gene_biotype", chromosome = "chromosome_name",
u5s = "5_utr_start", u5e = "5_utr_end", u3s = "3_utr_start", u3e = "3_utr_end", cdsl = c("cds_length", "cds_start"),
phase = "phase"
))
}
## Helper to do the actual fetching of data from Biomart
#' @importFrom utils modifyList
.fetchBMData <- function(object, chromosome_name = NULL, staged = FALSE) {
if (!is.null(chromosome_name)) {
chromosome_name <- gsub("^chr", "", chromosome_name)
}
## The map between Biomart DB fields and annotation features. The individual values can be vectors for cases where there is
## ambiguity between different marts. This will be dynamically evaluated against available filters.
origFeatureMap <- .getBMFeatureMap()
featureMap <- modifyList(origFeatureMap, as.list(.dpOrDefault(object, ".__featureMap", list())))
needed <- c(
"gene_id", "transcript_id", "exon_id", "start", "end", "rank", "strand", "symbol", "feature",
"chromosome", "u5s", "u5e", "u3s", "u3e", "cdsl", "phase"
)
if (!all(needed %in% names(featureMap))) {
stop("'featureMap' needs to include items '", paste(setdiff(needed, names(featureMap)), collapse = ", "), "'")
}
avail <- listAttributes(object@biomart)[, 1]
ambig <- names(featureMap)[listLen(featureMap) > 1]
for (i in ambig) {
mt <- match(featureMap[[i]], avail)
if (!all(is.na(mt))) {
featureMap[[i]] <- featureMap[[i]][min(which(!is.na(mt)))]
} else {
featureMap[[i]] <- NA
}
}
featureMap <- unlist(featureMap)
## Deal with the filters
filterValues <- as.list(object@filter)
start <- object@start
end <- object@end
for (i in c("start", "end", "chromosome_name")) {
if (!is.null(get(i)) && length(get(i)) > 0) {
filterValues[[i]] <- get(i)
}
}
ens <- getBM(as.vector(featureMap),
filters = names(filterValues),
values = filterValues, bmHeader = FALSE,
mart = object@biomart, uniqueRows = TRUE
)
colnames(ens) <- names(featureMap)
if (staged && nrow(ens) > 0) {
filterValues <- list(start = min(ens$start), end = max(ens$end), chromosome_name = ens[1, "chromosome"])
ens <- getBM(as.vector(featureMap),
filters = names(filterValues),
values = filterValues, bmHeader = FALSE,
mart = object@biomart, uniqueRows = TRUE
)
colnames(ens) <- names(featureMap)
}
## Only those transcripts that have a CDS length will be considered protein_coding
ens$feature <- ifelse(is.na(ens$cdsl) & ens$feature == "protein_coding", "non_coding", ens$feature)
## We may have to split exons if they contain UTRs
hasUtr <- !is.na(ens$u5s) | !is.na(ens$u3s)
ensUtr <- ens[hasUtr, , drop = FALSE]
ensUtr$ffeature <- ifelse(is.na(ensUtr$u5s), "utr3", "utr5")
ensUtr$us <- ifelse(ensUtr$ffeature == "utr3", ensUtr$u3s, ensUtr$u5s)
ensUtr$ue <- ifelse(ensUtr$ffeature == "utr3", ensUtr$u3e, ensUtr$u5e)
ensUtr$u5e <- ensUtr$u5s <- ensUtr$u3e <- ensUtr$u3s <- NULL
allUtr <- ensUtr$us == ensUtr$start & ensUtr$ue == ensUtr$end
utrFinal <- ensUtr[allUtr, , drop = FALSE]
ensUtr <- ensUtr[!allUtr, , drop = FALSE]
ensUtrS <- split(ensUtr, ifelse(ensUtr$start == ensUtr$us, "left", "right"))
utrFinal <- rbind(utrFinal, do.call(rbind, lapply(names(ensUtrS), function(i) {
y <- ensUtrS[[i]]
if (nrow(y) == 0) {
return(NULL)
}
yy <- y[rep(seq_len(nrow(y)), each = 2), ]
sel <- seq(1, nrow(yy), by = 2)
yy[sel, "end"] <- if (i == "left") yy[sel, "ue"] else yy[sel, "us"] - 1
yy[sel, "ffeature"] <- yy[sel, ifelse(i == "left", "ffeature", "feature")]
yy[sel, "phase"] <- if (i == "left") -1 else 0
sel <- seq(2, nrow(yy), by = 2)
yy[sel, "start"] <- if (i == "left") yy[sel, "ue"] + 1 else yy[sel, "us"]
yy[sel, "ffeature"] <- yy[sel, ifelse(i == "left", "feature", "ffeature")]
yy[sel, "phase"] <- if (i == "left") yy[sel, "phase"] else -1
yy
})))
utrFinal$feature <- utrFinal$ffeature
keep <- c(
"gene_id", "transcript_id", "exon_id", "start",
"end", "rank", "strand", "symbol", "feature",
"chromosome", "phase"
)
ens <- rbind(ens[!hasUtr, keep, drop = FALSE], utrFinal[, keep])
ens$chromosome <- .chrName(ens$chromosome, force = TRUE)
range <- GRanges(
seqnames = ens$chromosome,
ranges = IRanges(start = ens$start, end = ens$end),
strand = ens$strand, feature = as.character(ens$feature),
gene = as.character(ens$gene_id), exon = as.character(ens$exon_id),
transcript = as.character(ens$transcript_id), symbol = as.character(ens$symbol),
rank = as.numeric(ens$rank), phase = as.integer(ens$phase)
)
suppressWarnings(genome(range) <- unname(genome(object)[1]))
range <- sort(range)
return(range)
}
## Create an MD5 hash for a BiomartGeneRegion track taking into account the Biomart details for caching
#' @importFrom digest digest
.bmGuid <- function(bmtrack) {
digest(list(
genome = genome(bmtrack), host = bmtrack@biomart@host, mart = bmtrack@biomart@biomart, schema = bmtrack@biomart@vschema,
dataset = bmtrack@biomart@dataset, filters = bmtrack@filter
))
}
## Initialize ----------------------------------------------------------------
# #' @noRd
# #' @keywords internal
#' @describeIn BiomartGeneRegionTrack-class Initialize.
#' @export
setMethod("initialize", "BiomartGeneRegionTrack", function(.Object, start = NULL, end = NULL, biomart, filter = list(), range, genome = NULL, chromosome = NULL, strand = NULL,
featureMap = NULL, symbol = NULL, gene = NULL, transcript = NULL, entrez = NULL, ...) {
if ((missing(range) || is.null(range)) && is.null(genome) && is.null(chromosome)) {
return(.Object)
}
## the display parameter defaults
.makeParMapping()
.Object <- .updatePars(.Object, "BiomartGeneRegionTrack")
verb <- list(...)$verbose
displayPars(.Object) <- list(!is.null(verb) && verb == TRUE)
## preparing filters
strand <- .strandName(strand, extended = TRUE)
if (strand %in% 0:1) {
filter$strand <- c(1, -1)[strand + 1]
}
filterOrig <- filter
idFilters <- list(
symbol = c("external_gene_name", "external_gene_id", "hgnc_symbol", "wikigene_name", "dbass3_name"),
gene = "ensembl_gene_id",
transcript = "ensembl_transcript_id",
entrez = "entrezgene"
)
staged <- FALSE
for (i in names(idFilters)) {
if (!is.null(get(i))) {
mt <- match(idFilters[[i]], listFilters(biomart)[, 1])
sfilt <- listFilters(biomart)[mt[!is.na(mt)], 1]
if (length(sfilt) > 0) {
staged <- TRUE
filter[[sfilt[1]]] <- get(i)
if (!is.null(filter$strand)) {
warning(sprintf("Cannot combine %s filter with a strand filter. Strand filtering is ignored.", i))
filter$strand <- NULL
}
if (!is.null(start) || !is.null(end) || !is.null(chromosome)) {
warning(sprintf("Cannot combine %s filter with a range restriction. Ignoring start and end coordinates.", i))
start <- end <- chromosome <- NULL
}
} else {
stop(sprintf("Unable to automatically map the %s filter. Manually provide adequate filter list.", i))
}
}
}
## We can't have only one in start or end
if (!is.null(end) && is.null(start)) {
start <- 1
} else if (!is.null(start) && is.null(end)) {
end <- 10e8
}
## filling slots
.Object@filter <- filter
.Object@biomart <- biomart
## Extending start and end positions to capture genes on the edges. We also need both coordinates set, or none.
extend <- if (is.null(start) || is.null(end)) 10000 else max(10000, abs(diff(c(end, start))))
.Object@start <- if (!is.null(start)) max(1, start - extend) else NULL
.Object@end <- if (!is.null(end)) end + extend else NULL
displayPars(.Object) <- list(".__featureMap" = featureMap)
genome(.Object) <- ifelse(is.null(genome), "ANY", genome)
## fetching data from Biomart
range <- if (!is.null(.Object@biomart) && (!is.null(.Object@start) || !is.null(.Object@end) || length(.Object@filter) != 0)) {
.cacheMartData(.Object, chromosome, staged)
} else {
tmp <- GRanges()
values(tmp) <- DataFrame(feature = "a", gene = "a", exon = "a", transcript = "a", symbol = "a", rank = 0, phase = as.integer(1))[0, ]
suppressWarnings(genome(tmp) <- unname(genome[1]))
displayPars(.Object) <- list(".__streamOnly" = TRUE)
tmp
}
.Object@filter <- filterOrig
if (length(range) == 0) {
.Object <- setPar(.Object, "size", 0, interactive = FALSE)
} else {
chromosome <- if (is.null(chromosome)) seqlevels(range)[1] else chromosome
rr <- range(range, ignore.strand = TRUE)
s <- start(rr[seqnames(rr) == chromosome])
e <- end(rr[seqnames(rr) == chromosome])
start <- min(s, start)
end <- max(e, end)
}
.Object <- callNextMethod(
.Object = .Object, range = range, start = start, end = end,
genome = genome, chromosome = chromosome, strand = strand, ...
)
## We want to warn if searching was performed based on an identifier and now values have been returned
if ((!is.null(symbol) || !is.null(gene) || !is.null(transcript) || !is.null(entrez)) && length(range) == 0) {
warning("Search by identifier did not yield any values", call. = FALSE)
}
return(.Object)
})
## Constructor ---------------------------------------------------------------
## Constructor. The following arguments are supported:
## o start, end: numeric vectors of the item start and end coordinates
## o biomart: a biomaRt object used to to query for gene annotations
## o genome, chromosome: the reference genome and active chromosome for the track.
## o strand: character, search for gene models on the plus strand ("+" or 0), the
## minus strand ("-" or 1) or both strands ("+-" or "-+" or 2)
## o stacking: character controlling the stacking of overlapping items. One in 'hide',
## 'dense', 'squish', 'pack' or 'full'.
## o filters: list of additional filters for the biomaRt query, where the item names
## are the filter identifiers and the item values are the filter values.
## o name: the name of the track. This will be used for the title panel.
## All additional items in ... are being treated as DisplayParameters
#' @describeIn BiomartGeneRegionTrack-class Constructor function for
#' `BiomartGeneRegionTrack-class`.
#' @export
BiomartGeneRegionTrack <- function(start = NULL, end = NULL, biomart, chromosome = NULL, strand, genome = NULL,
stacking = "squish", filters = list(), featureMap = NULL, name = "BiomartGeneRegionTrack",
symbol = NULL, gene = NULL, entrez = NULL, transcript = NULL, ...) {
## Some default checking
.missingToNull(c("genome"))
if (missing(strand)) {
strand <- "*"
}
if ((!is.null(start) || !is.null(end)) && is.null(chromosome)) {
stop("Also need to specify a chromsome when initializing a BiomartGeneRegionTrack with start or end coordinates")
}
if (!is.null(chromosome)) {
chromosome <- .chrName(chromosome)[1]
}
if (missing(biomart)) {
if (is.null(genome)) {
stop("Need either a valid Mart connection object as 'biomart' argument or a UCSC genome identifier as the 'genome' argument.")
}
biomart <- .genome2Dataset(genome)
} else if (is.null(genome)) {
genome <- biomart@dataset
}
new("BiomartGeneRegionTrack",
start = start, end = end, chromosome = chromosome, strand = strand,
biomart = biomart, name = name, genome = genome, stacking = stacking, filter = filters, featureMap = featureMap,
symbol = symbol, gene = gene, transcript = transcript, entrez = entrez, ...
)
}
## This filters for genes with refseq IDs only
.refseqFilter <- list("with_refseq_dna" = TRUE)
## General accessors ---------------------------------------------------------
## Annotation Accessors ------------------------------------------------------
## Stacking ------------------------------------------------------------------
## Consolidate ---------------------------------------------------------------
## Collapse -----------------------------------------------------------------
## Subset --------------------------------------------------------------------
#' @describeIn BiomartGeneRegionTrack-class subset a `BiomartGeneRegionTrack`
#' by coordinates and sort if necessary.
#' @export
setMethod("subset", signature(x = "BiomartGeneRegionTrack"), function(x, from, to, chromosome, use.defaults = TRUE, ...) {
granges <- unlist(range(split(ranges(x), group(x))))
ranges <- if (use.defaults) {
.defaultRange(x, from = from, to = to)
} else {
c(
from = ifelse(is.null(from), min(start(granges)) - 1, from),
to = ifelse(is.null(to), max(end(granges)) + 1, to)
)
}
if (ranges["from"] < x@start || ranges["to"] > x@end) {
x@start <- ranges["from"] - 10000
x@end <- ranges["to"] + 10000
ranges(x) <- .cacheMartData(x, .chrName(chromosome))
}
return(callNextMethod(x = x, from = ranges["from"], to = ranges["to"], use.defaults = FALSE, ...))
})
## Position ------------------------------------------------------------------
## DrawGrid ------------------------------------------------------------------
## DrawAxis ------------------------------------------------------------------
## DrawGD --------------------------------------------------------------------
## SetAs ---------------------------------------------------------------------
## Show ----------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.