inst/doc/ensembldb.R

## ----biocstyle, echo = FALSE, results = "asis", message = FALSE---------------
library(BiocStyle)
BiocStyle::markdown()

## ----load-libs, warning=FALSE, message=FALSE----------------------------------
library(EnsDb.Hsapiens.v86)

## Making a "short cut"
edb <- EnsDb.Hsapiens.v86
## print some informations for this package
edb

## For what organism was the database generated?
organism(edb)

## ----no-network, echo = FALSE, results = "hide"-------------------------------
## Disable code chunks that require network connection - conditionally
## disable this on Windows only. This is to avoid TIMEOUT errors on the
## Bioconductor Windows build maching (issue #47).
use_network <- TRUE

## ----filters------------------------------------------------------------------
supportedFilters(edb)

## ----transcripts--------------------------------------------------------------
Tx <- transcripts(edb, filter = GeneNameFilter("BCL2L11"))

Tx

## as this is a GRanges object we can access e.g. the start coordinates with
head(start(Tx))

## or extract the biotype with
head(Tx$tx_biotype)

## ----transcripts-filter-expression--------------------------------------------
## Use a filter expression to perform the filtering.
transcripts(edb, filter = ~ gene_name == "ZBTB16")

## ----transcripts-filter, message = FALSE--------------------------------------
library(magrittr)

edb %>% filter(~ symbol == "BCL2" & tx_biotype != "protein_coding") %>%
    transcripts

## ----filter-Y-----------------------------------------------------------------
edb_y <- addFilter(edb, SeqNameFilter("Y"))

## All subsequent filters on that EnsDb will only work on features encoded on
## chromosome Y
genes(edb_y)

## Get all lincRNAs on chromosome Y
genes(edb_y, filter = ~ gene_biotype == "lincRNA")

## ----list-columns-------------------------------------------------------------
## list all database tables along with their columns
listTables(edb)

## list columns from a specific table
listColumns(edb, "tx")

## ----transcripts-example2-----------------------------------------------------
Tx <- transcripts(edb,
		  columns = c(listColumns(edb , "tx"), "gene_name"),
		  filter = TxBiotypeFilter("nonsense_mediated_decay"),
		  return.type = "DataFrame")
nrow(Tx)
Tx

## ----cdsBy--------------------------------------------------------------------
yCds <- cdsBy(edb, filter = SeqNameFilter("Y"))
yCds

## ----genes-GRangesFilter------------------------------------------------------
## Define the filter
grf <- GRangesFilter(GRanges("11", ranges = IRanges(114129278, 114129328),
			     strand = "+"), type = "any")

## Query genes:
gn <- genes(edb, filter = grf)
gn

## Next we retrieve all transcripts for that gene so that we can plot them.
txs <- transcripts(edb, filter = GeneNameFilter(gn$gene_name))

## ----granges-zbtb16, message = FALSE, echo = FALSE----------------------------
plot(3, 3, pch = NA, xlim = c(start(gn), end(gn)), ylim = c(0, length(txs)),
     yaxt = "n", ylab = "")
## Highlight the GRangesFilter region
rect(xleft = start(grf), xright = end(grf), ybottom = 0, ytop = length(txs),
     col = "red", border = "red")
for(i in 1:length(txs)) {
    current <- txs[i]
    rect(xleft = start(current), xright = end(current), ybottom = i-0.975,
         ytop = i-0.125, border = "grey")
    text(start(current), y = i-0.5, pos = 4, cex = 0.75, labels = current$tx_id)
}

## ----transcripts-GRangesFilter------------------------------------------------
transcripts(edb, filter = grf)

## ----biotypes-----------------------------------------------------------------
## Get all gene biotypes from the database. The GeneBiotypeFilter
## allows to filter on these values.
listGenebiotypes(edb)

## Get all transcript biotypes from the database.
listTxbiotypes(edb)

## ----genes-BCL2---------------------------------------------------------------
## We're going to fetch all genes which names start with BCL.
BCLs <- genes(edb,
	      columns = c("gene_name", "entrezid", "gene_biotype"),
	      filter = GeneNameFilter("BCL", condition = "startsWith"),
	      return.type = "DataFrame")
nrow(BCLs)
BCLs

## ----example-AnnotationFilterList---------------------------------------------
## determine the average length of snRNA, snoRNA and rRNA genes encoded on
## chromosomes X and Y.
mean(lengthOf(edb, of = "tx",
	      filter = AnnotationFilterList(
		  GeneBiotypeFilter(c("snRNA", "snoRNA", "rRNA")),
		  SeqNameFilter(c("X", "Y")))))

## determine the average length of protein coding genes encoded on the same
## chromosomes.
mean(lengthOf(edb, of = "tx",
	      filter = ~ gene_biotype == "protein_coding" &
		  seq_name %in% c("X", "Y")))

## ----example-first-two-exons--------------------------------------------------
## Extract all exons 1 and (if present) 2 for all genes encoded on the
## Y chromosome
exons(edb, columns = c("tx_id", "exon_idx"),
      filter = list(SeqNameFilter("Y"),
                    ExonRankFilter(3, condition = "<")))

## ----transcriptsBy-X-Y--------------------------------------------------------
TxByGns <- transcriptsBy(edb, by = "gene", filter = SeqNameFilter(c("X", "Y")))
TxByGns

## ----exonsBy-RNAseq, message = FALSE, eval = FALSE----------------------------
#  ## will just get exons for all genes on chromosomes 1 to 22, X and Y.
#  ## Note: want to get rid of the "LRG" genes!!!
#  EnsGenes <- exonsBy(edb, by = "gene", filter = AnnotationFilterList(
#  					  SeqNameFilter(c(1:22, "X", "Y")),
#  					  GeneIdFilter("ENSG", "startsWith")))

## ----toSAF-RNAseq, message = FALSE, eval = FALSE------------------------------
#  ## Transforming the GRangesList into a data.frame in SAF format
#  EnsGenes.SAF <- toSAF(EnsGenes)

## ----disjointExons, message = FALSE, eval = FALSE-----------------------------
#  ## Create a GRanges of non-overlapping exon parts.
#  DJE <- disjointExons(edb, filter = AnnotationFilterList(
#  			      SeqNameFilter(c(1:22, "X", "Y")),
#  			      GeneIdFilter("ENSG%", "startsWith")))

## ----transcript-sequence-AnnotationHub, message = FALSE, eval = FALSE---------
#  library(EnsDb.Hsapiens.v86)
#  edb <- EnsDb.Hsapiens.v86
#  
#  ## Get the TwoBit with the genomic sequence matching the Ensembl version
#  ## using the AnnotationHub package.
#  dna <- ensembldb:::getGenomeTwoBitFile(edb)
#  
#  ## Get start/end coordinates of all genes.
#  genes <- genes(edb)
#  ## Subset to all genes that are encoded on chromosomes for which
#  ## we do have DNA sequence available.
#  genes <- genes[seqnames(genes) %in% seqnames(seqinfo(dna))]
#  
#  ## Get the gene sequences, i.e. the sequence including the sequence of
#  ## all of the gene's exons and introns.
#  geneSeqs <- getSeq(dna, genes)
#  

## ----transcript-sequence-extractTranscriptSeqs, message = FALSE, eval = FALSE----
#  ## get all exons of all transcripts encoded on chromosome Y
#  yTx <- exonsBy(edb, filter = SeqNameFilter("Y"))
#  
#  ## Retrieve the sequences for these transcripts from the TwoBitile.
#  library(GenomicFeatures)
#  yTxSeqs <- extractTranscriptSeqs(dna, yTx)
#  yTxSeqs
#  
#  ## Extract the sequences of all transcripts encoded on chromosome Y.
#  yTx <- extractTranscriptSeqs(dna, edb, filter = SeqNameFilter("Y"))
#  
#  ## Along these lines, we could use the method also to retrieve the coding sequence
#  ## of all transcripts on the Y chromosome.
#  cdsY <- cdsBy(edb, filter = SeqNameFilter("Y"))
#  extractTranscriptSeqs(dna, cdsY)
#  

## ----extractTranscriptSeqs-BSGenome, warning = FALSE, message = FALSE---------
library(BSgenome.Hsapiens.NCBI.GRCh38)
bsg <- BSgenome.Hsapiens.NCBI.GRCh38

## Get the genome version
unique(genome(bsg))
unique(genome(edb))

## Extract the full transcript sequences.
yTxSeqs <- extractTranscriptSeqs(
  bsg, exonsBy(edb, "tx", filter = SeqNameFilter("Y")))

yTxSeqs

## Extract just the CDS
Test <- cdsBy(edb, "tx", filter = SeqNameFilter("Y"))
yTxCds <- extractTranscriptSeqs(
  bsg, cdsBy(edb, "tx", filter = SeqNameFilter("Y")))
yTxCds


## ----seqlevelsStyle, message = FALSE------------------------------------------
## Change the seqlevels style form Ensembl (default) to UCSC:
seqlevelsStyle(edb) <- "UCSC"

## Now we can use UCSC style seqnames in SeqNameFilters or GRangesFilter:
genesY <- genes(edb, filter = ~ seq_name == "chrY")
## The seqlevels of the returned GRanges are also in UCSC style
seqlevels(genesY)


## ----seqlevelsStyle-2, message = FALSE----------------------------------------
seqlevelsStyle(edb) <- "UCSC"

## Getting the default option:
getOption("ensembldb.seqnameNotFound")

## Listing all seqlevels in the database.
seqlevels(edb)[1:30]

## Setting the option to NA, thus, for each seqname for which no mapping is available,
## NA is returned.
options(ensembldb.seqnameNotFound=NA)
seqlevels(edb)[1:30]

## Resetting the option.
options(ensembldb.seqnameNotFound = "ORIGINAL")


## ----seqlevelsStyle-restore---------------------------------------------------
seqlevelsStyle(edb) <- "Ensembl"


## ----gviz-plot, message=FALSE-------------------------------------------------
## Loading the Gviz library
library(Gviz)
library(EnsDb.Hsapiens.v86)
edb <- EnsDb.Hsapiens.v86

## Retrieving a Gviz compatible GRanges object with all genes
## encoded on chromosome Y.
gr <- getGeneRegionTrackForGviz(edb, chromosome = "Y",
				start = 20400000, end = 21400000)
## Define a genome axis track
gat <- GenomeAxisTrack()

## We have to change the ucscChromosomeNames option to FALSE to enable Gviz usage
## with non-UCSC chromosome names.
options(ucscChromosomeNames = FALSE)

plotTracks(list(gat, GeneRegionTrack(gr)))

options(ucscChromosomeNames = TRUE)


## ----message=FALSE------------------------------------------------------------
seqlevelsStyle(edb) <- "UCSC"
## Retrieving the GRanges objects with seqnames corresponding to UCSC chromosome names.
gr <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
                                start = 20400000, end = 21400000)
seqnames(gr)
## Define a genome axis track
gat <- GenomeAxisTrack()
plotTracks(list(gat, GeneRegionTrack(gr)))


## ----gviz-separate-tracks, message=FALSE, warning=FALSE-----------------------
protCod <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
				     start = 20400000, end = 21400000,
				     filter = GeneBiotypeFilter("protein_coding"))
lincs <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
				   start = 20400000, end = 21400000,
				   filter = GeneBiotypeFilter("lincRNA"))

plotTracks(list(gat, GeneRegionTrack(protCod, name = "protein coding"),
		GeneRegionTrack(lincs, name = "lincRNAs")), transcriptAnnotation = "symbol")

## At last we change the seqlevels style again to Ensembl
seqlevelsStyle <- "Ensembl"


## ----pplot-plot, message = FALSE, eval = FALSE--------------------------------
#  library(ggbio)
#  
#  ## Create a plot for all transcripts of the gene SKA2
#  autoplot(edb, ~ gene_name == "SKA2")
#  

## ----pplot-plot-2, message = FALSE, eval = FALSE------------------------------
#  ## Get the chromosomal region in which the gene is encoded
#  ska2 <- genes(edb, filter = ~ gene_name == "SKA2")
#  strand(ska2) <- "*"
#  autoplot(edb, GRangesFilter(ska2), names.expr = "gene_name")
#  

## ----AnnotationDbi, message = FALSE-------------------------------------------
library(EnsDb.Hsapiens.v86)
edb <- EnsDb.Hsapiens.v86

## List all available columns in the database.
columns(edb)

## Note that these do *not* correspond to the actual column names
## of the database that can be passed to methods like exons, genes,
## transcripts etc. These column names can be listed with the listColumns
## method.
listColumns(edb)

## List all of the supported key types.
keytypes(edb)

## Get all gene ids from the database.
gids <- keys(edb, keytype = "GENEID")
length(gids)

## Get all gene names for genes encoded on chromosome Y.
gnames <- keys(edb, keytype = "GENENAME", filter = SeqNameFilter("Y"))
head(gnames)


## ----select, message = FALSE, warning=FALSE-----------------------------------
## Use the /standard/ way to fetch data.
select(edb, keys = c("BCL2", "BCL2L11"), keytype = "GENENAME",
       columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE"))

## Use the filtering system of ensembldb
select(edb, keys = ~ gene_name %in% c("BCL2", "BCL2L11") &
		tx_biotype == "protein_coding",
       columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE"))


## ----mapIds, message = FALSE--------------------------------------------------
## Use the default method, which just returns the first value for multi mappings.
mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME")

## Alternatively, specify multiVals="list" to return all mappings.
mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME",
       multiVals = "list")

## And, just like before, we can use filters to map only to protein coding transcripts.
mapIds(edb, keys = list(GeneNameFilter(c("BCL2", "BCL2L11")),
                        TxBiotypeFilter("protein_coding")), column = "TXID",
       multiVals = "list")


## ----AnnotationHub-query, message = FALSE, eval = use_network-----------------
library(AnnotationHub)
## Load the annotation resource.
ah <- AnnotationHub()

## Query for all available EnsDb databases
query(ah, "EnsDb")


## ----AnnotationHub-query-2, message = FALSE, eval = use_network---------------
ahDb <- query(ah, pattern = c("Xiphophorus Maculatus", "EnsDb", 87))
## What have we got
ahDb


## ----AnnotationHub-fetch, message = FALSE, eval = FALSE-----------------------
#  ahEdb <- ahDb[[1]]
#  
#  ## retriebe all genes
#  gns <- genes(ahEdb)
#  

## ----edb-from-ensembl, message = FALSE, eval = FALSE--------------------------
#  library(ensembldb)
#  
#  ## get all human gene/transcript/exon annotations from Ensembl (75)
#  ## the resulting tables will be stored by default to the current working
#  ## directory
#  fetchTablesFromEnsembl(75, species = "human")
#  
#  ## These tables can then be processed to generate a SQLite database
#  ## containing the annotations (again, the function assumes the required
#  ## txt files to be present in the current working directory)
#  DBFile <- makeEnsemblSQLiteFromTables()
#  
#  ## and finally we can generate the package
#  makeEnsembldbPackage(ensdb = DBFile, version = "0.99.12",
#                       maintainer = "Johannes Rainer <johannes.rainer@eurac.edu>",
#                       author = "J Rainer")
#  

## ----gtf-gff-edb, message = FALSE, eval = FALSE-------------------------------
#  ## Load the AnnotationHub data.
#  library(AnnotationHub)
#  ah <- AnnotationHub()
#  
#  ## Query all available files for Ensembl release 77 for
#  ## Mus musculus.
#  query(ah, c("Mus musculus", "release-77"))
#  
#  ## Get the resource for the gtf file with the gene/transcript definitions.
#  Gtf <- ah["AH28822"]
#  ## Create a EnsDb database file from this.
#  DbFile <- ensDbFromAH(Gtf)
#  ## We can either generate a database package, or directly load the data
#  edb <- EnsDb(DbFile)
#  
#  
#  ## Identify and get the TwoBit object with the genomic DNA sequence matching
#  ## the EnsDb annotation.
#  Dna <- getGenomeTwoBitFile(edb)
#  ## We next retrieve the sequence of all exons on chromosome Y.
#  exons <- exons(edb, filter = SeqNameFilter("Y"))
#  exonSeq <- getSeq(Dna, exons)
#  

## ----EnsDb-from-Y-GRanges, message = FALSE, eval = use_network----------------
## Generate a sqlite database from a GRanges object specifying
## genes encoded on chromosome Y
load(system.file("YGRanges.RData", package = "ensembldb"))
Y

## Create the EnsDb database file
DB <- ensDbFromGRanges(Y, path = tempdir(), version = 75,
		       organism = "Homo_sapiens")

## Load the database
edb <- EnsDb(DB)
edb


## ----EnsDb-from-GTF, message = FALSE, eval = FALSE----------------------------
#  library(ensembldb)
#  
#  ## the GTF file can be downloaded from
#  ## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/
#  gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz"
#  ## generate the SQLite database file
#  DB <- ensDbFromGtf(gtf = gtffile)
#  
#  ## load the DB file directly
#  EDB <- EnsDb(DB)
#  
#  ## alternatively, build the annotation package
#  ## and finally we can generate the package
#  makeEnsembldbPackage(ensdb = DB, version = "0.99.12",
#                       maintainer = "Johannes Rainer <johannes.rainer@eurac.edu>",
#                       author = "J Rainer")
#  

## ----sessionInfo--------------------------------------------------------------
sessionInfo()

Try the ensembldb package in your browser

Any scripts or data that you put into this service are public.

ensembldb documentation built on Nov. 8, 2020, 4:57 p.m.