knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL )
library(magrittr) library(dplyr) library(ggtranscript) library(ggplot2) library(rtracklayer)
ggtranscript
is designed to make it easy to visualize transcript structure and annotation using ggplot2
.
As the intended users are those who work with genetic and/or transcriptomic data in R
, this tutorial assumes a basic understanding of transcript annotation and familiarity with ggplot2
.
In order to showcase the package's functionality, ggtranscript
includes example transcript annotation for the genes SOD1 and PKNOX1, as well as a set of unannotated junctions associated with SOD1. These specific genes are unimportant, chosen arbitrarily for illustration. However, it worth noting that the input data for ggtranscript
, as a ggplot2
extension, is required be a data.frame
or tibble
.
sod1_annotation %>% head() pknox1_annotation %>% head() sod1_junctions
You may be asking, what if I have a gtf
file or a GRanges
object? The below demonstrates how to wrangle a gtf
into the required format for ggtranscript
and extract the relevant annotation for a particular gene of interest.
For the purposes of the vignette, here we download a gtf
(Ensembl version 105), then load the gtf
into R
using rtracklayer::import()
.
# download ens 105 gtf into a temporary directory gtf_path <- file.path(tempdir(), "Homo_sapiens.GRCh38.105.chr.gtf.gz") download.file( paste0( "http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/", "Homo_sapiens.GRCh38.105.chr.gtf.gz" ), destfile = gtf_path ) gtf <- rtracklayer::import(gtf_path) class(gtf)
To note, the loaded gtf
is a GRanges
class object. The input data for ggtranscript
, as a ggplot2
extension, is required be a data.frame
or tibble
. We can convert a GRanges
to a data.frame
using as.data.frame
or a tibble
via dplyr::as_tibble()
. Either is fine with respect to ggtranscript
, however we prefer tibble
s over data.frame
s for several reasons.
gtf <- gtf %>% dplyr::as_tibble() class(gtf)
Now that the gtf
is a tibble
(or data.frame
object), we can dplyr::filter()
rows and dplyr::select()
columns to keep the annotation columns required for any specific gene of interest. Here, we illustrate how you would obtain the annotation for the gene SOD1, ready for plotting with ggtranscript
.
# filter your gtf for the gene of interest, here "SOD1" gene_of_interest <- "SOD1" sod1_annotation_from_gtf <- gtf %>% dplyr::filter( !is.na(gene_name), gene_name == gene_of_interest ) # extract the required annotation columns sod1_annotation_from_gtf <- sod1_annotation_from_gtf %>% dplyr::select( seqnames, start, end, strand, type, gene_name, transcript_name, transcript_biotype ) sod1_annotation_from_gtf %>% head()
If users would like to plot ranges from a bed
file using ggtranscript
, they can first import the bed
file into R
using rtracklayer::import.bed()
. This method will create a UCSCData
object.
# for the example, we'll use the test bed file provided by rtracklayer test_bed <- system.file("tests/test.bed", package = "rtracklayer") bed <- rtracklayer::import.bed(test_bed) class(bed)
A UCSCData
object can be coerced into a tibble
, a data structure which can be plotted using ggplot2
/ggtranscript
, using dplyr::as_tibble()
.
bed <- bed %>% dplyr::as_tibble() class(bed) bed %>% head()
ggtranscript
introduces 5 new geoms designed to simplify the visualization of transcript structure and annotation; geom_range()
, geom_half_range()
, geom_intron()
, geom_junction()
and geom_junction_label_repel()
. The required aesthetics (aes()
) for these geoms are designed to match the data formats which are widely used in genetic and transcriptomic analyses:
dplyr::tribble( ~`Required aes()`, ~Type, ~Description, ~`Required by`, #-------|-----|----|-------- "xstart", "integer", "Start position in base pairs", "All geoms", "xend", "integer", "End position in base pairs", "All geoms", "y", "charactor or factor", "The group used for the y axis, most often a transcript id or name ", "All geoms", "label", "integer or charactor", "Variable used to label junction curves", "Only geom_junction_label_repel()", ) %>% knitr::kable()
In the simplest case, the core components of transcript structure are exons and introns, which can be plotted using geom_range()
and geom_intron()
. In order to facilitate this, ggtranscript
also provides to_intron()
, which converts exon co-ordinates into introns. Therefore, you can plot transcript structures with only exons as input and just a few lines of code.
📌: As
ggtranscript
geoms share required aesthetics, it is recommended to set these viaggplot()
, rather than in the individualgeom_*()
calls to avoid code duplication.
# to illustrate the package's functionality # ggtranscript includes example transcript annotation sod1_annotation %>% head() # extract exons sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon") sod1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(sod1_exons, "transcript_name"), aes(strand = strand) )
As suggested by it's name, geom_range()
is designed to visualize range-based transcript annotation. This includes but is not limited to exons. For instance, for protein coding transcripts it can be useful to visually distinguish the coding sequence (CDS) of a transcript from it's UTRs. This can be achieved by adjusting the height and fill of geom_range()
and overlaying the CDS on top of the exons (including UTRs).
# filter for only exons from protein coding transcripts sod1_exons_prot_cod <- sod1_exons %>% dplyr::filter(transcript_biotype == "protein_coding") # obtain cds sod1_cds <- sod1_annotation %>% dplyr::filter(type == "CDS") sod1_exons_prot_cod %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_cds ) + geom_intron( data = to_intron(sod1_exons_prot_cod, "transcript_name"), aes(strand = strand), arrow.min.intron.length = 500, )
geom_junction()
plots curved lines that are intended to represent junction reads. Junctions are reads obtained through RNA-sequencing (RNA-seq) data that map with gapped alignment to the genome. Often, this gap is indicative of a splicing event, but can also originate from other genomic events such as indels.
It can be useful to visually overlay junctions on top of an existing transcript structure. For example, this can help to understand which existing transcripts are expressed in the RNA-seq sample or inform the location or interpretation of the novel splice sites.
geom_junction_label_repel()
adds labels to junction curves. This can useful for labeling junctions with a measure of their expression or support such as read counts or percent-spliced-in. Alternatively, you may choose to visually map this measure to the thickness of the junction curves by adjusting the the size aes()
. Or, as shown below, both of these options can be combined.
# extract exons and cds for the MANE-select transcript sod1_201_exons <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201") sod1_201_cds <- sod1_cds %>% dplyr::filter(transcript_name == "SOD1-201") # add transcript name column to junctions for plotting sod1_junctions <- sod1_junctions %>% dplyr::mutate(transcript_name = "SOD1-201") sod1_201_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( fill = "white", height = 0.25 ) + geom_range( data = sod1_201_cds ) + geom_intron( data = to_intron(sod1_201_exons, "transcript_name") ) + geom_junction( data = sod1_junctions, aes(size = mean_count), junction.y.max = 0.5 ) + geom_junction_label_repel( data = sod1_junctions, aes(label = round(mean_count, 2)), junction.y.max = 0.5 ) + scale_size_continuous(range = c(0.1, 1))
One of the primary reasons for visualizing transcript structures is to better observe the differences between them. Often this can be achieved by simply plotting the exons and introns as shown in basic usage. However, for longer, complex transcripts this may not be as straight forward.
For example, the transcripts of PKNOX1 have relatively long introns, which makes the comparison between transcript structures (especially small differences in exons) more difficult.
📌: For relatively short introns, strand arrows may overlap exons. In such cases, the
arrow.min.intron.length
parameter ofgeom_intron()
can be used to set the minimum intron length for a strand arrow to be plotted.
# extract exons pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") pknox1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(pknox1_exons, "transcript_name"), aes(strand = strand), arrow.min.intron.length = 3500 )
shorten_gaps()
ggtranscript
provides the helper function shorten_gaps()
, which reduces the size of the gaps (regions that do not overlap an exon). shorten_gaps()
then rescales the exon and intron co-ordinates, preserving the original exon alignment. This allows you to hone in the differences of interest, such as the exonic structure.
📌: The rescaled co-ordinates returned by
shorten_gaps()
will not match the original genomic positions. Therefore, it is recommended thatshorten_gaps()
is used for visualizations purposes only.
# extract exons pknox1_exons <- pknox1_annotation %>% dplyr::filter(type == "exon") pknox1_rescaled <- shorten_gaps( exons = pknox1_exons, introns = to_intron(pknox1_exons, "transcript_name"), group_var = "transcript_name" ) # shorten_gaps() returns exons and introns all in one data.frame() # let's split these for plotting pknox1_rescaled_exons <- pknox1_rescaled %>% dplyr::filter(type == "exon") pknox1_rescaled_introns <- pknox1_rescaled %>% dplyr::filter(type == "intron") pknox1_rescaled_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = pknox1_rescaled_introns, aes(strand = strand), arrow.min.intron.length = 300 )
geom_half_range()
If you are interested in the differences between two transcripts, you can use geom_half_range()
whilst adjusting range.orientation
to plot the exons from each on the opposite sides of the transcript structure. This can reveal small differences in exon structure, such as those observed here at the 5' ends of PKNOX1-201 and PKNOX1-203.
# extract the two transcripts to be compared pknox1_rescaled_201_exons <- pknox1_rescaled_exons %>% dplyr::filter(transcript_name == "PKNOX1-201") pknox1_rescaled_203_exons <- pknox1_rescaled_exons %>% dplyr::filter(transcript_name == "PKNOX1-203") pknox1_rescaled_201_exons %>% ggplot(aes( xstart = start, xend = end, y = "PKNOX1-201/203" )) + geom_half_range() + geom_intron( data = to_intron(pknox1_rescaled_201_exons, "transcript_name"), arrow.min.intron.length = 300 ) + geom_half_range( data = pknox1_rescaled_203_exons, range.orientation = "top", fill = "purple" ) + geom_intron( data = to_intron(pknox1_rescaled_203_exons, "transcript_name"), arrow.min.intron.length = 300 )
to_diff()
Sometimes, it can be useful to visualize the differences of several transcripts with respect to one transcript. For example, you may be interested in how other transcripts differ in structure to the MANE-select transcript. This exploration can reveal whether certain important regions are missing or novel regions are added, hinting at differences in transcript function.
to_diff()
is a helper function designed for this situation. Here, we apply this to PKNOX1, finding the differences between all other transcripts and the MANE-select transcript (PKNOX1-201).
📌: Although below, we apply
to_diff()
to the rescaled exons and intron (outputted byshorten_gaps()
),to_diff()
can also be applied to the original, unscaled transcripts with the same effect.
mane <- pknox1_rescaled_201_exons not_mane <- pknox1_rescaled_exons %>% dplyr::filter(transcript_name != "PKNOX1-201") pknox1_rescaled_diffs <- to_diff( exons = not_mane, ref_exons = mane, group_var = "transcript_name" ) pknox1_rescaled_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range() + geom_intron( data = pknox1_rescaled_introns, arrow.min.intron.length = 300 ) + geom_range( data = pknox1_rescaled_diffs, aes(fill = diff_type), alpha = 0.2 )
ggplot2
functionalityAs a ggplot2
extension, ggtranscript
inherits ggplot2
's familiarity and flexibility, enabling users to intuitively adjust aesthetics, parameters, scales etc as well as complement ggtranscript
geoms with existing ggplot2
geoms to create informative, publication-ready plots.
Below is a list outlining some examples of complementing ggtranscript
with existing ggplot2
functionality that we have found useful:
add_exon_number()
and geom_text()
base_sod1_plot <- sod1_exons %>% ggplot(aes( xstart = start, xend = end, y = transcript_name )) + geom_range( aes(fill = transcript_biotype) ) + geom_intron( data = to_intron(sod1_exons, "transcript_name"), aes(strand = strand) ) base_sod1_plot + geom_text( data = add_exon_number(sod1_exons, "transcript_name"), aes( x = (start + end) / 2, # plot label at midpoint of exon label = exon_number ), size = 3.5, nudge_y = 0.4 )
coord_cartesian()
or ggforce::facet_zoom()
base_sod1_plot + coord_cartesian(xlim = c(31665500, 31669000))
geom_vline()
example_mutation <- dplyr::tibble( transcript_name = "SOD1-204", position = 31661600 ) # xstart and xend are set here to override the default aes() base_sod1_plot + geom_vline( data = example_mutation, aes( xintercept = position, xstart = NULL, xend = NULL ), linetype = 2, colour = "red" )
base_sod1_plot + theme_bw() + scale_x_continuous(name = "Position") + scale_y_discrete(name = "Transcript name") + scale_fill_discrete( name = "Transcript biotype", labels = c("Processed transcript", "Protein-coding") )
Show/hide
library("sessioninfo") options(width = 120) session_info()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.