suppressPackageStartupMessages({ library(learnr) # 0.10.1.9006 (github) library(gradethis) # 0.1.0.9004 (github) library(testthat) # 3.0.0 library(tidyverse) # 1.3.0 library(learnr.proto) library(IRanges) library(GenomicRanges) library(plyranges) library(rtracklayer) library(VennDiagram) library(GenomicFeatures) library(ComplexHeatmap) library(genomation) library(ChIPpeakAnno) library(ggthemes) library(org.Hs.eg.db) # load the txdb package which holds transcript-based gene models of hg38 genome library(TxDb.Hsapiens.UCSC.hg38.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene seqlevels(txdb) <- "chr19" promoters_chr19 <- unique(promoters(genes(txdb), upstream=2000, downstream=200)) rm(txdb) ## installing additional packages by: # BiocManager::install('genomation') # BiocManager::install("ggthemes") # BiocManager::install("ComplexHeatmap") # for upset plot with GRanges # BiocManager::install("org.Hs.eg.db") # configuration options("tutorial.storage"="local") # save progress in ~/.local/share/R/ see https://bit.ly/3oNP3kF knitr::opts_chunk$set(echo=FALSE, message=F) gradethis::gradethis_setup() # data accessible to all exercises rdata = system.file("extdata", "week2.Rdata", package = "learnr.proto") load(rdata) #monocytes_list <- GRangesList(monocytes_list, compress=F) # go from compressed to simpleGRangesList # gene quantification chromosome 19 rdsfile <- system.file("extdata", "week2", "prepared_rds", "blueprint_c000s5_gene_quantification_chr19.rds", package = "learnr.proto") quantification_chr19 <- readRDS(rdsfile) })
# stop the app when the browser is closed (or refreshed*) # *there is **no way** to distinguish between refresh and browser (tab) closing # this is required as closing the browser window prevents the timeout below from working. session$onSessionEnded(stopApp)
# The timeout chunks stop the tutorial if it has been idle for too long # Their purpose is to keep the server running smoothly # Method: # This chunk (timeout1) sends a unique identifier "session_id" to timeout2. # When the session has been running idle for longer than "timeoutSec" seconds, # timeout2 will update shiny variable "input[[session_id]]" . # This signal is then be used in timeout1 to stop the session. # obtain the session's ID and send it to the javascript chunk isolate({ session_id <- sub('<environment: (.*)>', '\\1', capture.output(session$userData)) session$sendCustomMessage("session_id", session_id) }) # stop the tutorial when "input[[session_id]]" is updated observeEvent(input[[session_id]], ignoreNULL=T, { write(paste0("\nTutorial terminated due to inactivity.\nRestart to continue where you left off!\n"), stderr()) stopApp() }) # source 1: https://shiny.rstudio.com/reference/shiny/latest/session.html # source 2: https://stackoverflow.com/questions/18900955/get-environment-identifier-in-r
```{js timeout2} $(function() { <<<<<<< HEAD var timeoutSec = 2560; ======= var timeoutSec = 1560; // <-- change as desired
d77f4dcca65a462c074531435811701900c445a9 var idleTimer;
// receive this session's ID Shiny.addCustomMessageHandler("session_id", function(s_id) { session_id = s_id; // assigns the variable globally });
// update "input[[session_id]]" when called function onTimeout() { alert("Tutorial stopped due to inactivity.\nRestart to continue where you left off!") Shiny.setInputValue(session_id, "TRUE"); }
function startIdleTimer() { if (idleTimer) clearTimeout(idleTimer); idleTimer = setTimeout(onTimeout, timeoutSec * 1000); }
// (re)set timeout upon user input $(document).on('shiny:message shiny:inputchanged', startIdleTimer);
})();
// source 1: https://community.rstudio.com/t/keeping-track-of-idle-time-during-app-usage/1735 // source 2: https://bookdown.org/yihui/rmarkdown/language-engines.html#javascript-and-css
```{js open_links_in_new_tab} // open all links starting with http(s) in a new tab $(function() { var links = document.getElementsByTagName('a'); for (var i = 0; i < links.length; i++) { if (/^(https?:)?\/\//.test(links[i].getAttribute('href'))) { links[i].target = '_blank'; } } })(); // source: https://yihui.org/en/2018/09/target-blank/
# log the first time a user starts this tutorial. tutorial = "fg2" log_path = "/scratch/fg_log" # make this dir and set 777 permissions! if (file.access(log_path, 7)[[1]] == 0 ){ date = Sys.Date() user = basename(path.expand("~")) log_name = paste0(tutorial, "_", user, "_", date) log_file = file.path(log_path, log_name) logs = list.files(log_path) # all currently existing logs substring = paste0(tutorial, "_", user) # date is unimportant log_exists = any(lapply(logs, startsWith, substring) == TRUE) if (!log_exists){ invisible(file.create(log_file)) system(paste0("chmod 777 ", log_file)) } }
Please fill in your name and student number below and hit "Submit Answer" (the button will then disappear). At the end of the tutorial, print your answers and upload the PDF to Brightspace.
question_text( "Enter your name & student number", placeholder = "name s123456", answer("Little nosey, aren't we?", correct=TRUE), incorrect=NULL, # hide the red button allow_retry = FALSE # prevent cheating ) # alternative: # https://stackoverflow.com/questions/63894794/create-an-open-ended-question-with-learnrtutorial-in-r
In fg1 you studied the major steps involved in obtaining epigenomics data, from experiment to raw data and from raw data to normalized signal and peaks. You uploaded and inspected data of DNaseI-seq, RNA-seq and ChIP-seq of histone modifications of monocytes in the UCSC genome browser and you searched for regions with increased signal, so-called peaks.
By browsing through the data in the UCSC genome browser, you observed that the signal of the various epigenomics datasets have different shapes. E.g. narrow and sharp signal for H3K4me3 versus broad, relatively shallow signal covering a broad window and multiple genes for H3K9me3.
You also examined the position of these peaks in the chromosome, with respect to genes as well as the co-occurrences of different marks. Lastly, you practiced in discriminating active from silenced genes based on their associated histone mark ChIP-seq and DNase-I-seq signal
This and also next week you will computationally analyze the genome-wide trends in ChIP-seq data of histone modifications using R programming and answer common questions such as:
A tiny side note: instead of performing the analyses genome-wide, we have restricted that dataset to chromosome 19; one of the smallest chromosomes. This was done to reduce the sizes of the files that you are woking with and thereby speed up the tutorial.
At the end of week 2 you are able to:
- Import ChIP-seq peaks into a GRanges object in r.
- Perform exploratory data analysis on GRanges objects with ChIP-seq peaks using plyranges and ggplot.
- Detect and count overlap between two GRanges objects.
- Plot this overlap in a Venn diagram.
- Statistically test for enrichment of histone marks in a particular genomic region.
- Compare gene expression of genes with different histone marks at their promoters.
- Group genes based on the combination of histone mark ChIP-seq peaks overlapping their promoter.
- Using the grouping of (7), quantify the frequencty of active, silenced and bivalent promoters.
- These refer to global learning objectives #4-#7.
Answers will be saved each time you close your session and will be loaded if you restart the tutorial. Unless you hit Start over
at the bottom of the table-of-contents.
At the end of the tutorial you are instructed to print a pdf report of the tutorial (incl. your answers).
To reserve server space to active running tutorials, and stop those that aren't actively used anymore, we set a time limit to 25 minutes meaning that if your tutorial has been idle for over 25 minutes, you will see the warning "Tutorial stopped due to inactivity. Restart to continue where you left off!". When you see this warning, you should close the browser with the tutorial and restart the tutorial. As progress is saved, you can continue where you left once you restart the tutorial.
Links to reference material and used resources are formatted with r colorize("light blue font", col = "dodgerblue")
. It is advised to open these links using the "right-mouse click --> Open in a new browser tab" strategy (if you click on the links straight away, you are directed away from the tutorial).
Reading of references and background material is not required and not taken into account when estimating the time-required for this tutorial.
In this tutorial you will have multiple-choice and check-the-box questions as you had in fg1 but also complete-the-code exercises. The latter are sometimes evaluated on the printed object or summary (e.g a the top part of an object). At other times, the code itself is checked. When the code is evaluated we added "(code check)" to the question. In these cases, R will give custom feedback that helps you correct your code.
Due to technical reasons, when you hit Submit Answer of "(code check)" exercises, the output (if there is any) will not be printed. Eg a plot won't be shown. To visualize the plot, you must you use Run Code
.
For short lines of code, you have to decide which function or object to use and how to write this in code. For some longer lines of codes, we left blanks as '___' that indicate where in the code you should fill in a function, object, variable or parameter. Make sure you fill out all the blanks before Answer submission, they will otherwise result in errors.
Hit Hint
(when provided) for clues. Due to technical reasons you can't go back to previous hints, so make sure you read them well.
Hit Run Code
to test your code and preview the output (but make sure no '_' are left).
Hit Submit Answer
(if present) to submit the code for evaluation (make sure no '_' are left).
Try to keep yourself from hitting Solution
(when provided). Only use it when you do not understand the automatic feedback given by R.
Hit Start Over
in the header of the exercise to remove any adjustments you made and start again.
Although we ask for specific answers or completion of specific code, you are free to test your own code and use Run Code
and Start Over
. I.e., remove the pre-coded code and write your own code for a different graph or summary of the data. Use Run Code
to preview the results. Unfortunately these newly created code can not be evaluated. For the evaluation you need to use our pre-coded code. Hit Start Over
to get this pre-coded code.
Example exercise: Print the head of object
monocytes_h3k4me3
(code check).
# print the head of `monocytes_h3k4me3`
# You may want to use the function head()
# warning: the next hint is the solution!
head(monocytes_h3k4me3)
grade_code()
You are allocated to Breakout rooms in Zoom (or you can select one yourself). Either way, use the Breakout rooms to discuss questions and difficulties with your peers. If you still have question, use the "raise hand" option to notify the host of the session.
Now, let's get started!
In epigenomic data analyses you often work with peaks, genes, exons, motif locations and other genomic regions reported with the genomic coordinates: chromosome, start, and end. These kind of data are referred to as genomic interval data, also called genomic windows or regions.
As we discussed in fg1, peak files generally come in the tabular BED format (for Browser Extensible Data) with the basic information about the location of the peak and additional columns that contain for example, the name of the peak, a score given by the peak caller and the summit of the peak.
In practice, peak files come in two flavors, both cohering to the BED format but with small differences in columns 4 till 9. These are called narrowPeak and broadPeak files. "Why should peak files come in two flavors?"
This is because the factors and histone modifications on which ChIPs can be performed, show different enrichment shapes. In other words: some show broad domains of relatively low enrichment, think of H3K9me3; others have more sharp and narrow peaks that clearly have a peak summit, think of transcription factors and H3K4me3.
Peak calling software can take this into account resulting in slightly different peak calling settings and two different output formats with the major difference in the includes of a peak summit location for narrow-shaped peaks. If you have a narrowPeak file, this means that during peak calling, settings were used that fit ChIP-seq datasets with sharp, narrow enrichment signals. For broadPeak files, peak calling settings were used to detect broad domains of (overall lower) enrichment.
Discussing the differences between narrowPeak and broadPeak files is one thing, and not very significant. What is important is that you are aware of the different enrichment shapes that ChIP-seq datasets of histone modifications can have. We will come back to these differences in section 2.3.
Background info: File formats
What do BED look like? (click here to expand)
Browser Extensible Data - BED format:
- Used for peaks, motif locations or other custom intervals.
- Has 3 required columns: chromosome, start and end position.
- 6 or 7 optional columns, in case of peaks:
4th: peak name
5th: peak score (-10log(q-value) * 10, rounded down to integer value))
6th: strand to denote orientation (if applicable, otherwise "*" or "." if unstranded)
7th: signalValue
8th: p-value to denote statistical significance, given as -log10(p-value)
9th: q-value statistical significance using false discoveray rate, given as -log10(q-value)
10th (only for sharp peaks not broad domains): location of peak summit relative to the "start" coordinate.Example:
{width=80%}
Further reading on the file formats on the UCSC Genome Browser FAQ page: BED, and the narrowPeak BED and broadPeak BED.
Extra reading: Transcript and gene annotations come in GFF format.
What do GFF files look like? (click here to expand)
General Feature Format - GFF format:
- Common file format for storing gene annotations not only including genes but also transcripts/splice variants, cDNA sequences, exons, rRNA, ncRNA, etc.
- Begins with meta-data in headerlines, starting with #
- Records reported in 9 fixed columns
- Column 9 can contains various attirbutes (eg gene symbol, the transcript to which the exon belongs).
- Downloaded for example from Ensembl.
Example:
{width=80%}
Further reading on the gff file formats: UCSC Genome Browser, or ENSEMBL.
Peak data can be imported into a data.frame with:
object_name <- read.table("location/of/peak_file.bed")
Refresher:
Follow this link for a recap on data.frames: What are data.frames? by Data Carpentry.
The data.frame data structure is, however, not the most efficient way to work with interval data. E.g. a simple manipulation such as shifting all reported intervals 2 bp to the right, requires you to manipulate the "start" and "end" columns at the same time. And do not even start thinking about calcualting overlap among two or more data.rames with interval data.
Interval data such as genomic peaks can be more efficiently handled with the IRanges
package which works with a data structure especially developed for ranges of integers: IRanges objects.
To construct an IRanges object you need to define at least 2 of the following 3 values:
The GRanges objects of the GenomicRanges
package are very similar but require a additional sequence name (in other words a chromosome) for every interval and an optional strand column.
We installed and loaded the packages IRanges
and GenomicsRanges
for you with:
BiocManager::install(c("IRanges", "GenomicsRanges")) library(IRanges) library(GenomicRanges)
And here is an example where we create a GRanges object called gr
with 4 intervals on chr1:
## create `gr` # the custom variable "color" will be used for plotting the ranges. gr_chr1 <- GRanges(seqnames = c("chr1"), # argument is re-used for each interval ranges = IRanges(start=c(1,3,8,15), width = c(3,10,6,2)), color = c("black", "red", "green", "yellow")) # add names to `gr_chr1` names(gr_chr1) <- paste("interval", LETTERS[1:4], sep = "_") # print the object: gr_chr1
Observe the 3 main columns to the left of the "|\"-sign that are always present in a GRanges object: "seqnames", "ranges" and "strand". The start and end coordinates are summarized in the "ranges" column. And these windows are not stranded and therefore have a "*" in the "strand" column.
When we visualize these intervals we see three 'blocks' along the horizontal axis:
plotRanges(gr_chr1, col = gr_chr1$color, xlim = c(0, 20)) #rm(gr_chr1)
Notice that gr_chr1
holds 3 slots:
We will come back to these slots shortly.
There are several functions to retrieve and set the values in GRanges objects. Here is an image of a GRanges object called gr
with the respective functions. You will use many of these function in this tutorial, it may therefore be handy to save this image in your Downloads ("right-mouse-click --> save image as").
my_gr <- GRanges(seqnames = c("chr1", "chr2", "chr3", "chr4", "chr5"), ranges = IRanges(start=c(3, 5, 13, 18, 20), width = c(5, 7, 9, 11,4)), strand = c("-", "-", "-","+", "+"), score = round(runif(5, 0, 106), digits = 3), GC = round(runif(5, 0.35, 0.60), digits = 3)) # add rownames: names(my_gr) <- paste("peak", LETTERS[1:5], sep = "_") my_gr # made screenshot of the object and added boxes to mark the different slots with functions in illustrator. Save and upload to package as png
{width=100%}
To add a column to the metadata you can use:
granges_object$new_variable <- new_variable_calculation
A list of the above functions (and more) for GRanges objects can also be obtained by running methods(class = "GenomicRanges")
.
You can print the top and bottom 5 lines of the GRanges object with show(object_name)
and print(object_name)
(this works for many other R objects as well).
As we mentioned, the object my_gr
in the image above has 3 slots:
The main differences between the genomic coordinates and the metadata are:
Genomic coordinates | Metadata columns
:--|:---
Printed on the left-hand side of the \|-sign | Printed on the right-hand side of the \|-sign
Extract using granges(object_name)
| Extract as DataFrame with mcols(object_name)
or object_name$column_name
for a specific column
Restricted to variables seqnames
, ranges
and strand
| Almost anything can be stored in the metadata
Information about the genome is stored in the seqinfo part of the object. In the seqinfo slot of my_gr
you can read that the peaks are located in human genome 38 which holds 455 sequences in total of which 1 circular (mitochondrial genome).
Side note: 455 sequences is more than the number of chromosomes in a human genome. This is because sequences that can not be confidently placed on a specific chromosome is stored in chrUn, chrN_random and chrN_region explanation on UCSC GB website.
To parse peak files directly into a GRanges object you use the import()
function from the rtracklayer
package. We installed and loaded this package with:
BiocManager::install("rtracklayer") library(rtracklayer)
Peakfiles in .bed format can be imported with:
# import monocyte H3K4me3 peak locations monocytes_h3k4me3 <- import("h3k4me3_peaks.chr19.bed", format = "narrowPeak") # (you could also use rtracklayer::import("h3k4me3_peaks.chr19.bed", format = "narrowPeak") and skip loading the library)
As mentioned in [2.2.1 ChIP-seq peaks can be narrow or broad], different histone modifications show different peak shapes in ChIP-seq resulting in "narrowPeak" and "broadPeak" files. H3K4me3 ChIP-seq generally shows sharp(er) enrichment that can reach high enrichment values and is stored as "narrowPeak" file. For import()
to function properly, we therefore defined this format in the format =
parameter.
H3K9me3 covers broad(er) domains and peak calling took that into account, resulting in a "broadPeak" peak file.
question("Which function imports the H3K9me3 dataset into a GRanges object?", answer("import(\"h3k9me3_peaks.chr19.bed\", format = \"narrowPeak\")", message = "check the file format."), answer("import(\"h3k9me3_peaks.chr19.bed\", format = \"BED\")", message = "BED is not the right format."), answer("import(\"h3k9me3_peaks.chr19.bed\", format = \"broadPeak\")", correct =T), answer("read.table(\"h3k49me3_peaks.chr19.bed\", format = \"broadPeak\")", message = "read.table parses the data into a data.frame while we asked for a GRanges object."), allow_retry=TRUE, random_answer_order=TRUE )
Note that, to reduce the sizes of the files, we restricted them to chromosome 19.
Let's have a quick look at monocytes_h3k4me3
.
Exercise 1a: Print the head of
monocytes_h3k4me3
.
# print the head of monocytes_h3k4me3
# You may want to use the function: head()
# As in: head(monocytes_h3k4me3)
grade_result( pass_if(~identical(.result, head(monocytes_h3k4me3))) )
quiz(caption = "", question("How many metadata columns does the 'monocytes_h3k4me3' object have?", answer("2"), answer("3"), answer("6", correct=T, message = "The original .BED files lacked column headers. The `rtracklayer::import()` function appended variable names and converted the data to the correct data type (eg numeric, integer, character etc.) according to the 'narrowPeak' format we defined."), answer("9"), allow_retry=TRUE), question("How many different chromosomes are reported in this file?", answer("1", correct=T, message = "We have restricted all peak files to chromosome 19 to limit their file size. That is why 'seqinfo' holds only one sequence"), answer("24", message = "Look in seqinfo part of the object."), answer("455", message = "Look in seqinfo part of the object."), allow_retry=TRUE) )
monocytes_h3k4me3
Column | Section in GRanges object | Variable | Description
:--|:---|:---|:------
1. | GRanges | seqnames | chromosome on which the peak is located.
2. | GRanges | ranges | start and end positions of peak.
3. | GRanges | strand | if applicable, orientation of the peak, otherwise *.
4. | Metadata | name | peak name, given by the peak caller.
5. | Metadata | score | -10log(qvalue) * 10, rounded down to integer value.
6. | Metadata | signalValue | enrichment signal at the peak summit.
7. | Metadata | pValue | -log10(p-value), (e.g.if p-value = 1e-10, this value is 10), roughly put, the significance of the enrichment.
8. | Metadata | qValue | -log10(qvalue), the false discovery rate determined by swapping test and control.
9. | Metadata | peak | location of peak summit relative to the peak start.
A common first step in genomics data analyses is to look at some basic characteristics of the data by making exploratory summaries and plots. You will look at peak counts, the peak widths and the total number of bps covered by the different marks in this section. In a normal analysis, these plots could function as an additional quality control (QC) or provide new insights, depending on the system that you are working with (think of a newly discovered histone mark or TF).
In this case, you will visualize the differences between broad and narrow marks (and at the same time get used to coding in this R environment and working with GRanges objects, and refresh your understanding of dplyr and ggplot2).
Exercise 2a: Determine the number of H3K4me3 and H3K9me3 peaks in monocytes.
- The peak data are stored in the R objects
monocytes_h3k4me3
andmonocytes_h3k9me3
.
# How many peaks are stored in "monocytes_h3k4me3"?
monocytes_h3k4me3
# Look at the handy functions in section 2.2.4
# You may want to use the function length()
grade_result( pass_if(~identical(.result, length(monocytes_h3k4me3))) )
# And how many H3K9me3 peaks do you have?
# look at the command you used above
grade_result( pass_if(~identical(.result, length(monocytes_h3k9me3))) )
Exercise 2b: And what is the distribution of peak sizes for these marks?
summary()
gives you the summary statistics of a variable or object.
# What is the distribution of H3K4me3 peak sizes? summary(___(monocytes_h3k4me3))
# Review the handy functions in section 2.2.4
# You may want to use width()
# Ie summary(width(___))
grade_result( pass_if(~identical(.result, summary(width(monocytes_h3k4me3)))) )
# And what is the distribution of H3K9me3 peak sizes?
monocytes_h3k9me3
grade_result( pass_if(~identical(.result, summary(width(monocytes_h3k9me3)))) )
Notice the larger median and maximum peak size for H3K9me3 peaks compared to H3K4me3 peaks. This is related to H3K4me3 having having narrow peaks of enrichment (enrichment restricted to a relatively small region with high scores) and H3K9me3 broad enrichment profiles (enrichment covering a broad region).
In summary, marks with narrow and with broad peaks
Click here to view the summary...
Histone modifications with narrow peak shape in ChIP-seq | Histone modifications with broad peak shape in ChIP-seq
:-- |:--
H3K27ac | H3K27me3
H3K4me3 | H3K36me3
H3K9/14ac | H3K9me3
H2A.Zac | H3K4me1
You will use the R packages ggplot2
and plyranges
to explore the ChIP-seq data further. You have used ggplot2 in the year 1 course Genomics and Big Data. plyranges
is a relative of the dplyr
-package that allows structured manipulation of data. Use the refresher-boxes and package cheatsheets (tip: use google) when you need a quick reminder of the basics and how-to-use the package-functions.
We installed and loaded these packages for you by running:
BiocManager::install("tidyverse") # tidyverse contains ggplot2 and several other handy packages BiocManager::install("plyranges") library(tidyverse) library(plyranges)
Refresher: ggplot2
Click here for a very brief refresher on ggplot2-ting...
Adapted from the ggplot2 cheatsheet.
ggplot2 is based on the grammar of graphics: the idea that you can build every graph from the same components: a dataset, a coordinate system and geoms. Geoms are visual marks that represent data points.
To display values, map variables in the dataset to visual properties of the geom (aesthetics) like size, color, and x and y locations.
To build a ggplot graph you can use this template:
{width=50%}
In ggplot2 you begin with theggplot(data = ..)
function and add layers withgeom()
functions. Example geom functions include:
geom_histogram()
for a histogram,geom_density()
for a smoothened version of the histogram,geom_bar()
for a barplot,geom_boxplot()
for a boxplot,geom_point()
for a scatter plot, andgeom_smooth()
for a smoothing or trend line.
The geom functions take amapping = aes(x = ..., y = ..., ...)
argument that defines how data from variables in the dataset are mapped to visual properties (aes for "aesthetic mapping"). Depending on the geom function, you can map variables to the x- and y-axis, to the shape of symbols, the line type of line graphs, to the fill color of symbols and more.
Each plot has a coordinate system that determine the scales and orientation of the x and/or y-axis. The default coordinate system is given by thecoord_cartesian()
funtion. Adding this to a default ggplot won't change your plot. What is helpful is that withincoord_cartesian()
you can set thexlim=
andylim=
arguments. Other coordinate systems can be found on the ggplot2 cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/data-visualization-2.1.pdf).
Scales control the mapping from data to aesthetics and can be customized withscale_[]_[]
functions. These can be used to, for example, change a color palette, set limits to the values included in the mapping, or log transform an axis.
Theme elements include functions to specifify non-data elements of your graph. You can also provide function that contain complete themes, eg intheme_bw()
,theme_grey()
and many more.
Finally, faceting generates small multiples of the graph, each showing a different subset of the data grouped by a categorical variable.
Additional reference:
R for Data Science, chapter 3 Data visualization
ggplot2: elegant graphics for data analysis
In this exercise you will make a barplot showing the number of peaks per ChIP experiment. All ChIP-seq peaks are concatenated in the GRanges object monocytes_all
. To generate the barplot, you need to:
a. Identify the variable that you can use to count the number of peaks per ChIP in the ggplot2 barplot function.
b. As GRanges objects are not compatible with ggplot, you need to transform the metadata to a data.frame.
c. Identify the geom_
function that will give you a barplot.
Exercise 3a: Print the top and bottom rows of
mcols(monocytes_all)
usingshow()
.
- This will show the top rows in the metadata of
monocytes_all
.- Make sure you match every operning "(" with a closing ")".
# make one large granges object monocytes_all <- unlist(monocytes_list) monocytes_all$chip <- names(monocytes_all)
# print the head of object 'mcols(monocytes_all)'
# Use mcols() within head()
# ie: head(mcols(___))
grade_result( pass_if(~identical(.result, head(mcols(monocytes_all)))), correct = "Identify the variable (column in metadata) that you can use for counting the number of peaks per chip (this will be the statistic used in the barplot)." )
question("Which variable will you use for counting the number of peaks per ChIP-seq dataset?", answer("dataset"), answer("type"), answer("peak"), answer("chip", correct =T), allow_retry=T)
Secondly, you have to extract the metadata and write it into a data.frame as GRanges objects are not compatible with ggplot2.
Exercise 3b: Complete the code below to extract the metadata from the object
monocytes_all
(code check).
- The function
as.data.frame()
is used to convert the resulting 'DFrame' object into a 'data.frame'.
# extract metadata from 'monocytes_all' monocytes_metadata <- __(__) # convert to a data.frame monocytes_metadata <- as.data.frame(monocytes_metadata)
# extract metadata monocytes_metadata <- mcols(monocytes_all) # convert to a data.frame monocytes_metadata <- as.data.frame(monocytes_metadata)
grade_code()
Exercise 3c: Complete the code below to plot the number of peaks per ChIP in a bar chart using ggplot2 plotting (code check).
monocytes_all <- unlist(monocytes_list) monocytes_all$chip <- names(monocytes_all) monocytes_metadata <- as.data.frame(mcols(monocytes_all)) rm(monocytes_all)
# plot the number of peaks per chip ggplot(data = __)+ __(mapping=aes(y=__) )+ scale_fill_colorblind()+ theme_calc()+ theme(legend.position = "none")+ ggtitle("Number of peaks per ChIP, monocytes, chr19")
# You prepared your data object in exercise 3b
# Look at the ggplot refresher or cheatsheet to identify the geom_ function that will give you a barplot.
# Which variable should be counted to obtain the number of peaks per chip (see exercise 3a) # WARNING: the next hint is the solution!
# plot the number of peaks per chip ggplot(data = monocytes_metadata)+ geom_bar(mapping=aes(y=chip))+ scale_fill_colorblind()+ theme_calc()+ theme(legend.position = "none")+ ggtitle("Number of peaks per ChIP, monocytes, chr19")
grade_code( correct = "Hit Run Code to view your plot. For which marks do you observe the most peaks? And for which the fewest number of peaks?" )
question("Mark has the most peaks on chromosome 19 and which the fewest?", answer("H3K4me1 the most, H3K9me3 the fewest."), answer("H3K27ac the most, H3K9me3 the fewest.", correct=TRUE, message = "The range of peak counts can tell you whether your experiment and peak calling performed as expected. If they show extremely high or low values, it is likely something is off with your experiment (something you should have already noted in the Genome Browser). It can also function as an extra control step to check that the complete peakfiles are imported in R (and no strange things happened to your files along the way). Or give you insights in the occurrence of a newly characterized factor or the function of a known factor in a new system. In this case, the number of peaks is correct and you can continue with the analysis."), answer("H3K4me3 the most, H3K36me3 the fewest."), allow_retry=TRUE, random_answer_order = TRUE)
In this section you will visualize the fraction that each ChIP-seq peak set covers in total on chromosome 19.
To generate this plot, you have to calculate the total number of bps covered by each peak dataset. This can be achieved by summing up the peaksizes obtained with the width()
function.
GRanges objects follow the tidy data principle: each row of a Ranges object corresponds to an interval, and each column will represent a variable about that interval, and generally each row will represent a single unit of an observation. You can use dplyr-like functions from the plyranges
package to manipulate these objects and use the pipe operator %>%
to combine functions in a workflow.
Refresher: dplyr & plyranges
Click here for common dplyr-functions and examples of their plyranges-relatives on GRanges objects
Function | Purpose | Example on GRanges
:--|:----|:----
select()
| subset variables (=columns) |select(my_gr, GC)
group_by()
| group data into rows with the same value for the specified variable. |my_gr %>% group_by(strand)
.
filter()
| subset observations (= rows) |filter(my_gr, GC < 0.4)
ormy_gr %>% group_by(strand) %>% filter(GC == max(GC))
.
summarise()
| Summarise variables, often per group |group_by(my_gr, strand) %>% summarise(n = n(), gc = max(GC))
mutate()
| Add a new varialbe |See also the dplyr cheatsheet
Exercise 4a: Add a variable called 'peak_size' to
monocytes_metadata
. This new variable holds the peak widths (code check).
monocytes_all <- unlist(monocytes_list) monocytes_all$chip <- names(monocytes_all) monocytes_metadata <- as.data.frame(mcols(monocytes_all))
# add variable 'peak_size' to monocytes_metadata with the width of the peaks from monocytes_all
#Remember, to add new columns to GRanges objects you can use:
`granges_object$new_variable <- variable_calculation`
# to obtain peak sizes, use the function width() # WARNING: the next hint is the solution!
monocytes_metadata$peak_size <- width(monocytes_all)
grade_code()
Exercise 4b: Finish the code below using
group_by()
andsummarise()
to calculate the fraction of bps covered by each ChIP-seq peak dataset (code check).To do so ..
- First, group the 'monoyctes_all' dataset per ChIP.
- Then, calculate the total number of bps covered by each peak dataset as 'total_bps'. Use the 'peak_size' variable you defined in exercise 4a.
- Laslty, calculate what fraction this is of chromosome 19 (=total 58617616 bp).
monocytes_all <- unlist(monocytes_list) monocytes_all$chip <- names(monocytes_all) monocytes_metadata <- as.data.frame(mcols(monocytes_all)) monocytes_metadata$peak_size <- width(monocytes_all) rm(monocytes_all)
# calculate the total number of bps covered by each chipseq dataset # and as a fraction of total bps in chromosome 19 monocytes_metadata %>% ___ %>% summarise(total_bps = ___, fraction_chr19 = ___/58617616)
# group per chip with group_by(chip)
# calculate the total_bps by summing up peak_size
# ie tutal_bps = sum(peak_size) # WARNING: the next hint is the solution!
monocytes_metadata %>% group_by(chip) %>% summarise(total_bps = sum(peak_size), fraction_chr19=total_bps/58617616)
grade_code()
Exercise 5: To plot the
fraction_chr19
as bar chart, pipe the output of exercise 4b into ggplot (code check).
- We instruct the barplot function to plot the actual values (in contrast to counting the occurrences of each value as in exercise 3) with:
stat = "identity"
.- Hit "Run Code" to plot the plot in the console.
# plot the fraction covered in a bar chart monocytes_metadata %>% group_by(chip) %>% summarise(total_bps = sum(peak_size), fraction_chr19 = sum(peak_size)/58617616) %>% ggplot(.)+ geom_bar(mapping = aes(x = __, y = __), stat = "identity")+ theme_bw()+ theme(legend.position = "none")
monocytes_metadata %>% group_by(chip) %>% summarise(total_bps = sum(peak_size), fraction_chr19 = sum(peak_size) /58617616) %>% ggplot(.)+ geom_bar(mapping = aes(x = chip, y=fraction_chr19), stat = "identity")+ theme_bw()+ theme(legend.position = "none")
grade_code()
question("Which type of peaks (broad or narrow) covers relativeley more bp in chromosome 19?", answer("narrow peak ChIP-seq peak datasets"), answer("broad domain ChIP-seq peak datasets", correct=T), allow_retry=T)
So far for comparing ChIP-seq peaks of histone marks. Let's look at the overlap between marks and genomics features.
To understand the function of a ChIPped histone mark or factor we often want to know whether it is enriched in a particular genomic element. To find out, you need to calculate a quantitative summary of its genome-wide distribution across the different elements. By comparing this distribution to the genome-wide fraction of these different elements (null hypothesis), we can test for enrichment of the mark or factor.
The genomic elements are defined in a reference file. You can use various reference files for this exercise: from transcript-oriented objects, to functional elements from a database like ENCODE (lists regulattory elements like enhancers, silencers), to a custom one like TF motif locations.
In the coming exercise, you will quantify the overlap between H3K4me3 peaks and promoters and, subsequently, between H3K4me3 peaks and several different genomic elements simultaneously.
To identify and count overlap between two sets of GRanges objects (e.g. H3K4me3 peaks and promoters) you can use the findOverlaps()
function from the GenomicRanges package:
overlap <- findOverlaps(query = [GRanges_object1], subject = [GRanges_object2])
overlap
is a Hits
object containing the indexes of the overlapping elements from the query and subject objects. queryHits(overlap_object)
and subjectHits(overlap_object)
. queryHits(overlap_object)
and subjectHits(overlap_object)
can be used to extract the corresponding peaks or regions from the objects that were used as inputs (we will guide you through an example soon). Here we use findOverlaps()
to identify the intervals that overlap between gr_chr1
and my_gr
(see [2.2.3 Introducing IRanges and GRanges]):
# did not save the object in section 2.2.4 :-( my_gr <- GRanges(seqnames = c("chr1", "chr2", "chr3", "chr4", "chr5"), ranges = IRanges(start=c(3, 5, 13, 18, 20), width = c(5, 7, 9, 11,4)), strand = c("-", "-", "-","+", "+"), score = c(12.053, 65.964, 64.583, 66.078, 91.257), GC= c(0.510, 0.352, 0.408, 0.517, 0.479)) names(my_gr) <- paste("peak", LETTERS[1:5], sep = "_")
# We place () brackets around the code to instruct R to print the output # of the calculation on the right side of the arrow, and at the same time assign # the output to the new object on the left side of the arrow. (my_first_overlap <- findOverlaps(query = gr_chr1, subject = my_gr))
In this case, the output tells you that:
gr_chr1
contained 4 intervals ("queryLength: 4"). my_gr
contained 5 intervals ("subjectLength: 5"). gr_chr1
overlap with intervals in my_chr
. These are the intervals with indexes [1] and [2] in gr_chr1
. my_gr
overlaps with intervals in gr_chr1
. This the interval with index [1] in my_gr
. We can obtain the relevant intervals from gr_chr1
and my_gr
by running:
# obtain the intervals fom gr_chr1 (query) that are part of the overlap indexes_query <- queryHits(my_first_overlap) # subset gr_chr1 with these indexes gr_chr1[indexes_query]
And from my_gr
by running:
# obtain the overlapping intervals from my_gr (subject) indexes_subject <- subjectHits(my_first_overlap) # subset my_gr with these indexes my_gr[indexes_subject]
my_gr
had 5 intervals, each on a different chromosome. So only the interval on chromosome 1 showed overlap with the intervals in gr_chr1
, of which the intervals were restricted to chromosome 1. ranges
we can see that interval_A in gr_chr1
overlaps with the first bp of peak_A in my_gr
; and interval_B in gr_chr1
overlaps with the whole of peak_A in my_gr
. Finally, if you want to know the number of unique intervals that overlap you can limit the reported indexes to only the unique ones (do not report indexes more than once) with the function unique()
, subset the original GRanges object with these unique indexes, and count the length of the subsetted GRanges object.
# obtain the overlapping intervals from my_gr (subject) indexes_subject <- subjectHits(my_first_overlap) # take unique indexes indexes_subject_unique <- unique(indexes_subject) # subset my_gr with unique indexes and count the number of intervals in the resulting object length(my_gr[indexes_subject_unique])
Side note: in the code above, instead of running unique()
on the indexes, you can also run it on the GRanges object with duplicate intervals and then count the length (or count the number of unique indexes straight away, all roads lead to Rome):
# obtain the overlapping intervals from my_gr (subject) indexes_subject <- subjectHits(my_first_overlap) # take relevant intervals, restrict to only unique intervals and count these length(unique(my_gr[indexes_subject]))
You will determine the overlap among H3K4me3 peaks and promoters. We have extracted promoter regions from the TxDb.Hsapiens.UCSC.hg38.knownGene
database package that holds the coordinates of all transcripts for known genes defined in human genome release 38 by the UCSC genome browser. Besides the transcript coordinates, it also stores the genomic coordinates of TSSs, exons, introns, UTRs and genes. Using the function promoters()
we can extract promoter regions from this database.
See the following code for how we obtained these regions exactly:
# install the txdb package BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene") # load the txdb package which holds transcript-based gene models of hg38 genome library(TxDb.Hsapiens.UCSC.hg38.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene seqlevels(txdb) <- "chr19" # limits the database to chromosome 19 # extract promoter coordinates, filter for promoters promoters_chr19 <- unique(promoters(genes(txdb), upstream=2000, downstream=200))
This code:
TxDb.Hsapiens.UCSC.hg38.knownGene
database package. promoters()
function. This function outputs a GRanges object with intervals around the TSS. promoters()
works on transcripts but we are often more interested in promoters of genes therefore we add the genes()
function. Exercise 6a:
How many promoter are stored inpromoters_chr19
?
# How many regions do you have are stored in `promoters_chr19`?
## You may want to use the `length()` function.
## WARNING: the next hint is the solution!
length(promoters_chr19)
grade_result( pass_if(~identical(.result, length(promoters_chr19))) )
We want to know how many of the promoters overlap with a H3K4me3 ChIP-seq peak and vice versa.
Exercise 6b:
UsefindOverlaps()
to determine the overlap betweenmonocytes_h3k4me3
(query) andpromoters_chr19
(subject).
# Find overlap between monocytes_h3k4me3 peaks and promoters overlap <- ___(query = ___, subject = ___) # print the overlap output show(___)
# Find overlap between monocytes_h3k4me3 peaks and promoters overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) # print the overlap output show(overlap)
grade_result(pass_if(~identical(.result, show(findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19)))), correct = "The Hits object `overlap` reports the indexes of `monocytes_h3k4me3` and `promoters_chr19` that overlap. If a peak or a promoter overlaps several times, each overlap will be reported in a new row.")
Exercise 6c:
Complete the code below to determine the number and fraction of promoters that overlaps with H3K4me3 peaks (Code check).
overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19)
# 1. Extract the unique indexes of the promoter that overlap unique_promoters <- unique(___(overlap)) # 2. How many of these unique promoters do you have? unique_promoter_count <- ___(promoters_chr19[___]) # 3. Print the resulting number (leave this code as it is) unique_promoter_count # 4. What fraction of promoters is part of the overlap? ___/___(promoters_chr19)
## Use `subjectHits()` to extract the indexes of overlapping promoters from `overlap`
## Use `unique()` to minimize this output to unique promoter indexes
## Use `length()` to count the number of unique indexes of the query and the subject.
## WARNING: the next hint is the solution!
# Extract the unique promoter indexes unique_promoters <- unique(subjectHits(overlap)) # How many of these unique promoters do you have? unique_promoters_count <- length(promoters_chr19[unique_promoters]) # Print the resulting number unique_promoters_count # What fraction of promoters is part of the overlap? unique_promoters_count/length(promoters_chr19)
grade_code(correct = "Now hit 'Run Code' and submit the reported values in the relevant boxes below.")
question_text( "Enter the number of promoters that are part of the overlap:", answer("1723", message = "This is the total number of promoters. Find the ones part of the overlap with subjectHits() unique() and length()"), answer("1150", correct=TRUE), answer("1366", message = "This is the length of the overlap object, select the subject and reduce with unique."), answer("1087", message = "This is number of unique H3K4me3 peaks that are part of the overlap. Make sure you use subjectHits() and not queryHits() function."), allow_retry=TRUE)
overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) unique_promoters <- unique(subjectHits(overlap)) unique_promoters_count <- length(unique_promoters)
# enter the fraction of promoters that are part of the overlap, round to 3 decimal places:
round(unique_promoters_count/length(promoters_chr19),3)
grade_result( pass_if(~identical(.result,round(unique_promoters_count/length(promoters_chr19),3))), correct = "Hmmm, H3K4me3 is found at 1150 of the 1723 promoters (66.7%). That looks like a strong enrichment.")
To plot the overlap among promoters and H3K4me3 peaks in a Venn diagram you will use the the draw.pairwise.venn()
function of the VennDiagram
package. We installed this package for you with:
BiocManager::install("VennDiagram") library(VennDiagram)
The draw.pairwise.venn()
function has 3 "area"-parameters:
draw.pairwise.venn(area1=[integer_value], area2=[integer_value], cross.area=[integer_value], ....)
area1=[integer]
and area2=[integer]
represent the total counts (overlapping and not-overlapping) for each object. See exercise 6a or 2a for how you can calculate these numbers. cross.area=[integer]
refers to the number of overlaps. You will first calculate the 'unique_peaks_count' in a similar fashion as you did for the promoters and then use the minimum of the 'unique_peaks_count' and 'unique_promoters_count' as the number of 'common peaks' in your venn diagram. ...
means that there are additional, optional parameters that you can define but are not needed at the moment. Exercise 6d:
How many h3k4me3 peaks are part of the overlap?
# Extract the unique peaks unique_peaks <- unique(___(overlap)) # How many of these unique peaks do you have? unique_peaks_count <- ___(___[unique_peaks]) # Print the resulting number unique_peaks_count
# Use the function queryHits()
# Extract the unique peaks unique_peaks <- unique(queryHits(overlap)) # How many of these unique peaks do you have? unique_peaks_count <- length(monocytes_h3k4me3[unique_peaks]) # Print the resulting number unique_peaks_count
grade_result( pass_if(~identical(.result, length(unique( queryHits(findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19)) )))) )
Exercise 6e:
Complete thedraw.pairwise.venn()
code below to visualize these counts in a venn diagram (code check).
overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) # unique peaks unique_peaks <- unique(queryHits(overlap)) unique_peaks_count <- length(unique_peaks) # unique promoters unique_promoters <- unique(subjectHits(overlap)) unique_promoters_count <- length(unique_promoters) # common common_counts <- min(unique_promoters_count, unique_peaks_count)
# determine the number of common counts common_counts <- min(unique_promoters_count, unique_peaks_count) # call a new plotting area grid.newpage() # Plot the overlap in a venn diagram ___(area1=___, # total count for area 1 area2=___, # total count for area 2 cross.area=____, # count for the overlap. category=c("H3K4me3", "Promoters"), fill=c("red", "gray"), cat.cex=1.2)
## Use `length()` to calculate the values for area1 and area2
# For area1 you need to count the number of peaks in 'monocytes_h3k4me3'
# For area2 you need to count the number of promoters in 'promoters_chr19'
# ie area1 = length(monocytes_h3k4me3) area2 = length(promoters_chr19) # WARNING: the next hint is the solution!
# determine the number of common counts common_counts <- min(unique_promoters_count, unique_peaks_count) # call a new plotting area grid.newpage() # Plot the overlap in a venn diagram draw.pairwise.venn(area1=length(monocytes_h3k4me3), area2=length(promoters_chr19), cross.area=common_counts, category=c("H3K4me3", "Promoters"), fill=c("red", "gray"), cat.cex=1.2)
grade_code(correct = "Make sure you hit 'Run Code' to plot the venn diagram. You can ignore the text message that is printed below the plot.")
Now you have your visualization of the overlap among promoters and H3K4me3 peaks. Let's test the enrichment of H3K4me3 peaks in promoters.
Is there a significant enrichment of H3K4me3 peaks in promoters?
To answer this question, we compare the fraction of promoters with a H3K4me3 peak with the chromosome 19-wide fraction of promoters.
If H3K4me3 is not enriched at promoters, we would expect that the fraction of promoters with a H3K4me3 peak is in the same range as the fraction of promoters on chromosome 19. This is our null hypothesis of no enrichment.
As some promoters may overlap we reduced promoters_chr19
to non-overlapping intervals before we calculate the chromosome 19-wide fraction of promoters:
# obtain reduced promoter regions promoters_chr19_reduced <- reduce(promoters_chr19)
Similarly to exercise 3 we then extract the metadata from this object and write it into a data.frame:
# retrieve metadata and convert to dataframe promoters_metadata <- as.data.frame(mcols(promoters_chr19_reduced))
Exercise 7a: Finish the code below to (code check):
- Define a new variable "promoter_size" in
promoters_metadata
with the widths of the intervals inpromoters_chr19_reduced
.- Calculate the fraction of bps covered by
promoters_chr19_reduced
in chromosome 19 (chr19 =58617616 bp).
promoters_chr19_reduced <- reduce(promoters_chr19) # convert to a data.frame promoters_metadata <- as.data.frame(mcols(promoters_chr19_reduced))
# 1. define the variable 'promoter_size' promoters_metadata$promoter_size <- ___(promoters_chr19_reduced) # 2. calculate the fraction of bps covered 'promoters_chr19_reduced` promoters_metadata %>% summarise(total_bps = ___ , fraction_chr19 = ___ /58617616)
## Look back at exercise 5a for the function you need in step 1 or use the additional hints.
## you may want to use width() in step 1
## define `total_bp` as the sum of `promoter_size`
## in other words total_bps = sum(promoter_size)
# and define the fraction_chr19 as the fraction between 'total_bps' and 58617616
## in other words fraction_chr19=total_bps /58617616 ## WARNING: the next hint is the solution!
# define the variable 'promoter_size' promoters_metadata$promoter_size <- width(promoters_chr19_reduced) # calculate the fraction of bps covered 'promoters_chr19_reduced` promoters_metadata %>% summarise(total_bps = sum(promoter_size), fraction_chr19=total_bps /58617616)
grade_code(correct = "Run the code as well and submit the returend fraction in the box below." )
# Enter the fraction of promoters in chromosome 19, round of to 3 decimals
round(0.06179487,3)
grade_result( pass_if(~identical(.result,round(0.06179487,3))), correct = "Promoters make up 6% of this chromosome but appr. 66% of all promoters overlap with h3k4me3 peaks. This looks like a strong enrichment!")
You can test whether the observed fraction is indeed larger than expected with a binomial test, in r we can use the function
binom.test(x = [integer_values_for_successes_and_failures], p = [expected_success_probability], alternative = ["two.sided"/"less"/"greater"])
x
= vector with number of successes (= number of promoters with H3K4me3 peak) and number of failures (= number of promoters without H3K4me3 peak)p
= expected probability of success, in this case the fraction of promoters in chromosome 19. alternativea
= refers to the alternative hypothesis, must be one of "two.sided" (alternative can be "enriched" or "depleted"), "greater" (alternative is testing for "enrichment") or "less" (alternative is testing for "depletion"). Background: The binomial test is run when an experiment has two possible outcomes (i.e. success/failure) and you have an idea about what the probability of success is. Success in this case is overlap and our expectation is that 20% of the cases show overlap. The test calculates the probability of getting a desired outcome with a specific sample size.
Exercise 7b: Use a binomial test to test for enrichment of H3K4me3 in promoters.
- Determine the number of successes and failures (step 2, 3 and 4).
- Set
alternative = "greater"
because we test for enrichment and our alternative hypothesis is that the true probability is larger than the expected probability.
# 1. We repeat the overlap calculation from exercise 6b so that you don't have to scroll back: overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) # 2. Use the indexes from the overlap object to identify promoters with H3K4me3 peak promoters_with_h3k4me3 <- ____[unique(___(overlap))] # 3. Perform negative selection using the same indexes to find promoters without H3K4me3 peak # the - sign removes the lines with the respective indexes from the original object promoters_without_h3k4me3 <- ___[ -unique(___(overlap)) ] # 4. Define the x for your test x_test <- c(___, ___) # 5. Define the p for your test (round to 3 decimals) p_test <- __ # 6. Binomial test for enrichment of h3k4me3 peaks in promoters: binomtest_result <- binom.test(x = x_test, p = p_test, alternative = "greater" ) # 7. Report the test output binomtest_result
# do you need the subjectHits() or queryHits() function in step2 and 3?
# in step 4 you also need the length() function
# eg x_test <- c(length(promoters_with_h3k4me3), length(...))
# completely: x_test <- c(length(promoters_with_h3k4me3), length(promoters_without_h3k4me3))
# in step 5 you need the fraction you obtained in exercise 7a # # WARNING: the next hint contains the complete solultion!
# 1. We repeat the overlap calculation from exercise 6b so that you don't have to scroll back: overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) # 2. Use the indexes from the overlap object to identify promoters with H3K4me3 peak promoters_with_h3k4me3 <- promoters_chr19[unique(subjectHits(overlap)) ] # 3. Perform negative selection using the same indexes to find promoters without H3K4me3 peak # the - sign removes the lines with the respective indexes from the original object promoters_without_h3k4me3 <- promoters_chr19[ -unique(subjectHits(overlap)) ] # 4. Define the x for your test x_test <- c(length(promoters_with_h3k4me3), length(promoters_without_h3k4me3)) # 5. Define the p for your test (round to 3 decimals) p_test <- 0.062 # 6. binomial test for enrichment of h3k4me3 peaks in promoters: binomtest_result <- binom.test(x = x_test, p = p_test, alternative = "greater") # 7. report the test output binomtest_result
grade_code(correct = "Run the code as well and use the output to answer the question below.")
Exercise 7c Based on this test, do you conclude that H3K4me3 is enriched in promoter regions?
question("Is H3K4me3 enriched promoters?", answer("*Yes*", correct=T, message = "The binomial test shows a p-value < 2.2e-16 and a confidence interval that excludes the expected 0.062 We therefore reject the H0 of no enrichment. Realize that with the current test, we compared the number of overlaps and ignored the size of the overlap. Sidenote: There is still ongoing discussion of what is the best method to test for enrichment of genomic interals. This was a relative simple approach that gives a good indication. An alternative would have been to cut chromosome 19 in 200bp windows and determine whether you have more windows than expected covering promoters AND H3K4me3 peaks with a chi-square test."), answer("*No*", message = "Incorrect. Look at the p-value of the previous test.") )
Instead of looking only at the overlap with promoters, we can also calculate the distribution of H3K4me3 peaks over various genomic features. To achieve this with functions like findOverlaps
would require several, successive overlap analyses that would clutter up the code. Luckily, special packages have been developed for ChIP-seq analyses that perform this task. One of these packages is genomation
. We have installed and loaded that package by:
BiocManager::install("genomation") library(genomation)
You can use the following function to calculate the peak distribution of over exons, introns, promoter, intergenic regions:
annotateWithGeneParts(target = [peaks_as_GRanges], feature = [features_as_GRangesList])
An important difference with the analysis above is that annotateWithGeneParts()
uses transcript-level features. In exercise 6 and 7 we used gene-level features, ie. one promoter per gene. annotateWithGeneParts()
defines one promoter per transcript.
The features have been loaded from gencode (downloaded from the UCSC genome browser) and read into a GRangesList object with the genomation-function readTranscriptFeatures()
:
gencode_chr19 <- readTranscriptFeatures("data/encode/gencodev32.chr19.bed", unique.prom = TRUE, up.flank = 2000, down.flank = 200)
up.flank=
and down.flank=
parameters we defined the up- and downstream boundaries of promoters around the TSSs. Exercise 8a: Use annotateWithGeneParts() to determine the overlap between H3K4me3 peaks and promoters, exons, introns and intergenic regions (code check).
# import the gencode features for annotation with genomation (exercise 8) gencode_chr19 <- system.file("extdata", "week2", "encode", "gencodev32.chr19.bed", package = "learnr.proto") gencode_chr19 <- readTranscriptFeatures(gencode_chr19, up.flank = 2000, down.flank = 200)
# overlap of h3k4me3 with promoters, exons, introns and intergenic regions annotation_h3k4me3 <- ___(target = __, feature = __)
# Use target = monocytes_h3k4me3
# and feature = gencode_chr19
# solution annotation_h3k4me3 <- annotateWithGeneParts(target = monocytes_h3k4me3, feature = gencode_chr19)
grade_code()
You can visualize the result with:
plotTargetAnnotation(x = [annotateWithGeneParts_output], main = "....")
Exercise 8b: Plot the output of exercise 8a using plotTargetAnnotation().
gencode_chr19 <- system.file("extdata", "week2", "encode", "gencodev32.chr19.bed", package = "learnr.proto") gencode_chr19 <- readTranscriptFeatures(gencode_chr19, up.flank = 2000, down.flank = 200) annotation_h3k4me3 <- annotateWithGeneParts(target = monocytes_h3k4me3, feature = gencode_chr19)
# visualize the output of exercise 8a ____(x = ____, main = "H3K4me3 over gene parts")
# solution plotTargetAnnotation(x = annotation_h3k4me3, main = "H3K4me3 over gene parts")
grade_code(correct = "Hit `Run Code` to view your plot.")
Realize that this transcript-oriented analyses gives you slightly different results compared to the gene-centered one in exercise 6 and 7: In Exercise 6 and 7 we observed that H3K4me3 is found to overlap with 66.7% of promoters but here we detect overlap with 60% of promoters. For now, we will stick to the gene-centered values and look at how the presence of histone marks at the TSS is related to the expression of the downstream gene.
Can we observe a difference in gene expression between genes with and without H3K4me3 in their promoters?
To answer this question, we first need to obtain the genes associated with the promoters that have a H3K4me3 peak. To do so, we can use the indexes from the overlap object (exercise 6) to subset the original promoters_chr19
object and extract the corresponding gene IDs from the metadata of that object.
Exercise 9: How many different genes are associated with the H3K4me3-overlapping promoters?
- the objects
promoters_chr19
(exercise 6a) andoverlap
(exercise 6b) are loaded for you.- As we have gene-oriented promoters, we only have one promoter per gene (and you can count the number of unique promoters to answer this question).
overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19)
# subset for promoters that are part of the overlap with H3K4me3 in monocytes promoters_with_h3k4me3 <- ___ # count the number of unique promoters __(__(promoters_with_h3k4me3))
## To get the promoters with H3K4me3 overlap, subset `promoters_chr19` with subjectHits(overlap)
## ie promoters_chr19[subjectHits(overlap)]
## obtain the unique promoters with unique(promoters_with_h3k4me3)
# use for counting length() ## WARNING: the next hint is the solution!
# the objects `overlap` and `promoters_chr19` are loaded for you # subset for promoters that are part of the overlap with H3K4me3 in monocytes promoters_with_h3k4me3 <- promoters_chr19[subjectHits(overlap)] # count the number of unique promoters length(unique(promoters_with_h3k4me3))
grade_result( pass_if(~identical(.result, length(unique(promoters_chr19[subjectHits(overlap), "gene_id"])) )))
Plot the expression of genes with and without H3K4me3 in their promoters. We obtained the gene expression levels from the BLUEPRINT data portal and filtered it for genes present on chromosome 19. These data are loaded in object quantification_chr19
.
Exercise 10a: Define a new variable
h3k4me3_promoter
inquantification_chr19
which groups genes based on their overlap (TRUE/FALSE) with H3K4me3 peaks (code check).
- First view the head of 'quantification_chr19' to identify the variable with the gene ID.
- Use a dplyr-function to define the new variable
h3k4me3_promoter
.- The GRanges object
promoters_with_h3k4me3
of exercise 9 is available to you, it holds the corresponding gene IDs in the column "gene_id" in the metadata.- The operator
%in%
returns a vector of TRUE / FALSE that indicates if a value in the object printed to the left of the%in%
is present in the object to the right of the%in%
.
overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) promoters_with_h3k4me3 <- promoters_chr19[subjectHits(overlap)]
# The object quantification_chr19 holds the RNA-seq quantification for genes on chromosome 19. # 1. View the structure to identify the variable you will use in step 2. head(quantification_chr19)
# 2. Define the new variable `h3k4me3_promoter` quantification_chr19_new <- quantification_chr19 %>% ___(h3k4me3_promoter = ___ %in% promoters_with_h3k4me3$gene_id)
# You may want to use the function mutate()
# 2. Define the new variable `h3k4me3_promoter` using the variable you identified in step 1
# ie: mutate(h3k4me3_promoter = entrezgene_id %in% promoters_with_h3k4me3$gene_id) # the next hint is the solution!
# 2. Define the new variable `h3k4me3_promoter` quantification_chr19_new <- quantification_chr19 %>% mutate(h3k4me3_promoter = entrezgene_id %in% promoters_with_h3k4me3$gene_id)
grade_code()
Exercise 10b: Use ggplot() and geom_boxplot() to visualize the FPKM (y-axis) per 'h3k4me3_promoter' group (x-axis) (code check).
overlap <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) promoters_with_h3k4me3 <- promoters_chr19[subjectHits(overlap)] quantification_chr19_new <- quantification_chr19 %>% mutate(h3k4me3_promoter = entrezgene_id %in% promoters_with_h3k4me3$gene_id)
# Use ggplot() and geom_boxplot() to visualize the FPKM per gene group in the dataset `quantification_chr19_new` ggplot(data = ___)+ geom_boxplot(mapping = aes(x = __, y = __))+ theme_calc()
ggplot(data = quantification_chr19_new)+ geom_boxplot(mapping = aes(x = h3k4me3_promoter, y = FPKM))+ theme_calc()
grade_code( correct = "Make sure you also hit `Run Code` to view your plot.")
This doesn't look very informative, we mainly see the outliers! RNA-seq quantification has a broad range. We should therefore log-transoform the y-axis.
Exercise 10c: Repeat the plot of 10b but log10-transform the FPKM values on the y-axis (code check).
- Add 1 to all FPKM values to prevent taking a log10() of 0.
- Also add another 'layer', visualizing the same data but as jitter (points with a small amount of random variation that is usefull if you want to prevent plotting points on top of other points).
- Hit "Run Code" to view the resulting plot.
# Repeat the plot of 9b but lgo10-transform the y-axis ___(data = ___)+ geom_boxplot(mapping = aes(x = h3k4me3_promoter, y = ____)+ geom_jitter(mapping = aes(x = h3k4me3_promoter, y = ____), size = 0.75, alpha = 0.6)+ # size makes the dots smaller, alpha makes the points transparent, ensuring that you still see the boxplot. theme_calc()
# Incorporate the log10-transformation by adding `log()` to the y-axis variable
# Eg log(FPKM+1) # make sure that you match all "(" with ")"-signs # The next hint is the solution!
# Repeat the plot of 9b but log10-transform the y-axis ggplot(data = quantification_chr19_new)+ geom_boxplot(mapping = aes(x = h3k4me3_promoter, y = log(FPKM+1)))+ geom_jitter(mapping = aes(x = h3k4me3_promoter, y = log(FPKM+1)), size = 0.75, alpha = 0.6)+ theme_calc()
grade_code(correct = "Make sure you also hit `Run Code` to view your plot.")
There is a clear difference in gene expression between the two gene sets with presence of H3K4me3 being associated with higher expression although we also see genes that deviate from this trend. This can be due to the presence of other histone marks, the presence of transcription factors or even downstream, transcriptional mechanisms that control gene expression.
H3K27me3 can also be deposited at promoters. What trend do you observe there?
Exercise 11: Plot the expression of genes with and without H3K27me3 at their promoters (code check).
To do this, you must....
- Identify overlap between H3K27me3 peaks and promoters
- Obtain promoters that are reported in the overlap
- Define a new variable called "h3k27me3_promoter" in
quantification_chr19
that groups genes based on the overlap of their promoter with H3K27me3 peak. Instead of writing this into a new object, you can pipe the output data.frame immediately into step 4.- Plot the log(FPKM+1) group in 'h3k27me3_promoter' in a bpxplot.
- Use the previous exercises if you need hints to code these steps.
- Use the first chunk for testing sections of the code.
# 1. Identify the overlap between monocytes_h3k27me3 and promoters_chr19 overlap_h3k27me3_promoters <- ___(query = ___, subject = ___) # 2. Obtain promoters that are reported in the overlap promoters_with_h3k27me3 <- promoters_chr19[___(overlap_h3k27me3_promoters)] # 3. Define a "h3k27me3_promoter" in `quantification_chr19`, pipe the output in your ggplot command quantification_chr19 %>% ___(h3k27me3_promoter = ____) %>% ggplot(.) + geom_boxplot(mapping = aes(x = ___, y = ____))+ # make sure the () match! geom_jitter(mapping = aes(x = ___, y = ____), size = 0.75, alpha = 0.6)+ theme_calc()
# In step 1 you may want to use the function findOverlaps()
# I.e. findOverlaps(query = monocytes_h3k27me3, subject = ___)
# And in step 2 the function subjectHits()
# Use mutate to define the new variable 'h3k27me3_promoter' as you did in exercise 10a
# in exercise 10a you did: mutate(h3k4me3_promoter = entrezgene_id %in% promoters_with_h3k4me3$gene_id)
# Incorporate the log10-transformation by adding `log()` to the y-axis variable
# Eg log(FPKM+1) # make sure that you match all "(" with ")"-signs # The next hint is the solution!
# 1. identify the overlap between monocytes_h3k27me3 and promoters_chr19 overlap_h3k27me3_promoters <- findOverlaps(query = monocytes_h3k27me3, subject = promoters_chr19) # 2. obtain promoters that are reported in the overlap promoters_with_h3k27me3 <- promoters_chr19[subjectHits(overlap_h3k27me3_promoters)] # 3. define a "h3k27me3_promoter" in `quantification_chr19`, pipe the output in your ggplot command quantification_chr19 %>% mutate(h3k27me3_promoter = entrezgene_id %in% promoters_with_h3k27me3$gene_id) %>% ggplot(.) + geom_boxplot(mapping = aes(x = h3k27me3_promoter, y = log(FPKM+1)))+ geom_jitter(mapping = aes(x = h3k27me3_promoter, y = log(FPKM+1)), size = 0.75, alpha = 0.6)+ theme_calc()
grade_code(correct = "Make sure you also hit `Run Code` to view your plot.")
This time the trend is the other way around; presence of H3K27me3 is associated with lower gene expression but also in this case it is not a hard 1-to-1 relationship but a trend.
H3K4me3 and H3K27me3 can also co-occur. What gene expression level is associate with promoters that have both marks?
This time we have to make 4 categories of genes, depending on the overlap of promoters with H3K4me3 and/or H3K27me3 peaks. We have done this for you, using the dplyr-functions case_when()
within mutate()
and the objects 'promoters_with_only_h3k4me3' or 'promoters_with_only_h3k27me3' of exercise 9 and 11 resp.
quantification_chr19_new <- quantification_chr19 %>% mutate(promoter_mark = case_when( entrezgene_id %in% promoters_with_h3k4me3$gene_id & entrezgene_id %in% promoters_with_h3k27me3$gene_id ~ "bivalent", entrezgene_id %in% promoters_with_h3k4me3$gene_id & !(entrezgene_id %in% promoters_with_h3k27me3$gene_id) ~ "H3K4me3_only", !(entrezgene_id %in% promoters_with_h3k4me3$gene_id) & entrezgene_id %in% promoters_with_h3k27me3$gene_id ~ "H3K27me3_only", !(entrezgene_id %in% promoters_with_h3k4me3$gene_id | entrezgene_id %in% promoters_with_h3k27me3$gene_id) ~ "no_mark") )
overlap_h3k4me3_promoters <- findOverlaps(query = monocytes_h3k4me3, subject = promoters_chr19) promoters_with_h3k4me3 <- promoters_chr19[subjectHits(overlap_h3k4me3_promoters)] overlap_h3k27me3_promoters <- findOverlaps(query = monocytes_h3k27me3, subject = promoters_chr19) promoters_with_h3k27me3 <- promoters_chr19[subjectHits(overlap_h3k27me3_promoters)] quantification_chr19_new <- quantification_chr19 %>% mutate(promoter_mark = case_when( entrezgene_id %in% promoters_with_h3k4me3$gene_id & entrezgene_id %in% promoters_with_h3k27me3$gene_id ~ "bivalent", entrezgene_id %in% promoters_with_h3k4me3$gene_id & !(entrezgene_id %in% promoters_with_h3k27me3$gene_id) ~ "H3K4me3_only", !(entrezgene_id %in% promoters_with_h3k4me3$gene_id) & entrezgene_id %in% promoters_with_h3k27me3$gene_id ~ "H3K27me3_only", !(entrezgene_id %in% promoters_with_h3k4me3$gene_id | entrezgene_id %in% promoters_with_h3k27me3$gene_id) ~ "no_mark") )
Exercise 12a: How many genes do we have in each category?
____(quantification_chr19_new$___)
# Use the function table() to get the contingency table
# Ie table(quantification_chr19_new$___)
# The next hint is the solution!
table(quantification_chr19_new$promoter_mark)
grade_result(pass_if(~identical(.result, table(quantification_chr19_new$promoter_mark))))
Exercise 12b: Plot the gene expression as log10(FPKM+1) in a boxplot with extra geom_jittter, per group of genes and answer the question below (thus, there is no "submit answer" box).
# Print the boxplot of the log(FPKM+1) distribution per gene group ___(data = ___) + ___(mapping = aes(x = ___, y = log(FPKM+1)))+ geom_jitter(mapping = aes(x = ___, y = log(FPKM+1)), size = 0.75, alpha = 0.6)+ theme_calc()
ggplot(data = quantification_chr19_new) + geom_boxplot(mapping = aes(x = promoter_mark, y = log(FPKM+1)))+ geom_jitter(mapping = aes(x = promoter_mark, y = log(FPKM+1)), size = 0.75, alpha = 0.6)+ theme_calc()
question("Which of the following statments are supported by the table of exercise 12a and the boxplot of exercise 12b?", answer("Gene expression is associated with different histone mark (combinations).", correct=TRUE), answer("H3K4me3 defines promoters.", message = "H3K4me3 does not define promoters. Promoters are defined as the regions near the TSS."), answer("There are more promoters with chromatin markings than without, in chromosome 19 of monocytes.", correct=TRUE), answer("Bivalent promoters show equally low expression levels as promoters with H3K27me3 only"), answer("H3K4me3 is more often found at promoters than H3K27me3, in chromosome 19 of monocytes.", correct=TRUE, message = "As you can see, the specific combination of histone marks at the promoter is functionally associated with a different output in gene expression levels. Let's not restrict our analysis to 2 marks but examine the occurrence of all the different marks in our dataset at the promoter."), allow_retry=TRUE, random_answer_order=TRUE )
", "red")`
What if we want to quantify and plot all the different combinations of histone marks that are present at the promoters in our dataset? As we have ChIP-seq data of 6 different histone marks, we could theoretically detect 0, 1 or any combination of 2 to 6 marks resulting in 64 different outcomes! How could we ever visualize those results clearly? A Venn diagram based on 6 datasets is extremely difficult to read. See this example of common and unique genes found in the genome of 6 different crop species.
{width=50%}\
Ref: D’Hont, A., Denoeud, F., Aury, JM. et al. The banana (Musa acuminata) genome and the evolution of monocotyledonous plants. Nature 488, 213–217 (2012)
To experience how it is to read this graph, try to identify the largest gene group and the associated genomes in which this group of genes is found.
UpSet plots are an elegant alternative to the Venn diagram. Especially for cases when you want to visualize the intersection among more than 3 "sets" (also see this commentary in Nature Methods 2014 by Alexander Lex & Nils Gehlenborg).
An UpSet depicts the detected intersections (or 'combinations') as a matrix with barplots on the top and right side that show respectively the size of the combination and the size of the original sets (here you find the UpSet-version of the banana-Venn diagram shown above).
There are several R packages with functions to make an UpSet. You will use the one from the ComplexHeatmap
package, see here for the complete reference book of this package. We installed and loaded the package for you by running:
BiocManager::install("ComplexHeatmap") library(ComplexHeatmap)
Two functions are important to make an UpSet plot: make_comb_mat()
and UpSet()
. With make_comb_mat()
you prepare a so-called "combination matrix" that summarizes the number of elements that overlap among the provided sets. With UpSet()
you plot the UpSet plot of the combination matrix.
# Generate the combination matrix my_combination_matrix <- make_comb_mat([the_input_sets_as_list], ...) # Generate the UpSet plot UpSet(m = [combination_matrix], set_order = ... , comb_order = ... ,...)
[the_input_sets_as_list]
should be a list of sets. Each set in this list should have a name attached to it. m =
you provide the matrix generated by make_comb_matrix()
. set_order =
you can set a custom order to the sets ( = rows with barplot to the right of the plot). comb_order =
you can do the same for the combinations (depicted in the columns with a barplot at the top of the plot). Let's look at an example to clarify this further.
Here we generated 3 'sets' of letters sampled from the alphabet and we visualize the overlap among the sets with a Venn diagram and an UpSet plot.
# generate set 'set_a', 'set_b' and 'set_c' set_a <- sample(letters, 5) set_b <- sample(letters, 10) set_c <- sample(letters, 15) # print the sets set_a set_b set_c
{width=50%}\
The aim is to visualize the overlap among the sets. So how many letters are common to all 3 sets, how many to 2 and how many letters are unique to 1 of the sets?
First, we plot the overlap among the sets using a Venn diagram:
## make Venn diagram # call a new plotting area grid.newpage() # Plot the overlap in a Venn diagram # use the draw.triple.venn() similar to the draw.pairwise.venn() of ex 6e. draw.triple.venn( area1=length(set_a), area2=length(set_b), area3=length(set_c), n12 = sum(set_a %in% set_b), # The size of the intersection between 'set_a' and 'set_b' n23 = sum(set_b %in% set_c), # The size of the intersection between 'set_b' and 'set_c' n13 = sum(set_a %in% set_c), # The size of the intersection between 'set_a' and 'set_c' n123 = sum(set_a[set_a %in% set_b] %in% set_c), # The size of the intersection between 'set_a', 'set_b' and 'set_c' category=c("A", "B", "C"), fill=c("red", "green", "blue"), cat.cex=1.2)
{width=35%}\
Next, we plot the same data in an UpSet plot:
## Make UpSet plot # make a combination matrix (required for Upset plot) my_combination_matrix <- make_comb_mat(list(A = set_a, B = set_b, C = set_c)) # plot the matrix of combinations UpSet(my_combination_matrix, set_order = order(set_size(my_combination_matrix)))
\
In contrast to the Venn diagram you can easily identify the biggest combination and which sets it involves using the Upset plot.
quiz(caption = "", question("What is the size of the biggest combination in the example UpSet plot, depicted above?", answer("2"), answer("6", message = "This is the size of the letters unique to set C"), answer("7", correct = TRUE), answer("15", message = "This is the size of the biggest original set (set C)"), allow_retry=TRUE), question("Which sets are involved in this biggest combination?", answer("set A and set B"), answer("set A and set C"), answer("set B and set C", correct = TRUE), answer("set A, set B and set C"), allow_retry=TRUE) )
Now, let's return to our question. We want to quantify and visualize the combination of histone marks at promoters (in this case limited to promoters of chromosome 19).
You'll first need to obtain the combination matrix. In this case, that will be a list of 6 sets, one for each mark. Each set will have gene IDs that correspond to the genes on which promoter the mark is found. To get these sets you will use findOverlaps()
and subjectHits()
the get the indexes of the relevant promoters. You will then use these indexes to subset the promoters_chr19
object which has the gene IDs in the column "gene_id" in the metadata.
Instead of repeating this procedure for each object, we have written a function that includes all these steps. This function:
gr_peaks
and promoters_chr19
; promoters_chr19
that are part of the overlap; promoters_chr19
with these indexes;get_geneids <- function(gr_peaks) { overlap <- findOverlaps(query = gr_peaks , subject = promoters_chr19) # step 1 unique_indexes <- unique(subjectHits(overlap)) # step 2 promoter_selection <- promoters_chr19[unique_indexes] # step 3 gene_id_selection <- promoter_selection$gene_id # step 4 return(gene_id_selection) # step 5 }
This function get_geneids()
is a "custom" function (in other words, we wrote it ourselves). It takes as input one object, defined by the gr_peaks =
parameter. This should be a GRanges object with ChIP-seq peaks. It outputs a vector of gene IDs corresponding to the genes on which promoters the mark is found.
You can use the lapply()
function to apply a function to a each item in a list and return the output of each calculation again as a list:
output_list <- lapply(X = [input_list], FUN = [function_to_apply])
Exercise 13a: Apply
get_geneids()
to each item inmonocytes_list
. Write the output togeneids_per_mark
.
monocytes_list
is a list of all GRanges objects with peak data of all the different histone marks we have (H3K4me1, H3K4me3, H3K9me3, H3K27ac, H3K27me3, H3K36me3).
get_geneids <- function(gr_peaks) { overlap <- findOverlaps(query = gr_peaks , subject = promoters_chr19) # identify the overlap among x and promoters_chr19 unique_indexes <- unique(subjectHits(overlap)) # extract the unique indexes of promoters_chr19 with overlap promoter_selection <- promoters_chr19[unique_indexes] # extract the corresponding promoters gene_id_selection <- promoter_selection$gene_id # write the corresponding gene ids to a new object return(gene_id_selection) # return only the gene ids }
# Apply 'get_geneids()' to 'monocytes_list' geneids_per_mark <- ___(X = ____ FUN = ___)
# You need to use lapply() here
# eg: geneids_per_mark <- lapply(X = monocytes_list, __)
# and set the FUN parameter as FUN = get_geneids
geneids_per_mark <- lapply(X = monocytes_list, FUN = get_geneids)
grade_code()
Exercise 13b: Generate the combination matrix of
geneids_per_mark
.
get_geneids <- function(gr_peaks) { overlap <- findOverlaps(query = gr_peaks , subject = promoters_chr19) # identify the overlap among x and promoters_chr19 unique_indexes <- unique(subjectHits(overlap)) # extract the unique indexes of promoters_chr19 with overlap promoter_selection <- promoters_chr19[unique_indexes] # extract the corresponding promoters gene_id_selection <- promoter_selection$gene_id # write the corresponding gene ids to a new object return(gene_id_selection) # return only the gene ids } geneids_per_mark <- lapply(X = monocytes_list, FUN = get_geneids)
# Obtain the combination matrix of 'geneids_per_mark' my_combination_matrix <- ___(____)
# Review section 2.6.2 to identify the function you need.
my_combination_matrix <- make_comb_mat(geneids_per_mark)
grade_code()
You can make an UpSet pot with the combination matrix you just made.
Exercise 14: Plot the UpSet plot of
my_combination_matrix
.
get_geneids <- function(gr_peaks) { overlap <- findOverlaps(query = gr_peaks , subject = promoters_chr19) # identify the overlap among x and promoters_chr19 unique_indexes <- unique(subjectHits(overlap)) # extract the unique indexes of promoters_chr19 with overlap promoter_selection <- promoters_chr19[unique_indexes] # extract the corresponding promoters gene_id_selection <- promoter_selection$gene_id # write the corresponding gene ids to a new object return(gene_id_selection) # return only the gene ids } geneids_per_mark <- lapply(X = monocytes_list, FUN = get_geneids) my_combination_matrix <- make_comb_mat(geneids_per_mark)
# Generate the UpSet plot of `my_combination_matrix`.
# Review section 2.6.2 to identify the function you need.
UpSet(my_combination_matrix)
grade_code(correct = "Make sure you also hit 'Run Code' to view the resulting plot.")
quiz(caption = "", question("How many promoters are covered by the combination of marks that is found most often at promoters on chromosome 19? Choose the number that is closest to your approximation.", answer("150"), answer("200"), answer("300"), answer("450", correct = TRUE), allow_retry=TRUE), question("Which histone marks are part of this combination?", answer("H3K4me3 and H3K4me1"), answer("H3K4me3, H3K4me1, H3K27ac and H3K36me3"), answer("H3K4me3, H3K4me1 and H3K27me3"), answer("H3K4me3, H3K4me1 and H3K27ac", correct = TRUE, message = "Observe that your Upset plot contains a lot of combinations that rather very infrequent. In exercise 15 you will therefore filter the combination matrix to only contain combinations of a size of 60 or more."), allow_retry=TRUE, random_answer_order = TRUE) )
You can filter a combination matrix for combinations of a particular size using the following code:
new_combination_matrix <- original_combination_matrix[comb_size(original_combination_matrix) > [threshold]]
>
for greater than
but you can also use other logical operators. comb_size()
functions finds the sizes of all the combinations in the given combination matrix. Exercise 15: Repeat the plot of exercise 14 but this time filter
my_combination_matrix
for combinations that have more than 60 genes.
## 1. Filter `my_combination_matrix` my_filtered_combination_matrix <- ____ ## 2. Generate the UpSet plot of `my_filtered_combination_matrix`. # in `top_annotation=` we changed the height of the top barplot. ___(___, top_annotation = upset_top_annotation(my_filtered_combination_matrix, height = unit(5, "cm")))
# subset my_combination_matrix for combinations of a size above 60
# Thus, subset using: comb_size(my_combination_matrix) >60
# Step 1 completely: my_filtered_combination_matrix <- my_combination_matrix[comb_size(my_combination_matrix) >60]
# Make the UpSet plot as in exercise 14
my_filtered_combination_matrix <- my_combination_matrix[comb_size(my_combination_matrix) > 60] UpSet(my_filtered_combination_matrix, top_annotation = upset_top_annotation(my_filtered_combination_matrix, height = unit(5, "cm")))
grade_code(correct = "Also hit 'Run Code' to view the UpSet plot.")
quiz(caption = "", question("How many promoters are covered by the combination H3K4me1, H3K4me3, H3K27ac and H3K27me3? Choose the number that is closest to you approximation.", answer("50", correct = TRUE, message = "The number of promoters with these four marks is 66 according to this analysis."), answer("100"), answer("150"), answer("200"), allow_retry=TRUE), question("What do you expect the gene expression level of the corresponding genes to be?", answer("Sky high; among the highest of genes."), answer("Among the lowest of all genes.", correct = TRUE), allow_retry=TRUE) )
In the plot below we added gene expression as FPKM of the corresponding genes as boxplots.
quiz(caption = "", question("Which column (labeled A -F at the bottom) represents polycomb-silenced genes?", answer("A", correct = TRUE), answer("B"), answer("C"), answer("D"), answer("E"), answer("F"), allow_retry=TRUE), question("Which columns represent candiate bivalent promoters with polycomb-mediated silencing?", answer("A"), answer("B"), answer("C", correct = TRUE), answer("D"), answer("E", correct = TRUE, message = "Notice that you also have H3K4me3/H3K9me3 bivalent promoters in column D. Also notice the difference in gene expression among genes of column C and E. Column E may contain promoters that for only a few bps overlap with a H3K27me3 peak and not with the majority promoter. Alternatively, the H3K27me3 signal may be very low and H3K4me3 signals very high. We will look at the relationship between signal intensity and gene expression next week."), answer("F"), allow_retry=TRUE) )
# make promoters_chr19 with added FPKM promoters_chr19_new <- promoters_chr19[promoters_chr19$gene_id %in% quantification_chr19$entrezgene_id] quantification_chr19_new <- quantification_chr19[quantification_chr19$entrezgene_id %in% promoters_chr19_new$gene_id,] FPKM_promorder<- as.vector(NA) for(i in 1:length(promoters_chr19_new)) { FPKM_promorder[i] <- quantification_chr19_new[which(quantification_chr19_new[, "entrezgene_id"] == promoters_chr19_new$gene_id[i]),]$FPKM } promoters_chr19_new$FPKM <- FPKM_promorder rm(FPKM_promorder) ## new function get_geneids <- function(gr_peaks) { overlap <- findOverlaps(query = gr_peaks , subject = promoters_chr19_new) # identify the overlap among x and promoters_chr19_new unique_indexes <- unique(subjectHits(overlap)) # extract the unique indexes of promoters_chr19_new with overlap promoter_selection <- promoters_chr19_new[unique_indexes] # extract the corresponding promoters gene_id_selection <- promoter_selection$gene_id # write the corresponding gene ids to a new object return(gene_id_selection) # return only the gene ids } ## make first combi matrix geneids_per_mark <- lapply(X = monocytes_list, FUN = get_geneids) m1 <- make_comb_mat(geneids_per_mark) ## filter m1 m3 <- m1[comb_size(m1) >60] ## add average gene expression as boxplot below the combinations m3_comb_sets <- lapply(comb_name(m3), function(nm) extract_comb(m3, nm)) # add FPKM to comb sets m3_comb_sets <- lapply(m3_comb_sets, function(v) { new <- as.data.frame(v) colnames(new) <- "gene_id" new$FPKM <- NA for(i in 1:nrow(new)) { new[i, "FPKM"] <- promoters_chr19_new[which(promoters_chr19_new$gene_id == new[i, "gene_id"])]$FPKM } new }) ## plot new UpSet plot UpSet(m3, bottom_annotation = HeatmapAnnotation( "log(FPKM+1)" = anno_boxplot(lapply(m3_comb_sets, function(d) log(d$FPKM+1)), outline = FALSE), annotation_name_side = "left", height = unit(3, "cm"), label = anno_text(LETTERS[1:length(m3_comb_sets)], rot = 0)) ) UpSet(m3, bottom_annotation = HeatmapAnnotation( FPKM = anno_boxplot(lapply(m3_comb_sets, function(d) d$FPKM), outline = FALSE), annotation_name_side = "left", height = unit(3, "cm"), label = anno_text(LETTERS[1:length(m3_comb_sets)], rot = 0)) ) UpSet(m3, bottom_annotation = HeatmapAnnotation( mean(FPKM) = sapply(m3_comb_sets, function(d) mean(d$FPKM)), annotation_name_side = "left", height = unit(3, "cm"), label = anno_text(LETTERS[1:length(m3_comb_sets)], rot = 0)) )
You have one last challenge to go: Hit the
Print page
button to save a progress report of the tutorial (incl. your answers) as pdf to your local drive.
- It is important that you have visited each topic within the tutorial for all of the content to be populated.
- Thus, if you see empty sections in your pdf file, revisit each section and print the report again.
- Hit
Continue
after you saved your pdf file.
```{js print2pdf1, context="server"} $(document).on('shiny:inputchanged', function(event) { if (event.name === 'print2pdf') { window.print(); } });
```r actionButton("print2pdf", "Print page", style="opacity: .7; color: #000;")
You have studied peak locations, their overlap with features, and the association between the absence or presence of a (or a combination of) mark(s) at the promoter and gene expression.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.