knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
In this benchmark, we will use Repguide to predict genomic binding sites for a publically available set of 12 guideRNAs targeting LTR5_Hs long terminal repeats and compare the results to experimentally determined Cas9 ChIP-seq peak intensities [1].
We first create a hg38 guideSet and provide the guide sequences to map.
library(Repguide) library(BSgenome.Hsapiens.UCSC.hg38) library(tidyverse) # Path to RepeatMasker hg38 repeat annotation (provided, LTRs only) repeats_path <- system.file(package = 'Repguide', 'extdata', 'hg38_ucsc_rmsk_ltr.txt.gz') gs <- createGuideSet(genome = Hsapiens, # Hsapiens is the short version of BSgenome.Hsapiens.UCSC.hg38 tes = repeats_path) # can be either path to file or GRanges object) gs <- addTargets(gs, 'LTR5_Hs') # Import guideRNA sequences used in Fuentes et al. [1] against LTR5_Hs guides_path <- system.file(package = 'Repguide', 'extdata', 'fuentes_elife_2018_guides.fasta') guides_fuentes <- as.character(Biostrings::readDNAStringSet(guides_path)) gs_fuentes <- addGuides(gs, guides = guides_fuentes, n_mismatches = 3, min_Son = 0, max_Soff = Inf, gc_content = c(0,1), PAM = 'NGG')
We can extract predicted binding sites using the mappings
accessor and import provided dCAS9 ChIP peak coordinates from Fuentes et al. [1]. To facilitate comparison, we restrict our analysis to peaks on the main chromosome assemblies.
# Extract predicted binding sites pred_sites <- mappings(gs_fuentes) # Import experimental binding sites peaks_path <- system.file(package = 'Repguide', 'extdata', 'fuentes_elife_2018_dCas9_peaks.bed') exp_sites <- rtracklayer::import.bed(peaks_path) # Throw chromosomes not present in guideSet seqlevels(exp_sites, pruning.mode = 'coarse') <- seqlevels(pred_sites)
We then compare the absolute number and binding intensities of correctly predicted binding sites.
hits <- IRanges::findOverlaps(exp_sites, pred_sites) # Add logical whether or not peak was predicted by Repguide exp_sites$predicted <- 1:length(exp_sites) %in% hits@from # Add Repguide predicted binding score to ChIP peaks exp_sites$score_pred = 0 exp_sites[unique(hits@from)]$score_pred <- as.data.frame(hits) %>% mutate(score_pred = pred_sites[hits@to]$Sbind) %>% group_by(queryHits) %>% summarise(score_pred = sum(score_pred)) %>% pull(score_pred)
We finally percent rank transform the predicted and experimental scores and visualize the results using ggplot2
.
ggdata <- exp_sites %>% as.data.frame %>% mutate(score_pred = percent_rank(score_pred), score = percent_rank(score)) # Create plots p_peaks_pie <- ggdata %>% ggplot(aes(x = '', fill = predicted)) + geom_bar() + coord_polar("y", start = 0) + theme(legend.position = 'none', panel.border = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank()) p_score_boxplot <- ggdata %>% ggplot(aes(x = predicted, y = score, group = predicted, fill = predicted)) + geom_boxplot(outlier.shape = NA) + geom_jitter(size = 1, alpha = 0.1) + theme(axis.text.x = element_text(angle = 90)) + theme(legend.position = 'none') p_score_scatter <- ggdata %>% filter(predicted) %>% ggplot(aes(score, score_pred)) + geom_point(size = 1) # Combine plots cowplot::plot_grid(p_peaks_pie, p_score_boxplot, p_score_scatter, rel_widths = c(1.5, 1, 2), labels = 'auto', nrow = 1)
We can see that Repguide correctly predicted more than 3/4 of experimentally determined Cas9 peaks (a) and failed predictions generally showed weak ChIP enrichment (b). Moreover, computed binding scores were largely consistent with ChIP signal intensities (c), suggesting Repguide on and off-targetd scores can be interpreted quantitatively.
[1] Daniel R Fuentes, Tomek Swigut, Joanna Wysocka. Systematic perturbation of retroviral LTRs reveals widespread long-range effects on human gene regulation. eLife (2018).
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.