This package is currently is it's infancy. Don't expect any real functionality for a while...
You can use RunATAC to easily Plot insert size histograms Calculate fraction of reads in peaks (FrIP) Plot footprint profiles for motifs Generate V-plots of fragment sizes flanking transcription factor motifs Generate bigwig files of Tn5 insertions and nucleosome spanning fragments Perform motif-centric transcription factor occupancy predictions with the CENTIPEDE algorithm Generate tables of counts for multiple samples Perform differential accessibility testing * Generate browser snapshots for regions of interest
knitr::opts_chunk$set(echo = TRUE, cache = FALSE, results = "hide", comment = FALSE, warning = FALSE, message=FALSE)
library(devtools) devtools::install_github("SamBuckberry/RunATAC") install.packages('data.table') install.packages('magrittr') ## try http:// if https:// URLs are not supported source("https://bioconductor.org/biocLite.R") biocLite(c("Biostrings", "motifRG", "GenomicRanges", "IRanges", "GenomicAlignments", "rtracklayer", "Rsamtools"))
One of the first quality control steps of an ATAC-seq experiment often performed is the assessment of insert size distribution. As Tn5 DNA insertions in accessible chromatin are highly structured around nucleosomes Buenrostro et. al. 2013, a good ATAC-seq experiment should show an enrichment of fragment widths (the mapping distance between 5' read ends for paired-end reads) below 100 bases, and another peak of enrichment at ~200 bases, indicative of nucleosome spanning read fragments.
To inspect the insert sizes from aligned ATAC-seq reads, first load the fragment data into into a GRanges object using the read_atac_frags()
function. Here we will use a BAM file included in the RunATAC package. This BAM is from ATAC-seq on Saccharomyces cerevisiae and includes data only from chrIV.
library(RunATAC) bam_fl <- system.file("extdata", "chrIV.bam", package = "RunATAC") frags <- read_atac_frags(bam_file = bam_fl)
Plot the insert size distribution.
plot_insert_size(frags, xlim=c(0,600))
Another useful metric for evaluating the quality of an ATAC-seq experiment is the fraction of reads in peaks (FrIP), which can help you assess the ammount of background noise in the experiment.
First, we load the Tn5 insertion points from BAM file into GRanges object
ins <- read_atac_insertions(bam_fl)
Then we load a file of peaks in the MACS2 .narrowPeak format, however you can generate peaks using any peak caller and load these into a GRanges object from a BED format file.
peak_fl <- system.file("extdata", "chrIV_peaks.narrowPeak", package = "RunATAC") peaks <- read_bed(peak_fl)
And now we calculate the fraction of reads in peaks
calc_frip(signal = ins, peaks = peaks)
If the purpose of the ATAC-seq experiments is to perform differential accessibility testing, you would need to generate a table of read counts for each library.
library(GenomicAlignments) library(stringr) olap_counts <- summarizeOverlaps(features = peaks, reads = ins) olap_counts <- assays(olap_counts)$counts rownames(olap_counts) <- str_c(seqnames(peaks), start(peaks), sep = ":") %>% str_c(end(peaks), sep = "-")
As Tn5 has has a sequence preference Buenrostro et. al. 2013 it can be useful to calculate a Position Weight Matrix (PWM) for sequence content ±10 bp from Tn5 insertion points, so insertion counts at each position can be bias corrected.
First, we can inspect the di-nucleotide insertion frequencies and see that there is a distinct depletion of insertions in CG dinucleotide context.
library(BSgenome.Scerevisiae.UCSC.sacCer3) dinuc_freq <- calc_dinuc_freq(ins_gr = ins, genome = Scerevisiae) barplot(dinuc_freq$percentage, names=dinuc_freq$dinucleotide)
Now we construct a 21 bp positional weight matrix (PWM) for the Tn5 insertion positions and the flanking 10 bases.
ins_pwm <- calc_ins_pwm(ins_gr = ins, genome = Scerevisiae) ins_pwm
library(reshape2) library(ggplot2) dat <- melt(ins_pwm) gg <- ggplot(dat, aes(x = Var2, y = value, group=Var1, colour=Var1)) + geom_line() gg
As ATAC-seq data can provide information on both Tn5 accessible DNA and nucleosome positioning, it is useful to summarise the coverage depth of both insertions and fragment centres. For this purpose, the bigwig file format is very efficient for visualisation and fast data retrieval.
To convert BAM files to insertion point and fragment centre bigwig files:
bam_to_insertions_bw(bam_file = bam_fl, file = "inst/extdata/chrIV.ins.bigwig") bam_to_centres_bw(bam_file = bam_fl, file = "inst/extdata/chrIV.nuc.bigwig")
In this example, we will plot the Tn5 insertion signal around REB1 motifs in ATAC-seq peaks using a positional weight matrix (PWM).
Get the REB1 motif from the JASPAR database.
#source("https://bioconductor.org/biocLite.R") #biocLite(c("BSgenome.Scerevisiae.UCSC.sacCer3", "JASPAR2016", "TFBSTools")) library(JASPAR2016) library(TFBSTools) getMatrixByID tf <- getMatrixByID(JASPAR2016, ID=c("MA0363.1")) tf <- tf@profileMatrix
Get the positions of motif in peaks.
motif_pos <- motif_gr(gr = peaks, pwm = tf, genome = Scerevisiae)
Get the insertion data for the motif regions, for this we retreivie the data from the bigwig file of Tn5 insertions. This will return a data.frame of insertions for the 100 bases flanking each side of the motif matches.
ins_bw <- system.file("extdata", "chrIV.ins.bigwig", package = "RunATAC") dat <- range_summary(bigwig = ins_bw, gr = motif_pos, range = 100) dat[1:6, 1:6] plot(colSums(dat), type='l')
Then Calculate Tn5 insertion bias for the same regions
par(mfrow=c(2,1)) plot(colSums(ins_bias), type='l') plot(colSums(dat), type='l') ins_over_bias <- dat+1 / ins_bias plot(colSums(ins_over_bias)) lines(colSums(ins_bias)) ins_bw <- system.file("extdata", "chrIV.ins.bigwig", package = "RunATAC") frag_bw <- system.file("extdata", "chrIV.nuc.bigwig", package = "RunATAC") gr <- motif_pos bias_pwm <- ins_pwm genome <- Scerevisiae range <- 200 # Get fragment centres, tn5 insertions and insertion bias scores for regions flanking PWM matches in peaks. calc_motif_profile <- function(ins_bw, frag_bw, gr, ins_pwm, genome, range=100){ # Get the range summary for Tn5 insertions ins_dat <- range_summary(bigwig = ins_bw, gr = gr, range = range) # Get the range summary for the fragment centres frag_dat <- range_summary(bigwig = frag_bw, gr = gr, range = range) # Calculate the bias scores bias_dat <- calc_profile_bias(ins_pwm = ins_pwm, gr = gr, genome = genome, range = range) # Get data into list for export dat <- list(ins_dat=ins_dat, frag_dat=frag_dat, bias_dat=bias_dat) return(dat) } profile_dat <- calc_motif_profile(ins_bw = ins_bw, frag_bw = frag_bw, gr = motif_pos, ins_pwm = ins_pwm, genome = Scerevisiae, range = 200) plot_motif_profiles <- function(profile_dat){ range01 <- function(x){(x-min(x))/(max(x)-min(x))} ins_dat <- colSums(profile_dat[[1]]) %>% range01() frag_dat <- colSums(profile_dat[[2]]) %>% range01() bias_dat <- colSums(profile_dat[[3]]) %>% range01() plot(ins_dat) plot(frag_dat) }
gr <- motif_pos ins_bigwig <- ins_bw genome <- Scerevisiae calc_footprint <- function(ins_bigwig, gr, ins_pwm, genome, flank=100){ # Get the insertion data for the GRanges dat <- RunATAC::range_summary(bigwig = ins_bigwig, gr = gr, range = flank) # Calculate the insertion bias for regions regions_gr <- GenomicRanges::resize(gr, width = flank+(flank+1), fix = 'center') bias <- RunATAC::calc_ins_bias(ins_pwm = ins_pwm, regions_gr = regions_gr, genome = genome) bias <- do.call(rbind, bias) # undo the log2 transform bias <- exp(bias) # Calculate insertions / bias ins_over_bias <- dat / bias # data for plotting xdat <- -flank:flank ins_y <- colSums(dat) bias_y <- colSums(2^bias) norm_y <- ins_y / bias_y } library(reshape2) library(dplyr) library(ggplot2) mdat <- melt(dat) fp <- mdat %>% group_by(variable) %>% summarise(yy=mean(value)) fp$variable <- as.character(fp$variable) %>% as.numeric() gg <- ggplot(fp, aes(x = variable, y = yy, fill = 'blue', title="")) + #stat_smooth(span = 10) + #geom_area(aes(fill = 'blue', stat = "bin")) + geom_line(size=0.3) + xlab("Distance from motif") + ylab("Tn5 insertions") + ggtitle(title) + theme_bw() + theme(plot.background = element_blank(), panel.border = element_blank(), strip.background = element_rect(size = 0), panel.grid.minor = element_blank(), axis.line = element_line(), axis.text = element_text(size=8)) gg
Generate V-plot for motif regions
v_dat <- calc_v(frags_gr = frags, motif_pos_gr = motif_pos, flank = 200, max_frag = 400) plot_v(df = v_dat)
Predict motifs with TF occupancy using the CENTIPEDE algorithm
library(GenomicFeatures) source("https://bioconductor.org/biocLite.R") biocLite("TxDb.Scerevisiae.UCSC.sacCer3.sgdGene") library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene) tss <- promoters(x = TxDb.Scerevisiae.UCSC.sacCer3.sgdGene, upstream = 1, downstream = 1) tss_profile <- range_summary(bigwig = ins_bw, gr = tss, aggregate = TRUE) plot(tss_profile, type='l')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.