knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE, echo = TRUE )
The data shown are generated by calling methylation on nanopore reads using nanopolish. This output two .tsv files, one has frequency of methylation, the other has single molecule data. Here an example of part of the single molecule data
library(nanodsmf) library(ggplot2) library(magrittr) knitr::kable(head(MSssI_CpG),full_width=F)
The Nanopolish calculate methylation frequency script (https://github.com/jts/nanopolish/blob/master/scripts/calculate_methylation_frequency.py) sets positive log_lik_ratio > 0 to methylated, but excludes all positions with an absolute value of log_like_ratio < 2.5. To test how this works on my data i will plot histograms with the different datasets. The cutoff is show in red.
ggplot(MSssI_CpG,aes(x=log_lik_ratio)) + geom_histogram(bins=100) + xlim(-15,15) + ggtitle("MSssI_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MSssI_GpC,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MSssI_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MCviPI_CpG,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MCviPI_GpC,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(Control_CpG,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(Control_GpC,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1)
Comparing the control to the methylated samples, 2.5 is probably an appropriate cutoff.
ggplot(MSssI_CpG, aes(x=as.factor(num_motifs),y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MSssI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MSssI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MSssI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MCviPI_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MCviPI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MCviPI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MCviPI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(Control_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("Control_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(Control_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("Control_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs")
There is a clear increase in log likelihood when more site are present in samples that have been methylated on that motif, and a clear reduction of log likelihood when more sites are present in samples not methylated on that motif. Should some normalisation by the number of motifs should be carried out.
#normalise all log_lik_ratio by number of motifs MSssI_CpG$norm_log_lik_ratio<-MSssI_CpG$log_lik_ratio/MSssI_CpG$num_motifs MSssI_GpC$norm_log_lik_ratio<-MSssI_GpC$log_lik_ratio/MSssI_GpC$num_motifs MCviPI_CpG$norm_log_lik_ratio<-MCviPI_CpG$log_lik_ratio/MCviPI_CpG$num_motifs MCviPI_GpC$norm_log_lik_ratio<-MCviPI_GpC$log_lik_ratio/MCviPI_GpC$num_motifs Control_CpG$norm_log_lik_ratio<-Control_CpG$log_lik_ratio/Control_CpG$num_motifs Control_GpC$norm_log_lik_ratio<-Control_GpC$log_lik_ratio/Control_GpC$num_motifs ggplot(MSssI_CpG, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MSssI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MSssI_GpC, aes(x=as.factor(num_motifs),y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MSssI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MCviPI_CpG, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MCviPI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MCviPI_GpC, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MCviPI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(Control_CpG, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("Control_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(Control_GpC, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("Control_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs")
The boxplots show that normalising by the number of motifs serves to make the log likelihood distribution more similar. Therefore this would be a good strategy when splitting apart separate sites - can simply divide the log likelihood between the sites.
This also makes sense in terms of combined probabilities: A joint probability of independent events would be a multiplication of the individual probabilities and therefore an addition of the logs of the probabilities/likelihoods. If we were to simply assign the fragment probability to each of the constituent sites, that would inflate the methyltion score.
Note that the Nanopolish scripts to calculate methylation frequency first binarises methylation status and then passes this to the sub-sites when "split" is turned on.
ggplot(MSssI_CpG,aes(x=norm_log_lik_ratio)) + geom_histogram(bins=100) + xlim(-15,15) + ggtitle("MSssI_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MSssI_GpC,aes(x=norm_log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MSssI_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MCviPI_CpG,aes(x=norm_log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MCviPI_GpC,aes(x=norm_log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(Control_CpG,aes(x=norm_log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(Control_GpC,aes(x=norm_log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1)
This change does not seem to significantly change the distribution of the values and $abs(norm_log_lik_ratio) > 2.5$ is probably still a good threshold.
The function splitMotifs() changes the tsv to contain exact start and end of the CG or GC motif (rather than of the whole sequence fragment surrounding it). For sequences taht contain more than one motif of a particular types, these will be split into multiple rows, with each row containing the exact start and end of the motif, and the log_lik_ratio normalised by the number of motifs in the sequence. The "sequence" and "motif_num" fields are not modified.
#split mutli-motif fragments and normalise log_lik_ratio MSssI_CpG<-splitMotifs(MSssI_CpG,"CG") MSssI_GpC<-splitMotifs(MSssI_GpC,"GC") MCviPI_CpG<-splitMotifs(MCviPI_CpG,"CG") MCviPI_GpC<-splitMotifs(MCviPI_GpC,"GC") Control_CpG<-splitMotifs(Control_CpG,"CG") Control_GpC<-splitMotifs(Control_GpC,"GC") ggplot(MSssI_CpG, aes(x=as.factor(num_motifs),y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MSssI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MSssI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MSssI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MCviPI_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MCviPI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(MCviPI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("MCviPI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(Control_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("Control_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs") ggplot(Control_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) + geom_boxplot() + ggtitle("Control_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") + xlab("num_motifs")
ggplot(MSssI_CpG,aes(x=log_lik_ratio)) + geom_histogram(bins=100) + xlim(-15,15) + ggtitle("MSssI_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MSssI_GpC,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MSssI_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MCviPI_CpG,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(MCviPI_GpC,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(Control_CpG,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_CpG") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) ggplot(Control_GpC,aes(x=log_lik_ratio))+ geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_GpC") + geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1)
Since the ttTi5605 PCR fragment is only 2.3 kb long we will create a matrix over the whole region. The function tsvToMethMat, takes a data table from the methylation called tsv that has been split into individual motifs, a genomic range, and the motif called in that tsv ("CG" or "GC"). It returns a matrix with reads in individual rows, and absolute positions of the motif in the genomic range as columns. Methylated positions have the value 1, non-methylated positions have the value 0 and inconclusive calls have the value NA.
# reload unsplit data data(MSssI_CpG) MSssI_CpG<-splitMotifs(MSssI_CpG,"CG") ttTi5605gr$ID<-"ttTi5605" methMat<-tsvToMethMat(MSssI_CpG,ttTi5605gr,"CG")[[1]] # add markers for CG positions CGpos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]]) CGpos$y<-rep(-1,length(CGpos$x)) p<-plotSingleMoleculesAmp(methMat,"ttTi5605",ttTi5605gr,featureGRs=c(),myXlab="CpG position",featureLabel="TSS",title=NULL,baseFontSize=12) # do single molecule plot with positions of Cs marked. p+geom_point(data=CGpos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")
As expected, most sites are accessible because this was naked DNA. The sites near the start and end of the amplicon were not called - they are probably not included in the region amplified by primers 28 and 32.
I will make the genomic range to be plotted smaller and do the same for all the data sets.
newGR<-ttTi5605gr GenomicRanges::start(newGR)<-400 GenomicRanges::end(newGR)<-2100 # reload unsplit data data(MSssI_CpG) tsv<-splitMotifs(MSssI_CpG,"CG") ttTi5605gr$ID<-"ttTi5605" methMat<-tsvToMethMat(tsv,ttTi5605gr,"CG")[[1]] # add markers for CG positions pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]]) pos$y<-rep(-1,length(pos$x)) p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MSssI_CpG", myXlab="CpG position") # do single molecule plot with positions of Cs marked. p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")
# reload unsplit data data(MSssI_GpC) tsv<-splitMotifs(MSssI_GpC,"GC") ttTi5605gr$ID<-"ttTi5605" methMat<-tsvToMethMat(tsv,ttTi5605gr,"GC")[[1]] # add markers for CG positions pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("GC",ttTi5605dna))[[1]]) pos$y<-rep(-1,length(pos$x)) p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MSssI_GpC", myXlab="GpC position") # do single molecule plot with positions of Cs marked. p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="GpC position")
# reload unsplit data data(MCviPI_CpG) tsv<-splitMotifs(MCviPI_CpG,"CG") ttTi5605gr$ID<-"ttTi5605" methMat<-tsvToMethMat(tsv,ttTi5605gr,"CG")[[1]] # add markers for CG positions pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]]) pos$y<-rep(-1,length(pos$x)) p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MCviPI_CpG", myXlab="CpG position") # do single molecule plot with positions of Cs marked. p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")
# reload unsplit data data(MCviPI_GpC) tsv<-splitMotifs(MCviPI_GpC,"GC") ttTi5605gr$ID<-"ttTi5605" methMat<-tsvToMethMat(tsv,ttTi5605gr,"GC")[[1]] # add markers for CG positions pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("GC",ttTi5605dna))[[1]]) pos$y<-rep(-1,length(pos$x)) p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MCviPI_GpC", myXlab="GpC position") # do single molecule plot with positions of Cs marked. p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="GpC position")
# reload unsplit data data(Control_CpG) tsv<-splitMotifs(Control_CpG,"CG") ttTi5605gr$ID<-"ttTi5605" methMat<-tsvToMethMat(tsv,ttTi5605gr,"CG")[[1]] # add markers for CG positions pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]]) pos$y<-rep(-1,length(pos$x)) p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="Control_CpG", myXlab="CpG position") # do single molecule plot with positions of Cs marked. p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")
# reload unsplit data data(Control_GpC) tsv<-splitMotifs(Control_GpC,"GC") ttTi5605gr$ID<-"ttTi5605" methMat<-tsvToMethMat(tsv,ttTi5605gr,"GC")[[1]] # add markers for CG positions pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("GC",ttTi5605dna))[[1]]) pos$y<-rep(-1,length(pos$x)) p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="Control_GpC", myXlab="GpC position") # do single molecule plot with positions of Cs marked. p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="GpC position")
In general the data looks ok: MSssI_CpG and MCviPI_GpC are both mostly accessible (naked DNA methylated with the appropriate enzyme).
The control GpC and CpG methylation are mostly black ("protected") because not methylated. However MSssI_GpC and MCviPI_CpG (where the DNA has been methylated by one enzyme and then methylation at the other motif was detected), are not looking great.
Normally I will want to extract multiple methMats (e.g. for multiple TSSs) from a single TSV. Here i test if it works.
spltgr<-c(ttTi5605gr,ttTi5605gr) GenomicRanges::start(spltgr)<-c(400,1001) GenomicRanges::end(spltgr)<-c(1000,2100) spltgr spltgr$ID<-c("first","second") data(MSssI_CpG) tsv<-splitMotifs(MSssI_CpG,"CG") methMats<-tsvToMethMat(tsv,spltgr,"CG") plotSingleMoleculesAmp(methMats[["first"]],"first", spltgr, title="MSssI_CpG", myXlab="CpG position in first GR") plotSingleMoleculesAmp(methMats[["second"]],"second", spltgr, title="MSssI_CpG", myXlab="CpG position in second GR")
Standard dSMF will want to merge the CG and GC matrices into a single methylation matrix.
spltgr<-c(ttTi5605gr,ttTi5605gr) GenomicRanges::start(spltgr)<-c(400,1001) GenomicRanges::end(spltgr)<-c(1000,2100) spltgr spltgr$ID<-c("first","second") data(MSssI_CpG) data(MSssI_GpC) tsvCG<-splitMotifs(MSssI_CpG,"CG") tsvGC<-splitMotifs(MSssI_GpC,"GC") g<-mergeCGandGCtsv(tsvCG,tsvGC,ttTi5605dna) methMats<-tsvToMethMat(g,spltgr,"CGorGC") methMatsCG<-tsvToMethMat(tsvCG,spltgr,"CG") methMatsGC<-tsvToMethMat(tsvGC,spltgr,"GC") plotSingleMoleculesAmp(methMats[["first"]],"first", spltgr, title="MSssI_CpG", myXlab="CpG position in first GR") plotSingleMoleculesAmp(methMats[["second"]],"second", spltgr, title="MSssI_CpG", myXlab="CpG position in second GR")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.