knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message = FALSE,
  warning = FALSE,
  echo = TRUE
)

Choosing log likelihood cutoff values

The data shown are generated by calling methylation on nanopore reads using nanopolish. This output two .tsv files, one has frequency of methylation, the other has single molecule data. Here an example of part of the single molecule data

library(nanodsmf)
library(ggplot2)
library(magrittr)
knitr::kable(head(MSssI_CpG),full_width=F)

The Nanopolish calculate methylation frequency script (https://github.com/jts/nanopolish/blob/master/scripts/calculate_methylation_frequency.py) sets positive log_lik_ratio > 0 to methylated, but excludes all positions with an absolute value of log_like_ratio < 2.5. To test how this works on my data i will plot histograms with the different datasets. The cutoff is show in red.

ggplot(MSssI_CpG,aes(x=log_lik_ratio)) +
  geom_histogram(bins=100) + xlim(-15,15) + ggtitle("MSssI_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MSssI_GpC,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MSssI_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MCviPI_CpG,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MCviPI_GpC,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(Control_CpG,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(Control_GpC,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

Comparing the control to the methylated samples, 2.5 is probably an appropriate cutoff.

Does number of motifs affect the log_lik_ratio

ggplot(MSssI_CpG, aes(x=as.factor(num_motifs),y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MSssI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")


ggplot(MSssI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MSssI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(MCviPI_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MCviPI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(MCviPI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MCviPI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(Control_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("Control_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(Control_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("Control_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

There is a clear increase in log likelihood when more site are present in samples that have been methylated on that motif, and a clear reduction of log likelihood when more sites are present in samples not methylated on that motif. Should some normalisation by the number of motifs should be carried out.

Normalising log_lik_ratio by number of motifs

#normalise all log_lik_ratio by number of motifs
MSssI_CpG$norm_log_lik_ratio<-MSssI_CpG$log_lik_ratio/MSssI_CpG$num_motifs
MSssI_GpC$norm_log_lik_ratio<-MSssI_GpC$log_lik_ratio/MSssI_GpC$num_motifs
MCviPI_CpG$norm_log_lik_ratio<-MCviPI_CpG$log_lik_ratio/MCviPI_CpG$num_motifs
MCviPI_GpC$norm_log_lik_ratio<-MCviPI_GpC$log_lik_ratio/MCviPI_GpC$num_motifs
Control_CpG$norm_log_lik_ratio<-Control_CpG$log_lik_ratio/Control_CpG$num_motifs
Control_GpC$norm_log_lik_ratio<-Control_GpC$log_lik_ratio/Control_GpC$num_motifs


ggplot(MSssI_CpG, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MSssI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")


ggplot(MSssI_GpC, aes(x=as.factor(num_motifs),y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MSssI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(MCviPI_CpG, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MCviPI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(MCviPI_GpC, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MCviPI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(Control_CpG, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("Control_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(Control_GpC, aes(x=as.factor(num_motifs), y=norm_log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("Control_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

The boxplots show that normalising by the number of motifs serves to make the log likelihood distribution more similar. Therefore this would be a good strategy when splitting apart separate sites - can simply divide the log likelihood between the sites.

This also makes sense in terms of combined probabilities: A joint probability of independent events would be a multiplication of the individual probabilities and therefore an addition of the logs of the probabilities/likelihoods. If we were to simply assign the fragment probability to each of the constituent sites, that would inflate the methyltion score.

Note that the Nanopolish scripts to calculate methylation frequency first binarises methylation status and then passes this to the sub-sites when "split" is turned on.

ggplot(MSssI_CpG,aes(x=norm_log_lik_ratio)) +
  geom_histogram(bins=100) + xlim(-15,15) + ggtitle("MSssI_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MSssI_GpC,aes(x=norm_log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MSssI_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MCviPI_CpG,aes(x=norm_log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MCviPI_GpC,aes(x=norm_log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(Control_CpG,aes(x=norm_log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(Control_GpC,aes(x=norm_log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

This change does not seem to significantly change the distribution of the values and $abs(norm_log_lik_ratio) > 2.5$ is probably still a good threshold.

Plotting split motifs

The function splitMotifs() changes the tsv to contain exact start and end of the CG or GC motif (rather than of the whole sequence fragment surrounding it). For sequences taht contain more than one motif of a particular types, these will be split into multiple rows, with each row containing the exact start and end of the motif, and the log_lik_ratio normalised by the number of motifs in the sequence. The "sequence" and "motif_num" fields are not modified.

#split mutli-motif fragments and normalise log_lik_ratio
MSssI_CpG<-splitMotifs(MSssI_CpG,"CG")
MSssI_GpC<-splitMotifs(MSssI_GpC,"GC")
MCviPI_CpG<-splitMotifs(MCviPI_CpG,"CG")
MCviPI_GpC<-splitMotifs(MCviPI_GpC,"GC")
Control_CpG<-splitMotifs(Control_CpG,"CG")
Control_GpC<-splitMotifs(Control_GpC,"GC")


ggplot(MSssI_CpG, aes(x=as.factor(num_motifs),y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MSssI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")


ggplot(MSssI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MSssI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(MCviPI_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MCviPI_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(MCviPI_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("MCviPI_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(Control_CpG, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("Control_CpG") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")

ggplot(Control_GpC, aes(x=as.factor(num_motifs), y=log_lik_ratio, group=as.factor(num_motifs), fill=as.factor(num_motifs))) +
  geom_boxplot() + ggtitle("Control_GpC") + scale_fill_brewer(palette="Dark2",name="motifs") +
  xlab("num_motifs")
ggplot(MSssI_CpG,aes(x=log_lik_ratio)) +
  geom_histogram(bins=100) + xlim(-15,15) + ggtitle("MSssI_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MSssI_GpC,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MSssI_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MCviPI_CpG,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(MCviPI_GpC,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("MCviPI_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(Control_CpG,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_CpG") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

ggplot(Control_GpC,aes(x=log_lik_ratio))+
  geom_histogram(bins=100)+ xlim(-15,15) + ggtitle("Control_GpC") +
  geom_vline(xintercept = c(-2.5,2.5), linetype="dashed", color = "red", size=1) 

Creating a methylation matrix based on genomic ranges

Since the ttTi5605 PCR fragment is only 2.3 kb long we will create a matrix over the whole region. The function tsvToMethMat, takes a data table from the methylation called tsv that has been split into individual motifs, a genomic range, and the motif called in that tsv ("CG" or "GC"). It returns a matrix with reads in individual rows, and absolute positions of the motif in the genomic range as columns. Methylated positions have the value 1, non-methylated positions have the value 0 and inconclusive calls have the value NA.

# reload unsplit data
data(MSssI_CpG)
MSssI_CpG<-splitMotifs(MSssI_CpG,"CG")
ttTi5605gr$ID<-"ttTi5605"
methMat<-tsvToMethMat(MSssI_CpG,ttTi5605gr,"CG")[[1]]
# add markers for CG positions
CGpos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]])
CGpos$y<-rep(-1,length(CGpos$x))
p<-plotSingleMoleculesAmp(methMat,"ttTi5605",ttTi5605gr,featureGRs=c(),myXlab="CpG position",featureLabel="TSS",title=NULL,baseFontSize=12)
# do single molecule plot with positions of Cs marked.
p+geom_point(data=CGpos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")

As expected, most sites are accessible because this was naked DNA. The sites near the start and end of the amplicon were not called - they are probably not included in the region amplified by primers 28 and 32.

I will make the genomic range to be plotted smaller and do the same for all the data sets.

newGR<-ttTi5605gr
GenomicRanges::start(newGR)<-400
GenomicRanges::end(newGR)<-2100
# reload unsplit data
data(MSssI_CpG)
tsv<-splitMotifs(MSssI_CpG,"CG")
ttTi5605gr$ID<-"ttTi5605"
methMat<-tsvToMethMat(tsv,ttTi5605gr,"CG")[[1]]
# add markers for CG positions
pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]])
pos$y<-rep(-1,length(pos$x))
p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MSssI_CpG", myXlab="CpG position")
# do single molecule plot with positions of Cs marked.
p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")
# reload unsplit data
data(MSssI_GpC)
tsv<-splitMotifs(MSssI_GpC,"GC")
ttTi5605gr$ID<-"ttTi5605"
methMat<-tsvToMethMat(tsv,ttTi5605gr,"GC")[[1]]
# add markers for CG positions
pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("GC",ttTi5605dna))[[1]])
pos$y<-rep(-1,length(pos$x))
p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MSssI_GpC", myXlab="GpC position")
# do single molecule plot with positions of Cs marked.
p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="GpC position")
# reload unsplit data
data(MCviPI_CpG)
tsv<-splitMotifs(MCviPI_CpG,"CG")
ttTi5605gr$ID<-"ttTi5605"
methMat<-tsvToMethMat(tsv,ttTi5605gr,"CG")[[1]]
# add markers for CG positions
pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]])
pos$y<-rep(-1,length(pos$x))
p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MCviPI_CpG", myXlab="CpG position")
# do single molecule plot with positions of Cs marked.
p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")
# reload unsplit data
data(MCviPI_GpC)
tsv<-splitMotifs(MCviPI_GpC,"GC")
ttTi5605gr$ID<-"ttTi5605"
methMat<-tsvToMethMat(tsv,ttTi5605gr,"GC")[[1]]
# add markers for CG positions
pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("GC",ttTi5605dna))[[1]])
pos$y<-rep(-1,length(pos$x))
p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="MCviPI_GpC", myXlab="GpC position")
# do single molecule plot with positions of Cs marked.
p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="GpC position")
# reload unsplit data
data(Control_CpG)
tsv<-splitMotifs(Control_CpG,"CG")
ttTi5605gr$ID<-"ttTi5605"
methMat<-tsvToMethMat(tsv,ttTi5605gr,"CG")[[1]]
# add markers for CG positions
pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("CG",ttTi5605dna))[[1]])
pos$y<-rep(-1,length(pos$x))
p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="Control_CpG", myXlab="CpG position")
# do single molecule plot with positions of Cs marked.
p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="CpG position")
# reload unsplit data
data(Control_GpC)
tsv<-splitMotifs(Control_GpC,"GC")
ttTi5605gr$ID<-"ttTi5605"
methMat<-tsvToMethMat(tsv,ttTi5605gr,"GC")[[1]]
# add markers for CG positions
pos<-data.frame(x=Biostrings::start(Biostrings::vmatchPattern("GC",ttTi5605dna))[[1]])
pos$y<-rep(-1,length(pos$x))
p<-plotSingleMoleculesAmp(methMat,"ttTi5605", newGR, title="Control_GpC", myXlab="GpC position")
# do single molecule plot with positions of Cs marked.
p+geom_point(data=pos,aes(x=x,y=y,color="red"),size=0.5,name="GpC position")

In general the data looks ok: MSssI_CpG and MCviPI_GpC are both mostly accessible (naked DNA methylated with the appropriate enzyme).

The control GpC and CpG methylation are mostly black ("protected") because not methylated. However MSssI_GpC and MCviPI_CpG (where the DNA has been methylated by one enzyme and then methylation at the other motif was detected), are not looking great.

Testing two GR:

Normally I will want to extract multiple methMats (e.g. for multiple TSSs) from a single TSV. Here i test if it works.

spltgr<-c(ttTi5605gr,ttTi5605gr)
GenomicRanges::start(spltgr)<-c(400,1001)
GenomicRanges::end(spltgr)<-c(1000,2100)
spltgr
spltgr$ID<-c("first","second")
data(MSssI_CpG)
tsv<-splitMotifs(MSssI_CpG,"CG")
methMats<-tsvToMethMat(tsv,spltgr,"CG")
plotSingleMoleculesAmp(methMats[["first"]],"first", spltgr, title="MSssI_CpG", myXlab="CpG position in first GR")
plotSingleMoleculesAmp(methMats[["second"]],"second", spltgr, title="MSssI_CpG", myXlab="CpG position in second GR")

merging CG and GC matrices

Standard dSMF will want to merge the CG and GC matrices into a single methylation matrix.

spltgr<-c(ttTi5605gr,ttTi5605gr)
GenomicRanges::start(spltgr)<-c(400,1001)
GenomicRanges::end(spltgr)<-c(1000,2100)
spltgr
spltgr$ID<-c("first","second")
data(MSssI_CpG)
data(MSssI_GpC)
tsvCG<-splitMotifs(MSssI_CpG,"CG")
tsvGC<-splitMotifs(MSssI_GpC,"GC")
g<-mergeCGandGCtsv(tsvCG,tsvGC,ttTi5605dna)
methMats<-tsvToMethMat(g,spltgr,"CGorGC")
methMatsCG<-tsvToMethMat(tsvCG,spltgr,"CG")
methMatsGC<-tsvToMethMat(tsvGC,spltgr,"GC")

plotSingleMoleculesAmp(methMats[["first"]],"first", spltgr, title="MSssI_CpG", myXlab="CpG position in first GR")
plotSingleMoleculesAmp(methMats[["second"]],"second", spltgr, title="MSssI_CpG", myXlab="CpG position in second GR")


CellFateNucOrg/nanodsmf documentation built on July 23, 2020, 4:57 p.m.