knitr::opts_chunk$set(echo = TRUE)
suppressMessages(library(EMclassifieR))

The data

DSMF data comes as matrices of reads x positions where each row refers to a single (usually paired-end) read and each column refers to a CpG or GpC position where methylation has been scored. The positions should be in relative coordinates (e.g. -250 to +250 around the TSS), so that matrices from different genes can be compared. Some example data is included with the package. First we read in a table with a list of the matrix filenames (including path). The important columns in this table are "filename" and "reads".

tablePath="csv/MatrixLog_relCoord_ampTSS.csv"
matTable<-read.csv(system.file("extdata", tablePath, package = "EMclassifieR",
                               mustWork=TRUE), stringsAsFactors=F)
head(matTable)

The matrices are in RDS format. For learning it is probably best to process them to remove any reads containing NAs.

i=1
dataMatrix<-readRDS(system.file("extdata", matTable$filename[i], package = "EMclassifieR", mustWork=TRUE))
dim(dataMatrix)
dataMatrix<-removeNArows(dataMatrix)
dim(dataMatrix)

# dataMatrix<-readRDS("~/Documents/MeisterLab/myPackages/EMclassifieR/vignettes/EMres_cosine/dS02-182_WBGene00015947_K2.rds")
# dataMatrix<-removeNArows(dataMatrix)
# 
# colourChoice=list(low="blue", mid="white", high="red", bg="white", lines="grey80")
# colourChoice=list(low="yellow", mid="grey20", high="red", bg="black", lines="grey80")
# 
# 
# p<-plotClassesSingleMolecule(dataMatrix,
#                                 xRange=c(-250,250), title="Reads by classes",
#                                 myXlab="CpG/GpC position",
#                                 featureLabel="TSS", baseFontSize=12,
#                                 segmentSize=5,
#                                 colourChoice=colourChoice)
# print(p)

Learning classes for single genes

To look at one gene at a time one can use the minimal matrix which includes only positions with a CpG or GpC motif in them.

k_range = 2:8      # Number of classes to be found
maxIterations = 100 # number of iterations of EM clustering to perform if it does not converge
convergenceError = 10e-6
numRepeats=10 # number of repeats of clustering each matrix (to account for fraction of methylation)
xRange=c(-250,250)
maxB=50 # Number of randomised matrices to generate
outPath="./EMres"
maxTasks=4
taskId=1
nThreads=4
setSeed=FALSE
distMetric=list(name="cosineDist",rescale=T)

if(!dir.exists(outPath)){
  dir.create(outPath)
}

#split table indicies into nTasks number of groups
taskSubList<-split(1:nrow(matTable),sort(1:nrow(matTable)%%maxTasks))

set.seed(200413)
for (i in taskSubList[[taskId]]){
#for (i in 1:nrow(matTable)) {
  regionName=matTable$region[i]
  sampleName=matTable$sample[i]
  outFileBase=paste(sampleName, regionName, sep="_")
  print(paste("Clustering", outFileBase))
  dataMatrix<-readRDS(system.file("extdata", matTable$filename[i], 
                              package = "EMclassifieR", mustWork=TRUE))
  dim(dataMatrix)
  dataMatrix<-removeNArows(dataMatrix,maxNAfraction=0.2)
  #dataMatrix<-recodeMatrixAsNumeric(dataMatrix)
  dim(dataMatrix)

  allClassMeans<-tryCatch(
    {
      print("running EM for a range of class numbers")
      runEMrangeClassNum(dataMatrix, k_range, convergenceError, maxIterations,
                     EMrepeats=numRepeats, outPath=outPath, xRange=xRange,
                     outFileBase=paste(sampleName, regionName, sep="_"),
                     doIndividualPlots=FALSE, distMetric=distMetric)

    },
      error=function(e){"Matrix not valid"}
  )

  if(is.list(allClassMeans)){
        saveRDS(allClassMeans,paste0(outPath,"/allClassMeans_",outFileBase,".rds"))
  } else {
        print(allClassMeans) # error message
  }

  clustMetrics<-tryCatch(
    {
        print("plotting clustering metrics for a range of class sizes")
        plotClusteringMetrics(dataMatrix, k_range, maxB, convergenceError,
            maxIterations, outPath, outFileBase, EMrep=NULL, nThreads=nThreads,
            setSeed=setSeed, distMetric=distMetric)
    },
    error=function(e){"Matrix not valid"}
  )
  if(length(clustMetrics)==1) {
    print(clustMetrics)
  }

  pcaPlots<-tryCatch(
    {
      print("plotting PCA of clusters")
      plotPCAofMatrixClasses(k_range, outPath, outFileBase)
    },
    error=function(e){"Matrix not valid"}
   )
  if(length(pcaPlots)==1) {
    print(pcaPlots)
  }

  umapPlots<-tryCatch( 
    {
      print("plotting UMAP of clusters")
      plotUMAPofMatrixClasses(k_range, outPath, outFileBase)
    },
    error=function(e){"Matrix not valid"}
   )
  if(length(umapPlots)==1) {
    print(umapPlots)
  }

}

Plotting classes with track data

dataMatrix<-readRDS(system.file("extdata", matTable$filename[i], package = "EMclassifieR", mustWork=TRUE))
regionName=matTable$region[i]
sampleName=matTable$sample[i]
outFileBase=paste(sampleName, regionName, sep="_")

allClassMeansList<-readRDS("/Users/semple/Documents/MeisterLab/myPackages/EMclassifieR/inst/extdata/rds/allClassMeans_dS02-182_WBGene00015947.rds")
numClasses=5
allClassMeans<-allClassMeansList[[numClasses]]
allClassMeans

ampTSS<-readRDS('/Users/semple/Documents/MeisterLab/myPackages/EMclassifieR/inst/extdata/rds/ampliconMaxTSSgr.RDS')


winSize=500
tssWin<-ampTSS
GenomicRanges::mcols(tssWin)$TSS<-GenomicRanges::start(tssWin)
tssWin<-GenomicRanges::resize(tssWin,width=winSize,fix="center")


names(GenomicRanges::mcols(tssWin))[1]<-"ID"
regionGR<-tssWin[i]
featureGR<-ampTSS[i]
regionName=matTable$region[i]
sampleName=matTable$sample[i]
outFileBase=paste(sampleName, regionName, sep="_")
#seqlevels(ampTSS)<-seqlevels(Celegans)
#seqinfo(ampTSS)<-seqinfo(Celegans)
#saveRDS(ampTSS,'/Users/semple/Documents/MeisterLab/myPackages/EMclassifieR/inst/extdata/rds/ampliconMaxTSSgr.RDS')

# genomeVer="WS275"
# genomeDir=paste0("~/Documents/MeisterLab/GenomeVer/",genomeVer)
# txdb<-AnnotationDbi::loadDb(paste0(genomeDir, 
#                                    "/annotations/c_elegans.PRJNA13758.",
#                                    genomeVer,  ".annotations.sqlite"))
# seqlevelsStyle(txdb)<-"ucsc"
# GenomeInfoDb::genome(txdb)<-GenomeInfoDb::genome(regionGR)
#library("TxDb.Celegans.UCSC.ce11.refGene")
txdb <- TxDb.Celegans.UCSC.ce11.refGene::TxDb.Celegans.UCSC.ce11.refGene
bigwigListFile<-"~/Documents/MeisterLab/sequencingData/public/bigwigFileList.txt"
#bigwigListFile<-"~/Documents/MeisterLab/sequencingData/public/bigwigFileList_kranz.txt"
strandedBwFile<-"~/Documents/MeisterLab/sequencingData/public/strandedBwFile.txt"
gr<-allClassMeansToGR(allClassMeans, regionGR)

dTrack<-Gviz::DataTrack(gr,name="dSMF Classes")
Gviz::plotTracks(dTrack, groups=rep(c(paste0("class",1:numClasses)), times=10),
                 type=c("a","p","confint"))

print(txdb)
outPath="./EMres"
grDevices::pdf(paste0(outPath,"/tracks_",
                          outFileBase,"_K",
                          numClasses, ".pdf"),
                   paper="a4", height=11, width=8)

plotClassMeansWithTracks(allClassMeans, regionGR, "middle", txdb, featureGR, 
                         bigwigListFile, strandedBwFile)
dev.off()

Multigene classification

First you must create a data set combining and equal amount of reads from different matrices

tablePath="csv/MatrixLog_relCoord_ampTSS.csv"
matTable<-read.csv(system.file("extdata", tablePath, package = "EMclassifieR",
                               mustWork=TRUE), stringsAsFactors=F)
matTable<-matTable[!is.na(matTable$filename),]

head(matTable)
multiGeneMat<-NULL
genesIncluded<-0
minReads=50
binSize=20
maxNAfraction=0.2

for(i in 1:nrow(matTable[matTable$sample=="dS02-182"])){
  regionName=matTable$region[i]
  sampleName=matTable$sample[i]
  outFileBase=paste(sampleName, regionName, sep="_")
  print(paste0("reading matrix number ",i))
  dataMatrix<-readRDS(system.file("extdata", matTable$filename[i], 
                                  package = "EMclassifieR", mustWork=TRUE))
  #Make sure matrix rows do not have too many NAs
  dataMatrix<-removeNArows(dataMatrix, maxNAfraction=maxNAfraction, removeAll0=T)
  subMatrix<-selectReadsFromMatrix(dataMatrix,minReads=minReads,
                                 addToReadName=outFileBase,
                                 preferBest=T)
  if(!is.null(subMatrix)){
    fullMatrix<-getFullMatrix(subMatrix)
    winMatrix<-prepareWindows(fullMatrix, binSize=binSize)
    genesIncluded<-genesIncluded+1
    if(is.null(multiGeneMat)){
      multiGeneMat<-winMatrix
    } else {
      multiGeneMat<-rbind(multiGeneMat,winMatrix)
    }
  }
}
print(paste(genesIncluded,"genes included in the multi gene matrix"))

#multiGeneMat<-rescale_minus1To1(multiGeneMat)
#multiGeneMat<-rescale_0To1(multiGeneMat)
#multiGeneMat<-recodeMatrixAsNumeric(multiGeneMat)

k_range = 2:8      # Number of classes to be found
maxIterations = 100 # number of iterations of EM clustering to perform if it does not converge
convergenceError = 10e-6
numRepeats=10 # number of repeats of clustering each matrix (to account for fraction of methylation)
xRange=c(-250,250)
maxB=50 # Number of randomised matrices to generate
outPath="./EMres_multigene_test_mi"
maxTasks=4
taskId=1
nThreads=4
setSeed=FALSE
#distMetric=list(name="cosineDist",valNA=0,rescale=F)
#distMetric=list(name="canberraDist",rescale=T)
#distMetric=list(name="correlationDist",valNA=0.5,rescale=T)
#distMetric=list(name="euclidean",valNA=0.5,rescale=T)
distMetric=list(name="mutualInformation",valNA=0.5,rescale=F)

if(!dir.exists(outPath)){
  dir.create(outPath)
}


set.seed(200413)

regionName="multiGene"
sampleName="dS02-182"
outFileBase=paste(sampleName, regionName, sep="_")
print(paste("Clustering", outFileBase))
dataMatrix<-multiGeneMat
dim(dataMatrix)
#dataMatrix<-removeNArows(dataMatrix,maxNAfraction=0)
dim(dataMatrix)

pdf(file=paste0("hist_w",binSize,"_NA",maxNAfraction,"_genes",genesIncluded,".pdf"),width=11, height=8, paper="a4r")
hist(rowSums(is.na(multiGeneMat)),main=paste0("winSize: ",binSize,", maxNA: ",maxNAfraction,
                                              ", genesIncluded:",genesIncluded),
     xlab="number of NA windows per molecule",xlim=c(0,500),col="blue")
dev.off()
allClassMeans<-tryCatch(
  {
    print("running EM for a range of class numbers")
    runEMrangeClassNum(dataMatrix, k_range, convergenceError, maxIterations,
                       EMrepeats=numRepeats, outPath=outPath, xRange=xRange,
                       outFileBase=paste(sampleName, regionName, sep="_"),
                       doIndividualPlots=FALSE, distMetric=distMetric)

  },
  error=function(e){"Matrix not valid"}
)

if(is.list(allClassMeans)){
  saveRDS(allClassMeans,paste0(outPath,"/allClassMeans_",outFileBase,".rds"))
} else {
  print(allClassMeans) # error message
}

clustMetrics<-tryCatch(
  {
    print("plotting clustering metrics for a range of class sizes")
    plotClusteringMetrics(dataMatrix, k_range, maxB, convergenceError,
                          maxIterations, outPath, outFileBase, EMrep=NULL,
                          nThreads=nThreads,
                          setSeed=setSeed, distMetric=distMetric)
  },
  error=function(e){"Matrix not valid"}
)
if(length(clustMetrics)==1) {
  print(clustMetrics)
}

pcaPlots<-tryCatch(
  {
    print("plotting PCA of clusters")
    plotPCAofMatrixClasses(k_range, outPath, outFileBase)
  },
  error=function(e){"Matrix not valid"}
)
if(length(pcaPlots)==1) {
  print(pcaPlots)
}

umapPlots<-tryCatch(
  {
    print("plotting UMAP of clusters")
    plotUMAPofMatrixClasses(k_range, outPath, outFileBase)
  },
  error=function(e){"Matrix not valid"}
)
if(length(umapPlots)==1) {
  print(umapPlots)
}

print("plotting classes per gene")
plotGenesPerClass(k_range, outPath, outFileBase)

# to use -1 to 1
#https://stats.stackexchange.com/questions/256917/can-an-unstandarized-beta-distribution-have-a-negative-domain

sorting reads of a matrix according to classMeans

allClassMeans<-readRDS(system.file("extdata", 
          "EMres/allClassMeans_dS02-182_multiGene.rds",
            package = "EMclassifieR", mustWork=T))

classes<-getClassMeansMatrix(allClassMeans,numClasses=6)



classes<-as.matrix(readRDS(paste0("../../../sequencingData/TCI_clusters/euc8-10_w30/classMeans_w30_euc.rds")))
classes<-classes[order(rowMeans(classes)),]
saveRDS(classes,paste0("../../../sequencingData/TCI_clusters/euc8-10_w30/classMeans_w30_euc_sort.rds"))
rowMeans(classes)
classes<-padEndToRange(classes,xRange=c(-250,250))
classes<-readRDS(paste0("../../../sequencingData/TCI_clusters/euc8-10_w30/classMeans_w30_euc_sort.rds"))

p<-plotClassMeans(classes,xRange=c(-250,250), facet=TRUE,
                         title="Class means",
                         myXlab="CpG/GpC position",featureLabel="TSS",
                         baseFontSize=12)

ggplot2::ggsave("../../../sequencingData/TCI_clusters/euc8-10_w30/classMeans_w30_euc_sort.pdf",p,
                height=11,width=4,device="pdf")

dataMatrix<-readRDS(system.file("extdata",
                                "EMres/dS02-182_multiGene_K6.rds",
                                package="EMclassifieR", mustWork=T))
dataMatrix<-readRDS(system.file("extdata",
                            "rds/relCoord_ampTSS/dS02-182_WBGene00009620.rds",
                                package="EMclassifieR", mustWork=T))

orderedMat<-dataMatrix
dataMatrix<-dataMatrix[sample(1:dim(dataMatrix)[1],replace=F),]

rownames(dataMatrix)<-gsub("__class.*$","",rownames(dataMatrix))

distMetric=list(name="euclidean",rescale=T) #list(name="euclidean")

dataOrderedByClass<-runClassLikelihoodRpts(dataMatrix, classes, 
                                numRepeats=20, outPath=".",
                                xRange=c(-250,250), outFileBase="test",
                                distMetric=distMetric, classesToPlot=NULL)

dataMatrix<-dataOrderedByClass[-grep("__class0[1,2,3,4]",row.names(dataOrderedByClass)),]

tablePath="csv/MatrixLog_relCoord_ampTSS.csv"
matTable<-read.csv(system.file("extdata", tablePath, package = "EMclassifieR",
                               mustWork=TRUE), stringsAsFactors=F)


jsemple19/EMclassifieR documentation built on Aug. 12, 2022, 2:57 p.m.