This document presents an example of the usage of the MLML2R
package for R.
The function MLML
provides maximum likelihood estimates (MLE) for 5-hmC and 5-mC levels using data from any combination of two of the methods: BS-seq, TAB-seq or oxBS-seq. The function also provides MLE when combining these three methods.
We will use the dataset from Li et al. (2016), which applied oxBS-seq and BS-seq to genomic DNA extracted from four human liver normal-tumor pairs and three human lung normal-tumor pairs (14 samples total). All libraries were sequenced to an average depth of 15.4 × per CpG cytosine. The dataset is available at GEO accession GSE70090.
Platform used: Illumina HiSeq 2000 (Homo sapiens).
To start this example we will need the following packages:
library(GEOquery)
which can be installed in R using the commands:
source("http://www.bioconductor.org/biocLite.R") biocLite(c("GEOquery"))
The processed data are deposited in GEO and can be downloaded by doing:
if(! file.exists("GSE70090/GSE70090_RAW.tar")) { getGEOSuppFiles("GSE70090") untar("GSE70090/GSE70090_RAW.tar", exdir = "GSE70090/data") head(list.files("GSE70090/data")) }
getGEOSuppFiles("GSE70090") untar("GSE70090/GSE70090_RAW.tar", exdir = "GSE70090/data") head(list.files("GSE70090/data", pattern = "data"))
Decompress the compressed files:
dataFiles <- list.files("GSE70090/data", pattern = "txt.gz$", full = TRUE) sapply(dataFiles, gunzip, overwrite = TRUE)
We need to identify the different samples from different methods: BS-conversion, oxBS-conversion. We can use the file names do extract this information.
files <- list.files("GSE70090/data") filesfull <- list.files("GSE70090/data",full=TRUE) tissue <- sapply(files,function(x) strsplit(x,"_")[[1]][2]) # tissue id <- sapply(files,function(x) strsplit(x,"_")[[1]][3]) # sample id tmp <- sapply(files,function(x) strsplit(x,"_")[[1]][4]) convMeth <- sapply(tmp, function(x) strsplit(x,"\\.")[[1]][1]) # DNA conversion method group <- ifelse(id %in% c("N1","N2","N3","N4"),"normal","tumor") id2 <- paste(tissue,id,sep="_") GSM <- sapply(files,function(x) strsplit(x,"_")[[1]][1]) # GSM pheno <- data.frame(GSM=GSM,tissue=tissue,id=id2,convMeth=convMeth,group=group,file=filesfull,stringsAsFactors = FALSE)
library(data.table) # selecting only lung samples phenoLung <- pheno[pheno$tissue=="lung",] # order to have all BS samples and then all oxBS samples phenoLung <- phenoLung[order(phenoLung$convMeth,phenoLung$id),] # BS files <- phenoLung$file[phenoLung$convMeth=="BS"] MethylatedBS <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("methylated_read_count")))) TotalBS <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("total_read_count")))) TotalBS[TotalBS==0] <- NA # 0 total counts = no information MethylatedBS[is.na(TotalBS)] <- NA # 0 total counts = no information UnMethylatedBS <- TotalBS - MethylatedBS # make sure rownames are the same across files tmp <- fread(files[1], data.table=FALSE, select=c("chr","position")) CpG <- paste(tmp[,1],tmp[,2],sep="-") rownames(MethylatedBS) <- CpG rownames(UnMethylatedBS) <- CpG colnames(MethylatedBS) <- phenoLung$id[phenoLung$convMeth=="BS"] colnames(UnMethylatedBS) <- phenoLung$id[phenoLung$convMeth=="BS"] # oxBS files <- phenoLung$file[phenoLung$convMeth=="oxBS"] MethylatedOxBS <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("methylated_read_count")))) TotalOxBS <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("total_read_count")))) TotalOxBS[TotalOxBS==0] <- NA MethylatedOxBS[is.na(TotalOxBS)] <- NA UnMethylatedOxBS <- TotalOxBS - MethylatedOxBS rownames(MethylatedOxBS) <- CpG rownames(UnMethylatedOxBS) <- CpG colnames(MethylatedOxBS) <- phenoLung$id[phenoLung$convMeth=="oxBS"] colnames(UnMethylatedOxBS) <- phenoLung$id[phenoLung$convMeth=="oxBS"] tmp1 <- which(rowMeans(is.na(MethylatedBS+UnMethylatedBS)) == 1) # CpGs with missing data for BS across all samples tmp2 <- which(rowMeans(is.na(MethylatedOxBS+UnMethylatedOxBS)) == 1) # CpGs with missing data for oxBS across all samples tmp <- union(tmp1,tmp2) # CpGs with missing data for all BS samples or all oxBS samples MethylatedBS <- MethylatedBS[-tmp,] UnMethylatedBS <- UnMethylatedBS[-tmp,] MethylatedOxBS <- MethylatedOxBS[-tmp,] UnMethylatedOxBS <- UnMethylatedOxBS[-tmp,] save(MethylatedBS,UnMethylatedBS,file="BS-seq.Rds") save(MethylatedOxBS,UnMethylatedOxBS,file="oxBS-seq.Rds")
MLML2R
packageAfter all the preprocessing procedures, we now can use the MLML2R
package to obtain the maximum likelihood estimates for the 5-hmC and 5-mC levels.
Getting the MLE estimates using PAVA:
load("oxBS-seq.Rds") load("BS-seq.Rds") library(matrixStats) TotalBS <- as.matrix(MethylatedBS+UnMethylatedBS) tmp1 <- rowMins(TotalBS,na.rm=TRUE) TotalOxBS <- as.matrix(MethylatedOxBS+UnMethylatedOxBS) tmp2 <- rowMins(TotalOxBS,na.rm=TRUE) aa <- which(tmp1>9 & tmp2>9) set.seed(2018) aa <- sample(aa,1000000) Tm = as.matrix(MethylatedBS[aa,]) Um = as.matrix(UnMethylatedBS[aa,]) Lm = as.matrix(UnMethylatedOxBS[aa,]) Mm = as.matrix(MethylatedOxBS[aa,]) rm(MethylatedBS,UnMethylatedBS,UnMethylatedOxBS,MethylatedOxBS) gc() library(MLML2R) start <- Sys.time() results_exact <- MLML(T.matrix = Tm, U.matrix = Um, L.matrix = Lm, M.matrix = Mm) print(Sys.time()-start) # 3.505538 secs save(results_exact,file="results_exact1.rds") # library(foreach) # library(doParallel) # # # Set-up parallel backend to use all but one available processors # no_cores <- detectCores()-1 # # cl <- makeCluster(no_cores) # # registerDoParallel(cl) # nCpGs <- dim(Tm)[1] # nSpecimens <- dim(Tm)[2] # # # Create container for the MLML results # results_exact <- array(NA,dim=c(nCpGs,nSpecimens,4)) # dimnames(results_exact) <- list( # rownames(Tm)[1:nCpGs], # colnames(Tm)[1:nSpecimens], c("mC","hmC","C","methods")) # # start <- Sys.time() # results_exact[,1:nSpecimens,] <- foreach(i = 1:nSpecimens, .combine=cbind, # .multicombine=TRUE, # .packages='MLML2R') %dopar% { # MLML(T.matrix = Tm[,i], # U.matrix = Um[,i], # L.matrix = Lm[,i], # M.matrix = Mm[,i]) # } # print(Sys.time()-start) # # stopCluster(cl) # save(results_exact,file="results_exact.rds")
Getting the MLE estimates using EM-algorithm:
load("oxBS-seq.Rds") load("BS-seq.Rds") library(matrixStats) TotalBS <- as.matrix(MethylatedBS+UnMethylatedBS) tmp1 <- rowMins(TotalBS,na.rm=TRUE) TotalOxBS <- as.matrix(MethylatedOxBS+UnMethylatedOxBS) tmp2 <- rowMins(TotalOxBS,na.rm=TRUE) aa <- which(tmp1>9 & tmp2>9) set.seed(2018) aa <- sample(aa,1000000) Tm = as.matrix(MethylatedBS[aa,]) Um = as.matrix(UnMethylatedBS[aa,]) Lm = as.matrix(UnMethylatedOxBS[aa,]) Mm = as.matrix(MethylatedOxBS[aa,]) rm(MethylatedBS,UnMethylatedBS,UnMethylatedOxBS,MethylatedOxBS) gc() # library(MLML2R) # # start <- Sys.time() # results_em <- MLML(T.matrix = Tm, # U.matrix = Um, # L.matrix = Lm, # M.matrix = Mm, # iterative = TRUE) # print(Sys.time()-start) # 7.671951 mins # save(results_em,file="results_em1.rds") source("testeMLML/MLML2R.R") start <- Sys.time() results_em <- MLML2(T.matrix = Tm, U.matrix = Um, L.matrix = Lm, M.matrix = Mm, iterative = TRUE) print(Sys.time()-start) # 9.888107 mins save(results_em,file="results_em1.rds") # library(foreach) # library(doParallel) # # # Set-up parallel backend to use all but one available processors # no_cores <- detectCores()-1 # # cl <- makeCluster(no_cores) # registerDoParallel(cl) # # # Create container for the MLML results # results_em <- array(NA,dim=c(nCpGs,nSpecimens,4)) # dimnames(results_em) <- list( # rownames(Tm)[1:nCpGs], # colnames(Tm)[1:nSpecimens], c("mC","hmC","C","methods")) # # start <- Sys.time() # # results_em[,1:nSpecimens,] <- foreach(i = 1:nSpecimens, .combine=cbind, # .packages='MLML2R') %dopar% { # MLML(T.matrix = Tm[,i], # U.matrix = Um[,i], # L.matrix = Lm[,i], # M.matrix = Mm[,i], # iterative = TRUE) # } # # print(Sys.time()-start) # # stopCluster(cl) # # # save(results_em,file="results_em.rds")
Plot of the results (we have 6 samples)
library(minfi) par(mfrow =c(1,3)) densityPlot(results_exact$hmC,main= "5-hmC estimates - MLML",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(results_exact$mC,main= "5-mC estimates - MLML",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(results_exact$C,main= "uC estimates - MLML",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
library(minfi) par(mfrow =c(1,3)) densityPlot(results_em1$hmC,main= "5-hmC estimates - MLML (EM)",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(results_em1$mC,main= "5-mC estimates - MLML (EM)",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(results_em1$C,main= "uC estimates - MLML (EM)",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
The naive approach to obtain 5-hmC levels is $\beta_{BS} - \beta_{OxBS}$. This approach results in negative values for the 5-hmC levels.
load("oxBS-seq.Rds") load("BS-seq.Rds") library(matrixStats) TotalBS <- as.matrix(MethylatedBS+UnMethylatedBS) tmp1 <- rowMins(TotalBS,na.rm=TRUE) TotalOxBS <- as.matrix(MethylatedOxBS+UnMethylatedOxBS) tmp2 <- rowMins(TotalOxBS,na.rm=TRUE) aa <- which(tmp1>9 & tmp2>9) set.seed(2018) aa <- sample(aa,1000000) beta_BS <- as.matrix(MethylatedBS[aa,]/(MethylatedBS[aa,]+UnMethylatedBS[aa,])) beta_OxBS <- as.matrix(MethylatedOxBS[aa,]/(MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,])) hmC_naive <- beta_BS-beta_OxBS C_naive <- 1-beta_BS mC_naive <- beta_OxBS save(hmC_naive,mC_naive,C_naive,file="results_naive1.rds")
par(mfrow =c(1,3)) densityPlot(hmC_naive,main= "5-hmC estimates - naive",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(mC_naive,main= "5-mC estimates - naive",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(C_naive,main= "uC estimates - naive",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
OxyBS
estimatesFor the specific case where only ox-BS and BS data are available, OxyBS
package from Houseman et al. (2016) can be use to obtain estimates.
library(OxyBS) # Methylated signals from the BS and oxBS arrays methBS <- MethylatedBS methOxBS <- MethylatedOxBS # Unmethylated signals from the BS and oxBS arrays unmethBS <- UnMethylatedBS unmethOxBS <- UnMethylatedOxBS # Calculate Total Signals signalBS <- methBS+unmethBS signalOxBS <- methOxBS+unmethOxBS # Calculate Beta Values betaBS <- methBS/signalBS betaOxBS <- methOxBS/signalOxBS #################################################### # 4. Apply fitOxBS function to preprocessed values #################################################### # Select the number of CpGs and Subjects to which the method will be applied nCpGs <- dim(unmethOxBS)[1] nSpecimens <- dim(unmethOxBS)[2] # Create container for the OxyBS results MethOxy <- array(NA,dim=c(nCpGs,nSpecimens,3)) dimnames(MethOxy) <- list( rownames(methBS)[1:nCpGs], colnames(methBS)[1:nSpecimens], c("C","5mC","5hmC")) # Process results (one array at a time, slow) #for(i in 1:nSpecimens){ #MethOxy[,i,] <-fitOxBS(betaBS[,i],betaOxBS[,i],signalBS[,i],signalOxBS[,i]) #} library(foreach) library(doParallel) # Set-up parallel backend to use all but one available processors no_cores <- detectCores() cl <- makeCluster(no_cores) registerDoParallel(cl) start <- Sys.time() MethOxy[,1:nSpecimens,] <- foreach(i = 1:nSpecimens, .combine=cbind, .packages='OxyBS') %dopar% { fitOxBS(betaBS[,i],betaOxBS[,i],signalBS[,i],signalOxBS[,i]) } print(Sys.time()-start) # stopCluster(cl)
save(MethOxy,file="MethOxy.rds")
load("MethOxy.rds")
Plot of the results (we have 4 replicates)
par(mfrow =c(1,3)) densityPlot(MethOxy[,,3],main= "5-hmC estimates - OxyBS",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(MethOxy[,,2],main= "5-mC estimates - OxyBS",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3))) densityPlot(MethOxy[,,1],main= "uC estimates - OxyBS",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
oxBS.MLE
estimatesENmix
package had the function oxBS.MLE
from Xu et al. (2016) that can be uses to obtain estimates for the specific case where ox-BS and BS data are available.
beta_BS <- as.matrix(MethylatedBS[aa,]/(MethylatedBS[aa,]+UnMethylatedBS[aa,])) beta_OxBS <- as.matrix(MethylatedOxBS[aa,]/(MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,])) N_BS <- as.matrix(MethylatedBS[aa,]+UnMethylatedBS[aa,]) N_OxBS <- as.matrix(MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,]) colnames(beta_BS) <- c("N1","N2","N3","T1","T2","T3") colnames(beta_OxBS) <- c("N1","N2","N3","T1","T2","T3") colnames(N_BS) <- c("N1","N2","N3","T1","T2","T3") colnames(N_OxBS) <- c("N1","N2","N3","T1","T2","T3") library(ENmix) oxBSMLEresults <- oxBS.MLE(beta.BS=beta_BS,beta.oxBS=beta_OxBS, N.BS=N_BS,N.oxBS=N_OxBS)
library(GGally) # data for replicate 1 is shown df <- data.frame(x = as.numeric(results_exact$hmC[,1]),y=as.numeric(results_em1$hmC[,1]), z = as.numeric(oxBSMLEresults[[2]][,1]),t=as.numeric(MethOxy[,1,3]),w=as.numeric(hmC_naive[,1])) df[1:10,] g <- ggpairs(df, title = " ", axisLabels = "show", columnLabels = c("MLML - exact","MLML - EM","oxBS.MLE","OxyBS","Naive")) g
library(ggplot2) ggplot(df,aes(x=x,y=z)) + geom_point(alpha = 0.3) + xlab("Exact MLE") + ylab("OxyBS") ggplot(df,aes(x=y,y=z)) + geom_point(alpha = 0.3) + xlab("EM") + ylab("OxyBS")
library(OxyBS) library(microbenchmark) library(MLML2R) source("testeMLML/MLML2R.R") signalBS <- MethylatedBS[aa,]+UnMethylatedBS[aa,] signalOxBS <- MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,] betaBS <- MethylatedBS[aa,]/signalBS betaOxBS <- MethylatedOxBS[aa,]/signalOxBS nCpGs <- dim(signalBS)[1] nSpecimens <- dim(signalBS)[2] MethOxy1 <- array(NA,dim=c(nCpGs,nSpecimens,3)) dimnames(MethOxy1) <- list( rownames(MethylatedBS)[1:nCpGs], colnames(MethylatedBS)[1:nSpecimens], c("C","5mC","5hmC")) oxyBS <- function() { for(i in 1:nSpecimens){ MethOxy1[,i,] <-fitOxBS(betaBS[,i],betaOxBS[,i],signalBS[,i],signalOxBS[,i]) } } mbm = microbenchmark( EXACT = MLML2(T.matrix = Tm, U.matrix = Um, L.matrix = Lm, M.matrix = Mm), EM = MLML2(T.matrix = Tm, U.matrix = Um, L.matrix = Lm, M.matrix = Mm, iterative=TRUE), oxBSMLE = oxBS.MLE(beta.BS=beta_BS, beta.oxBS=beta_OxBS, N.BS=N_BS, N.oxBS=N_OxBS), oxyBS_res = oxyBS(), times=1)
save(mbm,file="mbm.rds")
load("mbm.rds")
| Method | Function | Package | Time (Seconds) |
|-----------|:-----------------------:|---------:|-------------------------------------------:|
| Iterative | MLML
(iterative=TRUE
) | MLML2R
| r round(mbm$time[mbm$expr=="EM"]/(1e9),3)
|
| Iterative | fitOxBS
| OxyBS
| r round(mbm$time[mbm$expr=="oxyBS_res"]/(1e9),3)
|
| Closed-form analytical | MLML
(iterative=FALSE
) | MLML2R
| r round(mbm$time[mbm$expr=="EXACT"]/(1e9),3)
|
| Closed-form analytical | oxBS.MLE
| ENmix
| r round(mbm$time[mbm$expr=="oxBSMLE"]/(1e9),3)
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.