Introduction

This document presents an example of the usage of the MLML2R package for R.

The function MLML provides maximum likelihood estimates (MLE) for 5-hmC and 5-mC levels using data from any combination of two of the methods: BS-seq, TAB-seq or oxBS-seq. The function also provides MLE when combining these three methods.

Getting publicly available data

We will use the dataset from Li et al. (2016), which applied oxBS-seq and BS-seq to genomic DNA extracted from four human liver normal-tumor pairs and three human lung normal-tumor pairs (14 samples total). All libraries were sequenced to an average depth of 15.4 × per CpG cytosine. The dataset is available at GEO accession GSE70090.

Platform used: Illumina HiSeq 2000 (Homo sapiens).

To start this example we will need the following packages:

library(GEOquery)

which can be installed in R using the commands:

source("http://www.bioconductor.org/biocLite.R")
biocLite(c("GEOquery"))

The processed data are deposited in GEO and can be downloaded by doing:

if(! file.exists("GSE70090/GSE70090_RAW.tar"))
{
getGEOSuppFiles("GSE70090")
untar("GSE70090/GSE70090_RAW.tar", exdir = "GSE70090/data")
head(list.files("GSE70090/data"))
}
getGEOSuppFiles("GSE70090")
untar("GSE70090/GSE70090_RAW.tar", exdir = "GSE70090/data")
head(list.files("GSE70090/data", pattern = "data"))

Decompress the compressed files:

dataFiles <- list.files("GSE70090/data", pattern = "txt.gz$", full = TRUE)
sapply(dataFiles, gunzip, overwrite = TRUE)

We need to identify the different samples from different methods: BS-conversion, oxBS-conversion. We can use the file names do extract this information.

files <- list.files("GSE70090/data")
filesfull <- list.files("GSE70090/data",full=TRUE)
tissue <- sapply(files,function(x) strsplit(x,"_")[[1]][2]) # tissue
id <- sapply(files,function(x) strsplit(x,"_")[[1]][3]) # sample id
tmp <- sapply(files,function(x) strsplit(x,"_")[[1]][4]) 
convMeth <- sapply(tmp, function(x) strsplit(x,"\\.")[[1]][1]) # DNA conversion method
group <- ifelse(id %in% c("N1","N2","N3","N4"),"normal","tumor")
id2 <- paste(tissue,id,sep="_")
GSM <- sapply(files,function(x) strsplit(x,"_")[[1]][1]) # GSM
pheno <- data.frame(GSM=GSM,tissue=tissue,id=id2,convMeth=convMeth,group=group,file=filesfull,stringsAsFactors = FALSE)

Lung samples

library(data.table)

# selecting only lung samples
phenoLung <- pheno[pheno$tissue=="lung",]
# order to have all BS samples and then all oxBS samples
phenoLung <- phenoLung[order(phenoLung$convMeth,phenoLung$id),]

# BS
files <- phenoLung$file[phenoLung$convMeth=="BS"]

MethylatedBS    <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("methylated_read_count"))))

TotalBS    <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("total_read_count"))))

TotalBS[TotalBS==0] <- NA # 0 total counts = no information

MethylatedBS[is.na(TotalBS)] <- NA # 0 total counts = no information

UnMethylatedBS <- TotalBS - MethylatedBS


# make sure rownames are the same across files
tmp <- fread(files[1], data.table=FALSE, select=c("chr","position"))
CpG <- paste(tmp[,1],tmp[,2],sep="-")

rownames(MethylatedBS) <- CpG 
rownames(UnMethylatedBS) <- CpG

colnames(MethylatedBS) <- phenoLung$id[phenoLung$convMeth=="BS"]
colnames(UnMethylatedBS) <- phenoLung$id[phenoLung$convMeth=="BS"]

# oxBS
files <- phenoLung$file[phenoLung$convMeth=="oxBS"]

MethylatedOxBS    <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("methylated_read_count"))))

TotalOxBS    <- do.call(cbind,lapply(files,function(fn) fread(fn,data.table=FALSE,select=c("total_read_count"))))

TotalOxBS[TotalOxBS==0] <- NA 

MethylatedOxBS[is.na(TotalOxBS)] <- NA 

UnMethylatedOxBS <- TotalOxBS - MethylatedOxBS

rownames(MethylatedOxBS) <- CpG 
rownames(UnMethylatedOxBS) <- CpG

colnames(MethylatedOxBS) <- phenoLung$id[phenoLung$convMeth=="oxBS"]
colnames(UnMethylatedOxBS) <- phenoLung$id[phenoLung$convMeth=="oxBS"]

tmp1 <- which(rowMeans(is.na(MethylatedBS+UnMethylatedBS)) == 1) # CpGs with missing data for BS across all samples
tmp2 <- which(rowMeans(is.na(MethylatedOxBS+UnMethylatedOxBS)) == 1) # CpGs with missing data for oxBS across all samples
tmp <- union(tmp1,tmp2) # CpGs with missing data for all BS samples or all oxBS samples

MethylatedBS <- MethylatedBS[-tmp,]
UnMethylatedBS <- UnMethylatedBS[-tmp,]
MethylatedOxBS <- MethylatedOxBS[-tmp,]
UnMethylatedOxBS <- UnMethylatedOxBS[-tmp,]

save(MethylatedBS,UnMethylatedBS,file="BS-seq.Rds")
save(MethylatedOxBS,UnMethylatedOxBS,file="oxBS-seq.Rds")

Using the MLML2R package

After all the preprocessing procedures, we now can use the MLML2R package to obtain the maximum likelihood estimates for the 5-hmC and 5-mC levels.

Getting the MLE estimates using PAVA:

load("oxBS-seq.Rds")
load("BS-seq.Rds")


library(matrixStats) 

TotalBS <- as.matrix(MethylatedBS+UnMethylatedBS)
tmp1 <- rowMins(TotalBS,na.rm=TRUE)
TotalOxBS <- as.matrix(MethylatedOxBS+UnMethylatedOxBS)
tmp2 <- rowMins(TotalOxBS,na.rm=TRUE)
aa <- which(tmp1>9 & tmp2>9)

set.seed(2018)
aa <- sample(aa,1000000)

Tm = as.matrix(MethylatedBS[aa,])
Um = as.matrix(UnMethylatedBS[aa,])
Lm = as.matrix(UnMethylatedOxBS[aa,])
Mm = as.matrix(MethylatedOxBS[aa,])

rm(MethylatedBS,UnMethylatedBS,UnMethylatedOxBS,MethylatedOxBS)

gc()

library(MLML2R)

start <- Sys.time()
results_exact <- MLML(T.matrix = Tm, 
        U.matrix = Um, 
        L.matrix = Lm, 
        M.matrix = Mm)
print(Sys.time()-start) # 3.505538 secs
save(results_exact,file="results_exact1.rds")

# library(foreach)
# library(doParallel)
# 
# # Set-up parallel backend to use all but one available processors
# no_cores <- detectCores()-1
# 
# cl <- makeCluster(no_cores)
# 
# registerDoParallel(cl)
# nCpGs <- dim(Tm)[1]
# nSpecimens <- dim(Tm)[2]
# 
# # Create container for the MLML results
# results_exact <- array(NA,dim=c(nCpGs,nSpecimens,4))
# dimnames(results_exact) <- list(
#   rownames(Tm)[1:nCpGs],
#   colnames(Tm)[1:nSpecimens], c("mC","hmC","C","methods"))
# 
# start <- Sys.time()
# results_exact[,1:nSpecimens,] <- foreach(i = 1:nSpecimens, .combine=cbind,
#                                          .multicombine=TRUE,
#                                          .packages='MLML2R') %dopar% {
#   MLML(T.matrix = Tm[,i], 
#         U.matrix = Um[,i], 
#         L.matrix = Lm[,i], 
#         M.matrix = Mm[,i])
# }
# print(Sys.time()-start) # 
# stopCluster(cl)
# save(results_exact,file="results_exact.rds")

Getting the MLE estimates using EM-algorithm:

load("oxBS-seq.Rds")
load("BS-seq.Rds")

library(matrixStats) 

TotalBS <- as.matrix(MethylatedBS+UnMethylatedBS)
tmp1 <- rowMins(TotalBS,na.rm=TRUE)
TotalOxBS <- as.matrix(MethylatedOxBS+UnMethylatedOxBS)
tmp2 <- rowMins(TotalOxBS,na.rm=TRUE)
aa <- which(tmp1>9 & tmp2>9)

set.seed(2018)
aa <- sample(aa,1000000)

Tm = as.matrix(MethylatedBS[aa,])
Um = as.matrix(UnMethylatedBS[aa,])
Lm = as.matrix(UnMethylatedOxBS[aa,])
Mm = as.matrix(MethylatedOxBS[aa,])

rm(MethylatedBS,UnMethylatedBS,UnMethylatedOxBS,MethylatedOxBS)

gc()

# library(MLML2R)
# 
# start <- Sys.time()
# results_em <- MLML(T.matrix = Tm, 
#         U.matrix = Um, 
#         L.matrix = Lm, 
#         M.matrix = Mm,
#         iterative = TRUE)
# print(Sys.time()-start) # 7.671951 mins

# save(results_em,file="results_em1.rds")

source("testeMLML/MLML2R.R")
start <- Sys.time()
results_em <- MLML2(T.matrix = Tm, 
        U.matrix = Um, 
        L.matrix = Lm, 
        M.matrix = Mm,
        iterative = TRUE)
print(Sys.time()-start) # 9.888107 mins

save(results_em,file="results_em1.rds")


# library(foreach)
# library(doParallel)
# 
# # Set-up parallel backend to use all but one available processors
# no_cores <- detectCores()-1
# 
# cl <- makeCluster(no_cores)
# registerDoParallel(cl)
# 
# # Create container for the MLML results
# results_em <- array(NA,dim=c(nCpGs,nSpecimens,4))
# dimnames(results_em) <- list(
#   rownames(Tm)[1:nCpGs],
#   colnames(Tm)[1:nSpecimens], c("mC","hmC","C","methods"))
# 
# start <- Sys.time()
# 
# results_em[,1:nSpecimens,] <- foreach(i = 1:nSpecimens, .combine=cbind,
#                                       .packages='MLML2R') %dopar% {
#   MLML(T.matrix = Tm[,i], 
#         U.matrix = Um[,i], 
#         L.matrix = Lm[,i], 
#         M.matrix = Mm[,i],
#         iterative = TRUE)
# }
# 
# print(Sys.time()-start) # 
# stopCluster(cl)
# 
# 
# save(results_em,file="results_em.rds")

Plot of the results (we have 6 samples)

library(minfi)
par(mfrow =c(1,3))
densityPlot(results_exact$hmC,main= "5-hmC estimates - MLML",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(results_exact$mC,main= "5-mC estimates - MLML",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(results_exact$C,main= "uC estimates - MLML",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
library(minfi)
par(mfrow =c(1,3))
densityPlot(results_em1$hmC,main= "5-hmC estimates - MLML (EM)",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(results_em1$mC,main= "5-mC estimates - MLML (EM)",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(results_em1$C,main= "uC estimates - MLML (EM)",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))

Other methods to obtain the estimates

Naive estimates

The naive approach to obtain 5-hmC levels is $\beta_{BS} - \beta_{OxBS}$. This approach results in negative values for the 5-hmC levels.

load("oxBS-seq.Rds")
load("BS-seq.Rds")

library(matrixStats) 

TotalBS <- as.matrix(MethylatedBS+UnMethylatedBS)
tmp1 <- rowMins(TotalBS,na.rm=TRUE)
TotalOxBS <- as.matrix(MethylatedOxBS+UnMethylatedOxBS)
tmp2 <- rowMins(TotalOxBS,na.rm=TRUE)
aa <- which(tmp1>9 & tmp2>9)

set.seed(2018)
aa <- sample(aa,1000000)


beta_BS <- as.matrix(MethylatedBS[aa,]/(MethylatedBS[aa,]+UnMethylatedBS[aa,]))
beta_OxBS <- as.matrix(MethylatedOxBS[aa,]/(MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,]))
hmC_naive <- beta_BS-beta_OxBS
C_naive <- 1-beta_BS
mC_naive <- beta_OxBS

save(hmC_naive,mC_naive,C_naive,file="results_naive1.rds")
par(mfrow =c(1,3))
densityPlot(hmC_naive,main= "5-hmC estimates - naive",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(mC_naive,main= "5-mC estimates - naive",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(C_naive,main= "uC estimates - naive",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))

OxyBS estimates

For the specific case where only ox-BS and BS data are available, OxyBS package from Houseman et al. (2016) can be use to obtain estimates.

library(OxyBS)

# Methylated signals from the BS and oxBS arrays
methBS <- MethylatedBS
methOxBS <- MethylatedOxBS
# Unmethylated signals from the BS and oxBS arrays
unmethBS <- UnMethylatedBS
unmethOxBS <- UnMethylatedOxBS

# Calculate Total Signals
signalBS <- methBS+unmethBS
signalOxBS <- methOxBS+unmethOxBS

# Calculate Beta Values
betaBS <- methBS/signalBS
betaOxBS <- methOxBS/signalOxBS

####################################################
# 4. Apply fitOxBS function to preprocessed values
####################################################

# Select the number of CpGs and Subjects to which the method will be applied 
nCpGs <- dim(unmethOxBS)[1]
nSpecimens <- dim(unmethOxBS)[2]

# Create container for the OxyBS results
MethOxy <- array(NA,dim=c(nCpGs,nSpecimens,3))
dimnames(MethOxy) <- list(
  rownames(methBS)[1:nCpGs],
  colnames(methBS)[1:nSpecimens], c("C","5mC","5hmC"))

# Process results (one array at a time, slow)
#for(i in 1:nSpecimens){
#MethOxy[,i,] <-fitOxBS(betaBS[,i],betaOxBS[,i],signalBS[,i],signalOxBS[,i])
#}


library(foreach)
library(doParallel)

# Set-up parallel backend to use all but one available processors
no_cores <- detectCores()
cl <- makeCluster(no_cores)
registerDoParallel(cl)

start <- Sys.time()

MethOxy[,1:nSpecimens,] <- foreach(i = 1:nSpecimens, .combine=cbind, .packages='OxyBS') %dopar% {
  fitOxBS(betaBS[,i],betaOxBS[,i],signalBS[,i],signalOxBS[,i])
}

print(Sys.time()-start) # 
stopCluster(cl)
save(MethOxy,file="MethOxy.rds")
load("MethOxy.rds")

Plot of the results (we have 4 replicates)

par(mfrow =c(1,3))
densityPlot(MethOxy[,,3],main= "5-hmC estimates - OxyBS",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(MethOxy[,,2],main= "5-mC estimates - OxyBS",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))
densityPlot(MethOxy[,,1],main= "uC estimates - OxyBS",cex.axis=1.4,cex.main=1.5,cex.lab=1.4,xlab="Proportion",sampGroups=c(rep("normal",3),rep("tumor",3)))

oxBS.MLE estimates

ENmix package had the function oxBS.MLE from Xu et al. (2016) that can be uses to obtain estimates for the specific case where ox-BS and BS data are available.

beta_BS <- as.matrix(MethylatedBS[aa,]/(MethylatedBS[aa,]+UnMethylatedBS[aa,]))
beta_OxBS <- as.matrix(MethylatedOxBS[aa,]/(MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,]))
N_BS <- as.matrix(MethylatedBS[aa,]+UnMethylatedBS[aa,])
N_OxBS <- as.matrix(MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,])
colnames(beta_BS) <- c("N1","N2","N3","T1","T2","T3")
colnames(beta_OxBS) <- c("N1","N2","N3","T1","T2","T3")
colnames(N_BS) <- c("N1","N2","N3","T1","T2","T3")
colnames(N_OxBS) <- c("N1","N2","N3","T1","T2","T3")

library(ENmix)
oxBSMLEresults <- oxBS.MLE(beta.BS=beta_BS,beta.oxBS=beta_OxBS,
                           N.BS=N_BS,N.oxBS=N_OxBS)

Comparison of 5-hmC estimates from different methods

library(GGally)
# data for replicate 1 is shown
df <- data.frame(x = as.numeric(results_exact$hmC[,1]),y=as.numeric(results_em1$hmC[,1]),
                 z = as.numeric(oxBSMLEresults[[2]][,1]),t=as.numeric(MethOxy[,1,3]),w=as.numeric(hmC_naive[,1]))

df[1:10,]

g <- ggpairs(df, title = " ",
   axisLabels = "show", columnLabels = c("MLML - exact","MLML - EM","oxBS.MLE","OxyBS","Naive"))
g
library(ggplot2)
ggplot(df,aes(x=x,y=z)) + geom_point(alpha = 0.3) + xlab("Exact MLE") +
  ylab("OxyBS")

ggplot(df,aes(x=y,y=z)) + geom_point(alpha = 0.3) + xlab("EM") +
  ylab("OxyBS")

Comparison of processing times from different methods

library(OxyBS)
library(microbenchmark)
library(MLML2R)
source("testeMLML/MLML2R.R")
signalBS <- MethylatedBS[aa,]+UnMethylatedBS[aa,]
signalOxBS <- MethylatedOxBS[aa,]+UnMethylatedOxBS[aa,]
betaBS <- MethylatedBS[aa,]/signalBS
betaOxBS <- MethylatedOxBS[aa,]/signalOxBS

nCpGs <- dim(signalBS)[1]
nSpecimens <- dim(signalBS)[2]
MethOxy1 <- array(NA,dim=c(nCpGs,nSpecimens,3))
dimnames(MethOxy1) <- list(
  rownames(MethylatedBS)[1:nCpGs],
  colnames(MethylatedBS)[1:nSpecimens], c("C","5mC","5hmC"))

oxyBS <- function()
{
  for(i in 1:nSpecimens){
MethOxy1[,i,] <-fitOxBS(betaBS[,i],betaOxBS[,i],signalBS[,i],signalOxBS[,i])
  }
}



mbm = microbenchmark(
  EXACT = MLML2(T.matrix = Tm, 
               U.matrix = Um, 
               L.matrix = Lm, 
               M.matrix = Mm),
  EM = MLML2(T.matrix = Tm, 
               U.matrix = Um, 
               L.matrix = Lm, 
               M.matrix = Mm,
             iterative=TRUE),
  oxBSMLE = oxBS.MLE(beta.BS=beta_BS,
                     beta.oxBS=beta_OxBS,
                     N.BS=N_BS,
                     N.oxBS=N_OxBS),
  oxyBS_res = oxyBS(),
  times=1)
save(mbm,file="mbm.rds")
load("mbm.rds")

| Method | Function | Package | Time (Seconds) | |-----------|:-----------------------:|---------:|-------------------------------------------:| | Iterative | MLML (iterative=TRUE) | MLML2R | r round(mbm$time[mbm$expr=="EM"]/(1e9),3) | | Iterative | fitOxBS | OxyBS | r round(mbm$time[mbm$expr=="oxyBS_res"]/(1e9),3) | | Closed-form analytical | MLML (iterative=FALSE) | MLML2R | r round(mbm$time[mbm$expr=="EXACT"]/(1e9),3) | | Closed-form analytical | oxBS.MLE | ENmix | r round(mbm$time[mbm$expr=="oxBSMLE"]/(1e9),3) |



samarafk/MLML2R documentation built on Oct. 19, 2019, 6:04 p.m.