inst/doc/BioMMtutorial.R

## ----style, echo = FALSE, results = 'asis'------------------------------------
BiocStyle::markdown()

## ----global_options, include=FALSE-------------------------------------------------------------------------------------------------
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, fig.width=8, 
fig.height=8)
options(width=133) 

## ----eval=FALSE--------------------------------------------------------------------------------------------------------------------
#  ## Do not execute if you have already installed BioMM.
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  # The following initializes usage of Bioc devel
#  BiocManager::install(version='devel')
#  BiocManager::install("BioMM")

## ----eval=FALSE--------------------------------------------------------------------------------------------------------------------
#  install.packages("devtools")
#  library("devtools")
#  install_github("transbioZI/BioMM", build_vignettes=TRUE)

## ----loadPkg, eval=TRUE, results="hide"--------------------------------------------------------------------------------------------
library(BioMM)
library(BiocParallel) 
library(ranger)
library(rms)
library(glmnet)
library(e1071)
library(precrec)
library(vioplot)
library(CMplot)
library(imager)
library(topGO)
library(xlsx)

## ----studyData, eval=TRUE----------------------------------------------------------------------------------------------------------
## DNA methylation data 
methylData <- readRDS(system.file("extdata", "/methylData.rds", package="BioMM"))
# The first column is the label, and the rest are the features (e.g., CpGs) 
head(methylData[,1:4])
# 0: control and 1: patient
table(methylData[,1]) 
dim(methylData)

## Gene expression data
expData <- readRDS(system.file("extdata", "/expData.rds", package="BioMM"))
# The first column is the label, and the rest are the features (e.g., genes) 
head(expData[,1:4])
# 0: control and 1: patient
table(expData[,1]) 
dim(expData)

## ----annotationFile, eval=TRUE-----------------------------------------------------------------------------------------------------
## Load feature annotation data
featureAnno <- readRDS(system.file("extdata", "cpgAnno.rds", package="BioMM"))
# The mapping between CpGs and genes (i.e. entrezID or gene symbol)
head(featureAnno)
# total number of CpGs under investigation
str(unique(featureAnno[,1]))

## Reprocessed Gene ontological pathway annotation with 10 and 200 genes for each pathway
golist <- readRDS(system.file("extdata", "goDB.rds", package="BioMM")) 
## Number of investigated biological processes
length(golist)
str(golist[1:3])

## Reprocessed KEGG pathway annotation with 10 and 200 genes for each pathway
kegglist <- readRDS(system.file("extdata", "keggDB.rds", package="BioMM"))  
## Number of investigated KEGG pathways 
length(kegglist)
str(kegglist[1:3]) 


## ----pathlist, eval=TRUE-----------------------------------------------------------------------------------------------------------
## Feature annotation to pathways 
## Use 100 pathways to reduce the runtime for the downstream analysis. But if possible, please make sure to use all.
numPath <- 100

# GO pathway mapping using DNA methylation data
golistSub <- golist[seq_len(numPath)]
methylGOlist <- omics2pathlist(data=methylData, pathlistDB=golistSub, 
                               featureAnno=featureAnno, 
                               restrictUp=200, restrictDown=10, minPathSize=10) 
# KEGG pathway mapping using DNA methylation data
kegglistSub <- kegglist[seq_len(numPath)]
methylKEGGlist <- omics2pathlist(data=methylData, pathlistDB=kegglistSub, 
                                 featureAnno=featureAnno, 
                                 restrictUp=200, restrictDown=10, minPathSize=10) 

# GO pathway mapping using gene expression data
golistSub <- golist[seq_len(numPath)]
expGOlist <- omics2pathlist(data=expData, pathlistDB=golistSub, 
                            featureAnno=NULL, 
                            restrictUp=200, restrictDown=10, minPathSize=10) 
# KEGG pathway mapping using gene expression data
kegglistSub <- kegglist[seq_len(numPath)]
expKEGGlist <- omics2pathlist(data=expData, pathlistDB=kegglistSub, 
                              featureAnno=NULL, 
                              restrictUp=200, restrictDown=10, minPathSize=10) 


## ----BioMMrandForest4methylGO, eval=TRUE-------------------------------------------------------------------------------------------
## To reduce the runtime, only use a subset of DNA methylation data
## However, if possible, subsetting the data is not suggested.
trainData <- methylData[,1:3000]  
trainDataY <- trainData[,1]
testData <- NULL

## Model parameters
supervisedStage1=TRUE
classifier <- "randForest"
predMode <- "probability"
paramlist <- list(ntree=100, nthreads=20)   
core <- MulticoreParam(workers = 10) 

set.seed(123)
result <- BioMM(trainData=trainData, testData=NULL, 
                pathlistDB=golistSub, featureAnno, 
                restrictUp=200, restrictDown=10, minPathSize=10, 
                supervisedStage1, typePCA="regular", 
                resample1="BS", resample2="CV", dataMode="allTrain",
                repeatA1=50, repeatA2=1, repeatB1=10, repeatB2=1, 
                nfolds=10, FSmethod1=NULL, FSmethod2=NULL, 
                cutP1=0.05, cutP2=0.05, fdr2=NULL, 
                FScore=MulticoreParam(workers = 1), 
                classifier, predMode, paramlist, innerCore=core)

if (is.null(testData)) {
    metricCV <- getMetrics(dataY = trainDataY, predY = result)
    message("Cross-validation prediction performance:")
    print(metricCV)
} else if (!is.null(testData)){
    testDataY <- testData[,1]  
    metricCV <- getMetrics(dataY = trainDataY, cvYscore = result[[1]])
    metricTest <- getMetrics(dataY = testDataY, testYscore = result[[2]])
    message("Cross-validation performance:")
    print(metricCV)
    message("Test set prediction performance:")
    print(metricTest)
}


## ----BioMMrandForest4expKEGG, eval=TRUE--------------------------------------------------------------------------------------------
## to reduce the runtime, only use a subset of gene expression data
## However, if possible, subsetting the data is not suggested.
trainData <- expData[,1:3000] 
trainDataY <- trainData[,1] 
testData <- NULL 
## Only for gene expression data
featureAnno=NULL
## Model parameters
supervisedStage1=TRUE
classifier <- "randForest"
predMode <- "probability"
paramlist <- list(ntree=100, nthreads=20)   
core <- MulticoreParam(workers = 10) 

set.seed(123)
result <- BioMM(trainData=trainData, testData=NULL, 
                pathlistDB=kegglistSub, featureAnno, 
                restrictUp=200, restrictDown=10, minPathSize=10, 
                supervisedStage1, typePCA="regular", 
                resample1="BS", resample2="CV", dataMode="allTrain",
                repeatA1=50, repeatA2=1, repeatB1=10, repeatB2=1, 
                nfolds=10, FSmethod1=NULL, FSmethod2=NULL, 
                cutP1=0.05, cutP2=0.05, fdr2=NULL, 
                FScore=MulticoreParam(workers = 1), 
                classifier, predMode, paramlist, innerCore=core)

## Cross-validation is applied on the training data, therefore 'result' only returns the CV predicted score.
metricCV <- getMetrics(dataY = trainDataY, predY = result)
message("Cross-validation prediction performance:")
print(metricCV)

## ----stage2dataAprep, eval=TRUE----------------------------------------------------------------------------------------------------
## Define the omics type 
# omicsType <- "methylation"
omicsType <- "expression"
pathType <- "GO"
pathType <- "KEGG"
if (omicsType == "methylation" & pathType == "GO"){
    studylist <- methylGOlist
} else if (omicsType == "methylation" & pathType == "KEGG"){
    studylist <- methylKEGGlist
} else if (omicsType == "expression" & pathType == "GO"){
    studylist <- expGOlist
} else if (omicsType == "expression" & pathType == "KEGG"){
    studylist <- expKEGGlist
} else {
    stop("Wrong specified omicsType and pathType!")
} 

length(studylist)

## Model parameters 
classifier <- "randForest"
predMode <- "probability"
paramlist <- list(ntree=100, nthreads=20)   
core <- MulticoreParam(workers = 10) 

set.seed(123)
stage2dataA <- reconBySupervised(trainDataList=studylist, 
                   testDataList=NULL,
                   resample="BS", dataMode="allTrain",
                   repeatA=50, repeatB=1, nfolds=10,
                   FSmethod=NULL, cutP=0.05, fdr=NULL, 
                   FScore=MulticoreParam(workers = 1),
                   classifier, predMode, paramlist,
                   innerCore=core, outFileA=NULL, outFileB=NULL)
## Check stage-2 data
dim(stage2dataA)
print(table(stage2dataA[,1]))
head(stage2dataA[,1:4])


## ----stage2dataViz, eval=TRUE------------------------------------------------------------------------------------------------------
core <- MulticoreParam(workers = 10)   
fileName <- paste0(omicsType,"_", pathType, "_featuresVarExplained.png")
plotVarExplained(data=stage2dataA, posF=TRUE, binarize=FALSE, core=core, 
                 pathTitle=paste0(pathType, " pathways"), fileName)

plot(load.image(fileName)) 


## ----topPathFeatures, eval=TRUE, fig.show="hold"-----------------------------------------------------------------------------------
core <- MulticoreParam(workers = 1)   
rankMetric <- "negPlogit" 
filePrefix <- paste0(omicsType, "_", pathType, "_topPath_", rankMetric)
topPath <- plotRankedFeature(data=stage2dataA, 
                             posF=TRUE, topF=10, binarize=FALSE, 
                             blocklist=studylist,  
                             rankMetric=rankMetric, 
                             colorMetric="size",  core, 
                             pathTitle=paste0(pathType, " pathways"), 
                             fileName=paste0(filePrefix, ".png"))
plot(load.image(paste0(filePrefix, ".png")))   


## ----reportTopPath, eval=TRUE------------------------------------------------------------------------------------------------------
## Report the top pathways
if (pathType == "GO"){
    
  goterms = unlist(Term(GOTERM))  
  topGOID = gsub("\\.", ":", rownames(topPath))
  subTerm = goterms[is.element(names(goterms), topGOID)] 
  topIDmatch = subTerm[match(topGOID, names(subTerm))]  
  topPath <- data.frame(topPath, Description=topIDmatch)
  
} else if (pathType == "KEGG"){
  ## A matching list between KEGG ID and names. Data freezes on Aug 2020
  keggID2name <- readRDS(system.file("extdata", "/keggID2name202008.rds", 
                                     package="BioMM"))  
  keggSub <- keggID2name[is.element(keggID2name[,"ID"], rownames(topPath)),]
  topIDmatch <- keggSub[match(rownames(topPath), keggSub[,"ID"]),] 
  topPath <- data.frame(topPath, Description=topIDmatch[,"name"])
}

print(topPath) 
write.xlsx(topPath,file=paste0(filePrefix, ".xlsx"))
# write.table(topPath,file=paste0(filePrefix, ".txt"), sep="\t")


## ----cirPlot, eval=TRUE, fig.show="hold"-------------------------------------------------------------------------------------------
core <- MulticoreParam(workers = 10)   
pathID <- gsub("\\.", ":", rownames(topPath))
## The number of top pathways must be bigger than overall investigated pathways
pathSet <- studylist[is.element(names(studylist), pathID)]
pathMatch <- pathSet[match(pathID, names(pathSet))]
fileName <- paste0(omicsType, "_", pathType, "_SigRankBy_", rankMetric)

cirPlot4pathway(datalist=pathMatch, topPathID=names(pathMatch), core, fileName)



## ----sessioninfo, eval=TRUE--------------------------------------------------------------------------------------------------------
sessionInfo()

Try the BioMM package in your browser

Any scripts or data that you put into this service are public.

BioMM documentation built on Nov. 8, 2020, 11:04 p.m.