Nothing
#' Prediction by Machine Learning
#'
#' @description
#' Prediction by Machine Learning with different learners ( From 'mlr3' )
#' @param trainData The input training dataset. The first column
#' is the label or the output. For binary classes,
#' 0 and 1 are used to indicate the class member.
#' @param testData The input test dataset. The first column
#' is the label or the output. For binary classes,
#' 0 and 1 are used to indicate the class member.
#' @param predMode The prediction mode.Currently only supports 'probability' for binary classification tasks.
#' @param classifier Learners in mlr3
#' @param paramlist Learner parameters search spaces
#' @param inner_folds k-fold cross validation ( Only supported when testData = NULL )
#' @param seed Cross-Validation seed
#'
#' @return The predicted output for the test data.
#' @import mlr3verse
#' @importFrom mlr3 as_task_classif lrn rsmp msr resample as_task_regr
#' @export
#' @author Shunjie Zhang
#' @examples
#'library(mlr3verse)
#'library(caret)
#'library(BioM2)
#'data=MethylData_Test
#'set.seed(1)
#'part=unlist(createDataPartition(data$label,p=0.8))#Split data
#'predict=baseModel(trainData=data[part,1:10],
#' testData=data[-part,1:10],
#' classifier = 'svm')#Use 10 features to make predictions,Learner uses svm
#'
#'
baseModel=function ( trainData, testData, predMode = "probability",
classifier,paramlist=NULL, inner_folds=10,seed=10){
set.seed(seed)
predMode=match.arg(predMode)
if(!is.null(testData)){
if (colnames(trainData)[1] != "label") {
stop("The first column of the 'trainData' must be the 'label'!")
}
if (colnames(testData)[1] != "label") {
stop("The first column of the 'testData' must be the 'label'!")
}
if( predMode == 'probability'){
if(is.character(classifier)){
classifier=paste0('classif.',classifier,'')
model=lrn(classifier,predict_type = "prob")
}else{
model=classifier
}
trainData[,1]=as.factor(trainData[,1])
testData[,1]=as.factor(testData[,1])
trainData=as_task_classif(trainData,target='label')
testData=as_task_classif(testData,target='label')
if(!is.null(paramlist)){
sink(nullfile())
at = auto_tuner(
tuner = tnr("grid_search", resolution = 10, batch_size = 5),
learner = model,
search_space = paramlist,
resampling =rsmp("cv", folds =5),
measure = msr("classif.auc")
)
at$train(trainData)
model$param_set$values = at$tuning_result$learner_param_vals[[1]]
model$train(trainData)
sink()
predict=model$predict(testData)$prob[,2]
return(predict)
}else{
sink(nullfile())
model$train(trainData)
sink()
predict=model$predict(testData)$prob[,2]
return(predict)
}
}else if( predMode == 'regression'){
if(is.character(classifier)){
classifier=paste0('regr.',classifier,'')
model=lrn(classifier)
}else{
model=classifier
}
trainData[,1]=as.numeric(trainData[,1])
testData[,1]=as.numeric(testData[,1])
trainData=as_task_regr(trainData,target='label')
testData=as_task_regr(testData,target='label')
if(!is.null(paramlist)){
at = auto_tuner(
tuner = tnr("grid_search", resolution = 5, batch_size = 5),
learner = model,
search_space = paramlist,
resampling = rsmp("cv", folds =5),
measure = msr("regr.mae")
)
at$train(trainData)
model$param_set$values = at$tuning_result$learner_param_vals[[1]]
model$train(trainData)
predict=model$predict(testData)$response
return(predict)
}else{
model$train(trainData)
predict=model$predict(testData)$response
return(predict)
}
}
}
else if(is.null(testData)){
if (colnames(trainData)[1] != "label") {
stop("The first column of the 'trainData' must be the 'label'!")
}
if( predMode == 'probability'){
if(is.character(classifier)){
classifier=paste0('classif.',classifier,'')
model=lrn(classifier,predict_type = "prob")
}else{
model=classifier
}
trainData[,1]=as.factor(trainData[,1])
trainData=as_task_classif(trainData,target='label')
set.seed(seed)
sink(nullfile())
rr=resample(trainData, model, rsmp("cv", folds = inner_folds))$prediction()
sink()
re=as.data.frame(as.data.table(rr))[,c(1,5)]
re=re[order(re$row_ids),][,2]
return(re)
}else if(predMode == 'regression'){
classifier=paste0('regr.',classifier,'')
trainData=as_task_regr(trainData,target='label')
model=lrn(classifier)
set.seed(seed)
sink(nullfile())
rr=resample(trainData, model, rsmp("cv", folds = inner_folds))$prediction()
sink()
re=as.data.frame(as.data.table(rr))[,c(1,3)]
re=re[order(re$row_ids),][,2]
return(re)
}
}
}
#' Stage 1 Fearture Selection
#'
#' @param Stage1_FeartureSelection_Method Feature selection methods. Available options are
#' c(NULL, 'cor', 'wilcox.test', 'cor_rank', 'wilcox.test_rank').
#' @param data The input training dataset. The first column is the label.
#' @param cutoff The cutoff used for feature selection threshold. It can be any value
#' between 0 and 1. Commonly used cutoffs are c(0.5, 0.1, 0.05, 0.01, etc.).
#' @param featureAnno The annotation data stored in a data.frame for probe
#' mapping. It must have at least two columns named 'ID' and 'entrezID'.
#' (For details, please refer to data( data("MethylAnno") )
#' @param pathlistDB_sub A list of pathways with pathway IDs and their
#' corresponding genes ('entrezID' is used).
#' For details, please refer to ( data("GO2ALLEGS_BP") )
#' @param MinfeatureNum_pathways The minimal defined pathway size after mapping your
#' own data to pathlistDB(KEGG database/GO database).
#' @param cores The number of cores used for computation.
#' @param verbose Whether to print running process information to the console
#'
#' @return A list of matrices with pathway IDs as the associated list member
#' names.
#' @import parallel
#' @importFrom stats wilcox.test
#' @export
#' @author Shunjie Zhang
#' @examples
#'
#' library(parallel)
#' data=MethylData_Test
#' feature_pathways=Stage1_FeartureSelection(Stage1_FeartureSelection_Method='cor',
#' data=data,cutoff=0,
#' featureAnno=MethylAnno,pathlistDB_sub=GO2ALLEGS_BP,cores=1)
#'
Stage1_FeartureSelection=function(Stage1_FeartureSelection_Method='cor',data=NULL,cutoff=NULL,
featureAnno=NULL,pathlistDB_sub=NULL,MinfeatureNum_pathways=10,cores=1,verbose=TRUE){
if(Sys.info()[1]=="Windows"){
cores=1
}
if(Stage1_FeartureSelection_Method=='cor'){
if(verbose)print(paste0(' Using << correlation >>',' ,and you choose cutoff:',cutoff))
Cor=stats::cor(data$label,data)
Cor=ifelse(Cor>0,Cor,-Cor)
names(Cor)=colnames(data)
Cor_names=names(Cor)
Cor_cutoff=Cor[which(Cor>cutoff)]
Cor_cutoff_names=names(Cor_cutoff)
MinfeatureNum_pathways2=MinfeatureNum_pathways+1
feature_pathways=mclapply(1:length(pathlistDB_sub),function(x){
id=c('label',featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB_sub[[x]])])
if(length(id)>MinfeatureNum_pathways){
id2=id[which(id %in% Cor_cutoff_names)]
if(length(id2)<MinfeatureNum_pathways2){
a=Cor[id]
id2=names(a)[order(a,decreasing = T)[1:MinfeatureNum_pathways2]]
return(id2)
}else{
return(id2)
}
}else{
return(id)
}
} ,mc.cores=cores)
return(feature_pathways)
}else if(Stage1_FeartureSelection_Method=='wilcox.test'){
if(verbose)print(paste0(' Using << wilcox.test >>',' ,and you choose cutoff:',cutoff))
data_0=data[which(data$label==unique(data$label)[1]),]
data_1=data[which(data$label==unique(data$label)[2]),]
Cor=unlist(mclapply(1:ncol(data),function(x) wilcox.test(data_0[,x],data_1[,x])$p.value,mc.cores=cores))
names(Cor)=colnames(data)
Cor_names=names(Cor)
Cor_cutoff=Cor[which(Cor<cutoff)]
Cor_cutoff_names=names(Cor_cutoff)
MinfeatureNum_pathways2= MinfeatureNum_pathways+1
feature_pathways=mclapply(1:length(pathlistDB_sub),function(x){
id=c('label',featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB_sub[[x]])])
if(length(id)> MinfeatureNum_pathways){
id2=id[which(id %in% Cor_cutoff_names)]
if(length(id2)< MinfeatureNum_pathways2){
a=Cor[id]
id2=names(a)[order(a,decreasing = F)[1:MinfeatureNum_pathways2]]
return(id2)
}else{
return(id2)
}
}else{
return(id)
}
} ,mc.cores=cores)
return(feature_pathways)
}else if(Stage1_FeartureSelection_Method=='cor_rank'){
if(verbose)print(paste0(' Using << correlation_rank >>',' ,and you choose cutoff:',cutoff))
Cor=stats::cor(data$label,data)
Cor=ifelse(Cor>0,Cor,-Cor)
len=length(Cor)*cutoff
names(Cor)=colnames(data)
Cor_names=names(Cor)
Cor_cutoff=order(Cor,decreasing=T)[1:len]
Cor_cutoff_names=names(Cor_cutoff)
feature_pathways=mclapply(1:length(pathlistDB_sub),function(x){
id=c('label',featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB_sub[[x]])])
if(length(id)>10){
id2=id[which(id %in% Cor_cutoff_names)]
if(length(id2)<11){
a=Cor[id]
id2=names(a)[order(a,decreasing = T)[1:11]]
return(id2)
}else{
return(id2)
}
}else{
return(id)
}
} ,mc.cores=cores)
return(feature_pathways)
}else if(Stage1_FeartureSelection_Method=='wilcox.test_rank'){
if(verbose)print(paste0(' Using << wilcox.test_rank >>',' ,and you choose cutoff:',cutoff))
data_0=data[which(data$label==unique(data$label)[1]),]
data_1=data[which(data$label==unique(data$label)[2]),]
Cor=unlist(mclapply(1:ncol(data),function(x) wilcox.test(data_0[,x],data_1[,x])$p.value,mc.cores=cores))
len=length(Cor)*cutoff
names(Cor)=colnames(data)
Cor_names=names(Cor)
Cor_cutoff=order(Cor)[1:len]
Cor_cutoff_names=names(Cor_cutoff)
feature_pathways=mclapply(1:length(pathlistDB_sub),function(x){
id=c('label',featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB_sub[[x]])])
if(length(id)>10){
id2=id[which(id %in% Cor_cutoff_names)]
if(length(id2)<11){
a=Cor[id]
id2=names(a)[order(a,decreasing = F)[1:11]]
return(id2)
}else{
return(id2)
}
}else{
return(id)
}
} ,mc.cores=cores)
return(feature_pathways)
}else{
feature_pathways=mclapply(1:length(pathlistDB_sub),function(x){
id=c('label',featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB_sub[[x]])])
return(id)
} ,mc.cores=cores)
return(feature_pathways)
}
}
#' Stage 2 Fearture Selection
#'
#' @param Stage2_FeartureSelection_Method Feature selection methods. Available options are
#' c(NULL, 'cor', 'wilcox.test','cor_rank','wilcox.test_rank','RemoveHighcor', 'RemoveLinear').
#' @param data The input training dataset. The first column is the label.
#' @param label The label of dataset
#' @param cutoff The cutoff used for feature selection threshold. It can be any value
#' between 0 and 1.
#' @param preMode The prediction mode. "Currently only supports 'probability' for binary classification tasks."
#' @param classifier Learners in mlr3
#' @param cores The number of cores used for computation.
#' @param verbose Whether to print running process information to the console
#'
#' @return Column index of feature
#' @export
#' @import parallel
#' @importFrom stats wilcox.test
#' @import caret
#' @author Shunjie Zhang
#'
#'
Stage2_FeartureSelection=function(Stage2_FeartureSelection_Method='RemoveHighcor',data=NULL,label=NULL,cutoff=NULL,preMode=NULL,classifier=NULL,verbose=TRUE,cores=1){
if(Sys.info()[1]=="Windows"){
cores=1
}
if(preMode=='probability' | preMode=='classification'){
if(Stage2_FeartureSelection_Method=='cor'){
if(!is.null(label)){
up=ifelse(classifier=='lda',0.999,100)
corr=sapply(1:length(data),function(x) stats::cor(data[[x]],label,method='pearson'))
if(verbose)print(paste0(' |> Final number of pathways >>>',length(which(corr>cutoff & corr < up)),'......Min correlation of pathways>>>',round(min(corr[which(corr > cutoff & corr < up)]),digits = 3)))
index=which(corr>cutoff & corr < up )
return(index)
}else{
data=do.call(rbind,data)
label=data[,1]
corr=sapply(2:ncol(data),function(x) stats::cor(data[,x],label,method='pearson'))
index=which(corr > cutoff)
if(verbose)print(paste0(' |> Final number of pathways >>> ',length(index),'......Min correlation of pathways>>>',round(min(corr[index]),digits = 3)))
index=index+1
return(index)
}
}else if(Stage2_FeartureSelection_Method=='cor_rank'){
if(!is.null(label)){
up=ifelse(classifier=='lda',0.999,100)
corr=sapply(1:length(data),function(x) stats::cor(data[[x]],label,method='pearson'))
if(cutoff < length(corr)){
index=order(corr,decreasing = T)[1:cutoff]
}else{
index=order(corr,decreasing = T)
}
#index=which(pvalue < cutoff)
if(verbose)print(paste0(' |> Final number of pathways >>>',length(index),'......Min correlation of pathways>>>',round(min(corr[index]),digits = 3)))
return(index)
}else{
data=do.call(rbind,data)
label=data[,1]
corr=sapply(2:ncol(data),function(x) stats::cor(data[,x],label,method='pearson'))
if(cutoff < length(corr)){
index=order(corr,decreasing = T)[1:cutoff]
}else{
index=order(corr,decreasing = T)
}
if(verbose)print(paste0(' |> Final number of pathways >>> ',length(index),'......Min correlation of pathways>>>',round(min(corr[index]),digits = 3)))
index=index+1
return(index)
}
}else if(Stage2_FeartureSelection_Method=='wilcox.test'){
if(!is.null(label)){
data=do.call(cbind,data)
data=cbind(label=label,data)
data=as.data.frame(data)
data_0=data[which(data$label==unique(data$label)[1]),]
data_1=data[which(data$label==unique(data$label)[2]),]
pvalue=unlist(mclapply(2:ncol(data),function(x) wilcox.test(data_0[,x],data_1[,x])$p.value,mc.cores=cores))
corr=stats::cor(data$label,data[,-1])
index=which(pvalue < cutoff & corr>0)
if(verbose)print(paste0(' |> Final number of pathways >>>',length(index),'......Max p-value of pathways>>>',round(max(pvalue[index]),digits = 3)))
return(index)
}else{
data=do.call(rbind,data)
data=as.data.frame(data)
data_0=data[which(data$label==unique(data$label)[1]),]
data_1=data[which(data$label==unique(data$label)[2]),]
pvalue=unlist(mclapply(2:ncol(data),function(x) wilcox.test(data_0[,x],data_1[,x])$p.value,mc.cores=cores))
corr=stats::cor(data$label,data[,-1])
index=which(pvalue < cutoff & corr>0)
if(verbose)print(paste0(' |> Final number of pathways >>>',length(index),'......Max p-value of pathways>>>',round(max(pvalue[index]),digits = 3)))
index=index+1
return(index)
}
}else if(Stage2_FeartureSelection_Method=='wilcox.test_rank'){
if(!is.null(label)){
data=do.call(cbind,data)
data=cbind(label=label,data)
data=as.data.frame(data)
data_0=data[which(data$label==unique(data$label)[1]),]
data_1=data[which(data$label==unique(data$label)[2]),]
pvalue=unlist(mclapply(2:ncol(data),function(x) wilcox.test(data_0[,x],data_1[,x])$p.value,mc.cores=cores))
corr=stats::cor(data$label,data[,-1])
if(cutoff < length(which(corr>0))){
index=order(pvalue)
unindex=which(corr<0)
index=setdiff(index,unindex)
index=index[1:cutoff]
}else{
index=order(pvalue)
unindex=which(corr<0)
index=setdiff(index,unindex)
}
#index=which(pvalue < cutoff)
if(verbose)print(paste0(' |> Final number of pathways >>>',length(index),'......Max p-value of pathways>>>',round(max(pvalue[index]),digits = 3)))
return(index)
}else{
data=do.call(rbind,data)
data=as.data.frame(data)
data_0=data[which(data$label==unique(data$label)[1]),]
data_1=data[which(data$label==unique(data$label)[2]),]
pvalue=unlist(mclapply(2:ncol(data),function(x) wilcox.test(data_0[,x],data_1[,x])$p.value,mc.cores=cores))
corr=stats::cor(data$label,data[,-1])
if(cutoff < length(which(corr>0))){
index=order(pvalue)
unindex=which(corr<0)
index=setdiff(index,unindex)
index=index[1:cutoff]
}else{
index=order(pvalue)
unindex=which(corr<0)
index=setdiff(index,unindex)
}
if(verbose)print(paste0(' |> Final number of pathways >>>',length(index),'......Max p-value of pathways>>>',round(max(pvalue[index]),digits = 3)))
index=index+1
return(index)
}
}else if(Stage2_FeartureSelection_Method=='RemoveHighcor'){
if(!is.null(label)){
corr=sapply(1:length(data),function(x) stats::cor(data[[x]],label,method='pearson'))
data=do.call(cbind,data)
corm=stats::cor(data)
unindex=caret::findCorrelation(corm,cutoff =cutoff)
index=which(corr > 0)
index=setdiff(index,unindex)
if(verbose)print(paste0(' |> Final number of pathways >>> ',length(index),'......Min correlation of pathways>>>',round(min(corr[index]),digits = 3)))
return(index)
}else{
data=do.call(rbind,data)
label=data[,1]
corr=sapply(2:ncol(data),function(x) stats::cor(data[,x],label,method='pearson'))
index=which(corr > 0)
data=data[,-1]
corm=stats::cor(data)
unindex=caret::findCorrelation(corm,cutoff =cutoff)
index=setdiff(index,unindex)
index=index+1
return(index)
}
}else if(Stage2_FeartureSelection_Method=='RemoveLinear'){
if(!is.null(label)){
corr=sapply(1:length(data),function(x) stats::cor(data[[x]],label,method='pearson'))
data=do.call(cbind,data)
unindex=caret::findLinearCombos(data)$remove
index=which(corr > 0)
index=setdiff(index,unindex)
if(verbose)print(paste0(' |> Final number of pathways >>>',length(index),'......Min correlation of pathways>>>',round(min(corr[index]),digits = 3)))
return(index)
}else{
data=do.call(rbind,data)
label=data[,1]
corr=sapply(2:ncol(data),function(x) stats::cor(data[,x],label,method='pearson'))
index=which(corr > 0)
data=data[,-1]
unindex=caret::findLinearCombos(data)$remove
index=setdiff(index,unindex)
index=index+1
return(index)
}
}else{
if(!is.null(label)){
up=ifelse(classifier=='lda',0.99,100)
corr=sapply(1:length(data),function(x) stats::cor(data[[x]],label,method='pearson'))
if(verbose)print(paste0(' |> Final number of pathways >>>',length(order(corr,decreasing=T)[which(corr > 0 & corr < up)]),'......Min correlation of pathways>>>',round(min(corr[which(corr > 0 & corr < up)]),digits = 3)))
index=which(corr > 0 & corr < up )
return(index)
}else{
data=do.call(rbind,data)
label=data[,1]
corr=sapply(2:ncol(data),function(x) stats::cor(data[,x],label,method='pearson'))
index=which(corr > 0)
index=index+1
return(index)
}
}
}
if(preMode=='regression'){
}
}
#' Add unmapped probe
#'
#' @param train The input training dataset. The first column
#' is the label or the output. For binary classes,
#' 0 and 1 are used to indicate the class member.
#' @param test The input test dataset. The first column
#' is the label or the output. For binary classes,
#' 0 and 1 are used to indicate the class member.
#' @param Unmapped_num The number of unmapped probes.
#' @param Add_FeartureSelection_Method Feature selection methods. Available options are
#' c('cor', 'wilcox.test').
#' @param anno The annotation data stored in a data.frame for probe
#' mapping. It must have at least two columns named 'ID' and 'entrezID'.
#' (For details, please refer to data( data("MethylAnno") )
#' @param len The number of unmapped probes
#' @param cores The number of cores used for computation.
#' @param verbose Whether to print running process information to the console
#'
#' @return Matrix of unmapped probes
#' @export
#' @import parallel
#' @importFrom stats wilcox.test
#'
AddUnmapped=function(train=NULL,test=NULL,Unmapped_num=NULL,Add_FeartureSelection_Method='wilcox.test',anno=NULL,len=NULL,verbose=TRUE,cores=1){
requireNamespace("parallel")
if(Sys.info()[1]=="Windows"){
cores=1
}
if(is.null(Add_FeartureSelection_Method) | Add_FeartureSelection_Method=='wilcox.test'){
Unmapped_Train=train[,setdiff(colnames(train),anno$ID)]
Unmapped_Test=test[,setdiff(colnames(train),anno$ID)]
Unmapped_0=Unmapped_Train[which(Unmapped_Train$label==unique(Unmapped_Train$label)[1]),]
Unmapped_1=Unmapped_Train[which(Unmapped_Train$label==unique(Unmapped_Train$label)[2]),]
Unmapped_pvalue=unlist(mclapply(2:ncol(Unmapped_Train),function(x) wilcox.test(Unmapped_0[,x],Unmapped_1[,x])$p.value,mc.cores=cores))
if(is.null(Unmapped_num)){
Unmapped_num=len-1
}
if(length(Unmapped_pvalue)<Unmapped_num){
Unmapped_num=length(Unmapped_pvalue)
}
Unmapped_id=order(Unmapped_pvalue)[1:Unmapped_num]
Unmapped_id=Unmapped_id+1
Unmapped_Train=Unmapped_Train[,Unmapped_id]
Unmapped_Test=Unmapped_Test[,Unmapped_id]
Unmapped_Data=list('train'= Unmapped_Train,'test'= Unmapped_Test)
if(verbose)print(paste0(' |> Add Unmapped features==>>',length(Unmapped_id)))
return(Unmapped_Data)
}else if(Add_FeartureSelection_Method=='cor'){
Unmapped_Train=train[,setdiff(colnames(train),anno$ID)]
Unmapped_Test=test[,setdiff(colnames(train),anno$ID)]
Unmapped_pvalue=unlist(mclapply(2:ncol(Unmapped_Train),function(x) stats::cor(Unmapped_Train$label,Unmapped_Train[,x]),mc.cores=cores))
Unmapped_pvalue=ifelse(Unmapped_pvalue>0,Unmapped_pvalue,-Unmapped_pvalue)
if(is.null(Unmapped_num)){
Unmapped_num=len-1
}
if(length(Unmapped_pvalue)<Unmapped_num){
Unmapped_num=length(Unmapped_pvalue)
}
Unmapped_id=order(Unmapped_pvalue)[1:Unmapped_num]
Unmapped_id=Unmapped_id+1
Unmapped_Train=Unmapped_Train[,Unmapped_id]
Unmapped_Test=Unmapped_Test[,Unmapped_id]
Unmapped_Data=list('train'= Unmapped_Train,'test'= Unmapped_Test)
if(verbose)print(paste0(' |> Add Unmapped features==>>',length(Unmapped_id)))
return(Unmapped_Data)
}
}
#' Biologically Explainable Machine Learning Framework
#'
#' @param TrainData The input training dataset. The first column
#' is the label or the output. For binary classes,
#' 0 and 1 are used to indicate the class member.
#' @param TestData The input test dataset. The first column
#' is the label or the output. For binary classes,
#' 0 and 1 are used to indicate the class member.
#' @param pathlistDB A list of pathways with pathway IDs and their
#' corresponding genes ('entrezID' is used).
#' For details, please refer to ( data("GO2ALLEGS_BP") )
#' @param FeatureAnno The annotation data stored in a data.frame for probe
#' mapping. It must have at least two columns named 'ID' and 'entrezID'.
#' (For details, please refer to data( data("MethylAnno") )
#' @param resampling Resampling in mlr3verse.
#' @param nfolds k-fold cross validation ( Only supported when TestData = NULL )
#' @param classifier Learners in mlr3
#' @param paramlist Learner parameters search spaces
#' @param predMode The prediction mode. Currently only supports 'probability' for binary classification tasks.
#' @param PathwaySizeUp The upper-bound of the number of genes in each
#' biological pathways.
#' @param PathwaySizeDown The lower-bound of the number of genes in each
#' biological pathways.
#' @param MinfeatureNum_pathways The minimal defined pathway size after mapping your
#' own data to pathlistDB(KEGG database/GO database).
#' @param Add_UnMapped Whether to add unmapped probes for prediction
#' @param Unmapped_num The number of unmapped probes
#' @param Add_FeartureSelection_Method Feature selection methods.
#' @param Inner_CV Whether to perform a k-fold verification on the training set.
#' @param inner_folds k-fold verification on the training set.
#' @param Stage1_FeartureSelection_Method Feature selection methods.
#' @param cutoff The cutoff used for feature selection threshold. It can be any value
#' between 0 and 1.
#' @param Stage2_FeartureSelection_Method Feature selection methods.
#' @param cutoff2 The cutoff used for feature selection threshold. It can be any value
#' between 0 and 1.
#' @param classifier2 Learner for stage 2 prediction(if classifier2==NULL,then it is the same as the learner in stage 1.)
#' @param target Is it used to predict or explore potential biological mechanisms?
#' Available options are c('predict', 'pathways').
#' @param p.adjust.method p-value adjustment method.(holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
# "fdr", "none")
#' @param save_pathways_matrix Whether to output the path matrix file
#' @param cores The number of cores used for computation.
#' @param seed Cross-Validation seed
#' @param verbose Whether to print running process information to the console
#'
#' @return A list containing prediction results and prediction result evaluation
#' @export
#' @import ROCR
#' @import caret
#' @importFrom utils head data
#' @importFrom stats wilcox.test p.adjust cor.test
#' @examples
#'
#'
#'
#' library(mlr3verse)
#' library(caret)
#' library(parallel)
#' library(BioM2)
#' data=MethylData_Test
#' set.seed(1)
#' part=unlist(createDataPartition(data$label,p=0.8))
#' Train=data[part,]
#' Test=data[-part,]
#' pathlistDB=GO2ALLEGS_BP
#' FeatureAnno=MethylAnno
#'
#'
#' pred=BioM2(TrainData = Train,TestData = Test,
#' pathlistDB=pathlistDB,FeatureAnno=FeatureAnno,
#' classifier='svm',nfolds=5,
#' PathwaySizeUp=25,PathwaySizeDown=20,MinfeatureNum_pathways=10,
#' Add_UnMapped='Yes',Unmapped_num=300,
#' Inner_CV='None',inner_folds=5,
#' Stage1_FeartureSelection_Method='cor',cutoff=0.3,
#' Stage2_FeartureSelection_Method='None',
#' target='predict',cores=1
#' )#(To explore biological mechanisms, set target=‘pathways’)
#'
#'
#'
BioM2=function(TrainData=NULL,TestData=NULL,pathlistDB=NULL,FeatureAnno=NULL,resampling=NULL,nfolds=5,classifier='liblinear',
paramlist=NULL,predMode = "probability",
PathwaySizeUp=200,PathwaySizeDown=20,MinfeatureNum_pathways=10,
Add_UnMapped=TRUE,Unmapped_num=300,Add_FeartureSelection_Method='wilcox.test',
Inner_CV=TRUE,inner_folds=10,
Stage1_FeartureSelection_Method='cor',cutoff=0.3,
Stage2_FeartureSelection_Method='RemoveHighcor',cutoff2=0.95,classifier2=NULL,
target='predict',p.adjust.method='fdr',save_pathways_matrix=FALSE,cores=1,seed=10,verbose=TRUE){
if(verbose)print('===================BioM2==================')
if(Sys.info()[1]=="Windows"){
cores=1
}
if(is.null(names(pathlistDB)) | sum(is.na(names(pathlistDB)))>0 ){
stop("The name in pathlistDB cannot be NA or NULL.")
}
prediction=list()
FeatureAnno$ID=gsub('[\\.\\_\\-]','',FeatureAnno$ID)
colnames(TrainData)=gsub('[\\.\\_\\-]','',colnames(TrainData))
if(is.null(TestData)){
list_pathways=list()
Record=data.frame(resampling_id=1:nfolds,learner_name=1:nfolds,AUC=1:nfolds,ACC=1:nfolds,PCCs=1:nfolds)
Resampling=caret::createFolds(TrainData$label,k=nfolds)
if(!is.null(resampling)){
nfolds=resampling$param_set$values$folds
}
T1=Sys.time()
for(xxx in 1:nfolds){
if(verbose)print(paste0('<<<<<-----Start-----','Resampling(CV',',folds=',nfolds,')-No.',xxx,'----->>>>>'))
if(verbose)print('Step1: ReadData')
t1=Sys.time()
if(is.null(resampling)){
trainData=TrainData[unlist(Resampling[-xxx]),]
testData=TrainData[unlist(Resampling[xxx]),]
}else{
trainData=TrainData[resampling$train_set(xxx),]
testData=TrainData[resampling$test_set(xxx),]
}
geneNum_pathways=sapply(1:length(pathlistDB),function(i) length(pathlistDB[[i]]))
pathlistDB_sub=pathlistDB[which(geneNum_pathways > PathwaySizeDown & geneNum_pathways < PathwaySizeUp )]
featureAnno=FeatureAnno[FeatureAnno$ID %in% colnames(trainData),]
if(verbose)print(paste0(' |>Total number of pathways==>>',length(pathlistDB_sub)))
if(verbose)print('Step2: FeartureSelection-features')
feature_pathways=Stage1_FeartureSelection(Stage1_FeartureSelection_Method=Stage1_FeartureSelection_Method,data=trainData,cutoff=cutoff,
featureAnno=featureAnno,pathlistDB_sub=pathlistDB_sub,MinfeatureNum_pathways=MinfeatureNum_pathways,cores=cores,verbose=verbose)
lens=sapply(1:length(feature_pathways),function(x) length(feature_pathways[[x]]))
if(verbose)print('Step3: MergeData')
trainDataList=mclapply(1:length(feature_pathways),function(x) trainData[,feature_pathways[[x]]] ,mc.cores=cores)
testDataList=mclapply(1:length(feature_pathways),function(x) testData[,feature_pathways[[x]]] ,mc.cores=cores)
names(trainDataList)=names(pathlistDB_sub)
names(testDataList)=names(pathlistDB_sub)
trainDataList=trainDataList[which(lens>MinfeatureNum_pathways)]
testDataList=testDataList[which(lens>MinfeatureNum_pathways)]
featureNum_pathways=sapply(1:length(trainDataList),function(i2) length(trainDataList[[i2]]))
if(verbose)print(paste0(' |> Total number of selected pathways==>>',length(trainDataList)))
if(verbose)print(paste0(' |> Min features number of pathways==>>',min(featureNum_pathways)-1,'.......','Max features number of pathways==>>',max(featureNum_pathways)-1))
#(PredictPathways)
if(target=='pathways'){
if(verbose)print('Step4: PredictPathways')
test=mclapply(1:length(testDataList),function(i6) baseModel(trainData =trainDataList[[i6]],testData =testDataList[[i6]],predMode = predMode,
classifier = classifier,paramlist=paramlist,seed=seed ),mc.cores=cores)
corr=sapply(1:length(testDataList),function(x) stats::cor(test[[x]],testDataList[[x]]$label,method='pearson'))
newtest=do.call(cbind, test)
colnames(newtest)=names(testDataList)
newtest=cbind(label=testDataList[[1]]$label,newtest)
list_pathways[[xxx]]=newtest
if(verbose)print(paste0(' |>min correlation of pathways=====>>>',round(min(corr),digits = 3),'......','max correlation of pathways===>>>',round(max(corr),digits = 3)))
if(verbose)print(' <<< PredictPathways Done! >>> ')
t2=Sys.time()
if(verbose)print(t2-t1)
if(verbose)print('---------------------####################------------------')
}else{
#(Reconstruction )
if(verbose)print('Step4: Reconstruction')
if(Inner_CV==TRUE){
if(verbose)print(' |> Using Inner CV ~ ~ ~')
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =NULL,predMode =predMode,classifier = classifier,inner_folds=inner_folds,paramlist=paramlist,seed=seed),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode =predMode,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
}else{
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =trainDataList[[i4]],predMode = predMode ,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode = predMode ,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
}
if(verbose)print(' <<< Reconstruction Done! >>> ')
#(FeartureSelection-pathways)
if(verbose)print('Step5: FeartureSelection-pathways')
index=Stage2_FeartureSelection(Stage2_FeartureSelection_Method=Stage2_FeartureSelection_Method,data=train,
label=trainDataList[[1]]$label,cutoff=cutoff2,preMode='probability',classifier =classifier,verbose=verbose,cores=cores)
newtrain=do.call(cbind,train[index])
colnames(newtrain)=names(trainDataList)[index]
newtrain=cbind(label=trainDataList[[1]]$label,newtrain)
newtest=do.call(cbind,test[index])
colnames(newtest)=names(trainDataList)[index]
newtest=cbind(label=testDataList[[1]]$label,newtest)
colnames(newtest)=gsub(':','',colnames(newtest))
colnames(newtrain)=gsub(':','',colnames(newtrain))
if(Add_UnMapped==TRUE){
if(Unmapped_num==0){
if(verbose)print(paste0(' |> Merge PathwayFeature and AddFeature ==>>',ncol(newtrain)))
}else{
Unmapped_Data=AddUnmapped(train=trainData,test=testData,Add_FeartureSelection_Method=Add_FeartureSelection_Method,
Unmapped_num=Unmapped_num,len=ncol(newtrain),anno=featureAnno,verbose=verbose,cores=cores)
newtrain=cbind(newtrain,Unmapped_Data$train)
newtest=cbind(newtest,Unmapped_Data$test)
if(verbose)print(paste0(' |> Merge PathwayFeature and AddFeature ==>>',ncol(newtrain)))
}
}
#(Predict and Metric)
if(verbose)print('Step6: Predict and Metric')
if(is.null(classifier2)){
classifier2=classifier
}
result=baseModel(trainData=newtrain,testData=newtest,predMode ='probability',classifier = classifier2)
prediction_part=data.frame(sample=rownames(testData),prediction=result)
prediction[[xxx]]=prediction_part
names(prediction)[xxx]=paste('Resample No.',xxx)
testDataY=testDataList[[1]]$label
pre=ifelse(result>0.5,1,0)
Record[xxx,1]=xxx
Record[xxx,2]=ifelse(is.character(classifier),classifier,classifier$id)
Record[xxx,5]=stats::cor(testDataY,result,method='pearson')
Record[xxx,3]=ROCR::performance(ROCR::prediction(result,testDataY),'auc')@y.values[[1]]
accuracy_class1 <- sum(pre[testDataY == 1] == 1) / sum(testDataY == 1)
accuracy_class0 <- sum(pre[testDataY == 0] == 0) / sum(testDataY == 0)
Record[xxx,4]=(accuracy_class1 + accuracy_class0) / 2
if(verbose)print(paste0('######Resampling NO.',xxx,'~~~~',ifelse(is.character(classifier),classifier,classifier$id),'==>','AUC:',round(Record[xxx,3],digits = 3),' ','BAC:',round(Record[xxx,4],digits = 3),' ','PCCs:',round(Record[xxx,5],digits = 3)))
t2=Sys.time()
if(verbose)print(t2-t1)
if(verbose)print('---------------------####################------------------')
}
#(Comprehensive Assessment)
if(xxx==nfolds){
if(verbose)print('-----------------------------------------------------------')
if(verbose)print('------------========<<<< Completed! >>>>======-----------')
if(verbose)print('-----------------------------------------------------------')
if(target=='pathways'){
ColN=Reduce(intersect,lapply(1:nfolds,function(x) colnames(list_pathways[[x]])))
list_pathways=lapply(1:nfolds,function(x) list_pathways[[x]][,ColN])
index=Stage2_FeartureSelection(Stage2_FeartureSelection_Method=Stage2_FeartureSelection_Method,data=list_pathways,
label=NULL,cutoff=cutoff2,preMode='probability',classifier =classifier,cores=cores)
matrix_pathways=do.call(rbind,list_pathways)
matrix_pathways=matrix_pathways[,c(1,index)]
rownames(matrix_pathways)=rownames(TrainData)[unlist(Resampling)]
if(save_pathways_matrix==TRUE){
saveRDS(matrix_pathways,'pathways_matrix.rds')
if(verbose)print(' |||==>>> Save the Pathways-Matrix ')
}
pathways_result=data.frame(
id=colnames(matrix_pathways)[2:ncol(matrix_pathways)],
cor=sapply(2:ncol(matrix_pathways),function(i) cor.test(matrix_pathways[,1],matrix_pathways[,i])$estimate),
p.value=sapply(2:ncol(matrix_pathways),function(i) cor.test(matrix_pathways[,1],matrix_pathways[,i])$p.value)
)
pathways_result$adjust_p.value=p.adjust(pathways_result$p.value,method=p.adjust.method)
GO_Ancestor=NA
data("GO_Ancestor",envir = environment())
GO_anno=GO_Ancestor[,1:2]
GO_anno=GO_anno[-which(duplicated(GO_anno)),]
colnames(GO_anno)=c('id','term')
pathways_result2=merge(pathways_result,GO_anno,by='id')
id=which(pathways_result$id %in% setdiff(pathways_result$id,pathways_result2$id))
pathways_result$term=rep('',nrow(pathways_result))
pathways_result=rbind(pathways_result2,pathways_result[id,])
pathways_result=pathways_result[order(pathways_result$cor,decreasing = TRUE),]
if(verbose)print(head(pathways_result))
final=list('PathwaysMatrix'= matrix_pathways,'PathwaysResult'= pathways_result)
T2=Sys.time()
if(verbose)print(T2-T1)
if(verbose)print('######------- Well Done!!!-------######')
return(final)
}else{
T2=Sys.time()
if(verbose)print(paste0('{|>>>=====','Learner: ',ifelse(is.character(classifier),classifier,classifier$id),'---Performance Metric---==>>','AUC:',round(mean(Record$AUC),digits = 3),' ','ACC:',round(mean(Record$ACC),digits = 3),' ','PCCs:',round(mean(Record$PCCs),digits = 3),'======<<<|}'))
if(verbose)print(Record)
final=list('Prediction'=prediction,'Metric'=Record,'TotalMetric'=c('AUC'=round(mean(Record$AUC),digits = 3),'ACC'=round(mean(Record$ACC),digits = 3),'PCCs'=round(mean(Record$PCCs),digits = 3)))
if(verbose)print(T2-T1)
if(verbose)print('######------- Well Done!!!-------######')
return(final)
}
}
}
}else{
colnames(TestData)=colnames(TrainData)
Record=data.frame(learner_name=1,AUC=1,ACC=1,PCCs=1)
if(verbose)print('Step1: ReadData')
T1=Sys.time()
trainData=TrainData
testData=TestData
geneNum_pathways=sapply(1:length(pathlistDB),function(i) length(pathlistDB[[i]]))
pathlistDB_sub=pathlistDB[which(geneNum_pathways > PathwaySizeDown & geneNum_pathways < PathwaySizeUp )]
featureAnno=FeatureAnno[FeatureAnno$ID %in% colnames(trainData),]
if(verbose)print(paste0(' |>Total number of pathways==>>',length(pathlistDB_sub)))
if(verbose)print('Step2: FeartureSelection-features')
feature_pathways=Stage1_FeartureSelection(Stage1_FeartureSelection_Method=Stage1_FeartureSelection_Method,data=trainData,cutoff=cutoff,
featureAnno=featureAnno,pathlistDB_sub=pathlistDB_sub,MinfeatureNum_pathways=MinfeatureNum_pathways,cores=cores,verbose=verbose)
lens=sapply(1:length(feature_pathways),function(x) length(feature_pathways[[x]]))
if(verbose)print('Step3: MergeData')
trainDataList=mclapply(1:length(feature_pathways),function(x) trainData[,feature_pathways[[x]]] ,mc.cores=cores)
testDataList=mclapply(1:length(feature_pathways),function(x) testData[,feature_pathways[[x]]] ,mc.cores=cores)
names(trainDataList)=names(pathlistDB_sub)
names(testDataList)=names(pathlistDB_sub)
trainDataList=trainDataList[which(lens>MinfeatureNum_pathways)]
testDataList=testDataList[which(lens>MinfeatureNum_pathways)]
featureNum_pathways=sapply(1:length(trainDataList),function(i2) length(trainDataList[[i2]]))
if(verbose)print(paste0(' |> Total number of selected pathways==>>',length(trainDataList)))
if(verbose)print(paste0(' |> Min features number of pathways==>>',min(featureNum_pathways)-1,'.......','Max features number of pathways==>>',max(featureNum_pathways)-1))
#(PredictPathways)
if(target=='pathways'){
if(verbose)print('Step4: PredictPathways')
if(Inner_CV ==TRUE){
if(verbose)print(' |> Using Inner CV ~ ~ ~')
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =NULL,predMode =predMode,classifier = classifier,inner_folds=inner_folds,paramlist=paramlist,seed=seed),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode =predMode,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
}else{
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =trainDataList[[i4]],predMode = predMode ,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode = predMode ,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
}
if(verbose)print('Step5: FeartureSelection-pathways')
index=Stage2_FeartureSelection(Stage2_FeartureSelection_Method=Stage2_FeartureSelection_Method,data=train,
label=trainDataList[[1]]$label,cutoff=cutoff2,preMode='probability',classifier =classifier,verbose=verbose,cores=cores)
newtest=do.call(cbind, test[index])
colnames(newtest)=names(testDataList)[index]
newtest=cbind(label=testDataList[[1]]$label,newtest)
corr=stats::cor(newtest[,1],newtest[,-1])
matrix_pathways=newtest
rownames(matrix_pathways)=rownames(TestData)
if(verbose)print(paste0(' |>min correlation of pathways=====>>>',round(min(corr),digits = 3),'......','max correlation of pathways===>>>',round(max(corr),digits = 3)))
if(verbose)print(' <<< PredictPathways Done! >>> ')
if(save_pathways_matrix==TRUE){
saveRDS(matrix_pathways,'pathways_matrix.rds')
if(verbose)print(' |||==>>> Save the Pathways-Matrix ')
}
pathways_result=data.frame(
id=colnames(matrix_pathways)[2:ncol(matrix_pathways)],
cor=sapply(2:ncol(matrix_pathways),function(i) cor.test(matrix_pathways[,1],matrix_pathways[,i])$estimate),
p.value=sapply(2:ncol(matrix_pathways),function(i) cor.test(matrix_pathways[,1],matrix_pathways[,i])$p.value)
)
pathways_result$adjust_p.value=p.adjust(pathways_result$p.value,method=p.adjust.method)
GO_Ancestor=NA
data("GO_Ancestor",envir = environment())
GO_anno=GO_Ancestor[,1:2]
GO_anno=GO_anno[-which(duplicated(GO_anno)),]
colnames(GO_anno)=c('id','term')
pathways_result2=merge(pathways_result,GO_anno,by='id')
id=which(pathways_result$id %in% setdiff(pathways_result$id,pathways_result2$id))
pathways_result$term=rep('',nrow(pathways_result))
pathways_result=rbind(pathways_result2,pathways_result[id,])
pathways_result=pathways_result[order(pathways_result$cor,decreasing = TRUE),]
if(verbose)print(head(pathways_result))
final=list('PathwaysMatrix'= matrix_pathways,'PathwaysResult'= pathways_result)
T2=Sys.time()
if(verbose)print(T2-T1)
if(verbose)print('######------- Well Done!!!-------######')
return(final)
}else{
#(Reconstruction )
if(verbose)print('Step4: Reconstruction')
if(Inner_CV ==TRUE){
if(verbose)print(' |> Using Inner CV ~ ~ ~')
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =NULL,predMode =predMode,classifier = classifier,inner_folds=inner_folds,paramlist=paramlist,seed=seed),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode =predMode,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
}else{
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =trainDataList[[i4]],predMode = predMode ,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode = predMode ,classifier = classifier,paramlist=paramlist,seed=seed),mc.cores=cores)
}
if(verbose)print(' <<< Reconstruction Done! >>> ')
#(FeartureSelection-pathways)
if(verbose)print('Step5: FeartureSelection-pathways')
index=Stage2_FeartureSelection(Stage2_FeartureSelection_Method=Stage2_FeartureSelection_Method,data=train,
label=trainDataList[[1]]$label,cutoff=cutoff2,preMode='probability',classifier =classifier,verbose=verbose,cores=cores)
newtrain=do.call(cbind,train[index])
colnames(newtrain)=names(trainDataList)[index]
newtrain=cbind(label=trainDataList[[1]]$label,newtrain)
newtest=do.call(cbind,test[index])
colnames(newtest)=names(trainDataList)[index]
newtest=cbind(label=testDataList[[1]]$label,newtest)
if(Add_UnMapped==TRUE){
if(Unmapped_num==0){
if(verbose)print(paste0(' |> Merge PathwayFeature and AddFeature ==>>',ncol(newtrain)))
}else{
Unmapped_Data=AddUnmapped(train=trainData,test=testData,Add_FeartureSelection_Method=Add_FeartureSelection_Method,
Unmapped_num=Unmapped_num,len=ncol(newtrain),anno=featureAnno,verbose=verbose,cores=cores)
newtrain=cbind(newtrain,Unmapped_Data$train)
newtest=cbind(newtest,Unmapped_Data$test)
if(verbose)print(paste0(' |> Merge PathwayFeature and AddFeature ==>>',ncol(newtrain)))
}
}
rownames(newtrain)=rownames(trainData)
rownames(newtest)=rownames(testData)
st2_train=newtrain
st2_test=newtest
colnames(newtest)=gsub(':','',colnames(newtest))
colnames(newtrain)=gsub(':','',colnames(newtrain))
#(Predict and Metric)
if(verbose)print('Step6: Predict and Metric')
if(is.null(classifier2)){
classifier2=classifier
}
result=baseModel(trainData=newtrain,testData=newtest,predMode ='probability',classifier = classifier2)
predict=data.frame(sample=rownames(testData),prediction=result)
testDataY=testDataList[[1]]$label
pre=ifelse(result>0.5,1,0)
Record[1,1]=ifelse(is.character(classifier),classifier,classifier$id)
Record[1,4]=stats::cor(testDataY,result,method='pearson')
Record[1,2]=ROCR::performance(ROCR::prediction(result,testDataY),'auc')@y.values[[1]]
accuracy_class1 <- sum(pre[testDataY == 1] == 1) / sum(testDataY == 1)
accuracy_class0 <- sum(pre[testDataY == 0] == 0) / sum(testDataY == 0)
Record[1,3]=(accuracy_class1 + accuracy_class0) / 2
if(verbose)print(paste0('######~~~~',ifelse(is.character(classifier),classifier,classifier$id),'==>','AUC:',round(Record[1,2],digits = 3),' ','BAC:',round(Record[1,3],digits = 3),' ','PCCs:',round(Record[1,4],digits = 3)))
final=list('Stage2_train'=st2_train,'Stage2_test'=st2_test,'Prediction'=predict,'Metric'=Record)
T2=Sys.time()
if(verbose)print(T2-T1)
if(verbose)print('######------- Well Done!!!-------######')
return(final)
}
}
}
#' BioM2 Hyperparametric Combination
#'
#' @param TrainData The input training dataset. The first column
#' is the label or the output. For binary classes,
#' 0 and 1 are used to indicate the class member.
#' @param pathlistDB A list of pathways with pathway IDs and their
#' corresponding genes ('entrezID' is used).
#' For details, please refer to ( data("GO2ALLEGS_BP") )
#' @param FeatureAnno The annotation data stored in a data.frame for probe
#' mapping. It must have at least two columns named 'ID' and 'entrezID'.
#' (For details, please refer to data( data("MethylAnno") )
#' @param resampling Resampling in mlr3verse.
#' @param nfolds k-fold cross validation ( Only supported when TestData = NULL )
#' @param classifier Learners in mlr3
#' @param predMode The prediction mode. Currently only supports 'probability' for binary classification tasks.
#' @param PathwaySizeUp The upper-bound of the number of genes in each
#' biological pathways.
#' @param PathwaySizeDown The lower-bound of the number of genes in each
#' biological pathways.
#' @param MinfeatureNum_pathways The minimal defined pathway size after mapping your
#' own data to pathlistDB(KEGG database/GO database).
#' @param Add_UnMapped Whether to add unmapped probes for prediction
#' @param Unmapped_num The number of unmapped probes
#' @param Add_FeartureSelection_Method Feature selection methods.
#' @param Inner_CV Whether to perform a k-fold verification on the training set.
#' @param inner_folds k-fold verification on the training set.
#' @param Stage1_FeartureSelection_Method Feature selection methods.
#' @param stage1_cutoff The cutoff used for feature selection threshold. It can be any value
#' between 0 and 1.
#' @param Stage2_FeartureSelection_Method Feature selection methods.
#' @param stage2_cutoff The cutoff used for feature selection threshold. It can be any value
#' between 0 and 1.
#' @param classifier2 Learner for stage 2 prediction(if classifier2==NULL,then it is the same as the learner in stage 1.)
#' @param cores The number of cores used for computation.
#' @param verbose Whether to print running process information to the console
#'
#'
#' @return A data frame contains hyperparameter results
#' @export
#' @import ROCR
#' @import caret
#' @importFrom utils head
#' @importFrom stats wilcox.test
#'
HyBioM2=function(TrainData=NULL,pathlistDB=NULL,FeatureAnno=NULL,resampling=NULL,nfolds=5,classifier='liblinear', predMode = "probability",
PathwaySizeUp=200,PathwaySizeDown=20,MinfeatureNum_pathways=10,
Add_UnMapped=TRUE,Add_FeartureSelection_Method='wilcox.test',Unmapped_num=300,
Inner_CV=TRUE,inner_folds=10,
Stage1_FeartureSelection_Method='cor',stage1_cutoff=0.3,
Stage2_FeartureSelection_Method='RemoveHighcor',stage2_cutoff=0.8,
classifier2=NULL,cores=1,verbose=TRUE){
re=list()
if(Sys.info()[1]=="Windows"){
cores=1
}
if(verbose)print('===================HyBioM2==================')
for(c1 in 1:length(classifier)){
stage1_learner=classifier[[c1]]
HOPE=list()
t1=Sys.time()
for(luck in 1:length(stage1_cutoff)){
set.seed(666)
cutoff=stage1_cutoff[luck]
Resampling=createFolds(TrainData$label,k=nfolds)
final=list()
pr=sum(TrainData$label==1)>sum(TrainData$label==0)
for(xxx in 1:nfolds){
#print('Step1: ReadData')
trainData=TrainData[unlist(Resampling[-xxx]),]
testData=TrainData[unlist(Resampling[xxx]),]
geneNum_pathways=sapply(1:length(pathlistDB),function(i) length(pathlistDB[[i]]))
pathlistDB_sub=pathlistDB[which(geneNum_pathways > PathwaySizeDown & geneNum_pathways < PathwaySizeUp )]
#print(paste0(' |>Total number of pathways==>>',length(pathlistDB_sub)))
#print('Step2: FeartureSelection-features')
if(Stage1_FeartureSelection_Method=='cor'){
#print(paste0(' Using << correlation >>',' ,and you choose cutoff:',cutoff))
Cor=stats::cor(trainData$label,trainData)
Cor=ifelse(Cor>0,Cor,-Cor)
names(Cor)=colnames(trainData)
Cor_names=names(Cor)
Cor_cutoff=Cor[which(Cor>cutoff)]
Cor_cutoff_names=names(Cor_cutoff)
}else{
#print(paste0(' Using << wilcox.test >>',' ,and you choose cutoff:',cutoff))
train_0=trainData[which(trainData$label==0),]
train_1=trainData[which(trainData$label==1),]
Cor=unlist(mclapply(1:ncol(trainData),function(x) wilcox.test(train_0[,x],train_1[,x])$p.value,mc.cores=cores))
names(Cor)=colnames(trainData)
Cor_names=names(Cor)
Cor_cutoff=Cor[which(Cor<cutoff)]
Cor_cutoff_names=names(Cor_cutoff)
}
featureAnno=FeatureAnno[FeatureAnno$ID %in% colnames(trainData),]
MinfeatureNum_pathways2=MinfeatureNum_pathways+1
featureNum_pathways=mclapply(1:length(pathlistDB_sub),function(x){
id=c('label',featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB_sub[[x]])])
if(length(id)>MinfeatureNum_pathways2){
id2=id[which(id %in% Cor_cutoff_names)]
if(length(id2)<MinfeatureNum_pathways2){
a=Cor[id]
if(Stage1_FeartureSelection_Method=='cor'){
id2=names(a)[order(a,decreasing = T)[1:MinfeatureNum_pathways2]]
}else{
id2=names(a)[order(a,decreasing = F)[1:MinfeatureNum_pathways2]]
}
return(id2)
}else{
return(id2)
}
}else{
return(id)
}
} ,mc.cores=cores)
lens=sapply(1:length(featureNum_pathways),function(x) length(featureNum_pathways[[x]]))
#print('Step3: MergeData')
trainDataList=mclapply(1:length(featureNum_pathways),function(x) trainData[,featureNum_pathways[[x]]] ,mc.cores=cores)
testDataList=mclapply(1:length(featureNum_pathways),function(x) testData[,featureNum_pathways[[x]]] ,mc.cores=cores)
names(trainDataList)=names(pathlistDB_sub)
names(testDataList)=names(pathlistDB_sub)
trainDataList=trainDataList[which(lens>MinfeatureNum_pathways)]
testDataList=testDataList[which(lens>MinfeatureNum_pathways)]
featureNum_pathways2=sapply(1:length(trainDataList),function(i2) length(trainDataList[[i2]]))
#print(paste0(' |> Total number of selected pathways==>>',length(trainDataList)))
#print(paste0(' |> Min features number of pathways==>>',min(featureNum_pathways2)-1,'.......','Max features number of pathways==>>',max(featureNum_pathways2)-1))
#(FeartureSelection-pathways)
#print('Step4: Reconstruction')
if(Inner_CV){
#print(' |> Using Inner CV ~ ~ ~')
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =NULL,predMode =predMode,classifier = classifier,inner_folds=inner_folds),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode =predMode,classifier = classifier),mc.cores=cores)
#pred=mclapply(1:length(trainDataList),function(i4) baseModel2(trainData =trainDataList[[i4]],testData =testDataList[[i4]],classifier = stage1_learner,inner_folds=inner_folds),mc.cores=cores)
#train=lapply(1:length(pred),function(x) pred[[x]]$predtrain)
#test=lapply(1:length(pred),function(x) pred[[x]]$predtest)
}else{
train=mclapply(1:length(trainDataList),function(i4) baseModel(trainData =trainDataList[[i4]],testData =trainDataList[[i4]],predMode = predMode ,classifier = stage1_learner),mc.cores=cores)
test=mclapply(1:length(testDataList),function(i5) baseModel(trainData =trainDataList[[i5]],testData =testDataList[[i5]],predMode = predMode ,classifier = stage1_learner),mc.cores=cores)
}
#print(' <<< Reconstruction Done! >>> ')
#(FeartureSelection-pathways)
#print('Step5: FeartureSelection-pathways')
dbmap=unique(unlist(pathlistDB_sub))
annomap=unique(featureAnno$entrezID)
mapgene=intersect(annomap,dbmap)
map=featureAnno$ID[which(featureAnno$entrezID %in% mapgene)]
Unmapped_Train=trainData[,setdiff(colnames(trainData),map)]
Unmapped_Test=testData[,setdiff(colnames(trainData),map)]
#print(paste0('Unmapped_num: ',ncol(Unmapped_Train)))
Unmapped_0=Unmapped_Train[which(Unmapped_Train$label==unique(Unmapped_Train$label)[1]),]
Unmapped_1=Unmapped_Train[which(Unmapped_Train$label==unique(Unmapped_Train$label)[2]),]
if(Add_FeartureSelection_Method=='cor'){
Unmapped_pvalue= abs(stats::cor(Unmapped_Train$label,Unmapped_Train[,-1]))
Unmapped_pvalue=1/Unmapped_pvalue
}else{
Unmapped_0=Unmapped_Train[which(Unmapped_Train$label==unique(Unmapped_Train$label)[1]),]
Unmapped_1=Unmapped_Train[which(Unmapped_Train$label==unique(Unmapped_Train$label)[2]),]
Unmapped_pvalue=unlist(mclapply(2:ncol(Unmapped_Train),function(x) wilcox.test(Unmapped_0[,x],Unmapped_1[,x])$p.value,mc.cores=cores))
}
Record2=list()
for(ii in 1:length(stage2_cutoff)){
if(stage2_cutoff[ii]==0){
index=Stage2_FeartureSelection(Stage2_FeartureSelection_Method='None',data=train,
label=trainDataList[[1]]$label,cutoff=stage2_cutoff[ii],preMode='probability',classifier =classifier,verbose = FALSE,cores=cores)
}else{
index=Stage2_FeartureSelection(Stage2_FeartureSelection_Method=Stage2_FeartureSelection_Method,data=train,
label=trainDataList[[1]]$label,cutoff=stage2_cutoff[ii],preMode='probability',classifier =classifier,verbose = FALSE,cores=cores)
}
if(is.null(classifier2)){
stage2_learners=stage1_learner
}else{
stage2_learners=classifier2
}
n=length(Unmapped_num)*length(stage2_learners)
Record=data.frame(Unmapped_num=1:n,stage2_learner=1:n,AUC=1:n,
PCC=1:n,BAC=1:n,cutoff=1:n)
for(i in 1:length(Unmapped_num)){
newtrain=do.call(cbind,train[index])
colnames(newtrain)=names(trainDataList)[index]
newtrain=cbind(label=trainDataList[[1]]$label,newtrain)
newtest=do.call(cbind,test[index])
colnames(newtest)=names(trainDataList)[index]
newtest=cbind(label=testDataList[[1]]$label,newtest)
colnames(newtest)=gsub(':','',colnames(newtest))
colnames(newtrain)=gsub(':','',colnames(newtrain))
if(Add_UnMapped==TRUE & Unmapped_num[i] > 0 ){
if(length(Unmapped_pvalue)<Unmapped_num[i]){
Unmapped_Num=length(Unmapped_pvalue)
}else{
Unmapped_Num=Unmapped_num[i]
}
Unmapped_id=order(Unmapped_pvalue)[1:Unmapped_Num]
Unmapped_id=Unmapped_id+1
Unmapped_Train2=Unmapped_Train[,Unmapped_id]
Unmapped_Test2=Unmapped_Test[,Unmapped_id]
newtrain=cbind(newtrain,Unmapped_Train2)
newtest=cbind(newtest,Unmapped_Test2)
}
for(z in 1:length(stage2_learners)){
classifier_2=stage2_learners[[z]]
a=(i-1)*length(stage2_learners)+z
result=baseModel(trainData=newtrain,testData=newtest,predMode ='probability',classifier = classifier_2)
testDataY=testDataList[[1]]$label
pre=ifelse(result>0.5,1,0)
accuracy_class1 <- sum(pre[testDataY == 1] == 1) / sum(testDataY == 1)
accuracy_class0 <- sum(pre[testDataY == 0] == 0) / sum(testDataY == 0)
Record[a,5]=(accuracy_class1 + accuracy_class0) / 2
Record[a,1]=Unmapped_num[i]
Record[a,2]=ifelse(is.character(classifier_2),classifier_2,classifier_2$id)
Record[a,4]=stats::cor(testDataY,result,method='pearson')
Record[a,3]=ROCR::performance(ROCR::prediction(result,testDataY),'auc')@y.values[[1]]
Record[a,6]=stage2_cutoff[ii]
}
}
Record2[[ii]]=Record
}
Record2=do.call(rbind,Record2)
final[[xxx]]=Record2
}
hope=do.call(rbind,final)
hope=aggregate(hope[,c(3:5)],by=list(Unmapped_num=hope$Unmapped_num,stage2_cutoff=hope$cutoff,stage2_learner=hope$stage2_learner),mean)
hope$stage1_cutoff=stage1_cutoff[luck]
#if(verbose)print(hope)
#print(paste0('stage1_cutoff : ',cutoff))
#print(luck)
HOPE[[luck]]=hope
}
HOPE=do.call(rbind,HOPE)
HOPE$stage1_learner=ifelse(is.character(stage1_learner),stage1_learner,stage1_learner$id)
HOPE=HOPE[,c('stage1_learner','stage2_learner','stage1_cutoff','stage2_cutoff','Unmapped_num','AUC','BAC','PCC')]
if(verbose)print(HOPE)
re[[c1]]=HOPE
t2=Sys.time()
if(verbose)print(t2-t1)
if(verbose)print(' ')
}
re=do.call(rbind,re)
re=re[order(re$AUC,decreasing = T),]
return(re)
}
#' Find suitable parameters for partitioning pathways modules
#'
#' @param pathways_matrix A pathway matrix generated by the BioM2( target='pathways') function.
#' @param control_label The label of the control group ( A single number, factor, or character )
#' @param minModuleSize minimum module size for module detection. Detail for WGCNA::blockwiseModules()
#' @param mergeCutHeight dendrogram cut height for module merging. Detail for WGCNA::blockwiseModules()
#' @param minModuleNum Minimum total number of modules detected
#' @param power soft-thresholding power for network construction. Detail for WGCNA::blockwiseModules()
#' @param exact Whether to divide GO pathways more accurately (work when ancestor_anno=NULL)
#' @param ancestor_anno Annotations for ancestral relationships (like data('GO_Ancestor') )
#'
#' @return A list containing recommended parameters
#' @export
#' @importFrom utils data
#' @importFrom stats sd aggregate
#' @importFrom WGCNA pickSoftThreshold blockwiseModules
#'
#'
#'
#'
#'
FindParaModule=function(pathways_matrix=NULL,control_label=0,minModuleSize = seq(10,20,5),mergeCutHeight=seq(0,0.3,0.1),minModuleNum=5,power=NULL,exact=TRUE,ancestor_anno=NULL){
if('package:WGCNA' %in% search()){
final=list()
if(exact==FALSE & is.null(ancestor_anno)){
GO_Ancestor=NA
data("GO_Ancestor",envir = environment())
anno=GO_Ancestor
}else if(exact==TRUE & is.null(ancestor_anno)){
GO_Ancestor_exact=NA
data("GO_Ancestor_exact",envir = environment())
anno=GO_Ancestor_exact
}else{
anno=ancestor_anno
}
data=as.data.frame(pathways_matrix)
label=data.frame(ID=rownames(data),label=data$label)
data=data[data$label==control_label,]
data=data[,which(colnames(data) %in% anno$GO)]
if(is.null(power)){
powers = c(c(1:10), seq(from = 12, to=20, by=1))
sft = pickSoftThreshold(data, powerVector = powers, verbose = 0)
if(is.na(sft$powerEstimate)){
return('Could not find a proper powers , Please give a power by youself .')
}else{
power=sft$powerEstimate
message('Find the proper power!')
}
}
if(length(minModuleSize)==1 & length(mergeCutHeight)==1){
stop('Length of both minModuleSize and mergeCutHeight must greater than 1')
}else{
Num_module=minModuleSize
result_list=list()
for(xxx in 1:length(Num_module)){
cutoff=mergeCutHeight
n=length(cutoff)
result=data.frame(mergeCutHeight=1:n,Number_clusters=1:n,Mean_number_pathways=1:n,Mean_Fraction=1:n,Sd_Fraction=1:n,minModuleSize=1:n)
for(ii in 1:length(cutoff)){
net = blockwiseModules(data, power = power,
TOMType = "unsigned", minModuleSize = Num_module[xxx],
reassignThreshold = 0, mergeCutHeight = cutoff[ii],
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = F,
saveTOMFileBase = "femaleMouseTOM",
verbose = 0)
cluster=data.frame(ID=names(net$colors),cluster=net$colors)
cluster$cluster=cluster$cluster+1
cluster_list=list()
faction=vector()
numder_pathways=vector()
for(i in 1:max(cluster$cluster)){
new_anno=anno[which(anno$GO %in% cluster[which(cluster$cluster==i),]$ID),]
sum=length(which(cluster$cluster==i))
numder_pathways[i]=sum
term=unique(new_anno$Ancestor)
prop=sapply(1:length(term),function(x) length(which(new_anno$Ancestor==term[x]))*100/sum)
a=data.frame(Term=term[which.max(prop)],Fraction=max(prop))
faction[i]=a$Fraction
a$Fraction=paste0(round(a$Fraction,2),'%')
if(length(which(cluster$cluster==i))>5 & length(which(cluster$cluster==i))<300 ){
cluster_list[[i]]=a
}else{
cluster_list[[i]]=NA
}
names(cluster_list)[i]=paste0('cluster_NO.',i,'==>> ','There are ',sum,' pathways')
}
result[ii,1]=cutoff[ii]
result[ii,4]=mean(faction[which(!is.na(cluster_list))])
result[ii,5]=sd(faction[which(!is.na(cluster_list))])
result[ii,3]=mean(numder_pathways[which(numder_pathways<300)])
result[ii,2]=length(which(!is.na(cluster_list)))
result[ii,6]= Num_module[xxx]
}
result_list[[xxx]]=result
}
result=do.call(rbind,result_list)
result=result[which(result$Number_clusters > minModuleNum),]
d=aggregate(result$Mean_Fraction,by=list(minModuleSize=result$minModuleSize),max)
colnames(d)[2]='Mean_Fraction'
}
best_minModuleSize=d[which.max(d$Mean_Fraction),]$minModuleSize
message(paste0('The best minModuleSize is:',best_minModuleSize))
size=result[result$minModuleSize==best_minModuleSize,]
best_mergeCutHeight=size[which.max(size$Mean_Fraction),]$mergeCutHeight
message(paste0('The best mergeCutHeight is:', best_mergeCutHeight))
Para=c(power,best_minModuleSize,best_mergeCutHeight)
names(Para)=c('power','ModuleSize','mergeCutHeight')
final[[1]]=result
final[[2]]=Para
names(final)=c('TotalResult','BestParameter')
return(final)
message('Completed!')
}else{
message('If you want to use this function, please install and load the WGCNA package')
}
}
#' Delineate differential pathway modules with high biological interpretability
#'
#' @param pathways_matrix A pathway matrix generated by the BioM2( target='pathways') function.
#' @param control_label The label of the control group ( A single number, factor, or character )
#' @param power soft-thresholding power for network construction. Detail for WGCNA::blockwiseModules()
#' @param minModuleSize minimum module size for module detection. Detail for WGCNA::blockwiseModules()
#' @param mergeCutHeight dendrogram cut height for module merging. Detail for WGCNA::blockwiseModules()
#' @param cutoff Thresholds for Biological Interpretability Difference Modules
#' @param MinNumPathways Minimum number of pathways included in the biologically interpretable difference module
#' @param p.adjust.method p-value adjustment method.(holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
# "fdr", "none")
#' @param exact Whether to divide GO pathways more accurately (work when ancestor_anno=NULL)
#' @param ancestor_anno Annotations for ancestral relationships (like data('GO_Ancestor') )
#'
#' @return A list containing differential module results that are highly biologically interpretable
#' @export
#' @importFrom WGCNA pickSoftThreshold blockwiseModules moduleEigengenes
#' @importFrom stats wilcox.test p.adjust
#' @importFrom utils data
PathwaysModule=function(pathways_matrix=NULL,control_label=NULL,power=NULL,minModuleSize=NULL,mergeCutHeight=NULL,
cutoff=70,MinNumPathways=5,p.adjust.method='fdr',exact=TRUE,ancestor_anno=NULL){
if('package:WGCNA' %in% search()){
if(exact==FALSE & is.null(ancestor_anno)){
GO_Ancestor=NA
data("GO_Ancestor",envir = environment())
anno=GO_Ancestor
}else if(exact==TRUE & is.null(ancestor_anno)){
GO_Ancestor_exact=NA
data("GO_Ancestor_exact",envir = environment())
anno=GO_Ancestor_exact
}else{
anno=ancestor_anno
}
final=list()
data=as.data.frame(pathways_matrix)
label=data.frame(ID=rownames(data),label=data$label)
data=data[data$label==control_label,]
data=data[,which(colnames(data) %in% anno$GO)]
if(is.null(power)){
powers = c(1:30)
sft = pickSoftThreshold(data, powerVector = powers, verbose = 0)
if(is.na(sft$powerEstimate)){
return('Could not find a proper powers , Please give a power by youself .')
}else{
power=sft$powerEstimate
message('Find the proper power!')
}
}
net = blockwiseModules(data, power = power,
TOMType = "unsigned", minModuleSize = minModuleSize,
reassignThreshold = 0, mergeCutHeight = mergeCutHeight,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = F,
verbose = 0)
cluster=data.frame(ID=names(net$colors),cluster=net$colors)
cluster_list=list()
faction=vector()
for(i in 0:max(cluster$cluster)){
new_anno=anno[which(anno$GO %in% cluster[which(cluster$cluster==i),]$ID),]
sum=length(which(cluster$cluster==i))
term=unique(new_anno$Ancestor)
prop=sapply(1:length(term),function(x) length(which(new_anno$Ancestor==term[x]))*100/sum)
a=data.frame(Term=term[which.max(prop)],Fraction=max(prop))
faction[i+1]=a$Fraction
a$Fraction=paste0(round(a$Fraction,2),'%')
if(length(which(cluster$cluster==i))>5 & length(which(cluster$cluster==i))<1000 ){
cluster_list[[i+1]]=a
}else{
cluster_list[[i+1]]=NA
}
names(cluster_list)[i+1]=paste0('Module_NO.',i,'==>> ','There are ',sum,' pathways')
}
data=as.data.frame(pathways_matrix)
label=data.frame(ID=rownames(data),label=data$label)
data=data[,c(1,which(colnames(data) %in% anno$GO))]
ALL_eigengene=moduleEigengenes(data[,-1],net$colors)$eigengenes
ALL_eigengene$label=data$label
ALL_eigengene=ALL_eigengene[,c(ncol(ALL_eigengene),(1:(ncol(ALL_eigengene)-1)))]
Cor=data.frame(module=rownames(as.data.frame(stats::cor(ALL_eigengene)))[-1],cor=as.data.frame(stats::cor(ALL_eigengene))[-1,1])
ALL_eigengene_0=ALL_eigengene[ALL_eigengene$label==0,]
ALL_eigengene_1=ALL_eigengene[ALL_eigengene$label==1,]
pvalue=unlist(lapply(1:ncol(ALL_eigengene),function(x) wilcox.test(ALL_eigengene_0[,x],ALL_eigengene_1[,x])$p.value))
n=ncol(ALL_eigengene)-1
adjust=data.frame(module=colnames(ALL_eigengene),pvalue=pvalue,adjust_pvalue=p.adjust(pvalue,method=p.adjust.method))
adjust=adjust[-which(adjust$module=='label'),]
Num_pathways=as.vector(table(net$colors))
names(Num_pathways)=NULL
module=data.frame(module=paste0('ME',names(table(net$colors))),
Num_pathways=Num_pathways,
Fraction=faction)
result=merge(module,adjust,by='module')
result=merge(result,Cor,by='module')
message(paste0('Filter Module those Fraction less than ',cutoff))
id=which(result$adjust_pvalue < 0.05 & result$Fraction >= cutoff & result$Num_pathways >= MinNumPathways )
Result=result[id,]
Result=Result[order(Result$adjust_pvalue),]
final[[1]]=cluster
final[[2]]=result
final[[3]]=Result
final[[4]]=pathways_matrix
names(final)=c('ModuleResult','RAW_PathwaysModule','DE_PathwaysModule','Matrix')
message('Completed!')
return(final)
}else{
message('If you want to use this function, please install and load the WGCNA package')
}
}
#' Display biological information within each pathway module
#'
#' @param obj Results produced by PathwaysModule()
#' @param ID_Module ID of the diff module
#' @param exact Whether to divide GO pathways more accurately (work when ancestor_anno=NULL)
#' @param ancestor_anno Annotations for ancestral relationships (like data('GO_Ancestor') )
#'
#' @return List containing biologically specific information within the module
#' @export
#' @importFrom utils data
#'
ShowModule=function(obj=NULL,ID_Module=NULL,exact=TRUE,ancestor_anno=NULL){
i=ID_Module
if(exact==FALSE & is.null(ancestor_anno)){
GO_Ancestor=NA
data("GO_Ancestor",envir = environment())
anno=GO_Ancestor
}else if(exact==TRUE & is.null(ancestor_anno)){
GO_Ancestor_exact=NA
data("GO_Ancestor_exact",envir = environment())
anno=GO_Ancestor_exact
}else{
anno=ancestor_anno
}
cluster=obj$ModuleResult
final=list()
for(x in 1:length(i)){
new_anno=anno[which(anno$GO %in% cluster[which(cluster$cluster==i[x]),]$ID),]
sum=length(which(cluster$cluster==i[x]))
term=unique(new_anno$Ancestor)
prop=sapply(1:length(term),function(x) length(which(new_anno$Ancestor==term[x]))*100/sum)
a=new_anno[new_anno$Ancestor==term[which.max(prop)],]
b=setdiff(cluster[which(cluster$cluster==i[x]),]$ID,a$GO)
if(length(b)>0){
b2=anno[anno$GO %in% b,]
if(length(which(duplicated(b2$GO)))==0){
a=rbind(a,b2)
}else{
b2=b2[-which(duplicated(b2$GO)),]
a=rbind(a,b2)
}
}
final[[x]]=a
names(final)[x]=paste0('ME',i[x])
}
return(final)
}
#' Visualisation of the results of the analysis of the pathway modules
#'
#' @param BioM2_pathways_obj Results produced by BioM2(,target='pathways')
#' @param FindParaModule_obj Results produced by FindParaModule()
#' @param ShowModule_obj Results produced by ShowModule()
#' @param PathwaysModule_obj Results produced by PathwaysModule()
#' @param exact Whether to divide GO pathways more accurately (work when ancestor_anno=NULL)
#' @param ancestor_anno Annotations for ancestral relationships (like data('GO_Ancestor') )
#' @param type_text_table Whether to display it in a table
#' @param text_table_theme The topic of this table.Detail for ggtexttable()
#' @param n_neighbors The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
#' Larger values result in more global views of the manifold, while smaller values result in more local data being preserved.
#' In general values should be in the range 2 to 100.
#' @param spread The effective scale of embedded points. In combination with min_dist, this determines how clustered/clumped the embedded points are.
#' @param min_dist The effective minimum distance between embedded points.
#' Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together,
#' while larger values will result on a more even dispersal of points.
#' The value should be set relative to the spread value,
#' which determines the scale at which embedded points will be spread out.
#' @param size Scatter plot point size
#' @param target_weight Weighting factor between data topology and target topology.
#' A value of 0.0 weights entirely on data, a value of 1.0 weights entirely on target.
#' The default of 0.5 balances the weighting equally between data and target.
#' Only applies if y is non-NULL.
#' @param alpha Alpha for ellipse specifying the transparency level of fill color. Use alpha = 0 for no fill color.
#' @param ellipse logical value. If TRUE, draws ellipses around points.
#' @param ellipse.alpha Alpha for ellipse specifying the transparency level of fill color. Use alpha = 0 for no fill color.
#' @param theme Default:theme_base(base_family = "serif")
#' @param width image width
#' @param height image height
#' @param save_pdf Whether to save images in PDF format
#' @param volin Can only be used when PathwaysModule_obj exists. ( Violin diagram )
#' @param control_label Can only be used when PathwaysModule_obj exists. ( Control group label )
#' @param module Can only be used when PathwaysModule_obj exists.( PathwaysModule ID )
#' @param cols palette (vector of colour names)
#'
#' @return a ggplot2 object
#' @export
#' @import ggplot2
#' @import htmlwidgets
#' @import ggsci
#' @import CMplot
#' @import uwot
#' @import webshot
#' @import wordcloud2
#' @import ggpubr
#' @import ggthemes
#' @importFrom utils data
#' @importFrom stats aggregate quantile
#' @importFrom ggstatsplot ggbetweenstats
#'
VisMultiModule=function(BioM2_pathways_obj=NULL,FindParaModule_obj=NULL,ShowModule_obj=NULL,PathwaysModule_obj=NULL,exact=TRUE,ancestor_anno=NULL,
type_text_table=FALSE,text_table_theme=ttheme('mOrange'),
volin=FALSE,control_label=0,module=NULL,cols=NULL,
n_neighbors = 8,spread=1,min_dist =2,target_weight = 0.5,
size=1.5,alpha=1,ellipse=TRUE,ellipse.alpha=0.2,theme=ggthemes::theme_base(base_family = "serif"),
save_pdf=FALSE,width =7, height=7){
if(is.null(cols)){
cols = pal_d3("category20",alpha=alpha)(20)
}
if(!is.null(BioM2_pathways_obj)){
if(exact==FALSE & is.null(ancestor_anno)){
GO_Ancestor=NA
data("GO_Ancestor",envir = environment())
anno=GO_Ancestor
}else if(exact==TRUE & is.null(ancestor_anno)){
GO_Ancestor_exact=NA
data("GO_Ancestor_exact",envir = environment())
anno=GO_Ancestor_exact
}else{
anno=ancestor_anno
}
if(type_text_table){
Result=BioM2_pathways_obj$PathwaysResult
Result=Result[1:10,]
colnames(Result)=c('ID','Correlation','Pvalue','P-adjusted','Description')
pic=ggtexttable(Result, rows = NULL,theme=text_table_theme)
if(save_pdf){
pic
ggsave('PathwaysResult_Table.pdf',width =width, height =height)
return(pic)
}else{
pic
return(pic)
}
}else{
pathways=BioM2_pathways_obj$PathwaysResult[,c('id','p.value')]
colnames(pathways)=c('SNP','pvalue')
new_anno=anno[anno$GO %in% pathways$SNP,]
tb=table(new_anno$Ancestor)
if(length(which(tb>20))>10){
Names= names(tb[order(tb,decreasing=T)[1:8]])
}else{
Names=names(which(table(new_anno$Ancestor)>20))
}
new_anno=new_anno[which(new_anno$Ancestor %in% Names),]
new_anno=new_anno[,c('GO','Ancestor')]
colnames(new_anno)=c('SNP','Chromosome')
Anno=merge(new_anno,pathways)
a=names(which(table(Anno$Chromosome)>250))
a2=names(which(table(Anno$Chromosome)<250))
if(length(a)==0){
Anno=Anno[Anno$Chromosome %in% a2,]
}else{
a2=names(which(table(Anno$Chromosome)<250))
A=lapply(1:length(a),function(x){
Anno2=Anno[Anno$Chromosome %in% a[x],]
c1<- quantile(Anno2$pvalue, probs = 0.03)
Anno3=Anno2[Anno2$pvalue < c1 ,]
Anno4=Anno2[Anno2$pvalue > c1 ,]
Anno4=Anno4[sample(1:nrow(Anno4),150,replace = F),]
o=rbind(Anno3,Anno4)
})
A=do.call(rbind,A)
A2=Anno[Anno$Chromosome %in% a2,]
Anno=rbind(A,A2)
}
Anno$Position=sample(1:100000,nrow(Anno))
Anno=Anno[,c(1,2,4,3)]
if(save_pdf){
pic=CMplot(Anno,plot.type="c",
threshold=c(0.001,0.05)/nrow(pathways),threshold.col=c('red','orange'),
multracks=FALSE, H=2,axis.cex=2,chr.den.col=NULL,col=cols,
r=2.5,lab.cex=1.7,
file.output=T,file='pdf',height=height, width=width)
return(pic)
}else{
pic=CMplot(Anno,plot.type="c",
threshold=c(0.001,0.05)/nrow(pathways),threshold.col=c('red','orange'),
r=2,lab.cex=1.7,
multracks=FALSE, chr.den.col=NULL,H=2,axis.cex=1.7,col=cols,file.output=F)
return(pic)
}
}
}
if(!is.null(FindParaModule_obj)){
if(type_text_table){
Result=FindParaModule_obj$TotalResult
Result[,3:5]=round(Result[,3:5],2)
pic=ggtexttable(Result, rows = NULL, theme=text_table_theme)
if(save_pdf){
pic
ggsave('ParameterSelection_Table.pdf',width =width, height =height)
return(pic)
}else{
pic
return(pic)
}
}else{
result=FindParaModule_obj$TotalResult
result$minModuleSize=as.character(result$minModuleSize)
pic=ggpubr::ggline(result,
size=size,
x = "mergeCutHeight",
y = "Mean_Fraction",
linetype = "minModuleSize",
shape = "minModuleSize",
color = "minModuleSize",
title = "Parameter Selection",
xlab = "mergeCutHeight",
ylab = "Mean_Fraction",
palette = cols)+theme
if(save_pdf){
pic
ggsave('ParameterSelection.pdf',width =width, height =height)
return(pic)
}else{
pic
return(pic)
}
}
}
if(!is.null(ShowModule_obj)){
if(type_text_table){
NAME=names(ShowModule_obj)
output=paste0(NAME,'_Table.pdf')
p<-"
Result=ShowModule_obj[[xxx]]
colnames(Result)=c('GO','Description','Ancestor','AncestorGO')
pic=ggtexttable(Result, rows = NULL, theme=text_table_theme)
if(save_pdf){
pic
ggsave(output[xxx],width =width, height =height)
return(pic)
}else{
pic
return(pic)
}
"
y=sapply(1:length(NAME),function(x) gsub('xxx',x,p))
pic=eval(parse(text =y))
return(pic)
}else{
NAME=names(ShowModule_obj)
output=paste0(NAME,'_WordCloud.png')
p<-"
words=ShowModule_obj[[NAME[xxx]]]$Name
engine <- worker()
segment <- segment(words, engine)
wordfreqs <- freq(segment)
wordf <- wordfreqs[order(wordfreqs$freq,decreasing = T),]
rm=c('of','in','by','for','via','process','regulation','lengthening','to','production',
'mediated','signaling','1-')
if( length(which(wordf$char %in% rm))==0){
wordf=wordf
}else{
wordf=wordf[-which(wordf$char %in% rm),]
}
num=ceiling(nrow(wordf)/4)
colors=rep('darkseagreen', nrow(wordf))
colors[1:4]='darkorange'
#wordf$freq=wordf$freq/max(wordf$freq)
if(nrow(wordf)>=40){
size=0.7
}else if(nrow(wordf)>=20 & nrow(wordf)<=40){
size=0.6
}else if(nrow(wordf)<20){
size=0.8
}
print(paste0(NAME[xxx],' ',nrow(wordf),' ',size))
my_graph <-wordcloud2(wordf,shape = 'circle',color = colors,backgroundColor =ba,
minRotation = -pi/8, maxRotation = pi/8,
shuffle = F,rotateRatio = 1,size=size)
if(save_pdf){
my_graph
saveWidget(my_graph,'tmp.html',selfcontained = F)
webshot('tmp.html',output[xxx], delay =6)
#return(my_graph)
}else{
my_graph
return(my_graph)
}
"
y=sapply(1:length(NAME),function(x) gsub('xxx',x,p))
pic=eval(parse(text =y))
return(pic)
}
}
if(!is.null(PathwaysModule_obj)){
if(type_text_table){
Result=PathwaysModule_obj$DE_PathwaysModule[,-4]
Result$Fraction=round(Result$Fraction,2)
Result$cor=round(Result$cor,2)
colnames(Result)=c('Modules','Num_Pathways','Fraction','P-adjusted','Correlation')
pic=ggtexttable(Result, rows = NULL, theme=text_table_theme)
if(save_pdf){
pic
ggsave('PathwaysModule_Table.pdf',width =width, height =height)
return(pic)
}else{
pic
return(pic)
}
}else if(volin){
data=PathwaysModule_obj$Matrix[,PathwaysModule_obj$ModuleResult$ID]
data=moduleEigengenes(data,PathwaysModule_obj$ModuleResult$cluster)$eigengenes
data$label=PathwaysModule_obj$Matrix[,'label']
#data$label=ifelse(data$label==control_label,'Control','Case')
colnames(data)[which(colnames(data)==paste0('ME',module))]='y'
label=NA
pic=ggstatsplot::ggbetweenstats(
data=data,
x = label,
y = y,
centrality.plotting =F,
point.args = list(position = ggplot2::position_jitterdodge(dodge.width = 0.6), alpha =
0.6, size = 3, stroke = 0, na.rm = TRUE),
type = "nonparametric",
p.adjust.method ='fdr'
)+ labs(
x = "Phenotype",
y = "Module EigenPathways",
#title = 'Distribution of Module Eigengenes across Phenotype',
title = paste0('Module',module)
) +
theme(
# This is the new default font in the plot
text = element_text(family = "serif", size = 8, color = "black"),
plot.title = element_text(
family = "serif",
size = 20,
face = "bold",
color = "#2a475e"
),
plot.subtitle = element_text(
family = "serif",
size = 15,
face = "bold",
color="#1b2838"
),
plot.title.position = "plot", # slightly different from default
axis.text = element_text(size = 10, color = "black"),
axis.title = element_text(size = 12)
)+
theme(
axis.ticks = element_blank(),
axis.line = element_line(colour = "grey50"),
panel.grid = element_line(color = "#b4aea9"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed"),
panel.background = element_rect(fill = "#fbf9f4", color = "#fbf9f4"),
plot.background = element_rect(fill = "#fbf9f4", color = "#fbf9f4")
)
if(save_pdf){
pic
ggsave(paste0('PathwaysModule_ME',module,'_VolinPlot.pdf'),width =width, height =height)
return(pic)
}else{
pic
return(pic)
}
}else{
cluster=PathwaysModule_obj$ModuleResult
cluster$cluster=paste0('ME',cluster$cluster)
Result=PathwaysModule_obj$DE_PathwaysModule
if(nrow(Result)>10){
Result=Result[1:10,]
}
meta=cluster[cluster$cluster %in% Result$module,]
data=as.data.frame(PathwaysModule_obj$Matrix)
pdata=data[,meta$ID]
test=as.data.frame(t(pdata))
test$ID=rownames(test)
test=merge(test,meta,by='ID')
rownames(test)=test$ID
test=test[,-1]
test$cluster=as.factor(test$cluster)
test_umap <- uwot::umap(test, n_neighbors = n_neighbors,spread=spread,min_dist = min_dist,
y = test$cluster, target_weight = target_weight)
test_umap <- as.data.frame(test_umap)
test_umap$Modules=test$cluster
pic=ggpubr::ggscatter(test_umap,
x='V1',
y='V2',
size = size,
#title ="Highly Biologically Explainable Differential Module",
subtitle="A UMAP visualization",
color = "Modules",
alpha = alpha,
ellipse = ellipse,
ellipse.alpha=ellipse.alpha,
ellipse.type="norm",
palette =cols,
xlab = "UMAP_1",
ylab = "UMAP_2",
)+theme
if(save_pdf){
pic
ggsave('PathwaysModule_UMAP.pdf',width =width, height =height)
return(pic)
}else{
pic
return(pic)
}
}
}
}
#' Visualisation of significant pathway-level features
#'
#' @param BioM2_pathways_obj Results produced by BioM2(,target='pathways')
#' @param pathlistDB A list of pathways with pathway IDs and their corresponding genes ('entrezID' is used).
#' For details, please refer to ( data("GO2ALLEGS_BP") )
#' @param top Number of significant pathway-level features visualised
#' @param p.adjust.method p-value adjustment method.(holm", "hochberg", "hommel",
#' "bonferroni", "BH", "BY","fdr","none")
#' @param alpha The alpha transparency, a number in (0,1). Detail for scale_fill_viridis()
#' @param begin The (corrected) hue in (0,1) at which the color map begins. Detail for scale_fill_viridis().
#' @param end The (corrected) hue in (0,1) at which the color map ends. Detail for scale_fill_viridis()
#' @param option A character string indicating the color map option to use. Detail for scale_fill_viridis()
#' @param seq Interval of x-coordinate
#'
#' @return a ggplot2 object
#' @export
#' @import ggplot2
#' @import viridis
#' @importFrom stats p.adjust
PlotPathFearture=function(BioM2_pathways_obj=NULL,pathlistDB=NULL,top=10,p.adjust.method='none',begin=0.1,end=0.9,alpha=0.9,option='C',seq=1){
data=BioM2_pathways_obj$PathwaysResult
geneNum_pathways=sapply(1:length(pathlistDB),function(i) length(pathlistDB[[i]]))
pathlistDB=pathlistDB[which(geneNum_pathways > 20 & geneNum_pathways < 200 )]
data_top=data[1:top,]
if(p.adjust.method=='none'){
data_top$val=-log10(data_top$p.value)
}else{
data_top$val=-log10(p.adjust(data_top$p.value,method = p.adjust.method))
}
data_top=data_top[order(data_top$val),]
data_top$size=sapply(1:nrow(data_top),function(x) length(pathlistDB[[data_top$id[x]]]))
data_top$id=factor(data_top$id,levels = data_top$id)
max=max(data_top$val)+1
val=NA
id=NA
size=NA
term=NA
pic=ggplot(data_top) +
geom_col(aes(val, id,fill=size), width = 0.6)+
scale_fill_viridis(alpha=alpha,begin=begin,end=end,direction = -1,
name='size',option = option)+
labs(
x = '-log(p) ',
y = NULL,
title = paste0('Top ',top,' Pathway-Level Features'),
shape='-log(P-value)'
)+
scale_x_continuous(
limits = c(0, max),
breaks = seq(0, max, by = seq),
expand = c(0, 0),
position = "top"
) +
scale_y_discrete(expand = expansion(add = c(0, 0.5))) +
theme(
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "#A8BAC4", linewidth = 0.3),
axis.ticks.length = unit(0, "mm"),
#axis.title = element_blank(),
axis.line.y.left = element_line(color = "black"),
axis.text.y = element_text(family = "serif", size = 12,face = 'bold',colour='black'),
axis.text.x = element_text(family = "serif", size = 13,colour='black'),
axis.title = element_text(size = 18,family = "serif",face = 'bold.italic',vjust = 5),
plot.title = element_text(size = 20,family = "serif",face = 'bold'),
legend.text = element_text(family = "serif",face = 'bold'),
legend.title = element_text(size=13,family = "serif",face = 'bold'),
)+
geom_text(
data = data_top,
aes(0, y = id, label = term),
hjust = 0,
nudge_x = 0.5,
colour = "white",
family = "serif",
size = 6
)
return(pic)
}
#' Visualisation Original features that make up the pathway
#'
#' @param data The input omics data
#' @param pathlistDB A list of pathways with pathway IDs and their corresponding genes ('entrezID' is used).
#' For details, please refer to ( data("GO2ALLEGS_BP") )
#' @param FeatureAnno The annotation data stored in a data.frame for probe mapping.
#' It must have at least two columns named 'ID' and 'entrezID'.
#' (For details, please refer to data( data("MethylAnno") )
#' @param PathNames A vector.A vector containing the names of pathways
#' @param p.adjust.method p-value adjustment method.(holm", "hochberg", "hommel",
#' "bonferroni", "BH", "BY","fdr","none")
#' @param save_pdf Whether to save images in PDF format
#' @param alpha The alpha transparency, a number in (0,1).
#' @param cols palette (vector of colour names)
#'
#' @return a plot object
#' @export
#' @import ggplot2
#' @import CMplot
#' @import ggsci
#' @importFrom stats p.adjust
PlotPathInner=function(data=NULL,pathlistDB=NULL,FeatureAnno=NULL,PathNames=NULL,
p.adjust.method='none',save_pdf=FALSE,alpha=1,cols=NULL){
Result=list()
if(is.null(cols)){
cols = pal_d3("category20",alpha=alpha)(20)
}
featureAnno=FeatureAnno[FeatureAnno$ID %in% colnames(data),]
for(i in 1:length(PathNames)){
cpg_id=featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB[[PathNames[i]]])]
cpg_data=data[,c('label',cpg_id)]
cpg_0=cpg_data[which(cpg_data$label==unique(cpg_data$label)[1]),]
cpg_1=cpg_data[which(cpg_data$label==unique(cpg_data$label)[2]),]
pvalue=unlist(lapply(2:ncol(cpg_data),function(x) wilcox.test(cpg_0[,x],cpg_1[,x])$p.value))
if(p.adjust.method=='none'){
result=data.frame(SNP=cpg_id,Chromosome=rep(PathNames[i],length(cpg_id)),
Position=sample(1:100000,length(cpg_id)),pvalue=pvalue)
}else{
result=data.frame(SNP=cpg_id,Chromosome=rep(PathNames[i],length(cpg_id)),
Position=sample(1:100000,length(cpg_id)),pvalue=p.adjust(pvalue,method = p.adjust.method))
}
Result[[i]]=result
}
Result=do.call(rbind,Result)
pic=CMplot(Result,plot.type="c",
threshold=c(0.001,0.05),threshold.col=c('red','orange'),
multracks=FALSE, H=2,axis.cex=2,chr.den.col=NULL,col=cols,
r=2.5,lab.cex=2,outward = TRUE,signal.cex=2, signal.pch = 18,
file.output=save_pdf,file='pdf',height=13, width=13)
return(pic)
}
#' Correlalogram for Biological Differences Modules
#'
#' @param PathwaysModule_obj Results produced by PathwaysModule()
#' @param alpha The alpha transparency, a number in (0,1). Detail for scale_fill_viridis()
#' @param begin The (corrected) hue in (0,1) at which the color map begins. Detail for scale_fill_viridis().
#' @param end The (corrected) hue in (0,1) at which the color map ends. Detail for scale_fill_viridis()
#' @param option A character string indicating the color map option to use. Detail for scale_fill_viridis()
#' @param family calligraphic style
#' @return a ggplot object
#' @export
#' @import ggplot2
#' @importFrom ggstatsplot ggcorrmat
#' @import viridis
#'
#'
#'
PlotCorModule=function(PathwaysModule_obj=NULL,
alpha=0.7,begin=0.2,end=0.9,option="C",family="serif"){
colors=PathwaysModule_obj$ModuleResult$cluster
names(colors)=PathwaysModule_obj$ModuleResult$ID
ALL_eigengene=moduleEigengenes(PathwaysModule_obj$Matrix[,names(colors)],colors)$eigengenes
data=ALL_eigengene[,PathwaysModule_obj$DE_PathwaysModule$module]
pic=ggcorrmat(
data = data,
type = "nonparametric",
ggcorrplot.args = list(show.legend =T,pch.cex=10),
#title = "Correlalogram for Biological Differences Modules",
subtitle = " ",
caption = " "
)+theme(
# This is the new default font in the plot
text = element_text(family = family, size = 8, color = "black"),
plot.title = element_text(
family = family,
size = 20,
face = "bold",
color = "black"
),
plot.subtitle = element_text(
family = family,
size = 15,
face = "bold",
color="#1b2838"
),
plot.title.position = "plot", # slightly different from default
axis.text.x = element_text(size = 12, color = "black"),
axis.text.y = element_text(size = 12, color = "black"),
axis.title = element_text(size = 15)
)+scale_fill_viridis(alpha=alpha,begin=begin,end=end,direction = -1,
name='correlation',option = option)+
theme(legend.title =element_text(size = 10, color = "black"),
legend.text = element_text(size = 10,color = "black"))
pic$labels$caption=NULL
return(pic)
}
#' Network diagram of pathways-level features
#'
#' @param data The input omics data
#' @param pathlistDB A list of pathways with pathway IDs and their corresponding genes ('entrezID' is used).
#' For details, please refer to ( data("GO2ALLEGS_BP") )
#' @param FeatureAnno The annotation data stored in a data.frame for probe mapping.
#' It must have at least two columns named 'ID' and 'entrezID'.
#' (For details, please refer to data( data("MethylAnno") )
#' @param PathNames A vector.A vector containing the names of pathways
#' @param cutoff Threshold for correlation between features within a pathway
#' @param num The first few internal features of each pathway that are most relevant to the phenotype
#' @param BioM2_pathways_obj Results produced by BioM2()
#'
#' @return a ggplot object
#' @export
#' @import ggplot2
#' @import ggnetwork
#' @import igraph
#' @import ggsci
#' @import ggforce
#'
PlotPathNet=function(data=NULL,BioM2_pathways_obj=NULL,FeatureAnno=NULL,pathlistDB=NULL,PathNames=NULL,
cutoff=0.2,num=10){
featureAnno=FeatureAnno[FeatureAnno$ID %in% colnames(data),]
sub=list()
i=1
for(i in 1:length(PathNames)){
cpg_id=featureAnno$ID[which(featureAnno$entrezID %in% pathlistDB[[PathNames[i]]])]
cpg_data=data[,c('label',cpg_id)]
COR=stats::cor(cpg_data$label,cpg_data[,-1])
COR=ifelse(COR>0,COR,-COR)
names(COR)=cpg_id
len=length(which(COR>cutoff))
if(len>10 & len<num){
n=names(COR)[which(COR>cutoff)]
}else if(len >= num){
n=names(COR)[order(COR,decreasing = T)][1:num]
}else{
n=names(COR)[order(COR,decreasing = T)][1:10]
}
sub[[i]]=data[,n]
names(sub)[i]=PathNames[i]
}
result=list()
a=lapply(1:10, function(x){
data.frame(
label=colnames(sub[[x]]),
value=rep(names(sub)[x],length(sub[[x]]))
)
})
x=NA
y=NA
xend=NA
yend=NA
same.conf=NA
conf=NA
names(sub)=NULL
result$vertices=do.call(rbind,a)
comid=unique(result$vertices$label[duplicated(result$vertices$label)])
result$vertices=result$vertices[!duplicated(result$vertices$label),]
rownames(result$vertices)=result$vertices$label
nonid=setdiff(result$vertices$label,comid)
dd=do.call(cbind,sub)
dd=dd[,!duplicated(colnames(dd))]
cor_matrix <- stats::cor(dd)
upper_tri <- cor_matrix[upper.tri(cor_matrix)]
n <- nrow(cor_matrix)
upper_tri_matrix <- matrix(0, n, n)
upper_tri_matrix[upper.tri(upper_tri_matrix)] <- upper_tri
cor_matrix=upper_tri_matrix
colnames(cor_matrix)=colnames(dd)
rownames(cor_matrix)=colnames(dd)
df <- data.frame(from = character(n^2), to = character(n^2), Correlation = numeric(n^2))
count <- 1
for (i in 1:n) {
for (j in i:n) {
df[count, "from"] <- rownames(cor_matrix)[i]
df[count, "to"] <- rownames(cor_matrix)[j]
df[count, "Correlation"] <- cor_matrix[i, j]
count <- count + 1
}
}
df$Correlation=ifelse(df$Correlation>0,df$Correlation,-df$Correlation)
df=df[df$Correlation>0,]
DF=df[df$Correlation> 0.1,]
DF$same.conf=ifelse(result$vertices[DF$from,"value"]==result$vertices[DF$to,"value"],1,0)
DF1=DF[DF$same.conf==1,]
pname=BioM2_pathways_obj$PathwaysResult$id[1:10]
cor_matrix <- stats::cor(BioM2_pathways_obj$PathwaysMatrix[,pname])
upper_tri <- cor_matrix[upper.tri(cor_matrix)]
n <- nrow(cor_matrix)
upper_tri_matrix <- matrix(0, n, n)
upper_tri_matrix[upper.tri(upper_tri_matrix)] <- upper_tri
cor_matrix=upper_tri_matrix
colnames(cor_matrix)=colnames(BioM2_pathways_obj$PathwaysMatrix[,pname])
rownames(cor_matrix)=colnames(BioM2_pathways_obj$PathwaysMatrix[,pname])
df <- data.frame(from = character(n^2), to = character(n^2), Correlation = numeric(n^2))
count <- 1
for (i in 1:n) {
for (j in i:n) {
df[count, "from"] <- rownames(cor_matrix)[i]
df[count, "to"] <- rownames(cor_matrix)[j]
df[count, "Correlation"] <- cor_matrix[i, j]
count <- count + 1
}
}
df$Correlation=ifelse(df$Correlation>0,df$Correlation,-df$Correlation)
DF0=df[df$Correlation> 0,]
DF0=DF0[DF0$Correlation>quantile(DF0$Correlation, probs = 0.75),]
map=result$vertices[which(!duplicated(result$vertices$value)),]
rownames(map)=map$value
DF0$from=map[DF0$from,]$label
DF0$to=map[DF0$to,]$label
DF0$same.conf=rep(0,nrow(DF0))
DF=rbind(DF1,DF0)
result$edges=DF
fb.igra=graph_from_data_frame(result$edges[,1:2],directed = FALSE)
V(fb.igra)$conf=result$vertices[V(fb.igra)$name, "value"]
E(fb.igra)$same.conf=result$edges$same.conf
E(fb.igra)$lty=ifelse(E(fb.igra)$same.conf == 1, 1, 2)
pic<-ggplot(ggnetwork(fb.igra), aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(linetype= as.factor(same.conf)),
#arrow = arrow(length = unit(6, "pt"), type = "closed") #if directed
color = "grey50",
curvature = 0.2,
alpha=0.8,
ncp=10,
linewidth=0.7
) +
geom_nodes(aes(color = conf),
size = 7,
alpha=0.5) +
scale_color_brewer("Pathways",
palette = 'Paired') +
scale_linetype_manual(values = c(2,1)) +
guides(linetype = "none") +
theme_blank()+
geom_nodes(aes(color = conf),
size = 4)+labs(title = 'Network Diagram of TOP 10 Pathway-Level Features')+
theme(legend.text = element_text(family = 'serif',face = 'bold.italic',color = 'grey15'),
legend.title = element_text(family = 'serif',face = 'bold'),
plot.title = element_text(family = 'serif',face = 'bold'))+
geom_mark_ellipse(
aes(fill=conf,label =conf),
alpha = 0.2,
show.legend = F
)+scale_fill_brewer(palette = 'Paired')+xlim(-0.05,1.05)+ylim(-0.05,1.05)
return(pic)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.