#' @title Conduct LFQ and assess performance of one specific LFQ workflow.
#' @description The ProteoLFQ enables the label-free quantification of proteomic
#' data and the performance assessment of each LFQ workflow from multiple perspectives. This
#' tool function gives the results based on each LFQ workflow and the criteria preferred and selected by the users.
#' For function definitions and descriptions please use "??ProteoLFQ" command in R.
#' @param data_q This input file should be numeric type except the first and second column containing the names and label (control or case) of the studied samples, respectively. The intensity data should be provided in this input file with the following order: samples in row and proteins/peptides in column. Missing value (NA) of protein intensity are allowed.
#' @param selectFile Input the name of your prefered strategies. Sample data of this data type is in the working directory (in github) “idrblab/EVALFQ/data/selectworkflows.rda”.
#' @param Ca Criterion (a): precision of LFQ based on the proteomes among replicates (Proteomics. 15:3140-51, 2015). If set 1, the user chooses to assess LFQ workflows using Criterion (a). If set 0, the user excludes Criterion (a) from performance assessment. The default setting of this value is “1”.
#' @param Cb Criterion (b): classification ability of LFQ between distinct sample groups (Nat Biotechnol. 28:83-9, 2010). If set 1, the user chooses to assess LFQ workflows using Criterion (b). If set 0, the user excludes Criterion (b) from performance assessment. The default setting of this value is “1”.
#' @param Cc Criterion (c): differential expression analysis by reproducibility-optimization (Nat Biotechnol. 32:896-902, 2014). If set 1, the user chooses to assess LFQ workflows using Criterion (c). If set 0, the user excludes Criterion (c) from performance assessment. The default setting of this value is “1”.
#' @param Cd Criterion (d): reproducibility of the identified protein markers among different datasets (Mol Biosyst. 11:1235-40, 2015). If set 1, the user chooses to assess LFQ workflows using Criterion (d). If set 0, the user excludes Criterion (d) from performance assessment. The default setting of this value is “1”.
#' @return preprocessed matrix
#' @import utils stats
#' @import metabolomics
#' @import affy vsn
#' @import MASS limma
#' @import ProteoMM ROTS
#' @importFrom grDevices colorRampPalette dev.off pdf
#' @useDynLib EVALFQ
#' @importFrom Rcpp sourceCpp
#' @rawNamespace import(dplyr, except=c(filter,lag,select,combine))
#' @rawNamespace import(gplots, except=lowess)
#' @importFrom pcaMethods pca
#' @importFrom pcaMethods completeObs
#' @import impute
#' @usage lfqevalupart(data_q, selectFile, Ca="1", Cb="1", Cc="1", Cd="1")
#' @export lfqevalupart
lfqevalupart <- function(data_q, selectFile, Ca="1", Cb="1", Cc="1", Cd="1"){
#### 样本在行,特征在列
trans <- function(data,n){
matrix <- switch(
n,
"1" = Box_Cox(data),
"2" = log2(data),
"3" = data
)
return(matrix)
}
cents <- function(data,n){
matrix <- switch(
n,
"1" = MEC(data),
"2" = MDC(data),
"3" = data
)
return(matrix)
}
scals<- function(data,n){
matrix <- switch(
n,
"1" = 1,
"2" = AUTO1(data),
"3" = PARETO1(data),
"4" = VAST1(data),
"5" = RANGE1(data)
)
return(matrix)
}
norm <- function(data,n){
matrix <- switch(
n,
"1" = t(fastlo(as.matrix(data))),
"2" = t(EIGENMS(data, label)),
#t(LINEAR(data)),
"3" = t(LOWESS(data)),
"4" = t(SMAD(data)),
"5" = t(MEAN(data)),
"6" = t(MEDIAN(data)),
"7" = t(data),
"8" = t(PQN(data)),
"9" = t(QUANTILE(as.matrix(data))),
"10" = t(RLR1(data)),
"11" = t(MSTUS(data)),
"12" = t(TMM(data)),
"13" = t(VSN(as.matrix(data)))
)
return(matrix)
}
impute <- function(data,n){
matrix <- switch(
n,
"1" = filter_train_data,
"2" = t(back(filter_train_data)),
"3" = t(bpca(filter_train_data,nPcs=3)),
"4" = t(censor(filter_train_data)),
"5" = t(knn(filter_train_data,k=10)),
#t(lls(filter_train_data,k=10)),
"6" = t(svdm(filter_train_data,nPcs=3)),
"7" = t(zero(filter_train_data))
)
return(matrix)
}
consistency <- function(fold = 5, top = 20) {
folds <- fold
control.label <- control.y # variable-1
test.fold1 <- split(sample(1:length(control.label)), 1:folds) #ignore warning
case.label <- case.y # variable-2
test.fold2 <- split(sample(1:length(case.label)), 1:folds) #ignore warning
DEG <- list()
for (i in 1:folds) {
com.x <- cbind(control.x[, test.fold1[[i]]], case.x[, test.fold2[[i]]]) # variable-3 & 4.
lab.ct <- test.fold1[[i]]
lab.ca <- test.fold2[[i]]
design <- cbind(Grp1 = 1, Grp2vs1 = c(rep(0, length(lab.ct)), rep(1, length(lab.ca))))
fit <- limma::lmFit(com.x, design)
fit <- limma::eBayes(fit)
DEG[[i]] <- rownames(limma::topTable(fit, coef = 2, number = nrow(com.x)))
}
names(DEG) <- LETTERS[1:folds]
top.n <-top # Extracting the top n genes.
DEG.list <- DEG
for (g in 1:length(DEG.list)) {
DEG.list[[g]] <- DEG.list[[g]][1:top.n]
}
# Calculating consistency score:
setlist <- DEG.list
OLlist <- overLapper(setlist=setlist, sep="", type="vennsets")
con.score <- 0
VennList <- OLlist$Venn_List
for (i in 1:length(VennList)) {
insect.n <- nchar(names(VennList[i]))
if (insect.n < 2) next
num.i <- 2^(insect.n - 2) * length(VennList[[i]])
con.score <- con.score + num.i
}
return(con.score) # consistense score
}
# Stable consistense score, with 20 repeats
stabel.score <- function(repeats = 20, fold = 5, top = 10) {
score <- 0
for (r in 1:repeats) {
score <- score + consistency(fold, top)
}
return(score/repeats)
}
#####################################################################
iName<-c("BOX","LOG","NON")
oName<-c("MEC","MDC","NON")
pName<-c("NON","ATO","PAR","RAN","VAS")
jName<-c("CYC","EIG","LOW","MAD","MEA","MED","NON","PQN","QUA","RLR","TIC","TMM","VSN")
gName<-c("NON","BAK","BPC","CEN","KNN","SVD","ZER")
dataa<-data_q
rownames(dataa)<-dataa[,1]
label<- dataa[,2]
frame<- dataa[, -(1:2)]
frame<-t(frame)
frame<- data.frame(frame)
train_data<-data.matrix(frame, rownames.force = NA)
train_data_t<-train_data
Fpcv<-list()
Fscore<-list()
Faccuracy<-list()
Fpmad<-list()
Fbar1num<-list()
Fbaronummean<-list()
Fbarosd<-list()
Fbarorsd<-list()
Fbaro_rsdr_to_bar1num<-list()
spike<-list()
backgound<-list()
time<-0
data_p <- selectFile
for(s in 1:nrow(data_p)){
datap1 <- data_p[s,2]
datap2 <- as.character(datap1)
datap3 <- unlist(strsplit(datap2,"-"))
tra <- datap3[1]; cen <- datap3[2]; sca <- datap3[3]; nor <- datap3[4]; imp <- datap3[5]
message(sprintf("tra:%s; ", tra), sprintf("cen:%s; ", cen), sprintf("sca:%s; ", sca), sprintf("nor:%s; ", nor), sprintf("imp:%s; ", imp))
for(i in tra){
train_data_tran <- try(trans(train_data_t,i))
if (inherits(train_data_tran, "try-error"))
#if(class(train_data_tran)=="try-error")
{ next }
if(i!=3){
for(o in cen){
tran_train_data<-train_data_tran
tran_train_data[is.infinite(data.matrix(tran_train_data))]<-NA
cen_train_data<-try(cents(tran_train_data,o))
if (inherits(cen_train_data, "try-error"))
#if(class(cen_train_data)=="try-error")
{ next }
for(p in sca){
scal_factor<-try(scals(tran_train_data,p))
if (inherits(scal_factor, "try-error"))
#if(class(scal_factor)=="try-error")
{ next }
scal_train_data<-cen_train_data/scal_factor
if (inherits(scal_train_data, "try-error"))
#if(class(scal_train_data)=="try-error")
{ next }
for(j in nor){
scal_train_data[is.nan(scal_train_data)]<-NA
scal_train_data[is.infinite(scal_train_data)]<-NA
normalized_data <- try(norm(scal_train_data,j))
if (inherits(normalized_data, "try-error"))
#if(class(normalized_data)=="try-error")
{ next }
label_c<-as.factor(label)
g1<-table(label_c)[levels(label_c)[1]]*0.8
g2<-table(label_c)[levels(label_c)[2]]*0.8
train_data_filtering<-try(Basicfilter(normalized_data,label,g1=2,g2=2))
if (inherits(train_data_filtering, "try-error"))
#if(class(train_data_filtering)=="try-error")
{ next }
filter_train_data<-train_data_filtering
for(g in imp){
if(g==1){
imputed_data <- filter_train_data
}
if(g!=1){
imputed_data <- try(impute(filter_train_data,g))
if (inherits(imputed_data, "try-error"))
#if(class(imputed_data)=="try-error")
{ next }
}
##### Feature Selection
time=time+1
dataa<-data_q
frame<-imputed_data
label<- dataa[,2]
rots.out <-try(ROTS(data = t(frame), groups = as.character(label), B = 200, K = 500 , seed = 1234,log = FALSE))
if (inherits(rots.out, "try-error"))
#if(class(rots.out)=="try-error")
{ next }
frame<-imputed_data
label<-as.factor(as.character(label))
im.data<-data.frame(label=label,frame)
#(a) Precision of LFQ Based on the Proteomes among Replicates
if( Ca == 1 ){
cat(paste("Assessing" , paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria A: Precision"),"\n")
data<-im.data
result <-PCV1(data)
pcv<-sapply(1:3, function(i){round(1000*mean(as.numeric(result[,i])))/1000})[3]
pmad <- try(mean(PMAD(data)))
if (inherits(pmad, "try-error"))
#if(class(pmad)=="try-error")
{ next }
print(pmad)
}else{
message("'Criteria A: Precision' cannot be evaluated, Please Check!")
}
#(b) Classification Ability of LFQ between Distinct Sample Groups
if( Cb == 1 ){
cat(paste("Assessing" , paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria B: Classification.Ability"),"\n")
data<-im.data
rots.out <- rots.out
col_pos <- which(rots.out$FDR<0.05)
if(length(col_pos)<=10) {
markerid<-order(rots.out$pvalue)[1:20]
}else {
markerid <- which(rots.out$FDR<0.05)+1
}
clusters <- hclust(dist(data[,markerid]))
clusterCut <- cutree(clusters, 2)
dataa<-data_q
label<- dataa[,2]
tmatrix<-table(clusterCut, label)
tru<-as.numeric(data[,1])
accuracy<-(tmatrix[1,1]+ tmatrix[2,2])/length(label)
Faccuracy[paste(i,o,p,j,g,sep="")]<-accuracy
print(accuracy)
}else{
message("'Criteria.B-Differential.Expression' cannot be evaluated, Please Check!")
}
#(c) Differential Expression Analysis Based on Reproducibility-optimization
if( Cc == 1 ){
cat(paste("Assessing" ,paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria C: Differential.Expression"),"\n")
rots.out <- rots.out
breaks<-seq(0,1,0.05)
sdres <- affy::hist(rots.out$pvalue,breaks=breaks)
bar1num<-sdres$counts[1]
baronummean<-mean(sdres$counts[-1])
barosd<-sd(sdres$counts[-1])
barorsd<-barosd/baronummean
Fbar1num[paste(i,o,p,j,g,sep="")]<-bar1num
Fbaronummean[paste(i,o,p,j,g,sep="")]<-baronummean
Fbarosd[paste(i,o,p,j,g,sep="")]<-barosd
Fbarorsd[paste(i,o,p,j,g,sep="")]<-barorsd
Fbaro_rsdr_to_bar1num[paste(i,o,p,j,g,sep="")]<-barorsd/bar1num
Fbaro_rsdr_to_bar1num1 <- barorsd/bar1num
print(Fbaro_rsdr_to_bar1num1)
}else{
message("'Criteria.C-Differential.Expression' cannot be evaluated, Please Check!")
}
#(d) Reproducibility of the Identified Protein Markers among Different Datasets
if( Cd == 1 && length(label) >= 20){
cat(paste("Assessing" , paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria D: Reproducibility"),"\n")
test_data <- imputed_data
label.vector <- names(table(label))
control.x <- as.data.frame(t(test_data[label == label.vector[1], -1]))
case.x <- as.data.frame(t(test_data[label == label.vector[2], -1]))
control.y <- rep(0, table(label)[1])
case.y <- rep(1, table(label)[2])
score <- try(stabel.score(repeats = 200, fold = 5, top = 20))
print(score)
}else{
message("'Criteria.D-Reproducibility' cannot be evaluated, Please Check!")
}
}
}
}
}
}else{
for(o in 3){
tran_train_data<-train_data_tran
tran_train_data[is.infinite(data.matrix(tran_train_data))]<-NA
cen_train_data<-try(cents(tran_train_data,o))
if (inherits(cen_train_data, "try-error"))
#if(class(cen_train_data)=="try-error")
{ next }
for(p in 1){
scal_factor<-try(scals(tran_train_data,p))
if (inherits(scal_factor, "try-error"))
#if(class(scal_factor)=="try-error")
{ next }
scal_train_data<-cen_train_data/scal_factor
if (inherits(scal_train_data, "try-error"))
#if(class(scal_train_data)=="try-error")
{ next }
for(j in 13){
scal_train_data[is.nan(scal_train_data)]<-NA
scal_train_data[is.infinite(scal_train_data)]<-NA
normalized_data <- try(norm(scal_train_data,j))
if (inherits(normalized_data, "try-error"))
#if(class(normalized_data)=="try-error")
{ next }
label_c<-as.factor(label)
g1<-table(label_c)[levels(label_c)[1]]*0.8
g2<-table(label_c)[levels(label_c)[2]]*0.8
train_data_filtering<-try(Basicfilter(normalized_data,label,g1=2,g2=2))
if (inherits(train_data_filtering, "try-error"))
#if(class(train_data_filtering)=="try-error")
{ next }
filter_train_data<-train_data_filtering
for(g in imp){
if(g==1){
imputed_data <- filter_train_data
}
if(g!=1){
imputed_data <- try(impute(filter_train_data,g))
if (inherits(imputed_data, "try-error"))
#if(class(imputed_data)=="try-error")
{ next }
}
##### Feature Selection
time=time+1
dataa<-data_q
frame<-imputed_data
label<- dataa[,2]
rots.out <-try(ROTS(data = t(frame), groups = as.character(label), B = 200, K = 500 , seed = 1234,log = FALSE))
if (inherits(rots.out, "try-error"))
#if(class(rots.out)=="try-error")
{ next }
frame<-imputed_data
label<-as.factor(as.character(label))
im.data<-data.frame(label=label,frame)
#(a) Precision of LFQ Based on the Proteomes among Replicates
if( Ca == 1 ){
cat(paste("Assessing" , paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria A: Precision"),"\n")
data<-im.data
result <-PCV1(data)
pcv<-sapply(1:3, function(i){round(1000*mean(as.numeric(result[,i])))/1000})[3]
pmad <- try(mean(PMAD(data)))
if (inherits(pmad, "try-error"))
#if(class(pmad)=="try-error")
{ next }
Fpmad[paste(i,o,p,j,g,sep="")]<-pmad
print(pmad)
}else{
message("'Criteria A: Precision' cannot be evaluated, Please Check!")
}
#(b) Classification Ability of LFQ between Distinct Sample Groups
if( Cb == 1 ){
cat(paste("Assessing" , paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria B: Classification.Ability"),"\n")
data<-im.data
rots.out <- rots.out
col_pos <- which(rots.out$FDR<0.05)
if(length(col_pos)<=10) {
markerid<-order(rots.out$pvalue)[1:20]
}else {
markerid <- which(rots.out$FDR<0.05)+1
}
clusters <- hclust(dist(data[,markerid]))
clusterCut <- cutree(clusters, 2)
dataa<-data_q
label<- dataa[,2]
tmatrix<-table(clusterCut, label)
tru<-as.numeric(data[,1])
accuracy<-(tmatrix[1,1]+ tmatrix[2,2])/length(label)
Faccuracy[paste(i,o,p,j,g,sep="")]<-accuracy
print(accuracy)
}else{
message("'Criteria B: Classification.Ability' cannot be evaluated, Please Check!")
}
#(c) Differential Expression Analysis Based on Reproducibility-optimization
if( Cc == 1 ){
cat(paste("Assessing" ,paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria C: Differential.Expression"),"\n")
rots.out <- rots.out
breaks<-seq(0,1,0.05)
sdres <- affy::hist(rots.out$pvalue,breaks=breaks)
bar1num <- sdres$counts[1]
baronummean<-mean(sdres$counts[-1])
barosd<-sd(sdres$counts[-1])
barorsd<-barosd/baronummean
Fbar1num[paste(i,o,p,j,g,sep="")]<-bar1num
Fbaronummean[paste(i,o,p,j,g,sep="")]<-baronummean
Fbarosd[paste(i,o,p,j,g,sep="")]<-barosd
Fbarorsd[paste(i,o,p,j,g,sep="")]<-barorsd
Fbaro_rsdr_to_bar1num[paste(i,o,p,j,g,sep="")]<-barorsd/bar1num
Fbaro_rsdr_to_bar1num1 <- barorsd/bar1num
print(Fbaro_rsdr_to_bar1num1)
}else{
message("'Criteria C: Differential.Expression' cannot be evaluated, Please Check!")
}
#(d) Reproducibility of the Identified Protein Markers among Different Datasets
if( Cd == 1 && length(label) >= 20){
cat(paste("Assessing" , paste(time,".",sep=""), paste(iName[as.numeric(tra)],"-",oName[as.numeric(cen)],"-",pName[as.numeric(sca)],"-", jName[as.numeric(nor)], "-",gName[as.numeric(imp)],sep=""),"Under Criteria D: Reproducibility"),"\n")
test_data <- imputed_data
label.vector <- names(table(label))
control.x <- as.data.frame(t(test_data[label == label.vector[1], -1]))
case.x <- as.data.frame(t(test_data[label == label.vector[2], -1]))
control.y <- rep(0, table(label)[1])
case.y <- rep(1, table(label)[2])
score <- try(stabel.score(repeats = 200, fold = 5, top = 20))
Fscore[paste(i,o,p,j,g,sep="")]<-score
print(score)
}else{
message("'Criteria.D-Reproducibility' cannot be evaluated, Please Check!")
}
}
}
}
}
}
}
###############################################End##########################################################################
}}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.