R/xMSwrapper.XCMS.matchedFilter.R

Defines functions xMSwrapper.XCMS.matchedFilter

Documented in xMSwrapper.XCMS.matchedFilter

xMSwrapper.XCMS.matchedFilter <-
function(cdfloc, XCMS.outloc,xMSanalyzer.outloc,step.list=c(0.001,0.01,0.1), mz.diff.list=c(0.001,0.01,0.1), sn.thresh.list=c(10),
max=50, bw=c(10), minfrac.val=0.5, minsamp=2, mzwid=0.25, sleep=0, retcor.family = "symmetric", retcor.plottype = "mdevden", groupval.method = "medret",
numnodes=2,run.order.file=NA,max.mz.diff=15,max.rt.diff=300,
merge.eval.pvalue=0.2,mergecorthresh=0.7,deltamzminmax.tol=10,num_replicates=2,subs=NA, mz.tolerance.dbmatch=10, adduct.list=c("M+H"), samp.filt.thresh=0.70,feat.filt.thresh=50,cormethod="pearson", mult.test.cor=TRUE,
missingvalue=0,ignore.missing=TRUE,sample_info_file=NA,refMZ=NA,refMZ.mz.diff=10,refMZ.time.diff=NA,void.vol.timethresh=30,replacezeroswithNA=TRUE,scoreweight=30,
filepattern=".cdf",charge_type="pos", minexp.pct=0.1,syssleep=0.5)
{
suppressWarnings(sink(file=NULL))
x<-date()
	x<-strsplit(x,split=" ")
	targeted_feat_raw<-{}
	x1<-unlist(x)
	#fname<-paste(x1[2:5],collapse="_")
	#fname<-paste(x1[2:3],x1[5],x1[4],collapse="_")
	

	
	fname<-paste(x1[2:3],collapse="")
	fname<-paste(fname,x1[5],sep="")
	x1[4]<-gsub(x1[4],pattern=":",replacement="_")
	fname<-paste(fname,x1[4],sep="_")
	

	fname<-paste(xMSanalyzer.outloc,"/Log_",fname,".txt",sep="")
	print(paste("Program is running. Please check the logfile for runtime status: ",fname,sep=""))

	data_rpd_all=new("list")
	data_rpd=new("list")
	union_list=new("list")
	feateval_list<-new("list")
	rsqres_list<-new("list")
	annot.res<-{}
	annotres_list<-new("list")
	best_score<-(-1000000)
	
	if(is.na(sample_info_file)==FALSE)
    {
    	
    	
		sampleid_mapping<-read.table(sample_info_file,sep="\t",header=TRUE)

		if(is.na(cdfloc)==FALSE){
			l1<-list.files(cdfloc,filepattern)
	
			minexp<-minfrac.val*length(l1)
			
			}else{
				l1<-rep(1,num_replicates)	
			}
		if(length(l1)!=dim(sampleid_mapping)[1] & (is.na(cdfloc)==FALSE))
		{
			num_mis_files<-dim(sampleid_mapping)[1]-length(l1)
			stop(paste("ERROR: Only ",length(l1)," spectral files were found. ",num_mis_files," files are missing.",sep=""))
		
		}
	}else{
	
	
		if(is.na(cdfloc)==FALSE){
			l1<-list.files(cdfloc,filepattern)
			
			minexp<-minfrac.val*length(l1)
		}else{
			l1<-rep(1,num_replicates)	
		}
	}

if(length(l1)%%num_replicates>0)
{stop(paste("ERROR: Not all samples have ",num_replicates," replicates.",sep=""))
}

dir.create(XCMS.outloc,showWarnings=FALSE)
	dir.create(xMSanalyzer.outloc,showWarnings=FALSE)
	#subdir1<-paste(xMSanalyzer.outloc,"/Quality_assessment_files",sep="")
	#subdir2<-paste(xMSanalyzer.outloc,"/XCMS_filtered_data",sep="")
	#subdir3<-paste(xMSanalyzer.outloc,"/XCMS_with_xMSanalyzer_merged_data",sep="")
	
	
	sink(fname)
	print(sessionInfo())

	if(is.na(refMZ)==FALSE){
                        stddata<-read.table(refMZ,sep="\t",header=TRUE)
                        print(refMZ)
                        print(head(stddata))

                }else{
                        if(charge_type=="pos"){
                        data(example_target_list_pos)
                        stddata<-example_target_list_pos
                        }else{

                                if(charge_type=="neg"){
                                                data(example_target_list_neg)
                                                stddata<-example_target_list_neg
                                        }else{
                                                stop("Invalid option. \'charge_type\' should be \'pos\' or \'neg\'.")
                                                }
                                }
                }


        ############################################
        #1) Align profiles using the cdf.to.ftr wrapper function in apLCMS
	 if(is.na(XCMS.outloc)==TRUE)
        {
                stop("Undefined value for parameter, XCMS.outloc. Please define the XCMS output location.")

        }
         if(is.na(xMSanalyzer.outloc)==TRUE)
        {
                stop("Undefined value for parameter, xMSanalyzer.outloc. Please define the xMSanalyzer output location.")

        }


        if(is.na(cdfloc)==FALSE)
        {
                setwd(cdfloc)
                
                if(is.na(XCMS.outloc)==FALSE)
                {
                    
                    
                        data_rpd_all=XCMS.align.matchedFilter(cdfloc, XCMS.outloc, step.list = step.list, mz.diff.list = mz.diff.list,
                         sn.thresh.list = sn.thresh.list, max = max, bw.val = bw, minfrac.val = minfrac.val,
                         minsamp.val = minsamp, mzwid.val = mzwid, sleep.val = sleep, run.order.file = run.order.file,
                         subs = subs, retcor.family = retcor.family, retcor.plottype = retcor.plottype,
                         groupval.method = groupval.method,target.mz.list = stddata)
                    
                    
                    
                        
                        #data_rpd_all=XCMS.align.matchedFilter(cdfloc, XCMS.outloc,step.list, mz.diff.list, sn.thresh.list, max, bw, minfrac.val, minsamp, mzwid, sleep,
                        #run.order.file,subs, retcor.family, retcor.plottype,groupval.method)
			
                       
                }
                else
                {
                        stop("Undefined value for parameter, XCMS.outloc. Please define the output location.")
                }
        }
	
		setwd(XCMS.outloc)
                alignmentresults<-list.files(XCMS.outloc, "*.txt")
		print("Files found in XCMS output location:")
                print(alignmentresults)
		for(i in 1:length(alignmentresults)){
	
			data_rpd_all[[i]]<-read.table(paste(XCMS.outloc,"/",alignmentresults[i],sep=""),header=TRUE)
			
			
		}
	
	
	
	 subdir1<-paste(xMSanalyzer.outloc,"/Stage1",sep="")  #QC individual parameter settings
	subdir2<-paste(xMSanalyzer.outloc,"/Stage2",sep="")  #Data filtering
	subdir3<-paste(xMSanalyzer.outloc,"/Stage3a",sep="")  #Data merger/parameter optimization
	subdir3b<-paste(xMSanalyzer.outloc,"/Stage3b",sep="") 
	subdir4a<-paste(xMSanalyzer.outloc,"/Stage4a",sep="")	 #Raw QC: batch effect eval, TIC, etc
	
	
	dir.create(subdir1,showWarnings=FALSE)
	dir.create(subdir2,showWarnings=FALSE)
	dir.create(subdir3,showWarnings=FALSE)
	dir.create(subdir3b,showWarnings=FALSE)
	dir.create(subdir4a,showWarnings=FALSE)
	
	 if(is.na(sample_info_file)==FALSE)
    {
    	
    	subdir4b<-paste(xMSanalyzer.outloc,"/Stage4b",sep="")	 #Batch-effect corrected QC: batch effect eval, TIC, etc
	
		dir.create(subdir4b,showWarnings=FALSE)
		
	}
	
	if(is.na(adduct.list)==FALSE){
	subdir5<-paste(xMSanalyzer.outloc,"/Stage5",sep="")	 #Putative unprocessed annotations;
	

	dir.create(subdir5,showWarnings=FALSE)
	}
		bestscore<-(-1000000)
	
        {
                #stop("Undefined value for parameter, cdfloc. Please enter path of the folder where the CDF files to be processed are located.")
                #change location to the output folder
                setwd(XCMS.outloc)
                alignmentresults<-list.files(XCMS.outloc, "*.txt")
                
                if(length(data_rpd_all)>0)
                {
                          curdata_dim={}
                          if(num_replicates==2)
                          {
                                  fileroot="_PID"
                          }
                          else
                          {
                                  if(num_replicates>2)
                                  {
                                          fileroot="_CV"
                                  }
                                  else
                                  {
                                          fileroot=""
                             
					 stop("Need at least 2 technical replicates per sample.")
				     }
                          }
                          #for(i in 1:length(alignmentresults))
                          
                          cat("\n")
                          print("*******xMSanalyzer Stage 1: QC evaluation of invidual parameters*******")
                          cat("\n")
			         for(i in 1:length(data_rpd_all))
                          {
                              
                              print("dim of data is ")
                              print(dim(data_rpd_all[[i]]))
                          	
				  print(paste("**Evaluating XCMS results from parameter setting ",i,"**",sep=""))				  
                                  ############################################
                                  #2)Calculate pairwise correlation coefficients
                                  file_name=sapply(strsplit(alignmentresults[i],".txt"),head)
                                  #curdata=read.table(paste(XCMS.outloc,"/",alignmentresults[i],sep=""),header=TRUE)
                                  #curdata=check.mz.in.replicates(curdata)
                                  curdata=data_rpd_all[[i]]
                                  #############################################
                                  ############################################
                                  #3) Calculate Percent Intensity Difference
			                   if(num_replicates>1)
                                  {
							
								
								
								
								feat.eval.result=evaluate.Features(curdata, numreplicates=num_replicates,min.samp.percent=0.6,alignment.tool="XCMS",impute.bool=TRUE)
								cnames=colnames(feat.eval.result)
								feat.eval.result<-apply(feat.eval.result,2,as.numeric)
								feat.eval.result<-as.data.frame(feat.eval.result)
								feat.eval.result.mat=cbind(curdata[,c(1:8)],feat.eval.result)  
								feat.eval.outfile=paste(subdir1,"/",file_name,fileroot,"featureassessment.txt",sep="")					  
								#write results
								write.table(feat.eval.result.mat, feat.eval.outfile,sep="\t", row.names=FALSE)
								
                                curdata<-curdata[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),]
                                
                                feat.eval.result.mat<-feat.eval.result.mat[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),]
								
								#curdata<-cbind(curdata,feat.eval.result[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),])
								
								curdata<-as.data.frame(curdata)
								curdata<-replace(as.matrix(curdata),which(is.na(curdata)==TRUE),0)
								
                                if(is.na(deltamzminmax.tol)==FALSE){
                                    print("filtering by delta m/z")
                                    mz_min_max<-cbind(curdata[,2],curdata[,3])
                                    mz_min_max<-as.data.frame(mz_min_max)
                                    
                                    deltappm_res<-apply(mz_min_max,1,get_deltappm)
                                    
                                    curdata<-curdata[which(as.numeric(deltappm_res)<=deltamzminmax.tol),]
                                    feat.eval.result.mat<-feat.eval.result.mat[which(as.numeric(deltappm_res)<=deltamzminmax.tol),]
                                }
                                
                                feateval_list[[i]]<-feat.eval.result.mat
                                data_rpd[[i]]<-curdata
								
								  if(num_replicates>1)
								  {
									  print(paste("**calculating pairwise ",cormethod," correlation**",sep=""))

									  
									  
											rsqres_list<-evaluate.Samples(curdata, num_replicates, alignment.tool="XCMS", cormethod,missingvalue,ignore.missing)
											
											rsqres<-as.data.frame(rsqres_list$cor.matrix)
											
											curdata<-as.data.frame(rsqres_list$feature.table)
											rsqres<-as.data.frame(rsqres)
											snames<-colnames(curdata[,-c(1:8)])
											snames_1<-snames[seq(1,length(snames),num_replicates)]
											rownames(rsqres)<-snames_1
											pcor_outfile=paste(subdir1,"/",file_name,"_sampleassessment_usinggoodfeatures.txt",sep="")
											write.table(rsqres, pcor_outfile,sep="\t",row.names=TRUE)
											rsqres_list[[i]]<-rsqres
								  }
								  else
								  {
									  print("**skipping sample evaluataion as only one replicate is present**")
								  }
									
								if(num_replicates>2)
								{
									bad_samples<-which(rsqres$meanCorrelation<samp.filt.thresh)
								}else
								{
									bad_samples<-which(rsqres$Correlation<samp.filt.thresh)
								}
								
								if(length(bad_samples)>0){
									bad_sample_names<-snames_1[bad_samples]
									
									feat.eval.outfile=paste(subdir1,"/",file_name,"_badsamples_at_cor",samp.filt.thresh,".txt",sep="")
									bad_sample_names<-as.data.frame(bad_sample_names)
									colnames(bad_sample_names)<-paste("Samples with correlation between technical replicates <", samp.filt.thresh,sep="")
									write.table(bad_sample_names, file=feat.eval.outfile,sep="\t", row.names=FALSE)
								}
								
								bad_list={}
								if(length(bad_samples)>0)
								{
									for(n1 in 1:length(bad_samples))
									{	
										if(bad_samples[n1]>1)
										{
											bad_samples[n1]=bad_samples[n1]+(bad_samples[n1]-1)*(num_replicates-1)
										}
											
									}
									for(n1 in 1:num_replicates)
									{
										bad_list<-c(bad_list,(bad_samples+n1-1))
									}
									bad_list<-bad_list[order(bad_list)]
									
								}
								if(i>1){
										parent_bad_list<-intersect(parent_bad_list,bad_list)
									}
									else{
									    
									    parent_bad_list<-bad_list

									}
								
								
							
                                  }
				  else
				  {
					  print("**skipping feature evaluataion as only one replicate is present**")
				  }
                                
	       	}
		
		cat("\n")
		 print("********Stage 2: Filtering results from each parameter setting based on sample and feature quality checks*******")
		 cat("\n")
		  for(i in 1:length(alignmentresults))
                          {
                                
				 
				 
				    file_name=sapply(strsplit(alignmentresults[i],".txt"),head)
                                  feat.eval.file=paste(subdir2,"/",file_name,"cor",samp.filt.thresh,fileroot,feat.filt.thresh,"filtereddata.txt",sep="")	
                                  #data_rpd[[i]]=read.table(feat.eval.file,header=TRUE)
				  
				  curdata<-data_rpd[[i]]
				 
				  feat.eval.result.mat<-feateval_list[[i]]
				  if(length(parent_bad_list)>0){
					  
					 
					  curdata<-curdata[,-c(parent_bad_list+8)]
					  #maxint<-apply(curdata[,-c(1:4,((dim(curdata)[2]-6):dim(curdata)[2]))],1,max)
					  maxint<-apply(curdata[,-c(1:8)],1,max)
					  badfeats<-which(maxint==0)
					  if(length(badfeats)>0){
						curdata<-curdata[-c(badfeats),]
						
						
						
						  feat.eval.result.mat<- feat.eval.result.mat[-c(badfeats),]
						  feateval_list[[i]]<-feat.eval.result.mat
					  }
					
					}
				 data_rpd[[i]]<-curdata[which(as.numeric(feat.eval.result.mat$median)<=feat.filt.thresh),]
				 
				 
				 
				 feat.eval.outfile=paste(subdir2,"/",file_name,"cor",samp.filt.thresh,fileroot,feat.filt.thresh,"filtereddata.txt",sep="")	
								
				 #write results
				 write.table(data_rpd[[i]], feat.eval.outfile,sep="\t", row.names=FALSE)
				 
				 feat.eval.outfile=paste(subdir1,"/",file_name,fileroot,"featureassessment.txt",sep="")					  
								#write results
								write.table(feateval_list[[i]], feat.eval.outfile,sep="\t", row.names=FALSE)
							
				 
			 }
                          ###########################################
                          #4) Merge two or more parameter settings
                          cat("\n")
                          print("*************Stage 3a: Merging features detected at different parameter settings********************")
                          cat("\n")
                          num_pairs=1
                          finalres={}
                          rnames={}
		         
                          for(i in 1:length(alignmentresults))
                          {
                                  file_name=sapply(strsplit(alignmentresults[i],".txt"),head)
                                
                                  #feat.eval.file=paste(xMSanalyzer.outloc,"/",file_name,"cor",samp.filt.thresh,fileroot,feat.filt.thresh,"filtereddata.txt",sep="")	
				  #data_rpd[[i]]=read.table(feat.eval.file,header=TRUE)
                                  a1=sapply(strsplit(as.character(alignmentresults[i]), "\\_thresh"), head, n=2)[2]
                                  a2=sapply(strsplit(as.character(a1), "\\_"), head, n=2)
                                  threshval=as.numeric(a2[1])
                                  stepval=sapply(strsplit(as.character(a2[2]), "\\."), head, n=2)[2]
                                  stepval=paste(".",stepval,sep="")
                                  stepval=as.numeric(stepval)
                                  a1=sapply(strsplit(as.character(alignmentresults[i]), "\\_mzdiff"), head, n=2)[2]
                                  a2=sapply(strsplit(as.character(a1[1]), "\\_max"), head, n=2)
                                  mzdiff=as.numeric(a2[1])
                                  a2=sapply(strsplit(as.character(a2[2]), "\\.txt"), head, n=2)
                                  max=as.numeric(a2[1])
                                  p1=paste(threshval,"_",stepval,"_",mzdiff,"_",max,sep="")

								bool_num<-1
								
                                  for(j in i:length(alignmentresults))
                                  {
                                  	bool_num<-1
                                  	if(i==j){
                                  	if(length(alignmentresults)>1){
                                  		bool_num<-0
                                  	}
                                  	else{
                                  		bool_num<-1
                                  		}
                                  	}
				                	#if(i!=j)
				                	 if(bool_num==1)
				    				  {
				          file_name=sapply(strsplit(alignmentresults[j],".txt"),head)
					
                                          
                                          
                                         
                                         # if(i!=j)
                                          {
                                                  p1_p2=paste("p",i,"_U_","p",j,sep="")
                                          }
                                          #else
                                          #{
                                       #           p1_p2="p1"
                                          #}
					  
					   feat.eval.A<-feateval_list[[i]]
					 feat.eval.B<-feateval_list[[j]]
					 feat.eval.A<-feat.eval.A[which(as.numeric(feat.eval.A$median)<=feat.filt.thresh),]
					 feat.eval.B<-feat.eval.B[which(as.numeric(feat.eval.B$median)<=feat.filt.thresh),]
					 
					 #data_m=merge.Results(data_rpd[[i]],data_rpd[[j]],feateval[[i]],feateval[[j]],max.mz.diff,max.rt.diff,merge.eval.method,merge.eval.pvalue,alignment.tool="XCMS",
					
					print(paste("Number of good quality features from setting ",i,":", dim(data_rpd[[i]])[1],sep=": "))
					if(i!=j)
                                          {
					print(paste("Number of good quality features from setting ",j,":",dim(data_rpd[[j]])[1],sep=": "))
						}
                                         data_m=merge.Results(data_rpd[[i]],data_rpd[[j]],feat.eval.A,feat.eval.B,max.mz.diff,max.rt.diff, merge.eval.pvalue,alignment.tool="XCMS",
					numnodes=numnodes, mult.test.cor,mergecorthresh,missingvalue)
					data_m<-unique(data_m)
					
					numcols<-dim(data_m)[2]



					 data_m_int<-data_m[,-c(1:8,(numcols-7):numcols)]
					numsamps<-dim(data_m_int)[2]/num_replicates
					
                                           numcols<-dim(data_m)[2]

if(is.na(minexp.pct)==FALSE)
{
    minexp<-minexp.pct*dim(data_m_int)[2]
    
    if(length(which(data_m[,8]>=minexp))>0){
        data_m<-data_m[which(data_m[,8]>=minexp),]
    }else{
        stop(paste("No features have non-missing value in ",minexp, " samples",sep=""))
    }
    
    
}

					 data_m_int<-data_m[,-c(1:8,(numcols-7):numcols)]
					numsamps<-dim(data_m_int)[2]/num_replicates
					 
					maxint<-apply(data_m_int,1,function(x){max(x,na.rm=TRUE)})
					numpeaks<-lapply(1:dim(data_m_int)[1],function(j){length(which(is.na(data_m_int[j,])==FALSE))})
				
					 
					 
					
					 
					  union_list[[num_pairs]]<-data_m[,c(1:8)]
					
					#union_list[[num_pairs]]<-cbind(union_list[[num_pairs]],numpeaks)
					union_list[[num_pairs]]<-cbind(union_list[[num_pairs]],data_m$numgoodsamples)
					union_list[[num_pairs]]<-cbind(union_list[[num_pairs]],data_m$median)
					#Qscore<-(as.numeric(data_m$numgoodsamples)/as.numeric(data_m$median+0.1))
					#Qscore<-100*(Qscore/numsamps)
					
					
					union_list[[num_pairs]]<-cbind(union_list[[num_pairs]],data_m$Qscore)
					
					
					union_list[[num_pairs]]<-cbind(union_list[[num_pairs]],maxint)
					
			
					union_list[[num_pairs]]<-cbind(union_list[[num_pairs]],data_m_int)
					
					featinfo<-colnames(data_m[,c(1:7)])
					cnames<-colnames(data_m_int)
					merge.res.colnames<-c(featinfo,"NumPres.All.Samples","NumPres.Biological.Samples",paste("median",fileroot,sep=""),"Qscore","Max.Intensity",cnames)
					colnames(union_list[[num_pairs]])<-as.character(merge.res.colnames)
					
					
					
					  curres={}
					   curres=cbind(curres, p1_p2)
                                         
                      curres=cbind(curres, dim(union_list[[num_pairs]])[1])
                      curres=cbind(curres, mean(as.numeric(data_m$median)))
                      curres=cbind(curres, mean(as.numeric(data_m$Qscore)))
                      
                       #  curscore<-(dim(union_list[[num_pairs]])[1]-(scoreweight*mean(as.numeric(data_m$Qscore))))
                         curscore<-(dim(union_list[[num_pairs]])[1]-(scoreweight*mean(as.numeric(data_m$median))))
                                                           
                                          if(curscore>bestscore){
                                          	
                                          	bestscore<-curscore
                                          	best_i<-num_pairs
                                          	best_pair<-p1_p2
                                          }
                                          
                                          
                                          curres=cbind(curres,curscore)
                      
					  curres<-as.data.frame(curres)
                                          
                                          finalres=rbind(finalres,curres)
					  
					
                                             finalname=paste("XCMS_feature_list_at_", p1_p2,"cor",samp.filt.thresh,fileroot,feat.filt.thresh,".txt",sep="")
					  
                                          #Output merge results
                                          write.table(union_list[[num_pairs]],file=paste(subdir3,"/",finalname,sep=""), sep="\t",row.names=FALSE)
					  
							
								  num_pairs=num_pairs+1
                                     }
				  }
                          }
                          finalres<-as.data.frame(finalres)
                        
			
			  colnames(finalres)<-c("Parameter Combination", "Number of Features", "median PID/CV between sample replicates","mean Qscore (Quality score)","Parameter score")
                          write.table(finalres,file=paste(xMSanalyzer.outloc,"/Stage3a/xcms_with_xMSanalyzer_merge_summary.txt",sep=""), sep="\t", row.names=FALSE)
			 
			 
	print("Most optimal feature setting:")
    print(best_pair)
     cat("\n")
    #########################################################################
    
     cat("\n")
      print(paste("********Stage 3b: Generating final (pre-batcheffect correction) untargeted and targeted feature tables using ",best_pair," results******",sep=""))
      cat("\n")
    
    #rawQCeval/
   
  # pdf("Stage4a_QC_plots.pdf")
    #pdf(file=paste("subdir4a/Stage4a_QC_plots.pdf",sep=""))
    
    pdf(file=paste(subdir4a,"/Stage4a_QC_plots.pdf",sep=""))
 	d1<-union_list[[best_i]]
    
  if(is.na(refMZ)==FALSE){
			stddata<-read.table(refMZ,sep="\t",header=TRUE)
			print(refMZ)
			print(head(stddata))
			
		}else{
			if(charge_type=="pos"){
			data(example_target_list_pos)
			stddata<-example_target_list_pos
			}else{
				
				if(charge_type=="neg"){
						data(example_target_list_neg)
						stddata<-example_target_list_neg
					}else{
						stop("Invalid option. \'charge_type\' should be \'pos\' or \'neg\'.")
						}
				}
		}
  


  
			Sys.sleep(1)
			
		hist(d1$NumPres.All.Samples,main="Histogram NumPres.All.Samples",col="orange",xlab="Number of samples (including replicates) \n with non-zero intensity values", ylab="Number of features",cex.main=0.7)
			
			hist(d1$NumPres.Biological.Samples,main="Histogram NumPres.Biological.Samples \n (using all data)",col="orange",xlab="Number of biological samples \n with non-zero intensity values", ylab="Number of features",cex.main=0.7)
			#h1<-hist(d1$median_CV,main="Histogram median CV \n (using all data)",col="orange",xlab="median CV%", ylab="Number of features",cex.main=0.7)
			
			#h1<-hist(d1$median_CV,breaks=seq(0,max(d1$median_CV,na.rm=TRUE)+10,10),main="Histogram median CV \n (using all data)",col="orange",xlab="median CV%", ylab="Number of features",cex.main=0.7)
			
			if(num_replicates>2){
			h1<-hist(d1$median_CV,breaks=seq(0,max(d1$median_CV,na.rm=TRUE)+10,10),main="Histogram median CV \n (using all data)",col="orange",xlab="median CV%", ylab="Number of features",cex.main=0.7)
		}else{
			h1<-hist(d1$median_PID,breaks=seq(0,max(d1$median_PID,na.rm=TRUE)+10,10),main="Histogram median CV \n (using all data)",col="orange",xlab="median CV%", ylab="Number of features",cex.main=0.7)
			}	
			
				
			hist(d1$Qscore,main="Histogram Qscore \n (using all data)",col="orange",xlab="Quality score", ylab="Number of features",cex.main=0.7)
			
			#pie(h1$counts,col=rainbow(length(h1$counts)),labels=h1$breaks,main="Pie chart of median CVs (%) \n using all features",cex.main=0.7)
			
			#pie(h1$counts,col=rainbow(length(h1$counts)),labels=h1$breaks,main="Pie chart of median CVs (%) \n using all features",cex.main=0.7)
			
			lab_text<-paste(h1$breaks,"-",h1$breaks+10,sep="")
			if(num_replicates>2){
			pie(h1$counts,col=rainbow(length(h1$counts)),labels=lab_text,main=paste("Pie chart of median CVs (%) \n using all features\n; average=",mean(d1$median_CV),sep=""),cex.main=0.7)
			}else{
				pie(h1$counts,col=rainbow(length(h1$counts)),labels=lab_text,main=paste("Pie chart of median PIDs (%) \n using all features\n; average=",mean(d1$median_PID),sep=""),cex.main=0.7)
			
				}
			
			d2<-d1[order(d1$time),]
			
			
			plot(d2$time,d2$Max.Intensity,main="TIC \n (using all data)",col="orange",xlab="Time (s)",ylab="Max intensity across all samples/profiles",cex.main=0.7)
			
			plot(d2$time,d2$mz,main="m/z vs Time \n (using all data)",col="orange",xlab="Time (s)",ylab="m/z",cex.main=0.7)
			
			plot(d2$time,d2$Qscore,main="Qscore vs Time \n (using all data)",col="orange",xlab="Time (s)",ylab="Qscore",cex.main=0.7)
			
			plot(d2$mz,d2$Qscore,main="Qscore vs m/z \n (using all data)",col="orange",xlab="m/z",ylab="Qscore",cex.main=0.7)
			
		
	max_numzeros<-dim(d1)[2]*1

	if(is.na(void.vol.timethresh)==FALSE){
		dfirst15<-d1[which(d1[,2]<void.vol.timethresh),]
	
	
	

		if(nrow(dfirst15)>1){
			
			
			
		ind1<-which(dfirst15[,12]==max(dfirst15[,12]))
		
		time_thresh<-dfirst15[ind1,2]
		
		time_thresh<-time_thresh-(0.30*time_thresh)
		
		#qplot(dfirst15[,2],dfirst15[,9],xlab="Time (s)", ylab="Max intensity across all samples")
		
		plot(dfirst15[,2],dfirst15[,9],xlab="Time (s)", ylab="Max intensity across all samples", main=paste("Estimated void volume time: ",time_thresh," s",sep=""))
		abline(v=time_thresh,col=4,lty=3)
		
		
		d1<-d1[which(d1$time>=time_thresh),]
        
        print("Estimated void volume time cutoff")
        print(time_thresh)
		   }
		d1_int<-round(d1[,-c(1:12)],0)
		d1<-cbind(d1[,c(1:12)],d1_int)
		rm(d1_int)
		
	
		#sfname<-paste(xMSanalyzer.outloc,"/stage5/feature_table_",best_pair,"_cor",samp.filt.thresh,fileroot,feat.filt.thresh,"filtered.txt",sep="")
		finalname<-paste("featuretable_",best_pair,"_cor",samp.filt.thresh,fileroot,feat.filt.thresh,"_voidtimefilt.txt",sep="")
		
		write.table(d1,file=paste(subdir3b,"/",finalname,sep=""),sep="\t",row.names=FALSE)
	   

	}else{
		
		
		finalname<-paste("feature_table_",best_pair,"_cor",samp.filt.thresh,fileroot,feat.filt.thresh,".txt",sep="")
		
		write.table(d1,file=paste(subdir3b,"/",finalname,sep=""),sep="\t",row.names=FALSE)
	   

		
		}

	
		cat("\n")
     print(paste("*********Stage 4a: Search for target features, performing QC evaluation using ",best_pair," results********",sep=""))
     cat("\n")
     
     Sys.sleep(1)
	
		
		print("Dim data after void time filtering")
		print(dim(d1))
	data_m<-d1[,-c(1:12)]
   
    if(replacezeroswithNA==TRUE){
		data_m<-replace(as.matrix(data_m),which(data_m==0),NA)
	}
	
	
	counna<-apply(data_m,1,function(x){length(which(is.na(x)==TRUE))})

	maxzeros<-1*dim(data_m)[2]
	
	if(length(which(counna<maxzeros))){
		data_m<-data_m[which(counna<maxzeros),]
	}

    maxint<-apply(data_m,1,function(x){max(x,na.rm=TRUE)})
   	maxint_ord<-order(maxint,decreasing=TRUE)
   	#[maxint_ord[1:5000]
   	
    X<-t(data_m) #[maxint_ord[1:2000],])
    X<-replace(as.matrix(X),which(is.na(X)==TRUE),0)
    
    tic.eval(d1[,-c(1:12)],outloc=subdir4a)
    
    feat.eval.result=evaluate.Features(d1[,-c(9:12)], numreplicates=num_replicates,min.samp.percent=0.6,alignment.tool="XCMS",impute.bool=TRUE)
			
    		
    Sys.sleep(1)
    s1<-"Stage 1 results: QC evaluation of invidual parameters from apLCMS"
s2<-"Stage 2 results: filtered results from each paramter setting based on sample and feature quality (CV within replicates) checks"
s3<-"Stage 3a results: merged results using stage 2 filtered files"
s3b<-"Stage 3b results: Final untargeted and targeted feature tables"
s4a<-"Stage 4a results: QC evaluation of targeted and untargeted data before batch-effect correction"

sm<-rbind(s1,s2,s3,s3b,s4a)

targeted_feat_raw<-eval.target.mz(dataA=d1[,-c(3:12)],refMZ=stddata,feature.eval.result=feat.eval.result,mzthresh=refMZ.mz.diff,timethresh=refMZ.time.diff,outloc=subdir4a,folderheader="raw")
	

    if(is.na(sample_info_file)==FALSE)
    {
    	sampleid_mapping<-read.table(sample_info_file,sep="\t",header=TRUE)
    	
    	sampleid_mapping[,1]<-gsub(sampleid_mapping[,1],pattern=filepattern,replacement="")
		cnames<-colnames(data_m)
		
		cnames<-gsub(cnames,pattern=filepattern,replacement="")
		colnames(data_m)<-cnames


    	batch_inf<-sampleid_mapping[,3]
    	
    	batch_labels<-as.factor(sampleid_mapping[,3])
    
    	l1<-levels(batch_labels)
     
    	    	   

		    pca.eval(X=X,samplelabels=batch_labels,filename="raw",ncomp=5,center=TRUE,scale=TRUE,legendlocation="topright",legendcex=0.5,outloc=subdir4a)
			 
			Sys.sleep(1)
			
			
										
			cnames=colnames(feat.eval.result)
		
			

			if(dim(sampleid_mapping)[2]<4){
   		    mod<-rep(1,dim(data_m)[2])
   		    }else{
   		    		mod<-sampleid_mapping[,-c(1:3)]
   		    	}
    
  	
    		dev.off()

    		Sys.sleep(1)
    
		     #######################################################################
		     cat("\n")
		    print(paste("**********Stage 4b: Processing data using ComBat and post-correction QC evaluation using ",best_pair," results*******",sep=""))
		      cat("\n")
     #pdf("Stage4b_QC_plots.pdf")
     #pdf(file=paste("subdir4b/Stage4b_QC_plots.pdf",sep=""))
     
     pdf(file=paste(subdir4b,"/Stage4b_QC_plots.pdf",sep=""))
		    ##################################################################
		
		    adjdata<-try(sva::ComBat(dat=data_m,batch=batch_inf,mod=mod,par.prior=TRUE),silent=TRUE)
		    
		    if(is(adjdata,"try-error")){
		 
		
				data_m1<-cbind(d1[,c(1:4)],data_m)
				
				#print(adjdata)
				
				print("sva::ComBat caused an error.")
				print(adjdata)
				
				print("Too many missing values can cause this error. Trying MetabComBat...")
				#adjdata<-MetabComBat(dat=data_m1,saminfo=sampleid_mapping,par.prior=T,filter=F,write=F)
		    	adjdata<-MetabComBat(dat=data_m1,saminfo=sampleid_mapping,par.prior=T,filter=F,write=F,prior.plots=F)
		    	adjdata<-adjdata[,-c(1:4)]
		    }
    
    				    
		    maxint<-apply(adjdata,1,function(x){max(x,na.rm=TRUE)})
		    
		    adjdata2<-replace(as.matrix(adjdata),which(is.na(adjdata)==TRUE),0)
		    adjdata2<-replace(as.matrix(adjdata2),which(adjdata2<0),0)
		    
		    adjdata2<-cbind(d1[,c(1:8)],adjdata2)
		    
		    
		    
		    expression_xls<-paste(subdir4b,"/ComBat_corrected_",best_pair,"_feature_table.txt",sep="")
		    
		    feat.eval.result=evaluate.Features(adjdata2, numreplicates=num_replicates,min.samp.percent=0.6,alignment.tool="XCMS",impute.bool=TRUE)
		    
			cnames=colnames(feat.eval.result)
										
			feat.eval.result<-apply(feat.eval.result,2,as.numeric)
			feat.eval.result<-as.data.frame(feat.eval.result)
			feat.eval.result.mat=cbind(adjdata2[,c(1:4)],feat.eval.result)  
			
			numpeaks<-apply(adjdata2[,-c(1:8)],1,countpeaks)						
		    
		    maxint<-apply(adjdata2[,-c(1:8)],1,function(x){max(x,na.rm=TRUE)})
		    
		    adjdata2<-cbind(adjdata2[,c(1:7)],numpeaks,feat.eval.result.mat$numgoodsamples,feat.eval.result.mat$median,feat.eval.result.mat$Qscore,maxint,adjdata2[,-c(1:8)])
		    
		    colnames(adjdata2)<-colnames(d1)
		   
		    write.table(adjdata2,file=paste(subdir4b,"/",expression_xls,sep=""),sep="\t",row.names=FALSE)
		    
		   # X<-t(adjdata2[maxint_ord[1:2000],-c(1:9)])
		   
		    tic.eval(adjdata2[,-c(1:12)],outloc=subdir4b)
    

		    X<-t(adjdata2[,-c(1:12)])
		   
		     
		    X<-as.matrix(X)
		    
		    X<-replace(as.matrix(X),which(is.na(X)==TRUE),0)
		    
		    pca.eval(X,samplelabels=batch_labels,filename="ComBat",ncomp=5,center=TRUE,scale=TRUE,legendlocation="topright",legendcex=0.5,outloc=subdir4b)
			    
		    
		    #eval.reference.metabs(dataA=adjdata2,stdData=stddata,mzthresh=10,outloc=getwd(),folderheader="ComBat")
			
			targeted_feat_combat<-eval.target.mz(dataA=adjdata2[,-c(3:12)],refMZ=stddata,feature.eval.result=feat.eval.result,mzthresh=refMZ.mz.diff,timethresh=refMZ.time.diff,outloc=subdir4b,folderheader="ComBat")
			
			s4b<-"Stage 4b results: Batch-effect evaluation post ComBat and QC evaluation of targeted data post batch-effect correction"
		sm<-rbind(sm,s4b)
			
				
	
	
		}else{
			
			adjdata2=NULL
			targeted_feat_combat<-NULL
			}
    

			 
			 dev.off()
			 
			  	 metlin.res={}
                 kegg.res={}
                      
					 #length(union_list[[num_pairs]]$mz
					 if(is.na(adduct.list)==FALSE){
					
					cat("\n")
					  print("*********Stage 5: Mapping m/z values to known metabolites using KEGG*********")
					 cat("\n")
					 
                     	annot.res<-feat.batch.annotation.KEGG(adjdata2,mz.tolerance.dbmatch,adduct.list,subdir5, numnodes=numnodes,syssleep=syssleep)
					    annotres_list<-annot.res
					    
					     s5<-"Stage 5 results: Annotation of features"
					    sm<-rbind(sm,s5)

                     }
				
			 
			 
			  print("*************Processing complete**********")

			  #print("*********Characterizing metabolites*********")
		}             
                else
                {
                        stop(paste("No files exist in",XCMS.outloc, "Please check the input value for cdfloc", sep=""))
                }
                    
                
                
        }
        
        suppressWarnings(sink(file=NULL))

sm<-as.data.frame(sm)
colnames(sm)<-"Output_description"
setwd(xMSanalyzer.outloc)
write.table(sm,file="Readme.txt",sep="\t",row.names=FALSE)

		print(paste("Processing is complete. Program results can be found at: ",xMSanalyzer.outloc,sep=""))
		
	 return(list("XCMS.merged.res"=union_list, "XCMS.ind.res"=data_rpd_all,"XCMS.ind.res.filtered"=data_rpd, "final.feat.table.annot"=annotres_list, "feat.eval.ind"=feateval_list, "sample.eval.ind"=rsqres_list,"final.feat.table.raw"=d1,"final.feat.table.combat"=adjdata2,
	 "final.targeted.feat.table.raw"=targeted_feat_raw,"final.targeted.feat.table.combat"=targeted_feat_combat))
}
kuppal2/xMSanalyzer documentation built on Feb. 12, 2021, 12:36 a.m.