R/homol.search.R

Defines functions homol.search

Documented in homol.search

homol.search <-
function(
	peaklist,
	isotopes,	
	elements = c("C", "H", "O"),
	use_C = FALSE,
	minmz = 5,
	maxmz = 120,
	minrt = -2,
	maxrt = 2,
	ppm = TRUE,
	mztol = 3.5,
    rttol = 0.5,
	minlength = 5,
	mzfilter = FALSE,
	vec_size = 3E6,
	mat_size = 3,
	R2 = .98,
	spar = .45,
	plotit = FALSE,
	deb = 0
){
  
  
  

    ##########################################################################
    # (0) Issue warnings: check arguments ####################################
	if(vec_size>2147483648) stop("vec_size too large - must be <= (2^31-1)")
	if(!is.logical(use_C)) stop("use_C must be logial.")
	if(!is.logical(ppm)) stop("ppm must be logial.")
    if(ppm==TRUE & mztol>100) cat("Too big mztol?")
    if(minlength<3) stop("invalid minlen argument. set minlen >= 3")
	if(!length(elements)) stop("specify elements")
	if(!is.numeric(mzfilter) & mzfilter[1]!="FALSE") stop("mzfilter must either be a numeric vector or set to FALSE")
	if(elements[1]!="FALSE") if(any(is.na(match(elements,isotopes[,1])))) stop("unknown elements")
	if(length(peaklist[1,])>3) stop("peaklist with > 3 columns not allowed")
	if(!is.numeric(peaklist[,1]) || !is.numeric(peaklist[,2]) || !is.numeric(peaklist[,3]) ) stop("peaklist columns not numeric")
	if(R2!=FALSE) if((R2<=0)||(R2>1)) stop("R2 must be either FALSE or 0<R2<=1")
	if(R2!=FALSE) if((spar<=0)||(spar>1)) stop("R2 must be either FALSE or 0<R2<=1")
	if(mztol <= 0){ # Set precision to digits of inputs
		min_char <- Inf
		for(i in 1:length(peaklist[,1])){
			n_char <- nchar(strsplit(as.character(peaklist[i,1]),".",fixed=TRUE)[[1]][2])
			if(n_char < min_char) min_char <- n_char
		}
		mztol <- (10 ^ (-min_char + 1))
		warning(paste0("zero mztol - mztol set to ",mztol," for numeric precision!"))	
	}
	if(rttol <= 0){ # Set precision to digits of inputs
		min_char<-Inf
		for(i in 1:length(peaklist[,3])){
			n_char<-nchar(strsplit(as.character(peaklist[i,3]),".",fixed=TRUE)[[1]][2])
			if(n_char<min_char){
				min_char<-n_char
			}
		}
		rttol <- (10^(-min_char+1))
		warning(paste0("zero rttol - rttol set to ", rttol, " for numeric precision!"))	
	}
	if(minrt > maxrt) stop("minrt cannot be larger than maxrt!")
	if(minmz > maxmz) stop("minmz cannot be larger than maxmz!")	
	if(mzfilter[1] != "FALSE"){
		if(!is.numeric(mzfilter)) stop("mzfilter must either be FALSE or (a vector of) numeric")
		mzfilter<-mzfilter[order(mzfilter)]
		if(any(mzfilter<0)) stop(" Negative mzfilter values found - abort.")
		if(min(mzfilter)<minmz) stop(" Minimum mzfilter value smaller than minmz - abort.")
		if(max(mzfilter)>maxmz) stop(" Maximum mzfilter value larger than maxmz - abort.")	
	}
	if(!is.data.frame(peaklist) & !is.matrix(peaklist) ) stop("peaklist must be a numeric data.frame")
	if(!is.matrix(peaklist)){
		peaklist<-data.matrix(peaklist)
	}
	if(length(mztol) == 1){
		if(ppm == TRUE){
			delmz <- c(mztol*max(peaklist[,1])/1e6);
			peaklist4 <- c(mztol*peaklist[,1]/1e6);
		}else{
			delmz <- mztol;
			peaklist4 <- rep(mztol,length(peaklist[,1]))
		}
		max_delmz <- max(delmz)
	}else{
		if(length(mztol) == length(peaklist[,1])){
			peaklist4 <- mztol
			delmz <- max(mztol)
		}else{
			stop("mztol: must be either one value or of length peaklist[,1]")
		}
	}
	minmz <- (minmz - delmz)
	maxmz <- (maxmz + delmz)
	inter <- interactive()
	##########################################################################
    # (1) retrieve feasible mass differences & all combinations thereof ######
	# (1.1) upper & lower mass defect / mass bound ###########################
 	##########################################################################
	cat("\n(1-3) Build bounds, trees & nearest neighbour path ... ");	
	if(elements[1] == FALSE){
		elements <- unique(as.character(isotopes[,1]))
	}
	delmass <- c();
	c_ratio <- c();
	for(i in 1:length(elements)){
		isos <- isotopes[as.character(isotopes[,1]) == elements[i],]
		delmass <- c(delmass,
			isos[isos[,3] == min(isos[,3]),3]
		);
		c_ratio <- c(c_ratio, unique(isos[,5]));
	}	
	c_ratio[c_ratio == 0] <- Inf # well...
	if(use_C){
		delmass<-( c( delmass-round(delmass,digits=0) ) / (delmass+(1/c_ratio*12)) );		
	}else{
		delmass<-( c( delmass-round(delmass,digits=0) ) / delmass );	
	}
	maxup <- c(max(delmass));
	maxdown <- c(min(delmass));
	#maxdown<-abs(min(delmass));
	##########################################################################
    # (2) Set internal data & parameters #####################################
	#maxmove<-c(max(maxup,maxdown))
	shift <- c(maxmz*(maxup-maxdown))
	#shift<-c(maxmz*(maxdown+maxup))
	mass_def <- c(peaklist[,1] - round(peaklist[,1]))	# calculate mass defect
	peaklist2<-peaklist[,-2]
	uplim<-c(mass_def-(peaklist2[,1]*maxup))			# upward mass defect shift - (m/z)/mass defect lower intercept
	uplim_tol<-abs(max(peaklist4)*maxup*2)				# upward mass defect shift - maximum tolerance
	#uplim_tol<-c(max(peaklist4)*maxup*2)				# upward mass defect shift - maximum tolerance
	downlim<-c(mass_def-(peaklist2[,1]*maxdown))		# downward mass defect shift - (m/z)/mass defect upper intercept
	#downlim<-c(mass_def+(peaklist2[,1]*maxdown))		# downward mass defect shift - (m/z)/mass defect upper intercept	
	downlim_tol<-abs(max(peaklist4)*maxdown*2)			# downward mass defect shift - maximum tolerance
	#downlim_tol<-c(max(peaklist4)*maxdown*2)			# downward mass defect shift - maximum tolerance
	peaklist2<-cbind(peaklist2,uplim,downlim)
	peaklist2<-as.matrix(peaklist2)
	max_delmz<-c(4*max(peaklist4)) 						# maximum m/z-distance gap - used for early aborting
	scaled<-c(abs(maxmz-minmz),abs(maxrt+minrt),shift)	# scaling used for nearest neighbour-search
	if(any(scaled==0)){
		scaled[scaled==0]<-1
		#stop("debug me on issue #5: scaled entry ==0 -> wrong parameters=")
	}
	peaklist3<-cbind(peaklist2[,c(1,2)],mass_def)
	peaklist3<-as.matrix(peaklist3)
	bounds<-matrix(ncol=2,nrow=4,0)						# store search bounds
	marked<-matrix(ncol=3,nrow=length(peaklist2[,1]),0) # first sweep = 1 -> do not set to 0!
	marked[,1]<--1;
	marked[,3]<-c(1:length(peaklist2[,1]))
	colnames(marked)<-c("sweep","done","ID")
	tupels<-matrix(nrow=vec_size,ncol=7,0)				# initially used to store triplets, then any n-tupels
	tupeldo<-c(1);											# where to write into tupels
	new_found<-rep(0,length(peaklist2[,1]))				# store newly detected m/z-differences
	new_found_ceiling_UA<-rep(0,length(peaklist2[,1])) 	# if mass defect rounding hits ceiling, upper area (UA)
	new_found_ceiling_LA<-rep(0,length(peaklist2[,1])) 	# if mass defect rounding hits ceiling, lower area (LA)	
	ceiled_UA<-FALSE
	ceiled_LA<-FALSE
	new_found_floor_UA<-rep(0,length(peaklist2[,1]))	# if mass defect rounding hits floor, upper area (UA)
	new_found_floor_LA<-rep(0,length(peaklist2[,1]))	# if mass defect rounding hits floor, lower area (LA)		
	floored_UA<-FALSE
	floored_LA<-FALSE	
	dist_ID<-c()										# point ID
	dist_dist<-c()										# store distance
	mz_last<-c(0)										# store last sweep`s m/z-value for correction
	a<-c(0)												# track over sweeps: how many points within bounds?
	use<-c(1)											# for nearest neighbour search
	along<-rep(0,length(peaklist2[,1]))					# for nearest neighbour search
	along[1]<-1											# for nearest neighbour search
	##########################################################################
	# (3) build kd-trees & run nearest-neighbour search ######################
	peakTree<-.Call("kdtree", 
			as.matrix(peaklist2),
			PACKAGE="nontarget"
	);
	if(!any(deb==4)){ # along nearest neighbour path ...
		peakTree3<-.Call("kdtree", 
				as.matrix(peaklist3),
				PACKAGE="nontarget"
		);			
		for(i in 2:length(peakTree3[,1])){	
			use2<-.Call("search_kdtree3", 
				peaklist3, 	# rows: c(m/z,RT,UB,LB)
				peakTree3,  # peaks - search tree
				use,
				scaled,	
				PACKAGE="nontarget"
			)[,1]
			if(use2<1) stop("debug me on issue #4 - use2=-1,<1")
			.Call("node_delete", 
				use,
				peaklist3,
				peakTree3,
				PACKAGE="nontarget"
			);
			along[i]<-use2
			use<-use2
		}
		if(sum(peakTree3[,3]!=0)>1){stop("debug me on issue #5: peakTree3 was not fully emptied in NN-search!")}
		if(any(peakTree3[,3]>1)){stop("debug me on issue #6: peakTree3 corrupted!")}		
	}else{ # ... or, for comparison, via random sampling
		along<-sample(
			seq(1,length(peaklist[,1]),1),
			length(peaklist[,1]),
			replace=FALSE
		)
	}
	cat("done.");	
	##########################################################################
	# (4) Sweep through nearest neighbour path ###############################
	cat("\n(4) Triplet extraction \n");	
	if(inter){pBar <- txtProgressBar( min = 0, max = length(peaklist2[,1]), style = 3 )}
	for(i in 1:length(peaklist2[,1])){

		if(inter){setTxtProgressBar(pBar,i,title = NULL, label = NULL)}
		use<-along[i]
		######################################################################
		# upper area sweep ###################################################
		bounds[1,1]<-(peaklist2[use,1]+minmz)
		bounds[1,2]<-(peaklist2[use,1]+maxmz) 
		bounds[2,1]<-(peaklist2[use,2]+minrt)
		bounds[2,2]<-(peaklist2[use,2]+maxrt)
		bounds[3,1]<-(peaklist2[use,3]-shift-uplim_tol)		# maxup LB
		bounds[3,2]<-(peaklist2[use,3]+uplim_tol) 			# maxup UB
		bounds[4,1]<-(peaklist2[use,4]-downlim_tol)			# maxdown LB
		bounds[4,2]<-(peaklist2[use,4]+shift+downlim_tol) 	# maxdown UB
		#if(any(bounds[,1]>bounds[,2])){stop("debug me on wrong bounds_1")}
		.Call("search_kdtree_homol", 
			peaklist2, 	# rows: c(m/z,RT,UB,LB)
			peakTree,  	# peaks - search tree
			bounds,
			marked,		# only write to new_found those of marked[,1]!=(i-1) to omit last sweep`s new_found
			i,			# sweep number
			new_found,	
			1, 			# clean new_found?
			PACKAGE="nontarget"
		)
		# -> mass defect rounding hits ceiling, upper area (_UA)? ############
		if( (mass_def[use]+(maxup*maxmz)+uplim_tol)>=.5 ){ 		# PLUS!
			bounds[3,1]<-(peaklist2[use,3]-shift-1-uplim_tol)	# maxup LB
			bounds[3,2]<-(peaklist2[use,3]-1+uplim_tol) 		# maxup UB
			bounds[4,1]<-(peaklist2[use,4]-1-downlim_tol)		# maxdown LB
			bounds[4,2]<-(peaklist2[use,4]+shift-1+downlim_tol) # maxdown UB
			#if(any(bounds[,1]>bounds[,2])){stop("debug me on wrong bounds_2")}			
			.Call("search_kdtree_homol", 
				peaklist2, 	# rows: c(m/z,RT,UB,LB)
				peakTree,  	# peaks - search tree
				bounds,
				marked,		# only write to new_found those of marked[,1]!=(i-1) to omit last sweep`s new_found
				i,			# sweep number
				new_found_ceiling_UA,	
				1, 			# clean new_found?
				PACKAGE="nontarget"
			)
			if(new_found_ceiling_UA[1]!=0){ceiled_UA<-TRUE}
		}
		# -> mass defect rounding hits floor, upper area (_UA)? ###############
		if((mass_def[use]+(maxdown*maxmz)-downlim_tol)<=-.5){ 	# PLUS!
			bounds[3,1]<-(peaklist2[use,3]-shift+1-uplim_tol)	# maxup LB
			bounds[3,2]<-(peaklist2[use,3]+1+uplim_tol) 		# maxup UB
			bounds[4,1]<-(peaklist2[use,4]+1-downlim_tol)		# maxdown LB
			bounds[4,2]<-(peaklist2[use,4]+shift+1+downlim_tol) # maxdown UB
			#if(any(bounds[,1]>bounds[,2])){stop("debug me on wrong bounds_3")}
			.Call("search_kdtree_homol", 
				peaklist2, 	# rows: c(m/z,RT,UB,LB)
				peakTree,  	# peaks - search tree
				bounds,
				marked,		# only write to new_found those of marked[,1]!=(i-1) to omit last sweep`s new_found
				i,			# sweep number
				new_found_floor_UA,	
				1, 			# clean new_found?
				PACKAGE="nontarget"
			)
			if(new_found_floor_UA[1]!=0){floored_UA<-TRUE}
		}
		# lower area sweep ###################################################
		bounds[1,1]<-(peaklist2[use,1]-maxmz)
		bounds[1,2]<-(peaklist2[use,1]-minmz) 
		bounds[2,1]<-(peaklist2[use,2]-maxrt)
		bounds[2,2]<-(peaklist2[use,2]-minrt)
		bounds[3,1]<-(peaklist2[use,3]-uplim_tol)			# maxup LB
		bounds[3,2]<-(peaklist2[use,3]+shift+uplim_tol) 	# maxup UB
		bounds[4,1]<-(peaklist2[use,4]-shift-downlim_tol)	# maxdown LB
		bounds[4,2]<-(peaklist2[use,4]+downlim_tol) 		# maxdown UB
		#if(any(bounds[,1]>bounds[,2])){stop("debug me on wrong bounds_4")}
		.Call("search_kdtree_homol", 
			peaklist2, 	# rows: c(m/z,RT,UB,LB)
			peakTree,  	# peaks - search tree
			bounds,
			marked,		# only write to new_found those of marked[,1]!=(i-1) to omit last sweep`s new_found
			i,			# sweep number
			new_found,	
			0, 			# clean new_found?
			PACKAGE="nontarget"
		)	
		# -> mass defect rounding hits floor, lower area (_LA)? #############
		if((mass_def[use]-(maxup*maxmz)-uplim_tol)<=-.5){ 		# MINUS!
			bounds[3,1]<-(peaklist2[use,3]+1-uplim_tol)			# maxup LB
			bounds[3,2]<-(peaklist2[use,3]+shift+1+uplim_tol) 	# maxup UB
			bounds[4,1]<-(peaklist2[use,4]-shift+1-downlim_tol)	# maxdown LB
			bounds[4,2]<-(peaklist2[use,4]+1+downlim_tol) 		# maxdown UB
			#if(any(bounds[,1]>bounds[,2])){stop("debug me on wrong bounds_5")}
			.Call("search_kdtree_homol", 
				peaklist2, 	# rows: c(m/z,RT,UB,LB)
				peakTree,  	# peaks - search tree
				bounds,
				marked,		# only write to new_found those of marked[,1]!=(i-1) to omit last sweep`s new_found
				i,			# sweep number
				new_found_floor_LA,	
				1, 			# clean new_found?
				PACKAGE="nontarget"
			)
			if(new_found_floor_LA[1]!=0){floored_LA<-TRUE}
		}
		# -> mass defect rounding hits ceiling, lower area (_LA)? ###########
		if((mass_def[use]-(maxdown*maxmz)+downlim_tol)>=.5){ 	# MINUS!
			bounds[3,1]<-(peaklist2[use,3]-1-uplim_tol)			# maxup LB
			bounds[3,2]<-(peaklist2[use,3]+shift-1+uplim_tol) 	# maxup UB
			bounds[4,1]<-(peaklist2[use,4]-shift-1-downlim_tol)	# maxdown LB
			bounds[4,2]<-(peaklist2[use,4]-1+downlim_tol) 		# maxdown UB
			#if(any(bounds[,1]>bounds[,2])){stop("debug me on wrong bounds_6")}
			.Call("search_kdtree_homol", 
				peaklist2, 	# rows: c(m/z,RT,UB,LB)
				peakTree,  	# peaks - search tree
				bounds,
				marked,		# only write to new_found those of marked[,1]!=(i-1) to omit last sweep`s new_found
				i,			# sweep number
				new_found_ceiling_LA,	
				1, 			# clean new_found?
				PACKAGE="nontarget"
			)
			if(new_found_ceiling_LA[1]!=0){ceiled_LA<-TRUE}
		}
		######################################################################
		# delete or update old points ########################################
		if(length(dist_ID)>0){
			stay<-(marked[dist_ID,1]==i)
			dist_ID<-dist_ID[stay]	
			dist_dist<-dist_dist[stay]		
			dist_dist<-(dist_dist+mz_last-peaklist2[use,1])
		}
		mz_last<-peaklist2[use,1] # for next=i+1 update
		######################################################################
		# add new points #####################################################
		if(new_found[1]!=0){
			many<-(sum(new_found!=0))
			a<-(a+many)
			dist_ID<-append(dist_ID,new_found[1:many])
			dist_dist<-append(dist_dist,
				peaklist2[new_found[1:many],1]-peaklist2[use,1]	
			)
		}
		if(ceiled_UA){
			many<-sum(new_found_ceiling_UA!=0)
			dist_ID<-append(dist_ID,new_found_ceiling_UA[1:many])
			dist_dist<-append(dist_dist,
				peaklist2[new_found_ceiling_UA[1:many],1]-peaklist2[use,1]	
			)	
			new_found_ceiling_UA[]<-0;
			ceiled_UA<-FALSE;
			a<-(a+many)
		}
		if(floored_UA){
			many<-sum(new_found_floor_UA!=0)
			dist_ID<-append(dist_ID,new_found_floor_UA[1:many])
			dist_dist<-append(dist_dist,
				peaklist2[new_found_floor_UA[1:many],1]-peaklist2[use,1]	
			)	
			new_found_floor_UA[]<-0;
			floored_UA<-FALSE;		
			a<-(a+many)
		}		
		if(floored_LA){
			many<-sum(new_found_floor_LA!=0)
			dist_ID<-append(dist_ID,new_found_floor_LA[1:many])
			dist_dist<-append(dist_dist,
				peaklist2[new_found_floor_LA[1:many],1]-peaklist2[use,1]	
			)	
			new_found_floor_LA[]<-0;
			floored_LA<-FALSE;		
			a<-(a+many)
		}
		if(ceiled_LA){
			many<-sum(new_found_ceiling_LA!=0)
			dist_ID<-append(dist_ID,new_found_ceiling_LA[1:many])
			dist_dist<-append(dist_dist,
				peaklist2[new_found_ceiling_LA[1:many],1]-peaklist2[use,1]	
			)	
			new_found_ceiling_LA[]<-0;
			ceiled_LA<-FALSE;
			a<-(a+many)
		}
		if(deb>4){ # return queried peaks for peak ID == deb (must be >4 as deb used otherwise below)
			if(use==deb){
				return(dist_ID)
			}
		}
		if(length(dist_ID)>1){
			##################################################################
			# resort to new centre point i ###################################
			ord<-order(abs(dist_dist)) # decreasing=FALSE!
			dist_ID<-dist_ID[ord]
			dist_dist<-dist_dist[ord]
			##################################################################
			# find triplets ##################################################
			.Call("homol_triplet", 
				peaklist3,
				dist_ID,
				dist_dist,
				tupels, # = triplets
				tupeldo,
				peaklist4,
				use, # = current point
				max_delmz,
				rttol,
				PACKAGE="nontarget"
			);
			if(tupeldo>=vec_size){stop("\n Maximum number of tupels reached. increase vec_size")}
			##################################################################
		}	
		######################################################################
	}
	if(inter){close(pBar)}	
	tupeldo<-(tupeldo-1)
	if(tupeldo==0){stop("no series detected")}
	tupels<-tupels[1:tupeldo,,drop=FALSE]
	tupels<-tupels[tupels[,1]!=0,,drop=FALSE] # backup - if tupeldo fails 
	tupels<-tupels[order(tupels[,4]),,drop=FALSE]
	if(plotit==1){
		######################################################################	
		# path in m/z vs. mass defect ########################################
		plot.new()
		plot.window(xlim=c(min(peaklist2[,1]),max(peaklist2[,1])),ylim=c(min(mass_def),max(mass_def)))
		points(
			peaklist2[,1],mass_def,
			pch=19,cex=1,col="black"
		)
		title(xlab="m/z",ylab="Mass defect",main="Nearest neighbour path")
		box();axis(1);axis(2)
		for(j in 2:length(along)){
			lines(
				c(peaklist2[along[j],1],peaklist2[along[j-1],1]),c(mass_def[along[j]],mass_def[along[j-1]]),
				,col="black",lwd=.5)		
		}
		abline(b=maxup,a=(0-(min(peaklist2[,1])*maxup))
			,col="darkgreen",lwd=2)
		abline(b=maxdown,a=(0-(min(peaklist2[,1])*maxdown))
			,col="red",lwd=2)		
		Sys.sleep(5)
		######################################################################	
		plot.new()
		plot.window(xlim=c(min(peaklist2[,1]),max(peaklist2[,1])),ylim=c(min(peaklist2[,2]),max(peaklist2[,2])))
		points(
			peaklist2[,1],peaklist2[,2],
			pch=19,cex=.1,col="lightgrey"
		)
		title(xlab="m/z",ylab="RT",main="Nearest neighbour path")
		box();axis(1);axis(2)
		for(j in 2:length(along)){
			lines(
				c(peaklist2[along[j],1],peaklist2[along[j-1],1]),c(peaklist2[along[j],2],peaklist2[along[j-1],2]),
				,col="black",lwd=.5)
		}
		Sys.sleep(5)
		######################################################################	
	}
	cat("   done.");
	##########################################################################
	# (5) Apply mzfilter #####################################################
	if( mzfilter[1]!="FALSE" ){
		cat("\n(5) Apply mzfilter ... ");	
		keep<-rep(FALSE,tupeldo)
		for(k in 1:length(mzfilter)){
			keep[	
				tupels[,4]<=mzfilter[k] &
				tupels[,5]>=mzfilter[k]	
			]<-TRUE
		}
		if(!any(keep)){stop("No homologues detected after mzfilter application")}
		tupels<-tupels[keep,,drop=FALSE]
		cat("done.");
	}else{
		cat("\n(5) Skip mzfilter. ");	
	}
	##########################################################################
	# (6) Combine triplets to n-tupel - possibly check for smoothness ########	
	cat("\n(6) Combine n-tupels, n(rejects),passed_on:");	
	HS<-list();
	HS_length<-3;
	found<-0;
	tupels<-cbind(tupels,seq(1,length(tupels[,1]),1)) # dummy column
	while(any(tupels[,1]!=0)){
	    cat(paste(" ",HS_length," - ",length(tupels[,1]),sep=""));
		keeper<-rep(0,length(tupels[,1]))
		if(length(tupels[,1])==1){break} # only one tupel left = nothing to combine
		if(length(tupels[,1])==0){stop("\n debug me, issue #1")}
		if(any(tupels[,1]<1)){stop("\n debug me, +issue #3")}
		merged_tupels<-.Call("combine_tuple", 
			tupels[,1:HS_length],
			tupels[,(HS_length+1):(HS_length+4)],
			keeper,
			mat_size,
			PACKAGE="nontarget" 
		)	
		merged_tupels<-merged_tupels[merged_tupels[,1]!=0,,drop=FALSE]	
		if(length(merged_tupels[,1])==0){
			HS[[HS_length]]<-tupels[keeper==0,,drop=FALSE]
			if(HS_length>=minlength){
				found<-c(found+length(HS[[HS_length]][,1]))
			}
			break;
		}
		lang<-length(merged_tupels[1,])
		keeper_2<-rep(1,length(merged_tupels[,1])) # used for spline smooth
		if(R2!=FALSE){
			reject<-0;
			for(i in 1:length(merged_tupels[,1])){
				x<-peaklist3[
					merged_tupels[i,1:(HS_length+1)],1
				]
				if(any(duplicated(x)) & (HS_length<4)){next} # bypass fitting issues
				y<-peaklist3[
					merged_tupels[i,1:(HS_length+1)],2
				]
				if(any(duplicated(y)) & (HS_length<4)){next} # bypass fitting issues
				b <- predict(stats::smooth.spline(x,y,cv=FALSE,spar=.45))
				SS_tot<-sum((y-mean(y))^2);
				if(SS_tot==0){next;} # no variance; e.g. under constant RT
				SS_res<-sum((y-b$y)^2);
				R2_i<-(1-(SS_res/SS_tot));
				if(R2_i<R2){
					keeper[merged_tupels[i,lang]]<-(keeper[merged_tupels[i,lang]]-1)
					keeper[merged_tupels[i,(lang-1)]]<-(keeper[merged_tupels[i,(lang-1)]]-1)
					keeper_2[i]<-0
					reject<-(reject+1)
				}
				if(plotit==2 & (HS_length>=minlength)){
					plot(x,y,pch=19,main=R2_i,xlab="m/z",ylab="RT");
					points(b$x,b$y,type="l",col="red");	
					Sys.sleep(1.4)
				}
			}
			cat(paste("(",reject,")",sep=""));
		}else{
			cat(paste("(0)",sep=""));			
		}
		HS[[HS_length]]<-tupels[keeper==0,,drop=FALSE]
		cat(",",sum(keeper!=0),sep="")
		if(HS_length>=minlength){
			found<-c(found+length(HS[[HS_length]][,1]))
		}
		HS_length<-(HS_length+1)
		merged_tupels<-merged_tupels[keeper_2==1,,drop=FALSE]
		merged_tupels<-merged_tupels[order(merged_tupels[,(HS_length+1)]),,drop=FALSE]
		if( length(merged_tupels[,1])==1 ){ # single-rowed?
			HS[[HS_length]]<-merged_tupels;
			if(HS_length>=minlength){
				found<-c(found+length(HS[[HS_length]][,1]))
			}			
			break;
		}
		tupels<-merged_tupels;	
	}
	if(any(deb==1)){return(HS)}
	if(length(HS)<minlength){
		stop("\n No homologue series detected. Check parameters (e.g., minlength) and units?");
	}
	cat(" - done.");
	##########################################################################	
	# (7) remove nested HS ###################################################
	if(!any(deb==2)){
		cat("\n(7) Remove nested HS:  \n");
		reject<-0;
		# membership of peaks in HS #########################################
		peakHS<-list(0)
		for(i in 1:length(peaklist[,1])){
			peakHS[[i]]<-matrix(ncol=2,nrow=0)
		}
		if(length(HS)>minlength){ # anything to do?		
			for(i in minlength:length(HS)){		
				if(length(HS[[i]])>0){
					HS[[i]][,length(HS[[i]][1,])]<-1 # use last column as marker for keeping a HS
					for(a in 1:length(HS[[i]][,1])){
						for(b in 1:i){
							peakHS[[ HS[[i]][a,b] ]]<-(
								rbind(peakHS[[ HS[[i]][a,b] ]],c(i,a))
							)
						}
					}
				}
			}
		}
		# check peak memberships of HS for subsets ##########################
		if(inter){pBar <- txtProgressBar( min = 0, max = length(peaklist2[,1]), style = 3 )}
		for(i in 1:length(peakHS)){
			if(inter){setTxtProgressBar(pBar,i,title = NULL, label = NULL)}
			if(length(peakHS[[i]])>2){
				ns<-unique(peakHS[[i]][,1])
				if(length(ns)>1){
					for(a in 1:(length(peakHS[[i]][,1])-1)){
						n<-peakHS[[i]][a,1]
						for(b in (a+1):length(peakHS[[i]][,1])){
							m<-peakHS[[i]][b,1]
							if(n<m){
								if( m>=((n*2)-1) ){ # series nesting requirement
									this<-HS[[n]][peakHS[[i]][a,2],1:n]
									that<-HS[[m]][peakHS[[i]][b,2],1:m]
									if(all(!is.na(match(this,that)))){
										k<-peakHS[[i]][a,2]
										HS[[n]][k,length(HS[[n]][1,])]<-0
										reject<-(reject+1)
										#stop()	
									}
									
								}
							}
						}
					}	
				}
			}
		}
		if(inter){close(pBar)}
		# remove unmarked HS ################################################
		found<-0
		if(length(HS)>minlength){ # anything to do?		
			for(i in minlength:length(HS)){		
				if(length(HS[[i]])>0){
					HS[[i]]<-(HS[[i]][HS[[i]][,length(HS[[i]][1,])]==1,,drop=FALSE])
					found<-(found+length(HS[[i]][,1]))
				}
			}
		}
		#####################################################################
		cat(paste(reject," cases - done.",sep=""));
	}else{
		cat("\n(7) Removal of nested HS skipped");	
	}
	##########################################################################	
	# (8) cluster similar HS #################################################
	if(any(deb==3)){
		cat("\n(7 b) Grouping of superjacent HS:  \n");
		# HS_IDs: table to translate HS list to a continous ID and back ######
		# use 2nd last column in HS to place a unique ID #####################
		# peakHS: membership of peaks in HS ##################################
		if(length(HS)<minlength){stop("length(HS)<minlength")}
		peakHS<-list(0)
		for(i in 1:length(peaklist[,1])){
			peakHS[[i]]<-numeric(0)
		}
		number<-1
		for(i in minlength:length(HS)){		
			if(length(HS[[i]])>0){
				HS[[i]][,i+5]<-seq(number,(number+length(HS[[i]][,1])-1),1)
				number<-(number+length(HS[[i]][,1]))
				for(a in 1:length(HS[[i]][,1])){
					for(b in 1:i){
						peakHS[[ HS[[i]][a,b] ]]<-(
							c(
								peakHS[[ HS[[i]][a,b] ]],
								HS[[i]][a,i+5]
							)
						)
					}
				}
			}	
		}
		HS_IDs<-matrix(nrow=(number-1),ncol=4,0)
		number<-1
		for(i in minlength:length(HS)){		
			if(length(HS[[i]])>0){
				HS_IDs[number:(number+length(HS[[i]][,1])-1),1]<-HS[[i]][,i+5];
				HS_IDs[number:(number+length(HS[[i]][,1])-1),2]<-i;
				HS_IDs[number:(number+length(HS[[i]][,1])-1),3]<-seq(1,length(HS[[i]][,1]),1)
				number<-(number+length(HS[[i]][,1]))
			}	
		}
		if(any(duplicated(HS_IDs))) stop("debug me on issue #6: duplicated IDs in similarity grouping of HS!")
		# get matrix with unique HSpairs #################################### 				
		at<-1;
		leng<-5000;
		from<-rep(0,leng);
		to<-rep(0,leng);
		for(i in 1:length(peakHS)){
			if(length(peakHS[[i]])>1){
				these<-peakHS[[i]]
				these<-these[order(these)]
				for(a in 1:(length(these)-1)){
					for(b in (a+1):length(these)){
						from[at]<-these[a]
						to[at]<-these[b]
						at<-(at+1)
						if(at>leng){ # extend the vector
							from<-c(from,rep(0,leng))
							to<-c(to,rep(0,leng))
							leng<-(leng+leng)
						}
					}
				}
			}
		}
		from<-from[from!=0]
		to<-to[to!=0]
		HSpairs<-unique(cbind(from,to))
		# derive similarities ###############################################
		sim<-rep(0,length(HSpairs[,1]))
		if(inter){pBar <- txtProgressBar( min = 0, max = length(HSpairs[,1]), style = 3 )}
		for(i in 1:length(HSpairs[,1])){
			if(inter){setTxtProgressBar(pBar,i,title = NULL, label = NULL)}
			# get model for peak set a
			n_a<-HS_IDs[HSpairs[i,1],2]
			m_a<-HS_IDs[HSpairs[i,1],3]
			peaks_a<-HS[[n_a]][m_a,1:n_a]
			x_a<-peaklist[peaks_a,1]
			if(any(duplicated(x_a)) ){next} # bypass fitting issues
			y_a<-peaklist[peaks_a,3]
			if(any(duplicated(y_a))){next} # bypass fitting issues			
			model_a <- smooth.spline(x_a,y_a,cv=FALSE,spar=.45)
			# get model for peak set b				
			n_b<-HS_IDs[HSpairs[i,2],2]
			m_b<-HS_IDs[HSpairs[i,2],3]
			peaks_b<-HS[[n_b]][m_b,1:n_b]
			x_b<-peaklist[peaks_b,1]
			if(any(duplicated(x_b)) ){next} # bypass fitting issues
			y_b<-peaklist[peaks_b,3]
			if(any(duplicated(y_b))){next}  # bypass fitting issues			
			model_b <- smooth.spline(x_b,y_b,cv=FALSE,spar=.45)		
			# peaks a done by model_b			
			predict_a<-predict(model_b,x_a)			
			SS_tot_a<-sum((y_a-mean(y_a))^2);			
			SS_res_a<-((y_a-predict_a$y)^2);	
			SS_res_a[!is.na(match(peaks_a,peaks_b))]<-0 # replace full peak matches
			SS_res_a<-sum(SS_res_a);
			R2_a<-(1-(SS_res_a/SS_tot_a));
			# peaks b done by model_a			
			predict_b<-predict(model_a,x_b)			
			SS_tot_b<-sum((y_b-mean(y_b))^2);			
			SS_res_b<-((y_b-predict_b$y)^2);		
			SS_res_b[!is.na(match(peaks_b,peaks_a))]<-0	# replace full peak matches
			SS_res_b<-sum(SS_res_b);
			R2_b<-(1-(SS_res_b/SS_tot_b));
			# store maximum similarity
			sim[i]<-max(c(R2_a,R2_b))
			# plot it!
			if(plotit==3){	
				plot.new()
				plot.window(xlim=c(min(c(x_a,x_b)),max(c(x_a,x_b))),ylim=c(min(c(y_a,y_b)),max(c(y_a,y_b))))
				box();axis(1);axis(2);
				points(x_a,y_a,pch=19,cex=.5,col="red")
				points(x_a,y_a,type="l",col="red")
				points(x_b,y_b,pch=19,cex=.5,col="blue")
				points(x_b,y_b,type="l",col="blue")	
				title(main=paste(round(R2_a,2)," vs. ",round(R2_b,2)))
				Sys.sleep(.3)
			}
		}
		if(inter){close(pBar)}
		HSpairs<-HSpairs[sim>=.9,,drop=FALSE]
		if(length(HSpairs[,1])>0){ # any pairs formed? ...
			from<-HSpairs[,1]
			to<-HSpairs[,2]
			# add non-paired HS to have them assigned as a group, too
			gotthose<-unique(c(from,to))
			singled<-HS_IDs[-gotthose,1]
			use<-c(rep(1,length(from)),rep(2,length(to)),rep(1,length(singled)))
			from2<-c(from,to,singled)
			to2<-c(to,from,singled)
			use<-use[order(from2,decreasing=FALSE)]
			to2<-to2[order(from2,decreasing=FALSE)]
			from2<-from2[order(from2,decreasing=FALSE)]
			groups<-.Call("metagroup",
				as.integer(from2),
				as.integer(to2),
				PACKAGE="nontarget" 
			)
			to2<-to2[use==1]
			from2<-from2[use==1]
			groups<-groups[use==1]
			HS_IDs[to2,4]<-groups
			HS_IDs[from2,4]<-groups
			if(any(HS_IDs[,4]==0)){stop("debug me on issue #7: ungrouped HS formed.")}
		}else{ # otherwise, insert a consecutive grouping ID
			# insert a consecutive grouping ID
			HS_IDs[,4]<-seq(1,length(HS_IDs[,4]),1);
			groups<-length(HS_IDs[,4]);
		}
		cat(paste(max(groups)," HS cluster formed - done.",sep=""));
	}else{
		# insert a consecutive grouping ID
		number<-1
		for(i in minlength:length(HS)){		
			if(length(HS[[i]])>0){
				HS[[i]][,i+5]<-seq(number,(number+length(HS[[i]][,1])-1),1)
				number<-(number+length(HS[[i]][,1]))
			}	
		}
		HS_IDs<-matrix(nrow=(number-1),ncol=4,0)
		number<-1
		for(i in minlength:length(HS)){		
			if(length(HS[[i]])>0){
				HS_IDs[number:(number+length(HS[[i]][,1])-1),1]<-HS[[i]][,i+5];
				HS_IDs[number:(number+length(HS[[i]][,1])-1),2]<-i;
				HS_IDs[number:(number+length(HS[[i]][,1])-1),3]<-seq(1,length(HS[[i]][,1]),1)
				number<-(number+length(HS[[i]][,1]))
			}	
		}
		if(any(duplicated(HS_IDs))){stop("debug me on issue #6: duplicated IDs in similarity grouping of HS!")}
		HS_IDs[,4]<-seq(1,length(HS_IDs[,4]),1);
	}
	##########################################################################	
	# (9) Generate data output ###############################################
	cat(paste("\n(8) Parse output for ",max(HS_IDs[,4])," homologue series and ",found," cluster ... ",sep=""));
	# generate HS group lists ################################################
	HS_groups<-list(0)
	for(i in 1:length(HS_IDs[,4])){
		HS_groups[[HS_IDs[i,4]]]<-numeric(0)
	}
	for(i in 1:length(HS_IDs[,4])){
		HS_groups[[HS_IDs[i,4]]]<-c(HS_groups[[HS_IDs[i,4]]],HS_IDs[i,1])
	}
	if(plotit==4){
		for(i in 1:length(HS_groups)){
			those<-HS_IDs[HS_groups[[i]],,drop=FALSE]
			peaks<-c()
			delmz<-c()
			for(j in 1:length(those[,1])){
				peaks<-c(peaks,
					HS[[those[j,2]]][those[j,3],1:those[j,2]]
				)
				delmz<-c(delmz,
					mean(HS[[those[j,2]]][those[j,3],(those[j,2]+1):(those[j,2]+2)])
				)
			}
			min_mz<-min(peaklist[peaks,1])
			max_mz<-max(peaklist[peaks,1])
			min_RT<-min(peaklist[peaks,3])
			max_RT<-max(peaklist[peaks,3])
			plot.new();
			plot.window(xlim=c(min_mz,max_mz),ylim=c(min_RT,max_RT));			
			title(xlab="m/z",ylab="RT",main=paste(length(those[,1]),"HS"))
			box();axis(1);axis(2);
			this<-round(delmz,digits=2);
			that<-levels(as.factor(this));
			colo<-rainbow(length(that))
			for(j in 1:length(those[,1])){
				peaks<-HS[[those[j,2]]][those[j,3],1:those[j,2]]
				points(peaklist[peaks,1],peaklist[peaks,3],
					col=colo[that==this[j]],pch=19,cex=0.5)
				points(peaklist[peaks,1],peaklist[peaks,3],
					col=colo[that==this[j]],type="l")			
			}
			Sys.sleep(.8)
		}
	}
    # generate peaklist with links & #########################################
    # generate component list with relevant m/z & RT increments ##############
    group1<-c();  # HS ID
    group2<-c();  # peak IDs
    group3<-c();  # mzincr
    group4<-c();  # retincr
    group5<-c();  # retmin
    group6<-c();  # retmax
	group7<-c();  # retdel
	group8<-c();  # HS cluster ID
	len<-length(peaklist[,1])
    getit1<-rep("0",len);      	# (1) HS ID
    getit2<-rep("0",len);      	# (2) level
    getit3<-rep("0",len);   	# (3) to which peak
    getit4<-rep("none",len);   	# (4) mean dmass
    getit5<-rep("none",len);    # (5) mean RT
    getit6<-rep("0",len);      	# (6) HS cluster ID	
	peakID<-list(0);
	atgroup<-1;
	for(a in minlength:length(HS)){
		if(length(HS[[a]])>0){ 		
			for(b in 1:length(HS[[a]][,1])){
				peakID[[atgroup]]<-numeric(0)
				listit<-list(0)
				atthat<-""
				meanmz<-mean(HS[[a]][b,((a+1):(a+2))]);
				meanRT<-mean(HS[[a]][b,((a+3):(a+4))]);
				minRT<-min(peaklist[HS[[a]][b,1:a],3]);
				maxRT<-max(peaklist[HS[[a]][b,1:a],3]);
				difRT<-(maxRT-minRT);
				for(k in 1:a){
					getit1[HS[[a]][b,k]]<-paste(getit1[HS[[a]][b,k]],"/",as.character(atgroup),sep="")
					getit2[HS[[a]][b,k]]<-paste(getit2[HS[[a]][b,k]],"/",as.character(k),sep="")
					if(k<a){
						getit3[HS[[a]][b,k]]<-paste(getit3[HS[[a]][b,k]],"/",as.character(HS[[a]][b,(k+1)]),sep="")
					}
					getit4[HS[[a]][b,k]]<-paste(getit4[HS[[a]][b,k]],"/",as.character(round(meanmz,digits=7)),sep="")
					getit5[HS[[a]][b,k]]<-paste(getit5[HS[[a]][b,k]],"/",as.character(round(meanRT,digits=4)),sep="")	
					getit6[HS[[a]][b,k]]<-paste(getit6[HS[[a]][b,k]],"/",as.character(HS_IDs[HS[[a]][b,a+5],4]),sep="")
					listit[[k]]<-HS[[a]][b,k];
					atthat<-paste(atthat,",",as.character(HS[[a]][b,k]),sep="");
				}
				peakID[[atgroup]]<-listit;
				group1<-c(group1,atgroup); 
				group2<-c(group2,substr(atthat,2,nchar(atthat)));
				group3<-c(group3,meanmz);
				group4<-c(group4,meanRT);
				group5<-c(group5,minRT);				
				group6<-c(group6,maxRT);				
				group7<-c(group7,difRT);
				group8<-c(group8,HS[[a]][b,a+5]);
				atgroup<-(atgroup+1);
			}
		}
	}
	for(i in 1:length(getit1)){
		if(getit1[i]!="0"){
			getit1[i]<-substr(getit1[i],3,nchar(getit1[i]))
			getit2[i]<-substr(getit2[i],3,nchar(getit2[i]))
			getit4[i]<-substr(getit4[i],6,nchar(getit4[i]))
			getit5[i]<-substr(getit5[i],6,nchar(getit5[i]))
			getit6[i]<-substr(getit6[i],3,nchar(getit6[i]))			
		}
		if(getit3[i]!="0"){
			getit3[i]<-substr(getit3[i],3,nchar(getit3[i]))
		}
	}
	grouped_samples<-data.frame(peaklist,seq(1,len,1),getit1,getit2,getit3,getit4,getit5,getit6,stringsAsFactors=FALSE);
	names(grouped_samples)<-c("mz","intensity","RT","peak ID","HS IDs","series level","to ID","m/z increment","RT increment","HS cluster");
	grouping<-data.frame(group1,group2,group3,group4,group5,group6,group6-group5,group8,stringsAsFactors=FALSE);
	names(grouping)<-c("HS IDs","peak IDs","m/z increment","RT increment","min. RT in series","max. RT in series","max.-min. RT","HS cluster");
	# store parameters used ##################################################
	parameters<-list(0)
	parameters[[1]]<-elements;
	parameters[[2]]<-use_C
	parameters[[3]]<-minmz
	parameters[[4]]<-maxmz
	parameters[[5]]<-minrt
	parameters[[6]]<-maxrt
	parameters[[7]]<-ppm	
	parameters[[8]]<-mztol
	parameters[[9]]<-rttol
	parameters[[10]]<-minlength
	parameters[[11]]<-vec_size
	parameters[[12]]<-spar
	parameters[[13]]<-R2
	parameters[[14]]<-plotit	
	names(parameters)<-c("elements","use_C","minmz","maxmz","minrt","maxrt","ppm","mztol","rttol","minlength","vec_size","spar","R2","plotit")
	cat("done.\n");
	##########################################################################		
	# store HS memberships per peak ##########################################
	homol<-list(grouped_samples,parameters,grouping,mzfilter,peakID,HS_groups);
	names(homol)<-c("Peaks in homologue series","Parameters","Homologue Series","m/z-Filter used","Peaks per level","HS cluster")
	##########################################################################
	return(homol); ###########################################################
	##########################################################################
	
}
blosloos/nontarget documentation built on June 2, 2022, 3:53 p.m.