R/pep.id.R

#' Mass match peak pairs from a pepmatched object. 
#' 
#' Mass match peak pairs from a pepmatched object to known databases. Can call inbuilt databases, but you can also use your own local database.
#' @author Rik Verdonck & Gerben Menschaert
#' @seealso \code{\link{generate_random_db}}, \code{\link{pep.massmatch}}
#' @param pepmatched     Input object of class \code{pepmatched}, generated by the pepmatch() function of this package.  
#' @param ID_thresh      Numeric. Maximal allowed mass difference in ppm for identification.
#' @param db             In case no preset database is chosen: a database (table) with the first 3 columns \code{"name","MW"} and \code{"sequence"} (as read in with the \code{\link{download_lpm_db}} function) The amino acid sequences have to obey rules when you want to use them for mass recalculation or generation of a decoy database. Amino acids have to be capitalized one letter codes. For details, see \code{\link{calculate_peptide_mass}}
#' @param dbpath        Character. In case a local database is used. Should be the filepath of a table, separated by dbseparator, with first tree columns: name, mass in Dalton and sequence. 
#' @param presetdb       A preset database. For the future (not ready yet)         
#' @param dbseparator    Character. Column separator of database. 
#' @param dbheader       Logical. Does the database have a header? Default FALSE
#' @param masscorrection Logical. Should masses be corrected on the basis of identifications? Caution should be payed here, since this will only work with a sufficiently high number of real matches. This rather sketchy feature should be considered something interesting to have a look at, rather than a compulsory part of the pipeline. Always also run without mass correction and compare! Default is FALSE. 
#' @param FDR            Logical. Test false discovery rate of peptide mass match identification by calculating a decoy database and look how many hits you get there. Uses \code{\link{generate_random_db}}. 
#' @param iterations     How many iterations of FDR should be ran? This might take some time for larger datasets. 
#' @param checkdb        Look if the masses and sequences in your database make sense. UNDER CONSTRUCTION!!!
#' @param graphics       Only applies when FDR is TRUE.   
#' @param cores          Interger. Number of cores that can be used on the computer for calculation. When >1, the packages foreach and doParallel will be used for multithreading. 
#' @param verbose        Logical. If TRUE, verbose output is generated during identifications. 
#' @import foreach
#' @import doParallel
#' @export
#' @exportClass pepmatched
#' @return An object of class \code{pepmatched} with added mass matches that can be used for subsequent analysis using labelpepmatch functions. This pepmatched object will also contain a couple of extras like detailed pep.id parameters, details about the FDR estimation, a list of identified peptides with their counts, and a deltavector with the mass shifts in case the mass correction has been applied.


### TO DO
# Make it a more generic peptide identification tool. 
# Checkdb parameter
# Link to a mascot server.  
# Model a normal mass distribution around every peptide in the database, and assign "chance" to every query peptide. Problem here is we don't know the sd etc for every MS device.   
# Set delta_m column to ppm 


pep.id <-
function    ( 
                            pepmatched, 
                            ID_thresh 		    = 	10,
                            db, 	# should be an object within the environment (e.g. a database read in with download_lpm_db)
                            presetdb            =   NA,     
                            dbpath				=   NA,     # should be a path to a table, separated by dbseparator, with first tree columns name, mass and sequence
                            dbseparator	        = 	",",
                            dbheader			= 	FALSE,
                            masscorrection		=	FALSE,  # Should masses be corrected on the basis of identifications? TRUE, FALSE
                            cores               =   1,
                            FDR                 =   TRUE,    # calculate a decoy database and look how many hits you get there.
                            iterations          =   100,     # How many iterations should the FDR run?
                            checkdb             =   F,       # look if the masses and sequences in your database make sense. UNDER CONSTRUCTION!!!
                            graphics            =   F,       # Only applies when FDR is TRUE                    
                            verbose             =   FALSE
                            )
                                       
{
runcount=nrow(pepmatched$design)
### First some controls before we get started!
### Here we can incorporate the function to also get just normal lists of peptides identified. 
if(class(pepmatched)!="pepmatched"){stop("ERROR: input is not of class 'pepmatched'")}

### Read in database
if (!is.na(presetdb))
{   
	db<-download_lpm_db(presetdb)
}else if(missing(db)==F){
	db<-db			
}else{
        if(missing(dbpath)){stop("ERROR: no database found")}
        db<-read.table(dbpath,sep=dbseparator,header=dbheader)
}

### for now, the mass correction function is still a bit wobbly, so throw a warning!
if(masscorrection==T){cat("\nwarning: mass correction function is sketchy\n")}


### Now we either loop the identify function over the runs, or do it in parallel
###	The output of pep.massmatch is a list (class lpm_pepmatched) that contains:
#	matchlist: dataframe like the one in a normal pepmatched object, this time with ID information columns
#	delta: the overall difference between theoretical and observed masses. If masscorrection == T this value is added to all observed masses before identification (one permutation only)
#	identified_peptides: a vector that contains all identified peptide names 
#	FDR_hits: an integer vector of length "iterations" with the counts of discoveries in mock databases. Is empty if FDR == F
#	FDR_summaryvector: mean,median,min,max,sd and sem of above mentioned FDR_hits vector. Is empty if FDR == F
#	FDR_precisionvector: a vector with all the precisions (theoretical - observed masses) of all FDR identifications. Is empty if FDR == F
#	dblist: a list with all the "iterations" databases. 

### This list is generated for each run, and the N lists are stored in IDlistlist

#######################################################################################################################################  
if(FDR==T && graphics==T){par(mfrow=c(runcount/2,2))}
if(cores==1)                                                        
{                                                                                                               
    IDlistlist<-foreach (run = 1:runcount) %do%                       
    {
		matchlist   <-pepmatched[[run]]     
		run=run                                                
		idlist      <-pep.massmatch(input=matchlist,db=db,ID_thresh=ID_thresh,masscorrection=masscorrection,FDR=FDR,iterations=iterations,checkdb=checkdb,graphics=graphics,verbose=verbose) # here I removed run=run, which seemed to be an old argument of the pep.massmatch function. See comments there.                         
    }                                                       
}else # parallel !                                                      
{                                                           
  cl<- parallel::makeCluster(cores)                                                    
  registerDoParallel(cl)                                    
  IDlistlist  <-foreach (run = 1:runcount) %dopar%                    
    { 
		matchlist   <-pepmatched[[run]]                                                      
		matchlist   <-pep.massmatch(input=matchlist,db=db,ID_thresh=ID_thresh,masscorrection=masscorrection,FDR=FDR,iterations=iterations,checkdb=checkdb,graphics=F,verbose=verbose)# here I removed run=run, which seemed to be an old argument of the pep.massmatch function. See comments there.                                                  
    }
  parallel::stopCluster(cl)
}                                                                      
#######################################################################################################################################  
#closeAllConnections()






### Some intermediate tests to work with... 
#print(summary(IDlistlist))
#print(summary(IDlistlist[[1]]))
#meh<<-IDlistlist[[1]]	# puts an IDlistlist object in global environment to have a look at. 

if(graphics==T && FDR==F){cat("Warning, the graphics option of this function only works if FDR is TRUE\n")}
if(graphics==T && FDR==T)
{
    for(run in 1:runcount)
    {
        numberofhits    <-IDlistlist[[run]]$FDR_hits
        FDR_mean        <-IDlistlist[[run]]$FDR_summaryvector$mean
        FDR_sd          <-IDlistlist[[run]]$FDR_summaryvector$sd
        hist(numberofhits, col="lightgreen", prob=TRUE, xlab="number of false positives", main=paste("FDR estimation for run ",pepmatched$design[run,1],sep=""))
        curve(dnorm(x, mean=FDR_mean, sd=FDR_sd),col="darkblue", lwd=2, add=TRUE, yaxt="n")
    }
}




### FOR EACH RUN DO:

### Replace old matchlists with updated matchlists that contain identifications
### Make a vector that contains all the mass differences between theoretical and observed mass (deltavector)
### And a vector that contains the names of all identified peptides


deltavector	<-NULL
idpep		<-NULL

for (i in 1:runcount)
{
	pepmatched[[i]]<-as.data.frame(IDlistlist[[i]][[1]])
	deltavector<-c(deltavector,IDlistlist[[i]][[2]])
	idpep<-c(idpep,IDlistlist[[i]][[1]]$pepID)
}
### Now idpep is just a vector with peptide names, many of which will occur multiple times.
### We now transform it into a nice little table that tells us which peptide has been found in how many runs... 
identified_peptides<-as.data.frame(table(idpep))



### If false discovery rate estimation is false, a couple of elements of the lpm_massmatched object mentioned above is empty 
### If false discovery rate estimation is true, we will merge the FDR components of the lpm_massmatched object for all runs
### We first do this once for the first run, to get all the formats right, and next loop it over runs 2:runcount. 
if(FDR==T)
{
	pep.id_FDR_summary	<-IDlistlist[[1]]$FDR_summaryvector
	N_hits				<-sum(IDlistlist[[1]]$matchlist$N_identifications)
	pep.id_FDR_summary	<-c("N_real_hits"=N_hits,pep.id_FDR_summary)
	dblist<-list()
	dblist[[1]]<-IDlistlist[[1]]$dblist
	FDR_precisionvectors<-list()
	FDR_precisionvectors[[1]]<-IDlistlist[[1]]$FDR_precisionvector

	for (i in 2:runcount)
	{
		N_hits				<-sum(IDlistlist[[i]]$matchlist$N_identifications)
		pep.id_FDR_summary<-cbind.data.frame(pep.id_FDR_summary,c("N_real_hits"=N_hits,IDlistlist[[i]]$FDR_summaryvector))
		dblist[[i]]<-IDlistlist[[i]]$dblist
		FDR_precisionvectors[[i]]<-IDlistlist[[i]]$FDR_precisionvector
	}
	pep.id_FDR_summary<-t(pep.id_FDR_summary)
	rownames(pep.id_FDR_summary)<-pepmatched$design$RunName

	### stick the proportions to the data frame. Just like with the FDR estimation in the pepmatch function
	pep.id_FDR_summary_prop<-round(pep.id_FDR_summary[,-1]/pep.id_FDR_summary[,1]*100,2)
	colnames(pep.id_FDR_summary_prop)<- c("meanprop", "medianprop", "minprop", "maxprop", "sdprop", "semprop")
	pep.id_FDR_summary<-cbind.data.frame(pep.id_FDR_summary,pep.id_FDR_summary_prop)
	
	dblist<-unlist(dblist,recursive=F)
}






pep.id_parameters<-data.frame("ID_thresh"=ID_thresh,"database"=dbpath, "masscorrection"=masscorrection)
if(FDR==T)
{
	pep.id_FDR_details<-list()
	pep.id_FDR_details[[1]]<-dblist
	names(pep.id_FDR_details)[[1]]<-"mockdatabaselist"
	pep.id_FDR_details[[2]]<-FDR_precisionvectors
	names(pep.id_FDR_details)[[2]]<-"FDR_precisionvectors"
}else{
	"pep.id_FDR_summary"=NULL
	"pep.id_FDR_details"=NULL
	}

identifications<-list(	"pep.id_parameters"=pep.id_parameters,
						"pep.id_FDR_summary"=pep.id_FDR_summary,
						"pep.id_FDR_details"=pep.id_FDR_details,
						"identified_peptides"=identified_peptides,
						"deltavector"=deltavector
						)
						
						
						
pepmatched<-append(pepmatched,identifications)
out<-pepmatched
class(out)<-"pepmatched"

return(out)

}
goat-anti-rabbit/labelpepmatch.R documentation built on May 17, 2019, 7:29 a.m.