#' Mass match peak pairs from a pepmatched object.
#'
#' Mass match peak pairs from a pepmatched object to known databases. Can call inbuilt databases, but you can also use your own local database.
#' @author Rik Verdonck & Gerben Menschaert
#' @seealso \code{\link{generate_random_db}}, \code{\link{pep.massmatch}}
#' @param pepmatched Input object of class \code{pepmatched}, generated by the pepmatch() function of this package.
#' @param ID_thresh Numeric. Maximal allowed mass difference in ppm for identification.
#' @param db In case no preset database is chosen: a database (table) with the first 3 columns \code{"name","MW"} and \code{"sequence"} (as read in with the \code{\link{download_lpm_db}} function) The amino acid sequences have to obey rules when you want to use them for mass recalculation or generation of a decoy database. Amino acids have to be capitalized one letter codes. For details, see \code{\link{calculate_peptide_mass}}
#' @param dbpath Character. In case a local database is used. Should be the filepath of a table, separated by dbseparator, with first tree columns: name, mass in Dalton and sequence.
#' @param presetdb A preset database. For the future (not ready yet)
#' @param dbseparator Character. Column separator of database.
#' @param dbheader Logical. Does the database have a header? Default FALSE
#' @param masscorrection Logical. Should masses be corrected on the basis of identifications? Caution should be payed here, since this will only work with a sufficiently high number of real matches. This rather sketchy feature should be considered something interesting to have a look at, rather than a compulsory part of the pipeline. Always also run without mass correction and compare! Default is FALSE.
#' @param FDR Logical. Test false discovery rate of peptide mass match identification by calculating a decoy database and look how many hits you get there. Uses \code{\link{generate_random_db}}.
#' @param iterations How many iterations of FDR should be ran? This might take some time for larger datasets.
#' @param checkdb Look if the masses and sequences in your database make sense. UNDER CONSTRUCTION!!!
#' @param graphics Only applies when FDR is TRUE.
#' @param cores Interger. Number of cores that can be used on the computer for calculation. When >1, the packages foreach and doParallel will be used for multithreading.
#' @param verbose Logical. If TRUE, verbose output is generated during identifications.
#' @import foreach
#' @import doParallel
#' @export
#' @exportClass pepmatched
#' @return An object of class \code{pepmatched} with added mass matches that can be used for subsequent analysis using labelpepmatch functions. This pepmatched object will also contain a couple of extras like detailed pep.id parameters, details about the FDR estimation, a list of identified peptides with their counts, and a deltavector with the mass shifts in case the mass correction has been applied.
### TO DO
# Make it a more generic peptide identification tool.
# Checkdb parameter
# Link to a mascot server.
# Model a normal mass distribution around every peptide in the database, and assign "chance" to every query peptide. Problem here is we don't know the sd etc for every MS device.
# Set delta_m column to ppm
pep.id <-
function (
pepmatched,
ID_thresh = 10,
db, # should be an object within the environment (e.g. a database read in with download_lpm_db)
presetdb = NA,
dbpath = NA, # should be a path to a table, separated by dbseparator, with first tree columns name, mass and sequence
dbseparator = ",",
dbheader = FALSE,
masscorrection = FALSE, # Should masses be corrected on the basis of identifications? TRUE, FALSE
cores = 1,
FDR = TRUE, # calculate a decoy database and look how many hits you get there.
iterations = 100, # How many iterations should the FDR run?
checkdb = F, # look if the masses and sequences in your database make sense. UNDER CONSTRUCTION!!!
graphics = F, # Only applies when FDR is TRUE
verbose = FALSE
)
{
runcount=nrow(pepmatched$design)
### First some controls before we get started!
### Here we can incorporate the function to also get just normal lists of peptides identified.
if(class(pepmatched)!="pepmatched"){stop("ERROR: input is not of class 'pepmatched'")}
### Read in database
if (!is.na(presetdb))
{
db<-download_lpm_db(presetdb)
}else if(missing(db)==F){
db<-db
}else{
if(missing(dbpath)){stop("ERROR: no database found")}
db<-read.table(dbpath,sep=dbseparator,header=dbheader)
}
### for now, the mass correction function is still a bit wobbly, so throw a warning!
if(masscorrection==T){cat("\nwarning: mass correction function is sketchy\n")}
### Now we either loop the identify function over the runs, or do it in parallel
### The output of pep.massmatch is a list (class lpm_pepmatched) that contains:
# matchlist: dataframe like the one in a normal pepmatched object, this time with ID information columns
# delta: the overall difference between theoretical and observed masses. If masscorrection == T this value is added to all observed masses before identification (one permutation only)
# identified_peptides: a vector that contains all identified peptide names
# FDR_hits: an integer vector of length "iterations" with the counts of discoveries in mock databases. Is empty if FDR == F
# FDR_summaryvector: mean,median,min,max,sd and sem of above mentioned FDR_hits vector. Is empty if FDR == F
# FDR_precisionvector: a vector with all the precisions (theoretical - observed masses) of all FDR identifications. Is empty if FDR == F
# dblist: a list with all the "iterations" databases.
### This list is generated for each run, and the N lists are stored in IDlistlist
#######################################################################################################################################
if(FDR==T && graphics==T){par(mfrow=c(runcount/2,2))}
if(cores==1)
{
IDlistlist<-foreach (run = 1:runcount) %do%
{
matchlist <-pepmatched[[run]]
run=run
idlist <-pep.massmatch(input=matchlist,db=db,ID_thresh=ID_thresh,masscorrection=masscorrection,FDR=FDR,iterations=iterations,checkdb=checkdb,graphics=graphics,verbose=verbose) # here I removed run=run, which seemed to be an old argument of the pep.massmatch function. See comments there.
}
}else # parallel !
{
cl<- parallel::makeCluster(cores)
registerDoParallel(cl)
IDlistlist <-foreach (run = 1:runcount) %dopar%
{
matchlist <-pepmatched[[run]]
matchlist <-pep.massmatch(input=matchlist,db=db,ID_thresh=ID_thresh,masscorrection=masscorrection,FDR=FDR,iterations=iterations,checkdb=checkdb,graphics=F,verbose=verbose)# here I removed run=run, which seemed to be an old argument of the pep.massmatch function. See comments there.
}
parallel::stopCluster(cl)
}
#######################################################################################################################################
#closeAllConnections()
### Some intermediate tests to work with...
#print(summary(IDlistlist))
#print(summary(IDlistlist[[1]]))
#meh<<-IDlistlist[[1]] # puts an IDlistlist object in global environment to have a look at.
if(graphics==T && FDR==F){cat("Warning, the graphics option of this function only works if FDR is TRUE\n")}
if(graphics==T && FDR==T)
{
for(run in 1:runcount)
{
numberofhits <-IDlistlist[[run]]$FDR_hits
FDR_mean <-IDlistlist[[run]]$FDR_summaryvector$mean
FDR_sd <-IDlistlist[[run]]$FDR_summaryvector$sd
hist(numberofhits, col="lightgreen", prob=TRUE, xlab="number of false positives", main=paste("FDR estimation for run ",pepmatched$design[run,1],sep=""))
curve(dnorm(x, mean=FDR_mean, sd=FDR_sd),col="darkblue", lwd=2, add=TRUE, yaxt="n")
}
}
### FOR EACH RUN DO:
### Replace old matchlists with updated matchlists that contain identifications
### Make a vector that contains all the mass differences between theoretical and observed mass (deltavector)
### And a vector that contains the names of all identified peptides
deltavector <-NULL
idpep <-NULL
for (i in 1:runcount)
{
pepmatched[[i]]<-as.data.frame(IDlistlist[[i]][[1]])
deltavector<-c(deltavector,IDlistlist[[i]][[2]])
idpep<-c(idpep,IDlistlist[[i]][[1]]$pepID)
}
### Now idpep is just a vector with peptide names, many of which will occur multiple times.
### We now transform it into a nice little table that tells us which peptide has been found in how many runs...
identified_peptides<-as.data.frame(table(idpep))
### If false discovery rate estimation is false, a couple of elements of the lpm_massmatched object mentioned above is empty
### If false discovery rate estimation is true, we will merge the FDR components of the lpm_massmatched object for all runs
### We first do this once for the first run, to get all the formats right, and next loop it over runs 2:runcount.
if(FDR==T)
{
pep.id_FDR_summary <-IDlistlist[[1]]$FDR_summaryvector
N_hits <-sum(IDlistlist[[1]]$matchlist$N_identifications)
pep.id_FDR_summary <-c("N_real_hits"=N_hits,pep.id_FDR_summary)
dblist<-list()
dblist[[1]]<-IDlistlist[[1]]$dblist
FDR_precisionvectors<-list()
FDR_precisionvectors[[1]]<-IDlistlist[[1]]$FDR_precisionvector
for (i in 2:runcount)
{
N_hits <-sum(IDlistlist[[i]]$matchlist$N_identifications)
pep.id_FDR_summary<-cbind.data.frame(pep.id_FDR_summary,c("N_real_hits"=N_hits,IDlistlist[[i]]$FDR_summaryvector))
dblist[[i]]<-IDlistlist[[i]]$dblist
FDR_precisionvectors[[i]]<-IDlistlist[[i]]$FDR_precisionvector
}
pep.id_FDR_summary<-t(pep.id_FDR_summary)
rownames(pep.id_FDR_summary)<-pepmatched$design$RunName
### stick the proportions to the data frame. Just like with the FDR estimation in the pepmatch function
pep.id_FDR_summary_prop<-round(pep.id_FDR_summary[,-1]/pep.id_FDR_summary[,1]*100,2)
colnames(pep.id_FDR_summary_prop)<- c("meanprop", "medianprop", "minprop", "maxprop", "sdprop", "semprop")
pep.id_FDR_summary<-cbind.data.frame(pep.id_FDR_summary,pep.id_FDR_summary_prop)
dblist<-unlist(dblist,recursive=F)
}
pep.id_parameters<-data.frame("ID_thresh"=ID_thresh,"database"=dbpath, "masscorrection"=masscorrection)
if(FDR==T)
{
pep.id_FDR_details<-list()
pep.id_FDR_details[[1]]<-dblist
names(pep.id_FDR_details)[[1]]<-"mockdatabaselist"
pep.id_FDR_details[[2]]<-FDR_precisionvectors
names(pep.id_FDR_details)[[2]]<-"FDR_precisionvectors"
}else{
"pep.id_FDR_summary"=NULL
"pep.id_FDR_details"=NULL
}
identifications<-list( "pep.id_parameters"=pep.id_parameters,
"pep.id_FDR_summary"=pep.id_FDR_summary,
"pep.id_FDR_details"=pep.id_FDR_details,
"identified_peptides"=identified_peptides,
"deltavector"=deltavector
)
pepmatched<-append(pepmatched,identifications)
out<-pepmatched
class(out)<-"pepmatched"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.