Nothing
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Constructor and readIBSpectra.
###
.check.columns <- function(identifications,data.ions=NULL,data.mass=NULL,allow.missing.columns=FALSE) {
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
PC <- .PROTEIN.COLS[.PROTEIN.COLS %in% colnames(identifications)]
missing.cols = c()
for (col in c('SPECTRUM','PEPTIDE','MODIFSTRING'))
if (!is.element(.SPECTRUM.COLS[col],SC))
missing.cols <- c(missing.cols,.SPECTRUM.COLS[col])
for (col in c('PROTEINAC'))
if (!is.element(.PROTEIN.COLS[col],PC))
missing.cols <- c(missing.cols,.PROTEIN.COLS[col])
if (!is.null(data.ions) && !is.null(data.mass)) {
if (is.null(rownames(data.ions)) || is.null(rownames(data.mass)))
stop("provide spectrum ids as rownames for data.ions and data.mass")
if (!identical(rownames(data.ions),rownames(data.mass)))
stop("data.ions and data.mass do not have identical rownames!")
}
# handle missing columns
if (length(missing.cols) > 0) {
msg <- paste("not all required columns in identifications, the following are missing: \n\t",
paste(missing.cols,collapse="\n\t"),
"\nData:\n")
#paste(capture.output(print(head(identifications))))
if (allow.missing.columns) {
warning(msg)
print(head(identifications))
identifications[,missing.cols] <- NA
} else {
message(msg)
print(head(identifications))
stop()
}
}
message(" data.frame columns OK")
return(identifications)
}
.get.quant <- function(identifications,colname,reporterTagNames) {
cols <- sprintf(colname,reporterTagNames)
if (!all(cols %in% colnames(identifications)))
stop(" Quantitative information missing: Supply columns '",paste(cols,collapse="', '"),"'.")
qdata <- unique(identifications[,c(.SPECTRUM.COLS['SPECTRUM'],cols)])
identifications <<- identifications[,-which(colnames(identifications) %in% cols)]
rownames(qdata) <- qdata[,.SPECTRUM.COLS['SPECTRUM']]
qdata <- as.matrix(qdata[,-1])
colnames(qdata) <- reporterTagNames
return(qdata)
}
.check.assayDataElements <- function(assayDataElements) {
for (elem in assayDataElements) {
if (is.null(rownames(elem)))
stop("provide rownames for all assayDataElements")
#if (!identical(rownames(elem),rownames(data.ions)))
# stop("all assayDataElements must have the same rownames")
#if (!identical(dim(data.ions),dim(elem)))
# stop("all assayDataElements must have the same dim")
}
}
.get.quant.elems <- function(assayDataElements,data.ions,data.mass,spectra.ids,fragment.precision,reporterTagMasses) {
if (is.null(assayDataElements))
assayDataElements <- list()
assayDataElements$ions <- data.ions
assayDataElements$mass <- data.mass
if (!identical(spectra.ids,rownames(data.ions))) {
id.n.quant <- intersect(spectra.ids,rownames(data.mass))
id.not.quant <- setdiff(spectra.ids,rownames(data.mass))
quant.not.id <- setdiff(rownames(data.mass),spectra.ids)
if (length(id.n.quant)==0) stop("No spectra could be matched between identification and quantifications.",
"If the search was preformed with Mascot, set readIBSpectra argument decode.titles=TRUE,\n",
" or in the properties file for report generation:\n",
" readIBSpectra.args=list(decode.titles=TRUE)")
if (length(quant.not.id) > 0)
message(" for ",length(quant.not.id)," spectra ",
"[",round(length(quant.not.id)*100/nrow(data.mass),2),"%],",
" quantitative information is available,\n",
" but no peptide-spectrum match. Spectrum titles: \n\t",
paste(quant.not.id[1:min(length(quant.not.id),2)],collapse=",\n\t"),", ...")
if (length(id.not.quant) > 0)
message(" for ",length(id.not.quant)," spectra ",
"[",round(length(id.not.quant)*100/length(spectra.ids),2),"%]",
" with assigned peptides,\n",
" no reporter intensities are available. spectrum titles: \n\t",
paste(id.not.quant[1:min(length(id.not.quant),2)],collapse=",\n\t"),", ...")
na.matrix <- matrix(NA,nrow=length(spectra.ids),ncol=ncol(data.mass),
dimnames=list(spectra.ids,colnames(data.mass)))
for (elem in names(assayDataElements)) {
tmp <- na.matrix
tmp[id.n.quant,] <- assayDataElements[[elem]][id.n.quant,]
assayDataElements[[elem]] <- tmp
}
}
# filter based on fragment precision
if (!is.null(fragment.precision)) {
min.masses <- reporterTagMasses - fragment.precision/2
max.masses <- reporterTagMasses + fragment.precision/2
for (i in seq_len(nrow(assayDataElements$mass))) {
bad.mass <- assayDataElements$mass[i,]<min.masses | assayDataElements$mass[i,] > max.masses
if (any(bad.mass,na.rm=TRUE)) {
for (elem in names(assayDataElements)) {
assayDataElements[[elem]][i,bad.mass] <- NA
}
}
}
}
assayDataElements$ions[which(assayDataElements$ions==0)] <- NA
assayDataElements$mass[which(assayDataElements$mass==0)] <- NA
if (all(apply(is.na(assayDataElements$mass),2,all))) stop("Unable to extract any reporter m/z values. Try setting 'fragment.precision' higher (current value: ",fragment.precision,"), or to NULL if no filtering is desired.")
if (all(apply(is.na(assayDataElements$ions),2,all))) stop("Unable to extract any reporter intensities.")
return(assayDataElements)
}
.get.varMetaData <- function(identifications) {
nn <- .SPECTRUM.COLS %in% colnames(identifications)
label.desc <- c(PEPTIDE='peptide sequence',
MODIFSTRING='modifications of peptide',
CHARGE='peptide charge state',
THEOMASS='theoretical peptide mass',
EXPMASS='experimental peptide mass',
EXPMOZ='experimental mass-to-charge ratio',
PRECURSOR.ERROR='precursor error',
PARENTINTENS='parent ion intensity',
RT='retention time',
DISSOCMETHOD='dissociation METHOD',
SPECTRUM='spectrum title',
SPECTRUM.QUANT='title of spectrum used for quantiation',
PRECURSOR.PURITY="precursor purity",
RAWFILE="raw file",NMC="nmc",
DELTASCORE="delta score",DELTASCORE.PEP="delta score peptide",
SCANS="scans",
SCANS.FROM="scans from",SCANS.TO="scans to",
MASSDELTA.ABS="massdelta (abs)",MASSDELTA.PPM="massdelta (ppm)",
SEARCHENGINE='protein search engine',
SCORE='protein search engine score',
SCORE.MSGF='MSGF+ score',
SCORE.MASCOT="Mascot search score",
SCORE.PHENYX="Phenyx search score",
SCORE.PHOSPHORS='PhosphoRS pepscore',
PROB.PHOSPHORS="PhosphoRS probability",
SCAFFOLD.PEPPROB="Scaffold: Peptide Probability",
SEQUEST.XCORR="sequest:xcorr",
SEQUEST.DELTACN="sequest:deltacn",
IS.DECOY="is decoy peptide-spectrum match?",
P.VALUE="p value",
MSGF.RAWSCORE="MSGF raw score",
MSGF.DENOVOSCORE="MSGF de novo score",
MSGF.SPECEVALUE="MSGF specturm EValue",
MSGF.EVALUE="MSGF EValue",
MSGF.QVALUE="MSGF QValue",
MSGF.PEPQVALUE="MSGF PepValue",
PHOSPHO.SITES="phosphorylation sites",
USEFORQUANT='use spectrum for quantification',
SEQPOS='PTM seqpos',
SITEPROBS='PhosphoRS site.probs',
PEP.SITEPROBS='PhosphoRS site.probs inside peptide',
FILE='file',SAMPLE='sample',NOTES='notes'
)
if (!all(names(.SPECTRUM.COLS) %in% names(label.desc))) {
sel.bad <- !names(.SPECTRUM.COLS) %in% names(label.desc)
warning("Not all SPECTRUM COLS have a label description:\n\t",
paste(names(.SPECTRUM.COLS)[sel.bad],collapse="\n\t"))
label.desc[names(.SPECTRUM.COLS)[sel.bad]] <- .SPECTRUM.COLS[sel.bad]
}
VARMETADATA=data.frame(labelDescription=label.desc[names(.SPECTRUM.COLS)],
row.names=.SPECTRUM.COLS)
return(VARMETADATA[nn,,drop=FALSE])
}
.remove.duplications <- function(identifications) {
identifications <- unique(identifications)
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
if (max(table(identifications[,SC['SPECTRUM']])) > 1) {
t <- table(identifications[,SC['SPECTRUM']])
warning(sum(t>1)," spectra have diverging identifications after the merging, removing them.")
print(head(identifications[identifications[,SC['SPECTRUM']] %in% names(t)[t>1],]))
identifications <- identifications[!identifications[,SC['SPECTRUM']] %in% names(t)[t>1],]
}
if ('SEARCHENGINE' %in% names(SC)) {
message(" Identification details:")
tt <- sort(table(identifications[,SC['SEARCHENGINE']]))
stats <- data.frame(perc=sprintf("%.2f %%",tt/sum(tt)*100),n=tt)
if ('SCORE' %in% names(SC)) {
scores <- identifications[,SC['SCORE']]
score.stats <- do.call(rbind,lapply(names(tt),function (se) {
my.scores <- scores[identifications[,SC['SEARCHENGINE']]==se]
summary(my.scores,na.rm=TRUE)}))
stats <- cbind(stats,score.stats)
}
print(stats)
}
identifications
}
.merge.identifications.full <- function(identifications, ...) {
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
## Substitute Isoleucins with Leucins (indistinguishable by Masspec)
if (!.PEPTIDE.COLS['REALPEPTIDE'] %in% colnames(identifications))
identifications <- .fix.il.peptide(identifications)
## Separate protein columns (focus on peptide-spectrum matches)
PC <- unique(c(.SPECTRUM.COLS['PEPTIDE'],.PROTEIN.COLS,.PEPTIDE.COLS))
protein.colnames <- colnames(identifications)[colnames(identifications) %in% c(PC)]
pept.n.prot <- unique(identifications[,protein.colnames])
identifications <- unique(identifications[,-which(colnames(identifications) %in% setdiff(PC,.SPECTRUM.COLS['PEPTIDE']))])
## Merge identifications
if (max(table(identifications[,SC['SPECTRUM']])) > 1) {
if (SC['DISSOCMETHOD'] %in% colnames(identifications) &&
length(unique(identifications[,SC['DISSOCMETHOD'] ]))) {
identifications <- dlply(identifications,'dissoc.method',.merge.identifications,...)
# rbind separately to assure equal column names
identifications <- do.call(rbind,identifications)
identifications <- .merge.quant.identifications(identifications)
} else {
identifications <- .merge.identifications(identifications, ...)
}
}
identifications <- .remove.duplications(identifications)
## Merge back the protein mapping
merge(pept.n.prot,identifications,by="peptide")
}
setMethod("initialize","IBSpectra",
function(.Object,identifications=NULL,data.ions=NULL,data.mass=NULL,
proteinGroupTemplate=NULL,fragment.precision=NULL,
assayDataElements=list(),allow.missing.columns=FALSE,
write.excluded.to=NULL,...) {
if (is.null(identifications))
return(callNextMethod(.Object,...))
reporterTagNames <- reporterTagNames(.Object)
identifications <- .factor.to.chr(identifications)
## Check that obligatory columns are present
identifications <- .check.columns(identifications, data.ions, data.mass, allow.missing.columns)
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
## Substitute Isoleucins with Leucins (indistinguishable by Masspec)
if (!.PEPTIDE.COLS['REALPEPTIDE'] %in% colnames(identifications))
identifications <- .fix.il.peptide(identifications)
## Separate protein columns (focus on peptide-spectrum matches)
PC <- unique(c(.SPECTRUM.COLS['PEPTIDE'],.PROTEIN.COLS,.PEPTIDE.COLS))
protein.colnames <- colnames(identifications)[colnames(identifications) %in% c(PC)]
pept.n.prot <- unique(identifications[,protein.colnames])
identifications <- unique(identifications[,-which(colnames(identifications) %in% setdiff(PC,.SPECTRUM.COLS['PEPTIDE']))])
## Merge identifications
if (max(table(identifications[,SC['SPECTRUM']])) > 1) {
message("merging identifications")
if (SC['DISSOCMETHOD'] %in% colnames(identifications) &&
length(unique(identifications[,SC['DISSOCMETHOD'] ]))) {
identifications <- dlply(identifications,'dissoc.method',.merge.identifications,...)
# rbind separately to assure equal column names
identifications <- do.call(rbind,identifications)
identifications <- .merge.quant.identifications(identifications)
} else {
identifications <- .merge.identifications(identifications, ...)
}
}
identifications <- .remove.duplications(identifications)
# Create ProteinGroup
proteinGroup <- ProteinGroup(merge(pept.n.prot,identifications,by="peptide"),template=proteinGroupTemplate)
## Get intensities and masses in assayDataElements
if (is.null(data.ions))
data.ions <- .get.quant(identifications,.PEAKS.COLS['IONSFIELD'],reporterTagNames)
if (is.null(data.mass))
data.mass <- .get.quant(identifications,.PEAKS.COLS['MASSFIELD'],reporterTagNames)
if (!all(rownames(data.ions)==rownames(data.mass),na.rm=TRUE))
stop(sum(rownames(data.ions)==rownames(data.mass))," rownames are not equal between ions and mass")
if (any(is.na(rownames(data.ions))))
stop(sum(is.na(rownames(data.ions)))," rownames in data.ions are NA")
if (any(is.na(rownames(data.mass))))
stop(sum(is.na(rownames(data.ions)))," rownames in data.ions are NA")
assayDataElements <- .get.quant.elems(assayDataElements,data.ions,data.mass,
identifications[,SC['SPECTRUM']],fragment.precision,
reporterTagMasses(.Object))
if (!.SPECTRUM.COLS['USEFORQUANT'] %in% colnames(identifications)) {
# not perfect - better: set spectra of peptides shared between groups to FALSE
# and spectra with no values
identifications[,.SPECTRUM.COLS['USEFORQUANT']] <- TRUE
}
fdata <- identifications[,colnames(identifications) %in% .SPECTRUM.COLS]
rownames(fdata) <- fdata[,'spectrum']
featureData <- new("AnnotatedDataFrame",data=fdata,
varMetadata=.get.varMetaData(fdata))
assayData=do.call(assayDataNew, assayDataElements, envir=parent.frame())
## create ibspectra object
callNextMethod(.Object,
assayData=assayData,
featureData=featureData,
proteinGroup=proteinGroup,
reporterTagNames=reporterTagNames,...)
})
setGeneric("readIBSpectra", function(type,id.file,peaklist.file,...)
standardGeneric("readIBSpectra"))
setMethod("readIBSpectra",
signature(type="character",id.file="character",peaklist.file="missing"),
function(type,id.file,identifications.format=NULL,
sep="\t",decode.titles=FALSE,trim.titles=FALSE,...) {
new(type,
identifications=.read.idfile(id.file,sep=sep,
identifications.format=identifications.format,
decode.titles=decode.titles,trim.titles=trim.titles),...)
}
)
setMethod("readIBSpectra",
signature(type="character",id.file="data.frame",peaklist.file="missing"),
function(type,id.file,...) new(type,identifications=id.file,...)
)
setMethod("readIBSpectra",
signature(type="character",id.file="character",peaklist.file="character"),
function(type,id.file,peaklist.file,sep="\t",
mapping.file=NULL,mapping=c(quantification.spectrum = "hcd",identification.spectrum = "cid"),
id.file.domap=NULL,identifications.format=NULL,decode.titles=FALSE,...) {
id.data <- .read.identifications(id.file,sep=sep,
mapping=mapping.file,mapping.names=mapping,
identifications.quant=id.file.domap,
identifications.format=identifications.format,decode.titles=decode.titles)
readIBSpectra(type,id.data,peaklist.file,...)
})
##' readIBSpectra - read IBSpectra object from files
##'
##' <details>
##' @title
##' @param type [character] IBSpectra type
##' @param id.file [character] file of format ibspectra.csv or
##' mzid. If format is ibspectra.csv, it may also contain quantitative
##' information, and peaklist.file is not required.
##' @param peaklist.file [character] file in MGF format containing
##' quantitative information (m/z and intensity pairs). If NULL,
##' quantitative information is expected to reside in id.file.
##' @param proteinGroupTemplate [ProteinGroup object] uses certain
##' protein grouping as template. Useful when comparing multiple
##' experiments.
##' @param mapping.file [character] CSV file which maps spectrum
##' titles from peaklist file to those of id file. Usefule when
##' quantitative and identification information reside in different
##' but corresponding spectra. E.g. HCD-CID dissociation
##' @param mapping [named character] column number or name for
##' 'peaklist' and 'id' spectra.
##' @param mapping.file.readopts
##' @param peaklist.format
##' @param identifications.format
##' @return IBSpectra object of type
##' @author Florian P Breitwieser
setMethod("readIBSpectra",
signature(type="character",id.file="data.frame",peaklist.file="character"),
function(type,id.file,peaklist.file,
annotate.spectra.f=NULL,
peaklist.format=NULL,scan.lines=0,
fragment.precision=NULL,fragment.outlier.prob=NULL,...) {
if (is.function(annotate.spectra.f)) {
id.file <- annotate.spectra.f(id.file,peaklist.file)
}
.Object <- new(type)
quant <- .read.peaklist(peaklist.file,peaklist.format,
.Object@reporterTagMasses,.Object@reporterTagNames,
scan.lines,fragment.precision,fragment.outlier.prob,
id.data=id.file)
new(type,identifications=id.file,quant[[2]],data.ions=quant[[1]],...)
}
)
# returns a list with intensities and mass matrices
.read.peaklist <- function(peaklist.file,peaklist.format,
reporterMasses,reporterTagNames,
scan.lines,fragment.precision,fragment.outlier.prob,
id.data=NULL) {
data.ions=c()
data.mass=c()
data.titles=c()
for (peaklist.f in peaklist.file) {
peaklist.format.f <- peaklist.format
if (is.null(peaklist.format.f)) {
if (grepl(".mgf$",peaklist.f,ignore.case=TRUE))peaklist.format.f <- "mgf"
else if (grepl(".mcn$",peaklist.f,ignore.case=TRUE))peaklist.format.f <- "mcn"
else if (grepl(".intensities.csv$",peaklist.f,ignore.case=TRUE))peaklist.format.f <- "csv"
else
stop(paste0("cannot parse file ",peaklist.f," - cannot deduce format (mgf or mcn)"))
}
if (tolower(peaklist.format.f) == "mgf") {
intensities.f <- .read.mgf(peaklist.f,reporterMasses,reporterTagNames,
fragment.precision=fragment.precision,
prob=fragment.outlier.prob,scan.lines=scan.lines)
if (nrow(intensities.f$ions) == 0) { stop("only NA data in ions/mass") }
data.titles <- c(data.titles,intensities.f$spectrumtitles)
data.ions <- rbind(data.ions,intensities.f$ions)
data.mass <- rbind(data.mass,intensities.f$mass)
} else if (tolower(peaklist.format.f) == "mcn") {
if (type != "iTRAQ4plexSpectra")
stop("mcn format (iTracker) only supports iTRAQ 4plex spectra!")
mcn.i <- read.table("itraqdta/i-Tracker.mcn",sep=",",skip=2,
colClasses=c("character","character",
"numeric","numeric","numeric","numeric",rep("NULL",36)),
col.names=c("spectrum","spectrum.t","114","115","116","117",rep("",36)),
check.names=FALSE)
data.titles <- c(data.titles,mcn.i$spectrum)
data.ions <- c(data.ions,as.matrix(mcn.i[,3:6]))
data.mass <- c(data.mass,
matrix(rep(114:117,nrow(mcn.i)),nrow=nrow(mcn.i),byrow=TRUE))
} else if (tolower(peaklist.format.f) == "csv") {
message(" reading peaklist ",peaklist.f," ...",appendLF=FALSE)
res <- read.delim(peaklist.f,stringsAsFactors=FALSE,quote="")
message(" done")
if (!"spectrum" %in% colnames(res)) stop("'spectrum' column is missing from peaklist CSV")
if (sum(.grep_columns(res,"ions$")) < length(reporterTagNames))
stop("Not all neccessary intensity columns [",paste0(reporterTagNames,"_ions"),"] in peaklist CSV")
if (sum(.grep_columns(res,"mass$")) < length(reporterTagNames))
stop("Not all neccessary reporter mass columns [",paste0(reporterTagNames,"_mass"),"] in peaklist CSV")
data.titles <- c(data.titles,res[,"spectrum"])
data.ions <- rbind(data.ions,as.matrix(res[,.grep_columns(res,"ions$")]))
data.mass <- rbind(data.mass,as.matrix(res[,.grep_columns(res,"mass$")]))
}
}
if (!is.null(id.data) && .SPECTRUM.COLS['SPECTRUM.QUANT'] %in% colnames(id.data)) {
data.titles.orig <- data.titles
data.titles <- .do.map(data.titles,unique(id.data[,.SPECTRUM.COLS[c('SPECTRUM','SPECTRUM.QUANT')]]))
sel.na <- is.na(data.titles)
if (any(is.na(data.titles))) {
message(" for ",sum(sel.na)," of ",length(data.titles)," spectra,",
" quantitative information is available,\n",
" but no peptide-spectrum match. Spectrum titles: \n\t",
paste(data.titles.orig[sel.na][1:2],collapse=",\n\t"),", ...")
data.ions <- data.ions[!sel.na,]
data.mass <- data.mass[!sel.na,]
data.titles <- data.titles[!sel.na]
}
}
rownames(data.ions) <- data.titles
rownames(data.mass) <- data.titles
## TODO: check that all identified spectra are present in intensities
colnames(data.ions) <- reporterTagNames
colnames(data.mass) <- reporterTagNames
if (any(is.na(rownames(data.ions))))
stop(sum(is.na(rownames(data.ions)))," rownames in data.ions are NA")
if (any(is.na(rownames(data.mass))))
stop(sum(is.na(rownames(data.ions)))," rownames in data.ions are NA")
return(list(data.ions,data.mass))
}
.do.map <- function(spectrumtitles,mapping.quant2id) {
#mapped.spectra.pl <-
# mapping.quant2id[,2][ mapping.quant2id[,1] %in% id.spectra]
#write(mapped.spectra.pl,file='id_spectra.csv')
#.stopiflengthnotequal(id.spectra,mapped.spectra.pl,
# "not all identified spectra could be matched!\n",
# "\tx ... identified spectra\n",
# "\ty ... mapped spectra\n")
spectra.map <- .as.vect(mapping.quant2id,1,2)
.stopiflengthnotequal(spectrumtitles,
spectra.map[spectrumtitles],
"not all spectra could be matched!\n",
"\tx ... spectra with quant info\n",
"\ty ... identified spectra\n")
spectra.map[spectrumtitles]
}
.get.dupl.n.warn <- function(df,col,msg="ibspectra",write.to=NULL,f=warning) {
dupl <- .all.duplicate.rows(df,col)
if (!is.null(write.to))
write.table(dupl,file=write.to,row.names=FALSE,sep="\t")
dupl.msg <- paste(apply(dupl,1,paste,collapse="; "),collapse="\n\t")
f(sprintf("%s> divergent identifications in %s spectra [%s ids]:\n\t%s",
msg,length(unique(dupl[,col])),nrow(dupl),dupl.msg))
return(unique(dupl[,col]))
}
### READ MzID
read.mzid <- function(filename) {
requireNamespace("XML")
doc <- XML::xmlInternalTreeParse(filename)
ns <- c(x=XML::xmlNamespace(XML::xmlRoot(doc))[[1]])
root <- ifelse (isTRUE(ns == "http://psidev.info/psi/pi/mzIdentML/1.0"), "/x:mzIdentML", "/x:MzIdentML")
peptidesequence.name <- ifelse (isTRUE(ns == "http://psidev.info/psi/pi/mzIdentML/1.0"), "peptideSequence", "PeptideSequence")
dbsequenceref.attrname <- ifelse (isTRUE(ns == "http://psidev.info/psi/pi/mzIdentML/1.0"), "DBSequence_ref", "dBSequence_ref")
peptideevref.attrname <- ifelse (isTRUE(ns == "http://psidev.info/psi/pi/mzIdentML/1.0"), "PeptideEvidence_ref", "peptideEvidence_ref")
peptideref.attrname <- ifelse (isTRUE(ns == "http://psidev.info/psi/pi/mzIdentML/1.0"), "Peptide_ref", "peptide_ref")
searchdatabase.mapping <- data.frame(
ref=XML::xpathSApply(doc,paste0(root,"/x:DataCollection/x:Inputs/x:SearchDatabase"),
XML::xmlGetAttr,name="id",namespaces=ns),
name=XML::xpathSApply(doc,paste0(root,"/x:DataCollection/x:Inputs/x:SearchDatabase"),
XML::xmlGetAttr,name="name",namespaces=ns),stringsAsFactors=FALSE
)
modification.mapping.df <- t(do.call(cbind,
XML::xpathApply(doc,paste0(root,"/x:AnalysisProtocolCollection/x:SpectrumIdentificationProtocol/x:ModificationParams/x:SearchModification"),
function(modif) {
res <- XML::xmlAttrs(modif)
modparams <- XML::getNodeSet(modif,"x:ModParam",namespaces=ns)
return (switch(as.character(length(modparams)),
"0" = c(res,XML::xmlAttrs(modif[['cvParam']])),
"1" = c(res,XML::xmlAttrs(modparams[[1]]),XML::xmlAttrs(modparams[[1]][['cvParam']])),
stop("Expecting zero or one ModParam Node in ",
"/mzIdentML/AnalysisProtocolCollection/SpectrumIdentificationProtocol/ModificationParams/SearchModification")))
},namespaces=ns)))
unknown.modif <- modification.mapping.df[,'name']=='unknown modification' |
modification.mapping.df[,'accession']=='MS:1001460'
if (any(unknown.modif)) {
if (! 'value' %in% colnames(modification.mapping.df)) {
stop('unknown modifications [as defined by nameor accession], and no column value to override.')
}
modification.mapping.df[unknown.modif,'name'] <- modification.mapping.df[unknown.modif,'value']
}
modif.map <- .as.vect(unique(modification.mapping.df[,c('massDelta','name')]))
message("peptide and protein mapping")
peptide.mapping <- do.call(rbind,XML::xpathApply(doc,paste0(root,"/x:SequenceCollection/x:Peptide"),namespaces=ns,
function(pep) {
peptide.ref <- XML::xmlGetAttr(pep,name='id')
peptide.seq <- XML::xmlValue(pep[[peptidesequence.name]])
loc.n.delta <- XML::xpathSApply(pep,"x:Modification",function(m) c(location=XML::xmlGetAttr(m,name="location"),
massDelta=XML::xmlGetAttr(m,name="monoisotopicMassDelta")),
namespaces=ns)
modifstring <- character(nchar(peptide.seq)+2)
if (length(loc.n.delta) > 0) {
modif.names <- modif.map[loc.n.delta[2,]]
if (any(is.na(modif.names)))
modif.names <- XML::xpathSApply(pep,"x:Modification/x:cvParam[@cvRef='UNIMOD']",XML::xmlGetAttr,name='name',namespaces=ns)
if (length(modif.names) < ncol(loc.n.delta))
stop("modif names and delta have different size")
modifstring[as.numeric(loc.n.delta[1,])+1] <- modif.names
}
c(peptide.ref=peptide.ref,peptide=peptide.seq,modif=paste(modifstring,collapse=":"))
}))
protein.mapping <- do.call(rbind,XML::xpathApply(doc,paste0(root,"/x:SequenceCollection/x:DBSequence"),function(dbs) {
c(dbseq.ref = XML::xmlGetAttr(dbs,name='id'),
accession = XML::xmlGetAttr(dbs,name='accession'),
length = XML::xmlGetAttr(dbs,name='length'),
sdb.ref = XML::xmlGetAttr(dbs,name='SearchDatabase_ref'))
#sequence = XML::xmlValue(dbs[['seq']]))
},namespaces=ns))
records.proteinDetections <- do.call(rbind,XML::xpathApply(doc,
paste0(root,"/x:DataCollection/x:AnalysisData/x:ProteinDetectionList/x:ProteinAmbiguityGroup"),
namespaces=ns,
fun=function(pag) {
## apply on ProteinAmbiguityGroup
do.call(rbind,XML::xmlApply(pag,function(pdh) {
## apply on ProteinDetectionHypothesis
cbind(dbseq.ref=XML::xmlGetAttr(pdh,dbsequenceref.attrname),
peptide.ev.ref=XML::xpathSApply(pdh,"x:PeptideHypothesis",XML::xmlGetAttr,name=peptideevref.attrname,namespaces=ns)
)}))
}))
# map PeptideEvidence attribute names to isobar columns
pe.attr.names <- c(id="peptide.ev.ref",start="start.pos",end="end.pos",pre="aa.before",post="aa.after",
missedCleavages="nmc",isDecoy="is.decoy",DBSequence_Ref="dbseq.ref",dBSequence_Ref="dbseq.ref")
message("spectrum mapping")
spectrumIdentifications <-
XML::xpathApply(doc,paste0(root,"/x:DataCollection/x:AnalysisData/x:SpectrumIdentificationList/x:SpectrumIdentificationResult"),
namespaces=ns,
fun=function(sir) {
## _SpectrumIdentificationResult_ #
## All identification from searching one spectrum
spectrum.id <- XML::xmlGetAttr(sir,"spectrumID")
spectrum.title.nodes <- XML::getNodeSet(sir,"x:cvParam[@name='spectrum title']",namespaces=ns)
if (length(spectrum.title.nodes) == 1)
spectrum.id <- XML::xmlGetAttr(spectrum.title.nodes[[1]],name='value')
## get spectrum identifications which pass threshold
## _SpectrumIdentificationItem_ #
## An identification of a single peptide of a specturm.
## Only take the one which passes the threshold.
sii <- XML::getNodeSet(sir,"x:SpectrumIdentificationItem[@passThreshold='true' and @rank='1']",namespaces=ns)
if (length(sii) == 0 || length(sii) > 1) return (NULL)
#"more than one match for [ x:SpectrumIdentificationItem[@passThreshold='true' and @rank='1'] ]")
# TODO: There will be always two times the same score with peptides with just an I/L difference
sii <- sii[[1]]
scores <- unlist(XML::xpathApply(sii,"x:cvParam",function(cvp) {
switch(XML::xmlGetAttr(cvp,name='name'),
"Scaffold: Peptide Probability"=c(scaffold.pepprob=XML::xmlGetAttr(cvp,name="value")),
"mascot:score"=c(score.mascot=XML::xmlGetAttr(cvp,name="value")),
"mascot:expectation value"=c(mascot.evalue=XML::xmlGetAttr(cvp,name="value")),
"sequest:xcorr"=c(sequest.xcorr=XML::xmlGetAttr(cvp,name="value")),
"sequest:deltacn"=c(sequest.deltacn=XML::xmlGetAttr(cvp,name="value")),
NULL
)
},namespaces=ns))
if ("PeptideEvidence" %in% names(sii)) {
pe.attr <- XML::xmlAttrs(sii[["PeptideEvidence"]])
pe.res <- pe.attr.names[names(pe.attr)[names(pe.attr) %in% names(pe.attr.names)]]
} else {
pe.res <- c(peptide.ev.ref=XML::xmlGetAttr(sii[["PeptideEvidenceRef"]],name="peptideEvidence_ref"))
}
c(spectrum=spectrum.id,
peptide.ref = XML::xmlGetAttr(sii,name=peptideref.attrname),
theo.mass = XML::xmlGetAttr(sii,name='calculatedMassToCharge'),
exp.mass = XML::xmlGetAttr(sii,name='experimentalMassToCharge'),
scores,pe.res)
})
si.names <- unique(unlist(sapply(spectrumIdentifications,names)))
records.spectrumIdentifications <- do.call(rbind,lapply(spectrumIdentifications,function(x) x[si.names]))
records.spectrumIdentifications <- as.data.frame(records.spectrumIdentifications,stringsAsFactors=FALSE)
peptide.ev.mapping <- XML::xpathApply(doc,paste0(root,"/x:SequenceCollection/x:PeptideEvidence"),namespaces=ns,XML::xmlAttrs)
if (length(peptide.ev.mapping) > 0) {
records.spectrumIdentifications <- merge(records.spectrumIdentifications,
as.data.frame(do.call(rbind,peptide.ev.mapping),stringsAsFactors=FALSE))
}
spectra.n.peptides <- merge(as.data.frame(records.spectrumIdentifications,stringsAsFactors=FALSE),
as.data.frame(peptide.mapping,stringsAsFactors=FALSE),
by='peptide.ref')
XML::free(doc)
message("merging results")
protein.mapping1 <- merge(as.data.frame(records.proteinDetections,stringsAsFactors=FALSE),
as.data.frame(protein.mapping,stringsAsFactors=FALSE),by="dbseq.ref")
merge(spectra.n.peptides,
protein.mapping1,by="peptide.ev.ref")
}
##' .read.mgf: read isobaric reporter tag masses and intensities from MGFs
##'
##' MGF files list m/z and intensities for each spectrum. .read.mgf
##' extracts m/z and intensity pairs of masses corresponding to isobaric
##' tag masses (within a fragment precision).
##' @title
##' @param filename [character] file name of one or multiple files in MGF format
##' @param type [character] denoting IBSpectra class - used to get
##' reporter masses and reporter names.
##' @param spectra [character vector] if defined, only export spectra whose TITLE
##' is listed. Speeds up the function when only identified spectra
##' are of interest.
##' @param fragment.precision [numeric] take m/z-intensity pairs whose m/z value are
##' in a range of +/- fragment.precision/2 of 'true' mass.
##' @param prob [numeric] Filter out m/z-intensitiy values with the prob/2 most
##' unprecise m/z values on both sides.
##' @param substitute.dta [boolean] internal. replace TITLEs: s/.dta.[0-9]*$/.dta/
##' @return list(ions, mass, spectrumtitles)
##' @author Florian P Breitwieser
.read.mgf <- function(filename,reporterMasses,reporterNames,spectra=NULL,fragment.precision=0.05,
prob=NULL,substitute.dta=FALSE,check.id.ok=FALSE,
scan.lines=0) {
if (is.null(fragment.precision)) { fragment.precision=0.05 }
message(" reading mgf file ",filename,
" [fragment precision: ",fragment.precision,"]")
## get reporter masses and names from type
nReporter <- length(reporterMasses)
min.mass <- min(reporterMasses)-fragment.precision/2
max.mass <- max(reporterMasses)+fragment.precision/2
## read (and concatenate if multiple) mgf files
input <- c()
for (f in filename) {
con <- file(f,'r')
if (scan.lines > 0) {
while(length(f.input <- readLines(con, n=scan.lines)) > 0){
input <- c(input,grep("^[A-Z]|^1[12][0-9]\\.",f.input,value=T))
}
} else {
input <- c(input,readLines(con))
}
close(con)
}
if (substitute.dta)
input <- sub(".dta.*$",".dta",input)
## index mgf file content: positions of BEGIN and END IONS
begin_ions <- which(input=="BEGIN IONS")
end_ions <- which(input=="END IONS")
if (length(begin_ions) != length(end_ions))
stop("mgf file is errorneous, non-matching number",
" of BEGIN IONS and END IONS tags");
bnd <- data.frame(begin=begin_ions,
end=end_ions)
if (!is.null(spectra)) {
## filtering to take only spectra defined in spectra
## all.titles <- .trim(sapply(strsplit(grep("TITLE",input,value=T),"="),function(x) x[2] )) # not efficient
all.titles <- .trim(substring(grep("^TITLE=",input,value=TRUE),7))
if (length(all.titles) != nrow(bnd)) {
# sanity check that each spectrum has a title
stop("title not specified for all spectra!");
}
spectra.to.take <- all.titles %in% spectra
if (sum(spectra.to.take) ==0) {
stop("No identified spectrum is found in MGF file.\n",
" TITLEs in MGF [1:",length(all.titles),"]: \n\t",
paste(all.titles[1:2],collapse=", "),", ...\n",
" identified spectra [1:",length(spectra),"]: \n\t",
paste(spectra[1:2],collapse=", "),", ...\n")
}
bnd <- bnd[spectra.to.take,]
}
bnd$recordNo = seq_len(nrow(bnd))
nSpectra <- nrow(bnd)
message(" ",nSpectra," spectra in MGF file.")
## create list with all spectra (header+mass list) as entries
all.spectra <- apply(bnd,1,function(x) input[x[1]:x[2]])
rm(input)
## if all spectra are of equal length, apply returns a matrix
## convert to list
if (is.matrix(all.spectra))
all.spectra <- split(all.spectra,col(all.spectra))
## extract information from each spectrum
result <- llply(all.spectra,function(x) {
header <- .strsplit_vector(x[grep("^[A-Z]",x)],"=")
numbers <- do.call(rbind,strsplit(x[grep("^1..\\.",x)],"\\s"))
mzi.mass <- as.numeric(numbers[,1])
rr <- c(title=header["TITLE"])
sel <- mzi.mass > min.mass & mzi.mass < max.mass
if (any(sel)) {
mzi.mass <- mzi.mass[sel]
mzi.ions <- as.numeric(numbers[sel,2])
rr <- c(rr,do.call(c,lapply(reporterMasses,function(y) {
m <- abs(y-mzi.mass)
pos <- which(m == min(m))
if (length(pos) > 1){
# (Jacques) added to address cases where 2 peaks (within the required precision) are at the same distance of the reporter theoretical mass
# Pick most intense
max.intens <- max(mzi.ions[pos])
pos <- pos[which(mzi.ions[pos]==max.intens)]
pos <- pos[1] # in case same intensity as well!
}
if (length(pos) > 0 & m[pos] < fragment.precision/2)
return(c(mzi.mass[pos],mzi.ions[pos]))
else
return(c(NA,NA))
})))
} else {
rr <- c(rr,rep(NA,nReporter*2))
}
return(rr)
},.parallel=isTRUE(getOption('isobar.parallel')))
result <- do.call(rbind,result)
rm(all.spectra)
if (length(result) == 0 || nrow(result) == 0) {
stop("error reading MGF file - could not parse masses and intensities.\n",
"Check the mapping id <-> peaklist.")
}
mass <- apply(result[,seq(from=2,to=ncol(result),by=2)],2,as.numeric)
ions <- apply(result[,seq(from=3,to=ncol(result),by=2)],2,as.numeric)
## only select spectra with itraq masses detected
sel = apply(mass,1,function(x) any(!is.na(x)))
if (!any(sel)) {
stop("No values could be extracted from MGF ",filename,"\n",
"with fragment precision ",fragment.precision,".\n")
}
if (!is.null(prob) && prob > 0) {
## boundaries: remove extreme outliers of mass spectrum
## to test: another quantile.type might be more appropriate for small datasets
bnd <- apply(mass[sel,],2,quantile,probs=c(prob*0.5,1-prob*0.5),na.rm=TRUE)
sel.prob <- !is.na(mass) & (mass < matrix(bnd[1,],byrow=T,nrow=nrow(mass),ncol=ncol(mass)) |
mass > matrix(bnd[2,],byrow=T,nrow=nrow(mass),ncol=ncol(mass)))
ions[sel.prob] <- NA
mass[sel.prob] <- NA
message("\tmass boundaries:\n\t",
paste(colnames(mass),sprintf("%.5f : %.5f",bnd[1,],bnd[2,]),sep="\t",collapse="\n\t"))
}
ions <- ions[sel,,drop=FALSE]
mass <- mass[sel,,drop=FALSE]
spectrumtitles <- .trim(result[sel,1])
if (ncol(ions) != length(reporterNames) || ncol(mass) != length(reporterNames)) {
stop("ions or mass matrix have wrong dimension.")
}
dimnames(ions) <- list(spectrumtitles,reporterNames)
dimnames(mass) <- list(spectrumtitles,reporterNames)
rm(result)
return(list(ions=ions, mass=mass,
spectrumtitles=spectrumtitles))
}
.read.idfile.df <- function(filename,identifications.format,sep,...) {
if (is.data.frame(filename))
return(filename)
if (!is.character(filename))
stop("filename should be a data.frame or character")
if (!file.exists(filename))
stop("idfile ",filename," defined, but does not exist")
is.format <- function(y,ext)
identical(identifications.format,y) ||
any(sapply(ext,function(iext) grepl(paste0(iext,"$"),filename)))
ext.def <-
list(mzid = list(ext=c("mzid"),f=read.mzid),
rockerbox = list(ext=c("peptides.[ct]sv"),f=.read.rockerbox),
msgf = list(ext=c("msgfp.csv","tsv"),f=.read.msgfp.tsv),
ibspectra = list(ext=c("csv"),
f=function(x) read.table(x,header=TRUE,sep=sep,
stringsAsFactors=FALSE,quote="",...)))
for (etype in names(ext.def)) {
if (is.format(etype,ext.def[[etype]][["ext"]])) {
message(" reading id file ",filename," [type: ",etype,"] ...",appendLF=FALSE)
res <- ext.def[[etype]][["f"]](filename)
message(" done")
return(res)
}
}
stop(paste("cannot parse file ",filename," - cannot deduce format based on extension ",
"(it is not ibspectra.csv, id.csv, peptides.txt or mzid). ",
"Please provide id.format to readIBSpectra",sep=""))
}
## TODO: log is not returned
.read.idfile <- function(id.file,identifications.format=NULL,sep="\t",
decode.titles=FALSE,trim.titles=FALSE,log=NULL,all.cols=FALSE,...) {
if (!is.data.frame(id.file)) {
if (!(is.list(id.file) || is.character(id.file)))
stop("id.file argument of .read.idfile should be a data frame, character or list")
if (all(sapply(id.file,is.character)))
id.data <- lapply(id.file,.read.idfile.df,sep=sep,
identifications.format=identifications.format,...)
else if (all(sapply(id.file,is.data.frame)))
id.data <- id.file
else
stop()
rm(id.file)
id.colnames <- lapply(seq_along(id.data),function(s.i) colnames(id.data[[s.i]]))
colnames.equal <- all(sapply(id.colnames,
function(cn) identical(cn,id.colnames[[1]])))
if (!colnames.equal) {
message(" id file colnames are not equal:")
message(paste(sapply(seq_along(id.data),
function(s.i) paste0(" ",s.i,": [",
paste(id.colnames[[s.i]],collapse=","),
"]")),collapse="\n"))
all.id.colnames <- unique(unlist(id.colnames))
if (isTRUE(all.cols)) {
id.data <- do.call(rbind,lapply(id.data,function(i.d) {
id.d[,all.id.colnames[!all.id.colnames %in% colnames(id.d)]] <- NA
id.d[,all.id.colnames]
}))
} else {
intersect.colnames <- id.colnames[[1]]
for (s.i in seq(from=2,to=length(id.colnames))) {
intersect.colnames <- intersect(intersect.colnames,id.colnames[[s.i]])
}
message(" taking intersection: ",paste(intersect.colnames,collapse="; "))
id.data <- do.call(rbind,lapply(id.data,function(i.d) i.d[,intersect.colnames]))
}
} else {
id.data <- do.call(rbind,id.data)
}
}
#.check.columns(id.data)
if (!is.character(id.data[,.SPECTRUM.COLS['SPECTRUM']]))
id.data[,.SPECTRUM.COLS['SPECTRUM']] <- as.character(id.data[,.SPECTRUM.COLS['SPECTRUM']])
if ('accession' %in% colnames(id.data)) {
if (all(grepl(sprintf("%s\\|%s",.uniprot.pattern.ac,.uniprot.pattern.id),id.data[,.PROTEIN.COLS['PROTEINAC']]))) {
split.ac <- strsplit(id.data[,.PROTEIN.COLS['PROTEINAC']],"|",fixed=TRUE)
id.data[,.PROTEIN.COLS['PROTEINAC']] <- sapply(split.ac,function(x) x[1])
id.data[,.PROTEIN.COLS['NAME']] <- sapply(split.ac,function(x) ifelse(length(x) == 2, x[2], NA))
}
}
if (decode.titles)
id.data[,.SPECTRUM.COLS['SPECTRUM']] <- sapply(id.data[,.SPECTRUM.COLS['SPECTRUM']],URLdecode)
if (trim.titles)
id.data[,.SPECTRUM.COLS['SPECTRUM']] <- .trim(id.data[,.SPECTRUM.COLS['SPECTRUM']])
return(id.data)
}
.read.rockerbox <- function(filename) {
data.r <- read.table(filename,header=T,stringsAsFactors=F,sep="\t")
if (!"scan.title" %in% colnames(data.r))
stop("no scan.title column in ",filename,"; please use a Rockerbox version >= 2.0.6")
## transform modification (TODO)
data.r$modif <- data.r$modifications
## end transform
## transform 'all.peptide.matches' to ac and start.pos
split.acs <- strsplit(data.r$all.protein.matches,"; ")
names(split.acs) <- data.r$all.protein.matches
ac.n.startpos <- ldply(split.acs,function(x) {
y <- do.call(rbind,strsplit(x,split="\\[|\\]"))
start.pos <- sapply(strsplit(y[,2],"-",fixed=TRUE),
function(z) if(all(!is.na(as.numeric(z))) && length(z) == 2) as.numeric(z[1])
else stop("not numeric"))
data.frame(accession=y[,1],start.pos=start.pos)
})
data.r <- merge(data.r,ac.n.startpos,by.x="all.protein.matches",by.y=".id")
data.r$all.protein.matches <- NULL
## end transform
sel <- names(.ROCKERBOX.MAPPING.COLS) %in% names(c(.SPECTRUM.COLS,.PEPTIDE.COLS))
data.r <- data.r[,.ROCKERBOX.MAPPING.COLS[sel]]
colnames(data.r) <- c(.SPECTRUM.COLS,.PEPTIDE.COLS)[names(.ROCKERBOX.MAPPING.COLS)[sel]]
return(data.r)
}
###############################################################################
## MERGE IDENTIFCATIONS
.dissect.search.engines <- function(identifications) {
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
## scores are merged together
if (.SPECTRUM.COLS['SCORE'] %in% colnames(identifications)) {
engines <- strsplit(identifications[,SC['SEARCHENGINE']],"|",fixed=TRUE)
scores <- strsplit(identifications[,SC['SCORE']],"|",fixed=TRUE)
} else {
engine.n.score <- strsplit(identifications[,SC['SEARCHENGINE']],"[ \\|]")
engines <- lapply(engine.n.score,function(x) x[seq(from=1,to=length(x),by=2)])
if (length(unique(unlist(engines))>10)) {
engines <- lapply(engine.n.score,function(x) x[seq(from=1,to=length(x)/2)])
scores <- lapply(engine.n.score,function(x) as.numeric(x[seq(from=length(x)/2+1,to=length(x))]))
} else {
scores <- lapply(engine.n.score,function(x) as.numeric(x[seq(from=2,to=length(x),by=2)]))
}
}
score.columns <- paste0('score.',tolower(unique(unlist(engines))))
for (engine in unique(unlist(engines))) {
name <- paste0('score.',tolower(engine))
e.scores <- mapply(function(e,s) if(any(e==engine)) s[e==engine] else NA,engines,scores)
identifications[,name] <- as.numeric(e.scores)
}
identifications[,SC['SEARCHENGINE']] <- sapply(engines,paste,collapse="&")
tt <- table(identifications[,SC['SPECTRUM']])
if (max(tt) > 1) {
message(" resolving duplicated ids ...")
id.good <- identifications[identifications$spectrum %in% names(tt)[tt==1],]
id.bad <- identifications[identifications$spectrum %in% names(tt)[tt>1],]
id.bad$n.se <- rowSums(!is.na(id.bad[,score.columns]))
id.bad <- ddply(id.bad,'spectrum',function(x) {
x[which.max(x$n.se),]
})
id.bad$n.se <- NULL
identifications <- rbind(id.good,id.bad)
}
if ('SCORE' %in% names(SC))
identifications[,SC['SCORE']] <- NULL
return(identifications)
}
.merge.search.engine.identifications <- function(identifications,...) {
## Columns to consolidate after merging
## functions are used to resolve or remove the psm state
COLS.TO.CONSOLIDATE <- list('peptide'=.consolidate.peptide.ids,
'modif'=.consolidate.modification.pos,
'charge'=.consolidate.charge, ## sometimes, charge state 0 is reported when too high
'theo.mass'=.mean.na.rm, ## can differ esp. between Mascot and Phenyx
'retention.time'=.mean.na.rm,
'parent.intens'=.mean.na.rm)
COLS.TO.CONSOLIDATE <- COLS.TO.CONSOLIDATE[names(COLS.TO.CONSOLIDATE) %in% colnames(identifications)]
## Columns used for merging. Typically 'spectrum' and columns which are the same regardless of search engine
COLS.FOR.MERGING <- setdiff(.SPECTRUM.COLS,c(.ID.COLS,names(COLS.TO.CONSOLIDATE)))
COLS.FOR.MERGING <- c(COLS.FOR.MERGING,'is.decoy')
COLS.FOR.MERGING <- intersect(colnames(identifications),COLS.FOR.MERGING)
#'delta.score','delta.score.pep','delta.score.notpep','n.pep','n.loc'
## Split identifications by search engine
cn.search.engine <- which(colnames(identifications) == .SPECTRUM.COLS['SEARCHENGINE'])
ids.split <- lapply(split(identifications,identifications[,cn.search.engine]),
function(x) x[,-cn.search.engine])
if (length(ids.split) < 2) stop("Expecting more than two different search engines.")
if (length(ids.split) > 10) stop("Split data.frame into more than 10 search.engines - seems unrealistic. ",
"search.engine values: ",paste(names(ids.split),collapse=", "))
clean.names <- gsub("[^[:alnum:]]","",tolower(names(ids.split))) ## lower case alpha-numeric names
score.cols <- paste0("score.",clean.names)
for (ii in seq_along(ids.split)) { # fix colnames which are duplicate
cn <- colnames(ids.split[[ii]])
cn[!cn %in% COLS.FOR.MERGING] <- paste(cn[!cn %in% COLS.FOR.MERGING],clean.names[ii],sep=".")
colnames(ids.split[[ii]]) <- cn
}
## Merge identification data frames
ids.merged <- merge(ids.split[[1]],ids.split[[2]],by=COLS.FOR.MERGING,all=TRUE)
if (length(ids.split) > 2) {
for (ii in seq(from=3,to=length(ids.split))) {
ids.merged <- merge(ids.merged,ids.split[[ii]],by=COLS.FOR.MERGING,all=TRUE)
}
}
if (!.SPECTRUM.COLS['NOTES'] %in% colnames(ids.merged))
ids.merged[,.SPECTRUM.COLS['NOTES']] <- ''
## Consolidate spectra with different identifications with different search engines
for (colname in names(COLS.TO.CONSOLIDATE)) {
message("Consolidating ",colname, " [",date(),"]")
ids.merged <- .resolve.conflicts(ids=ids.merged,resolve.f=COLS.TO.CONSOLIDATE[[colname]],
colname=colname,resolve.colnames=paste0(colname,".",clean.names))
}
ids.merged <- unique(ids.merged)
tt <- table(ids.merged[,.SPECTRUM.COLS['SPECTRUM']])
#spectra.ok <- ids.merged[,'spectrum'] %in% names(tt)[tt==1]
#resolved.ids <- .resolve.differing.identifications(ids.merged[!spectra.ok,],score.cols)
#identifications <- rbind(ids.merged[spectra.ok,],resolved.ids)
if (any(tt>1)) {
warning("Merging not successful, ",sum(tt>1)," duplicated psms, removing them!")
ids.merged <- ids.merged[!ids.merged[,.SPECTRUM.COLS['SPECTRUM']] %in% names(tt)[tt>1],]
}
ids.merged[,.SPECTRUM.COLS['SEARCHENGINE']] <-
apply(ids.merged[,score.cols],1,function(x) { paste(names(ids.split)[!is.na(x)],collapse="&") })
return(ids.merged)
}
.na.rm <- function(x) x[!is.na(x)]
.mean.na.rm <- function(x) mean(x,na.rm=TRUE)
# Remove differing peptide ids
.consolidate.peptide.ids <- function(x) NA
# Take a non-zero charge
.consolidate.charge <- function(x) {
x <- !is.na(x)
ifelse(any(x > 0),x[x>0][1],0)
}
# Take 'first' modification
.consolidate.modification.pos <- function(x) .na.rm(x)[1]
.searchengine.cols <- function(ids,colname) {
}
.resolve.conflicts <- function(ids, resolve.f, colname, resolve.colnames, keep.cols = FALSE) {
if (is.character(resolve.colnames))
resolve.colnames <- which(colnames(ids) %in% resolve.colnames)
# set result column
ids[,colname] <- .return.equal.or.na(ids[,resolve.colnames])
# return resolved conflicting and non-conflicting data
good.spectra <- !is.na(ids[,colname])
if (!any(good.spectra))
stop("No good spectra which don't have to be resolved - something went wrong")
if (!all(good.spectra)) {
print(isobar:::.sum.bool(good.spectra))
ids <- rbind(ids[good.spectra,],
.do.resolve.conflicts(ids[!good.spectra,],colname,resolve.colnames, resolve.f))
}
if (keep.cols)
ids
else
ids[-resolve.colnames,]
}
.do.resolve.conflicts <- function(ids, colname, resolve.colnames, resolve.f) {
if (length(ids) == 0)
return(NULL)
if (length(ids) == 1)
return(ids)
ids[,colname] <- apply(ids[,resolve.colnames],1,resolve.f)
if (any(is.na(ids[,colname])))
message("removing ",sum(is.na(ids[,colname])))
ids <- ids[!is.na(ids[,colname]),]
if (nrow(ids) == 0)
return(NULL)
ids[,.SPECTRUM.COLS['NOTES']] <- ifelse(ids[,.SPECTRUM.COLS['NOTES']] == '','',
paste0(ids[,.SPECTRUM.COLS['NOTES']],"\n"))
ids[,.SPECTRUM.COLS['NOTES']] <- paste0(ids[,.SPECTRUM.COLS['NOTES']],
apply(ids[,resolve.colnames],1,function(x) {
x <- x[!is.na(x)]
paste(colname,"differing:\n\t",
paste(names(x),x,sep=": ",collapse="\n\t"))}))
return(ids)
}
.resolve.modifications <- function(df,colname, modif.cols, standard.modif) {
message("Resolving modifications")
if (is.character(modif.cols))
modif.cols <- which(colnames(df) %in% modif.cols)
adply(df, 1, function(x) {
x <- as.data.frame(x)
modifs <- unlist(x[,modif.cols])
modifs <- modifs[!is.na(modifs)]
if (length(modifs) == 0) {
print(x)
stop ("all modifs are NA!")
}
## coherent modifications
x$note <- ""
if (length(modifs) == 1 || all(modifs == modifs[1])) {
stop("Should be handled before")
x[,colname] <- modifs[1]
return(x[,-modif.cols])
}
## resolve incoherent modifs
note <- paste0("differing modification positions: \n\t",
paste(names(modifs),modifs,sep=": ",collapse="\n\t"))
## any modification position is more often present?
tt <- table(modifs)
if (.max.uniq(tt)) {
note <- paste0(note,". Taking ", .take.max(tt) )
stop("TODO: Implement")
}
## take 'first' modification position
x[,colname] <- modifs[1]
x[,'note'] <- note
return(x)
})
}
.max.uniq <- function(tt) sum(tt==max(tt)) == 1
.take.max <- function(tt) names(tt)[which.max(tt)]
.resolve.differing.identifications <- function(identifications,score.cols) {
allequal <- function(x) all(x == x[1])
by.y <- function(x,ind,fun=sum) sapply(by(x,ind,fun),function(x) return(x) )
skipna <- function(x) unlist(x[!is.na(x)])
n.skipped <- 0
n.max.ids <- 0
n.modif.pos.dif <- 0
resolved.identifications <- ddply(identifications,'spectrum', function(x) {
n.ids <- rowSums(!is.na(x[,score.cols]),na.rm=TRUE) ## number of identifications for each psm
if (.max.uniq(n.ids)) {
n.max.ids <<- n.max.ids + 1
return(x[which.max(n.ids),,drop=FALSE])
}
## resolve peptide differnces
if (!allequal(x[,'peptide'])) { ## different peptides identified
pep.ids <- by.y(n.ids,x[,'peptide'],sum)
if (.max.uniq(pep.ids)) { ## any peptide has been seen more often than others?
x <- x[x[,'peptide'] == .take.max(pep.ids),,drop=FALSE]
} else {
n.skipped <<- n.skipped + 1
return(NULL)
}
}
## all equal peptides, different modification positions
if (!allequal(x[,'peptide'])) {
message("ERROR while merging: psm peptides should be equal, but they aren't.")
print(x)
stop()
}
## resolve modification differences
if (!allequal(x[,'modif'])) { ## different modifs identified
modif.ids <- by.y(n.ids,x[,'modif'],sum)
modifs <- gsub("Oxidation_M","")
if (!.max.uniq(modif.ids))
n.modif.pos.dif <<- n.modif.pos.dif + 1
#print(x)
modif <- paste(x[,.SPECTRUM.COLS['SEARCHENGINE']],x[,'modif'],sep=": ",collapse=' & ')
x <- x[x[,'modif'] == .take.max(modif.ids)[1],,drop=FALSE]
x[,'modif'] <- modif
} else if (!allequal(x[,'charge'])) { ## different charge
x[1,score.cols] <- as.numeric(sapply(x[,score.cols],skipna))
x[1,'charge'] <- max(x[,'charge'])
x <- x[1,,drop=FALSE]
} else {
message("Why is the modif equal? ")
print(rbind(x,is.equal=apply(x,2,allequal)))
stop()
}
return(x)
})
message(" resolving differing ",length(unique(identifications[,'spectrum']))," identifications: " )
message(" ",n.max.ids," resolved because more evidence was available for one alternative")
message(" ",n.skipped," removed because of differing peptide ids")
message(" ",n.modif.pos.dif," kept, as only the modification position was different")
return(resolved.identifications)
}
.merge.quant.identifications <- function(identifications) {
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
tt <- table(identifications[,SC['SPECTRUM.QUANT']])
spectra.ok <- identifications[,SC['SPECTRUM.QUANT']] %in% names(tt)[tt==1]
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
score.colname <- .SPECTRUM.COLS[c('SCORE.MASCOT','SCORE.PHENYX','SCORE.MSGF','PROB.PHOSPHORS')]
score.colname <- score.colname[score.colname %in% colnames(identifications)]
message("Merging ",sum(!spectra.ok)," identifications from different dissociation methods.")
ids.quant.merged <- ddply(identifications[!spectra.ok,],.SPECTRUM.COLS['SPECTRUM.QUANT'],function(x) {
if (nrow(x) == 1) return(x)
my.args <- as.list(x[,score.colname,drop=FALSE])
my.args <- lapply(my.args,round,digits=2) ## take two significant digits before taking next score into account as top hitter
my.args$decreasing=TRUE
max.hit <- do.call(order,my.args)[1]
# dismiss peptide-spectrum matches which are different between search engines
if (!all(x[,SC['PEPTIDE']] == x[1,SC['PEPTIDE']]))
return(NULL)
x[max.hit,SC['DISSOCMETHOD']] = paste0("[",x[max.hit,SC['DISSOCMETHOD']],"]")
if (all(x[,SC['MODIFSTRING']] == x[max.hit,SC['MODIFSTRING']]) && 'DISSOCMETHOD' %in% names(SC)) {
if ("cid" %in% x[,SC['DISSOCMETHOD']])
x[max.hit,SC['SPECTRUM']] <- x[x[,SC['DISSOCMETHOD']]=="cid",SC['SPECTRUM']][1]
x[max.hit,SC['DISSOCMETHOD']] <- paste(x[,SC['DISSOCMETHOD']],collapse="&")
for (sc in score.colname[score.colname != 'pepprob'])
x[max.hit,sc] <- gsub("NA","",paste(x[,sc],collapse="&"))
}
return(x[max.hit,,drop=FALSE])
})
colnames(ids.quant.merged)[colnames(ids.quant.merged)=='SPECTRUM.QUANT'] <- 'spectrum.quant'
ids.quant.merged <- ids.quant.merged[,colnames(identifications)]
rbind(identifications[spectra.ok,],ids.quant.merged)
}
.merge.identifications <- function(identifications,...) {
SC <- .SPECTRUM.COLS[.SPECTRUM.COLS %in% colnames(identifications)]
## Merge results of different search engines / on different spectra
if ('SEARCHENGINE' %in% names(SC) && ## search engine column is present
length(unique(identifications[,SC['SEARCHENGINE']])) > 1 && ## there are multiple search engines defines
!any(grepl('^score\\.',colnames(identifications)))) ## the individual score.ENGINENAME are not present
{
if (any(grepl("|",identifications[,SC['SEARCHENGINE']],fixed=TRUE))) {
identifications <- .dissect.search.engines(identifications) ## dissect merged id columns
} else {
identifications <- .merge.search.engine.identifications(identifications,...) ## merge identifications
}
}
return(identifications)
}
## end MERGE IDENTIFICATIONS
###############################################################################
.read.identifications <- function(identifications,...,
mapping=NULL,mapping.names=c(quantification.spectrum="hcd",identification.spectrum="cid"),
identifications.quant=NULL) {
## load identifications (is either character or data.frame
if (is.character(identifications) && all(sapply(identifications,file.exists)))
identifications <- .read.idfile(identifications,...)
## load mapping (either character or data.frame)
if (!is.null(mapping)) {
if (is.character(mapping) && file.exists(mapping))
mapping <- do.call(rbind,lapply(mapping,read.table,sep=",",header=TRUE,stringsAsFactors=FALSE,quote=""))
if (!is.data.frame(mapping)) stop("mapping must be a data.frame or valid file name")
if (ncol(mapping) > 2) stop("only one column in mapping")
if (is.null(mapping.names)) {
if (ncol(mapping) > 2) stop("more than one column in mapping data - mapping.names are therefore required")
message("trying to assess right mapping")
cn <- apply(mapping,2,function(x) sum(x %in% identifications[,.SPECTRUM.COLS['SPECTRUM']]))
mapping.names <- c(quantification.spectrum=names(cn)[which.min(cn)],
identification.spectrum=names(cn)[which.max(cn)])
}
if (!all(c('identification.spectrum','quantification.spectrum') %in% names(mapping.names)))
stop("mapping.names must be given alongside with mapping. names(mapping.names) must be 'identification.spectrum' and 'quantification.spectrum'")
if (!all(mapping.names %in% colnames(mapping))) stop("mapping.names must be column names of mapping")
quant2id <- .as.vect(mapping,mapping.names['identification.spectrum'],mapping.names['quantification.spectrum'])
id2quant <- .as.vect(mapping,mapping.names['quantification.spectrum'],mapping.names['identification.spectrum'])
identifications[,.SPECTRUM.COLS['SPECTRUM.QUANT']] <- id2quant[identifications[,.SPECTRUM.COLS['SPECTRUM']]]
identifications[,.SPECTRUM.COLS['DISSOCMETHOD']] <- ifelse(is.null(identifications.quant),"hcd","cid") # temp fix
#identifications[,.SPECTRUM.COLS['DISSOCMETHOD']] <- names(mapping.names)[mapping.names=='identification.spectrum']
if (!is.null(identifications.quant)) {
## load identifications.quant (either character or data.frame)
if (is.character(identifications.quant) && all(sapply(identifications.quant,file.exists)))
identifications.quant <- .read.idfile(identifications.quant,...)
if (!is.data.frame(identifications.quant)) stop("identifications.quant must be a data.frame or valid file name")
identifications.quant[,.SPECTRUM.COLS['SPECTRUM.QUANT']] <- identifications.quant[,.SPECTRUM.COLS['SPECTRUM']]
identifications.quant[,.SPECTRUM.COLS['SPECTRUM']] <- quant2id[ identifications.quant[,.SPECTRUM.COLS['SPECTRUM.QUANT']] ]
identifications.quant[,.SPECTRUM.COLS['DISSOCMETHOD']] <- "hcd"
#identifications.quant[,.SPECTRUM.COLS['DISSOCMETHOD']] <- names(mapping.names)[mapping.names=='quantification.spectrum']
#if (.SPECTRUM.COLS['DATABASE'] %in% colnames(identifications) &&
# !.SPECTRUM.COLS['DATABASE'] %in% colnames(identifications.quant))
# identifications[,.SPECTRUM.COLS['DATABASE']] <- NULL
intersect.ids <- intersect(colnames(identifications),colnames(identifications.quant))
cn.i.intersect <- colnames(identifications)[!colnames(identifications) %in% intersect.ids]
cn.iq.intersect <- colnames(identifications.quant)[!colnames(identifications.quant) %in% intersect.ids]
if (length(cn.i.intersect) > 0 || length(cn.iq.intersect)>0)
stop("identifications and identifications.quant dataframes are different:",
ifelse(length(cn.i.intersect),paste0("\n\t[",paste0(cn.i.intersect,collapse=";"),
"] only appear in identifications"),""),
ifelse(length(cn.iq.intersect),paste0("\n\t[",paste0(cn.iq.intersect,collapse=";"),
"] only appear in identifications.quant"),""))
identifications <- rbind(identifications[,intersect.ids],identifications.quant[,intersect.ids])
}
}
return(identifications)
}
## read MSGF+ tab-separated identification files
.read.msgfp.tsv <- function(filename,filter.rev.hits=FALSE) {
if (is.data.frame(filename))
ib.data <- filename
else
id.data <- read.delim(filename, sep="\t", stringsAsFactors=FALSE,quote="")
ib.df <- data.frame(Protein=id.data[,'Protein'],
spectrum=id.data[,'Title'],.convert.msgfp.pepmodif(id.data[,'Peptide']),
scan.from=id.data[,'ScanNum'],dissoc.method=tolower(id.data[,'FragMethod']),
precursor.error=id.data[,'PrecursorError.ppm.'],charge=id.data[,'Charge'],
search.engine="MSGF+",score=-log10(id.data[,'SpecEValue']),
stringsAsFactors=FALSE)
sel <- names(.MSGF.MAPPINGCOLS) %in% names(c(.SPECTRUM.COLS,.PEPTIDE.COLS))
data.r <- id.data[,.MSGF.MAPPINGCOLS[sel]]
colnames(data.r) <- c(.SPECTRUM.COLS,.PEPTIDE.COLS)[names(.MSGF.MAPPINGCOLS)[sel]]
ib.df <- cbind(ib.df,data.r)
ib.protnpep <- .convert.msgfp.protein(unique(ib.df[,c('Protein','peptide')]),filter.rev.hits=filter.rev.hits)
id.data$Protein <- NULL
merge(unique(ib.protnpep),ib.df,by='peptide',all=TRUE)
}
.convert.msgfp.pepmodif <- function(peptide,modif.masses=
c(iTRAQ4plex=144.102,Cys_CAM=57.021,Oxidation=15.995,
iTRAQ4_ACET="144.102+42.011",ACET=42.011,
METH=14.016,BIMETH=28.031,TRIMETH=42.047
)) {
modif.masses.r <- setNames(c(names(modif.masses)),c(paste0("+",modif.masses)))
modifs <- strsplit(paste0(peptide,"+"),"[A-Z]")
#sort(table(unlist(modifs)))
modif.p <- sapply(modifs,function(x) {
x[length(x)] <- sub(".$","",x[length(x)]);
x[x!=""] <- modif.masses.r[x[x!=""]];
x
})
if (any(is.na(unlist(modif.p))))
stop("NA in modifications - did not map")
#sel <- sapply(modif.p,function(x) any(is.na(x)))
#modifs[sel]
pep.seq <- gsub("[+0-9\\.]","",peptide)
if (any(nchar(pep.seq)+1 != sapply(modif.p,length)))
stop("some modif sequences are not of correct length!")
#cbind(peptide,pep.seq,nchar(pep.seq),sapply(modif.p,length))
modif.i <- mapply(function(pep,modif) {
pep <- c("Nterm",pep)
sel.m <- modif!="" & !grepl("_",modif)
modif[sel.m] <- paste0(modif[sel.m],"_",pep[sel.m])
paste(modif,collapse=":")
} ,strsplit(pep.seq,""),modif.p)
return(data.frame(peptide=pep.seq,modif=paste0(modif.i,":"),stringsAsFactors=FALSE))
}
.convert.msgfp.protein <- function(protein.n.peptide,filter.rev.hits=FALSE) {
# layout: sp|Q60848-1|HELLS_MOUSE(pre=R,post=K);sp|Q60848-2|HELLS_MOUSE(pre=R,post=K)
# reverse hits: XXX_sp|...
# TODO: extract pre and post AAs
split.protein <- strsplit(protein.n.peptide[,1],"[;|]")
res <- lapply(seq_along(split.protein), function(i) {
y <- split.protein[[i]]
cbind(peptide=protein.n.peptide[i,2],
database=y[seq(from=1,to=length(y),by=3)],
accession=y[seq(from=2,to=length(y),by=3)])
})
res <- as.data.frame(do.call(rbind,res),stringsAsFactors=FALSE)
res$is.decoy <- grepl("^XXX_",res[,'database'])
print(c(is.decoy=.sum.bool(res$is.decoy)))
if (isTRUE(filter.rev.hits))
res <- res[!res$is.decoy,]
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.