knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library(plyr)
library(FinnProt)

Overview MS

Overview R

Peptide annotation

Generated using R package prozor

data("peptide_proteinAnnotation")
head(peptide_proteinAnnotation)
prots <- peptide_proteinAnnotation

Number of Proteins per Protein

#prots <- read.table(file="../data/peptide_proteinAnnotation.txt")
tabProtID <- table(prots$proteinID)
plot(table(prots$proteinID), ylab="# of Peptides/protein", xlab="protein", axes=F)
axis(1)
axis(2)
names(tabProtID)[1:10]

Number of proteins matched by one peptide

tabPeptides <- table(prots$peptideSequence)
plot(sort(tabPeptides), axes=F, ylab="# of protein per peptide", xlab="unique peptide")
axis(1)
axis(2)

Length of peptide vs number of proteins matched

pepLength<-nchar(names(tabPeptides))
plot(pepLength,tabPeptides, xlab="length of peptide sequence", ylab="nr of proteins assigned",axes=F)
axis(1)
axis(2)

whichPeptide <- which(pepLength >30 & tabPeptides >50)
points(pepLength[whichPeptide], tabPeptides[whichPeptide],col=2,pch="x",cex=3)
as.character(prots[ prots$peptideSequence == names(whichPeptide),"proteinID"])

Transposon

Looking at the peptide spectrum matches for the yeast dataset

Data generated from Mascot search result .dat files using R package bibliospec.

# specMeta <- read.table(file = "../data/psmMatchesAnnotated.txt",stringsAsFactors = F)
data("psmMatchesAnnotated")
lapply(psmMatchesAnnotated,class)
for(i in 1:ncol(psmMatchesAnnotated)){
  if(class(psmMatchesAnnotated[[i]])=="factor"){
    psmMatchesAnnotated[[i]]<-as.character(psmMatchesAnnotated[[i]])
  }
}
lapply(psmMatchesAnnotated,class)

specMeta <- psmMatchesAnnotated
table(specMeta$fileName)
tmp <- specMeta$fileName%in%unique(specMeta$fileName)[1]
Glucdata <- specMeta[tmp,]
tmp <- specMeta$fileName%in%unique(specMeta$fileName)[2]
Ethanodata <- specMeta[tmp,]

Compute function of FDR given score

computeFDRFunction <- function(score, proteinID, decoy = "REV_" ){
  idx <- order(score, decreasing = TRUE)
  score <- score[idx]
  decoy_hit <- grepl(decoy, proteinID[idx])

  FP <- cumsum(decoy_hit)
  TP <- 1:length(idx) - FP

  FDR1 <- (2 * FP) / (TP + FP)
  FDR2 <- FP / TP
  return(list(decoy_hit = decoy_hit, score = score , FDR1 = FDR1 ,FDR2 = FDR2))
}

Helper function to make histogram of the decoy (FP) and non decoy (TP + FP) scores.

histWithFDR <- function(data){
  tx <- with(data,hist(score2,plot = FALSE, breaks=100))
  t1<-with(data,hist(score2[!grepl("REV_",proteinID)],breaks=tx$breaks,main="",xlab="score"))
  with(data,hist(score2[grepl("REV_",proteinID)],add=T,breaks = t1$breaks, col=2))
  cff <- with(data,computeFDRFunction(score2 , proteinID))
  par(new=T)
  with(cff,plot(score,FDR1*100, type="l",col=4,lwd=2,xlab=NA,ylab=NA,axes=FALSE))
  axis(side = 4)
  mtext(side = 4, line = 3, 'Number genes selected')
  return(cff)
}

PSM level FDR vs Score

cff <- histWithFDR(Glucdata)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

FDR is a function of the score $FDR = f(score)$.

fdr = 0.01
scthresh <-min(cff$score[cff$FDR1 < fdr])
scthresh

Filter dataset given score

Glucdata1 <- subset(Glucdata, score2 > scthresh)

What is the FDR before and after filtering?

getFDR <- function(data){
  (2 * sum(grepl("REV_",data$proteinID ))) / nrow(data)
}
getFDR(Glucdata)*100
getFDR(Glucdata1)*100
cff <- histWithFDR(Glucdata1)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Precursor level FDR

GlucdataPrec <- ddply(Glucdata, .(peptideModSeq, precursorCharge),function(x){x[which.max(x$score2),]})
getFDR(GlucdataPrec) * 100
cff <- histWithFDR(GlucdataPrec)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Exercise : Control the Precursor level FDR on 1% level.

PTM sequence level FDR

GlucdataModPep <- ddply(Glucdata, .(peptideModSeq),function(x){x[which.max(x$score2),]})
getFDR(GlucdataModPep) * 100
cff <- histWithFDR(GlucdataModPep)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Peptide Level FDR

GlucdataPep <- ddply(Glucdata, .(peptideSeq),function(x){x[which.max(x$score2),]})
getFDR(GlucdataPep) * 100
cff <- histWithFDR(GlucdataPep)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Protein Level FDR

GlucdataProt <- ddply(Glucdata, .(proteinID), function(x){data.frame(x[which.max(x$score2),],nrPSM=nrow(x), newscore = sum(x$score2))  })
getFDR(GlucdataProt) * 100
nrow(GlucdataProt)
par(mfrow=c(2,1))
cffP1 <- histWithFDR(GlucdataProt)
GlucdataProt2 <- GlucdataProt
GlucdataProt2$score2 <- GlucdataProt2$newscore
getFDR(GlucdataProt2) * 100
cffP2 <- histWithFDR(GlucdataProt2)

Control FDR on protein level using score

fdr = 0.01
scthresh1 <-min(cffP1$score[cffP1$FDR1 < 0.01])
scthresh1

scthresh2 <-min(cffP2$score[cffP2$FDR1 < 0.01])
scthresh2
GlucdataProt0.01 <- subset(GlucdataProt, score2 > scthresh1)
nrow(GlucdataProt0.01)
getFDR(GlucdataProt0.01) * 100

GlucdataProt0.01 <- subset(GlucdataProt2, score2 > scthresh2)
nrow(GlucdataProt0.01)
getFDR(GlucdataProt0.01) * 100

Removing Proteins identified by a single PSM

Improve FDR by removing "single hit wonders".

GlucdataProt2 <- (subset(GlucdataProt, nrPSM > 1))
nrow(GlucdataProt2)
getFDR(GlucdataProt2) * 100

cff <- histWithFDR(GlucdataProt2)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Compare results

Compare portein lists obtained by removing single hit wonders with those by controlling fdr using score.

length(intersect(GlucdataProt2$proteinID, GlucdataProt0.01$proteinID))
length(setdiff(GlucdataProt2$proteinID, GlucdataProt0.01$proteinID))
length(setdiff(GlucdataProt0.01$proteinID,GlucdataProt2$proteinID ))

Questions?



wolski/FinnProt documentation built on May 4, 2019, 9:47 a.m.