In wolski/FinnProt: Data and vignette for FinnProt Workshop 2017

knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library(plyr)
library(FinnProt)

Overview MS

proteotypic peptides
FDR as function of score
finding score given FDR
various FDR levels
protein inference problem

Overview R

data frames (factor and character) stringAsFactors = FALSE
tabulating data table
subsetting data frames using subset and []
aggregating data using plyr (ddply)
histograms hist
comparing lists using intersect and setdiff
the with function

Peptide annotation

Generated using R package prozor

data("peptide_proteinAnnotation")
head(peptide_proteinAnnotation)
prots <- peptide_proteinAnnotation

Number of Proteins per Protein

#prots <- read.table(file="../data/peptide_proteinAnnotation.txt")
tabProtID <- table(prots$proteinID)
plot(table(prots$proteinID), ylab="# of Peptides/protein", xlab="protein", axes=F)
axis(1)
axis(2)
names(tabProtID)[1:10]

Number of proteins matched by one peptide

tabPeptides <- table(prots$peptideSequence)
plot(sort(tabPeptides), axes=F, ylab="# of protein per peptide", xlab="unique peptide")
axis(1)
axis(2)

Length of peptide vs number of proteins matched

pepLength<-nchar(names(tabPeptides))
plot(pepLength,tabPeptides, xlab="length of peptide sequence", ylab="nr of proteins assigned",axes=F)
axis(1)
axis(2)

whichPeptide <- which(pepLength >30 & tabPeptides >50)
points(pepLength[whichPeptide], tabPeptides[whichPeptide],col=2,pch="x",cex=3)
as.character(prots[ prots$peptideSequence == names(whichPeptide),"proteinID"])

Transposon

Looking at the peptide spectrum matches for the yeast dataset

Data generated from Mascot search result .dat files using R package bibliospec.

# specMeta <- read.table(file = "../data/psmMatchesAnnotated.txt",stringsAsFactors = F)
data("psmMatchesAnnotated")
lapply(psmMatchesAnnotated,class)
for(i in 1:ncol(psmMatchesAnnotated)){
  if(class(psmMatchesAnnotated[[i]])=="factor"){
    psmMatchesAnnotated[[i]]<-as.character(psmMatchesAnnotated[[i]])
  }
}
lapply(psmMatchesAnnotated,class)

specMeta <- psmMatchesAnnotated
table(specMeta$fileName)

tmp <- specMeta$fileName%in%unique(specMeta$fileName)[1]
Glucdata <- specMeta[tmp,]
tmp <- specMeta$fileName%in%unique(specMeta$fileName)[2]
Ethanodata <- specMeta[tmp,]

Compute function of FDR given score

computeFDRFunction <- function(score, proteinID, decoy = "REV_" ){
  idx <- order(score, decreasing = TRUE)
  score <- score[idx]
  decoy_hit <- grepl(decoy, proteinID[idx])

  FP <- cumsum(decoy_hit)
  TP <- 1:length(idx) - FP

  FDR1 <- (2 * FP) / (TP + FP)
  FDR2 <- FP / TP
  return(list(decoy_hit = decoy_hit, score = score , FDR1 = FDR1 ,FDR2 = FDR2))
}

Helper function to make histogram of the decoy (FP) and non decoy (TP + FP) scores.

histWithFDR <- function(data){
  tx <- with(data,hist(score2,plot = FALSE, breaks=100))
  t1<-with(data,hist(score2[!grepl("REV_",proteinID)],breaks=tx$breaks,main="",xlab="score"))
  with(data,hist(score2[grepl("REV_",proteinID)],add=T,breaks = t1$breaks, col=2))
  cff <- with(data,computeFDRFunction(score2 , proteinID))
  par(new=T)
  with(cff,plot(score,FDR1*100, type="l",col=4,lwd=2,xlab=NA,ylab=NA,axes=FALSE))
  axis(side = 4)
  mtext(side = 4, line = 3, 'Number genes selected')
  return(cff)
}

PSM level FDR vs Score

cff <- histWithFDR(Glucdata)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

FDR is a function of the score $FDR = f(score)$.

Controlling FDR means finding the inverse.
Find score given an FDR $score = i(FDR)$ and remove all PSM's with lower score.

fdr = 0.01
scthresh <-min(cff$score[cff$FDR1 < fdr])
scthresh

Filter dataset given score

Glucdata1 <- subset(Glucdata, score2 > scthresh)

What is the FDR before and after filtering?

getFDR <- function(data){
  (2 * sum(grepl("REV_",data$proteinID ))) / nrow(data)
}

getFDR(Glucdata)*100
getFDR(Glucdata1)*100
cff <- histWithFDR(Glucdata1)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Precursor level FDR

GlucdataPrec <- ddply(Glucdata, .(peptideModSeq, precursorCharge),function(x){x[which.max(x$score2),]})
getFDR(GlucdataPrec) * 100

cff <- histWithFDR(GlucdataPrec)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Exercise : Control the Precursor level FDR on 1% level.

PTM sequence level FDR

GlucdataModPep <- ddply(Glucdata, .(peptideModSeq),function(x){x[which.max(x$score2),]})
getFDR(GlucdataModPep) * 100

cff <- histWithFDR(GlucdataModPep)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Peptide Level FDR

GlucdataPep <- ddply(Glucdata, .(peptideSeq),function(x){x[which.max(x$score2),]})
getFDR(GlucdataPep) * 100

cff <- histWithFDR(GlucdataPep)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Protein Level FDR

GlucdataProt <- ddply(Glucdata, .(proteinID), function(x){data.frame(x[which.max(x$score2),],nrPSM=nrow(x), newscore = sum(x$score2))  })
getFDR(GlucdataProt) * 100
nrow(GlucdataProt)

par(mfrow=c(2,1))
cffP1 <- histWithFDR(GlucdataProt)
GlucdataProt2 <- GlucdataProt
GlucdataProt2$score2 <- GlucdataProt2$newscore
getFDR(GlucdataProt2) * 100
cffP2 <- histWithFDR(GlucdataProt2)

Control FDR on protein level using score

fdr = 0.01
scthresh1 <-min(cffP1$score[cffP1$FDR1 < 0.01])
scthresh1

scthresh2 <-min(cffP2$score[cffP2$FDR1 < 0.01])
scthresh2

GlucdataProt0.01 <- subset(GlucdataProt, score2 > scthresh1)
nrow(GlucdataProt0.01)
getFDR(GlucdataProt0.01) * 100

GlucdataProt0.01 <- subset(GlucdataProt2, score2 > scthresh2)
nrow(GlucdataProt0.01)
getFDR(GlucdataProt0.01) * 100

Removing Proteins identified by a single PSM

Improve FDR by removing "single hit wonders".

GlucdataProt2 <- (subset(GlucdataProt, nrPSM > 1))
nrow(GlucdataProt2)
getFDR(GlucdataProt2) * 100

cff <- histWithFDR(GlucdataProt2)
with(cff,summary(score[decoy_hit]))
with(cff,summary(score[!decoy_hit]))

Compare results

Compare portein lists obtained by removing single hit wonders with those by controlling fdr using score.

length(intersect(GlucdataProt2$proteinID, GlucdataProt0.01$proteinID))
length(setdiff(GlucdataProt2$proteinID, GlucdataProt0.01$proteinID))
length(setdiff(GlucdataProt0.01$proteinID,GlucdataProt2$proteinID ))

Questions?

wolski/FinnProt documentation built on May 4, 2019, 9:47 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

wolski/FinnProt
Data and vignette for FinnProt Workshop 2017

In wolski/FinnProt: Data and vignette for FinnProt Workshop 2017

Overview MS

Overview R

Peptide annotation

Number of Proteins per Protein

Number of proteins matched by one peptide

Length of peptide vs number of proteins matched

Looking at the peptide spectrum matches for the yeast dataset

Compute function of FDR given score

Helper function to make histogram of the decoy (FP) and non decoy (TP + FP) scores.

PSM level FDR vs Score

FDR is a function of the score $FDR = f(score)$.

Filter dataset given score

What is the FDR before and after filtering?

Precursor level FDR

Exercise : Control the Precursor level FDR on 1% level.

PTM sequence level FDR

Peptide Level FDR

Protein Level FDR

Control FDR on protein level using score

Removing Proteins identified by a single PSM

Compare results

Questions?

R Package Documentation

Browse R Packages

We want your feedback!

wolski/FinnProt Data and vignette for FinnProt Workshop 2017

In wolski/FinnProt: Data and vignette for FinnProt Workshop 2017

Overview MS

Overview R

Peptide annotation

Number of Proteins per Protein

Number of proteins matched by one peptide

Length of peptide vs number of proteins matched

Looking at the peptide spectrum matches for the yeast dataset

Compute function of FDR given score

Helper function to make histogram of the decoy (FP) and non decoy (TP + FP) scores.

PSM level FDR vs Score

FDR is a function of the score $FDR = f(score)$.

Filter dataset given score

What is the FDR before and after filtering?

Precursor level FDR

Exercise : Control the Precursor level FDR on 1% level.

PTM sequence level FDR

Peptide Level FDR

Protein Level FDR

Control FDR on protein level using score

Removing Proteins identified by a single PSM

Compare results

Questions?

R Package Documentation

Browse R Packages

We want your feedback!

wolski/FinnProt
Data and vignette for FinnProt Workshop 2017