knitr::opts_chunk$set(echo = TRUE) rm(list=ls()) library(plyr) library(FinnProt)
table
subset
and []
ddply
)hist
intersect
and setdiff
with
functionGenerated using R package prozor
data("peptide_proteinAnnotation") head(peptide_proteinAnnotation) prots <- peptide_proteinAnnotation
#prots <- read.table(file="../data/peptide_proteinAnnotation.txt") tabProtID <- table(prots$proteinID) plot(table(prots$proteinID), ylab="# of Peptides/protein", xlab="protein", axes=F) axis(1) axis(2) names(tabProtID)[1:10]
tabPeptides <- table(prots$peptideSequence) plot(sort(tabPeptides), axes=F, ylab="# of protein per peptide", xlab="unique peptide") axis(1) axis(2)
pepLength<-nchar(names(tabPeptides)) plot(pepLength,tabPeptides, xlab="length of peptide sequence", ylab="nr of proteins assigned",axes=F) axis(1) axis(2) whichPeptide <- which(pepLength >30 & tabPeptides >50) points(pepLength[whichPeptide], tabPeptides[whichPeptide],col=2,pch="x",cex=3) as.character(prots[ prots$peptideSequence == names(whichPeptide),"proteinID"])
Transposon
Data generated from Mascot search result .dat files using R package bibliospec.
# specMeta <- read.table(file = "../data/psmMatchesAnnotated.txt",stringsAsFactors = F) data("psmMatchesAnnotated") lapply(psmMatchesAnnotated,class) for(i in 1:ncol(psmMatchesAnnotated)){ if(class(psmMatchesAnnotated[[i]])=="factor"){ psmMatchesAnnotated[[i]]<-as.character(psmMatchesAnnotated[[i]]) } } lapply(psmMatchesAnnotated,class) specMeta <- psmMatchesAnnotated table(specMeta$fileName)
tmp <- specMeta$fileName%in%unique(specMeta$fileName)[1] Glucdata <- specMeta[tmp,] tmp <- specMeta$fileName%in%unique(specMeta$fileName)[2] Ethanodata <- specMeta[tmp,]
computeFDRFunction <- function(score, proteinID, decoy = "REV_" ){ idx <- order(score, decreasing = TRUE) score <- score[idx] decoy_hit <- grepl(decoy, proteinID[idx]) FP <- cumsum(decoy_hit) TP <- 1:length(idx) - FP FDR1 <- (2 * FP) / (TP + FP) FDR2 <- FP / TP return(list(decoy_hit = decoy_hit, score = score , FDR1 = FDR1 ,FDR2 = FDR2)) }
histWithFDR <- function(data){ tx <- with(data,hist(score2,plot = FALSE, breaks=100)) t1<-with(data,hist(score2[!grepl("REV_",proteinID)],breaks=tx$breaks,main="",xlab="score")) with(data,hist(score2[grepl("REV_",proteinID)],add=T,breaks = t1$breaks, col=2)) cff <- with(data,computeFDRFunction(score2 , proteinID)) par(new=T) with(cff,plot(score,FDR1*100, type="l",col=4,lwd=2,xlab=NA,ylab=NA,axes=FALSE)) axis(side = 4) mtext(side = 4, line = 3, 'Number genes selected') return(cff) }
cff <- histWithFDR(Glucdata) with(cff,summary(score[decoy_hit])) with(cff,summary(score[!decoy_hit]))
fdr = 0.01 scthresh <-min(cff$score[cff$FDR1 < fdr]) scthresh
Glucdata1 <- subset(Glucdata, score2 > scthresh)
getFDR <- function(data){ (2 * sum(grepl("REV_",data$proteinID ))) / nrow(data) }
getFDR(Glucdata)*100 getFDR(Glucdata1)*100 cff <- histWithFDR(Glucdata1) with(cff,summary(score[decoy_hit])) with(cff,summary(score[!decoy_hit]))
GlucdataPrec <- ddply(Glucdata, .(peptideModSeq, precursorCharge),function(x){x[which.max(x$score2),]}) getFDR(GlucdataPrec) * 100
cff <- histWithFDR(GlucdataPrec) with(cff,summary(score[decoy_hit])) with(cff,summary(score[!decoy_hit]))
GlucdataModPep <- ddply(Glucdata, .(peptideModSeq),function(x){x[which.max(x$score2),]}) getFDR(GlucdataModPep) * 100
cff <- histWithFDR(GlucdataModPep) with(cff,summary(score[decoy_hit])) with(cff,summary(score[!decoy_hit]))
GlucdataPep <- ddply(Glucdata, .(peptideSeq),function(x){x[which.max(x$score2),]}) getFDR(GlucdataPep) * 100
cff <- histWithFDR(GlucdataPep) with(cff,summary(score[decoy_hit])) with(cff,summary(score[!decoy_hit]))
GlucdataProt <- ddply(Glucdata, .(proteinID), function(x){data.frame(x[which.max(x$score2),],nrPSM=nrow(x), newscore = sum(x$score2)) }) getFDR(GlucdataProt) * 100 nrow(GlucdataProt)
par(mfrow=c(2,1)) cffP1 <- histWithFDR(GlucdataProt) GlucdataProt2 <- GlucdataProt GlucdataProt2$score2 <- GlucdataProt2$newscore getFDR(GlucdataProt2) * 100 cffP2 <- histWithFDR(GlucdataProt2)
fdr = 0.01 scthresh1 <-min(cffP1$score[cffP1$FDR1 < 0.01]) scthresh1 scthresh2 <-min(cffP2$score[cffP2$FDR1 < 0.01]) scthresh2
GlucdataProt0.01 <- subset(GlucdataProt, score2 > scthresh1) nrow(GlucdataProt0.01) getFDR(GlucdataProt0.01) * 100 GlucdataProt0.01 <- subset(GlucdataProt2, score2 > scthresh2) nrow(GlucdataProt0.01) getFDR(GlucdataProt0.01) * 100
Improve FDR by removing "single hit wonders".
GlucdataProt2 <- (subset(GlucdataProt, nrPSM > 1)) nrow(GlucdataProt2) getFDR(GlucdataProt2) * 100 cff <- histWithFDR(GlucdataProt2) with(cff,summary(score[decoy_hit])) with(cff,summary(score[!decoy_hit]))
Compare portein lists obtained by removing single hit wonders with those by controlling fdr using score.
length(intersect(GlucdataProt2$proteinID, GlucdataProt0.01$proteinID)) length(setdiff(GlucdataProt2$proteinID, GlucdataProt0.01$proteinID)) length(setdiff(GlucdataProt0.01$proteinID,GlucdataProt2$proteinID ))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.