Nothing
# TODO: Add comment
#
# Author: erikahrne
###############################################################################
# set filter, filer = data.frame all TRUE/FALSE
#' @export
.setFilter <- function(eset=eset, filter=filter){
fData(eset)$isFiltered <- apply(filter,1,function(f){ return(sum(f,na.rm=T) > 0 ) })
return(eset)
}
### add ptm coord to eset fData (data frame listing phospho coordinates and motifX)
#' @export
#' @importFrom seqinr read.fasta
#' @importFrom utils setTxtProgressBar txtProgressBar
.addPTMCoord <- function(eset,proteinDB,motifLength=4, isProgressBar=F,format=1){
ptmCoordDf <- data.frame()
### progress bar
pbSum <- txtProgressBar(min = 0, max = nrow(eset), style = 3)
for(i in 1:nrow(eset)){
### increment progressbar
if(isProgressBar) setTxtProgressBar(pbSum, i)
modifAnnot <- as.character(fData(eset)$ptm[i])
### if peptide is modified
if((nchar(modifAnnot) > 0) & !is.na(modifAnnot) ){
peptide <- as.character(fData(eset)$peptide[i])
proteinAC <- as.character(fData(eset)$proteinName[i])
modifCoord <- NA
motifX <- NA
proteinSeq <- proteinDB[proteinAC]
if(proteinSeq == "NULL" ){
warning(proteinAC," NOT FOUND IN PROTEIN FASTA","\n")
# modifCoord <- "Err"
# motifX <- "Err"
}else{
modifCoord <- getModifProteinCoordinates(modifAnnot,peptide,proteinSeq, format=format)
motifX <- paste(getMotifX(modifCoord,peptide, proteinSeq,motifLength),collapse=",")
modifCoord <- paste(modifCoord,collapse=",")
}
}else{
modifCoord <- ""
motifX <- ""
}
ptmCoordDf <- rbind(ptmCoordDf,data.frame(modifCoord,motifX))
}
# close progress bar
setTxtProgressBar(pbSum, i)
close(pbSum)
fData(eset) <- cbind(fData(eset),ptmCoordDf)
return(eset)
}
### more to be added
#' @export
.getProteaseRegExp <- function(protease="trypsin"){
if(protease == "trypsin"){
return("[KR](?!P)")
}else if(protease == "lys-c"){
return("K(?!P)")
}else{
stop("Unknown Protease:", protease )
}
}
# @details returns a dataframe lising all unique motifs and the corresponding ptm. multiple entries will be created for multiply modified peptides
#' @export
.getUniquePtmMotifs <- function(eset, format=1){
eset <- eset[!fData(eset)$isFiltered & (nchar(as.character(fData(eset)$ptm)) > 0),]
### has to be done this way as sometimes the matching AC is not found in the database...
motifXTag <- c()
ptmTag <- c()
if(format ==1){
for(i in 1:nrow(eset)){
ptms <- gsub("\\[[0-9]*\\] {1,}","",unlist(strsplit(as.character(fData(eset)$ptm[i]),"\\|")))
mX <- unlist(strsplit(as.character(fData(eset)$motifX[i]),"\\,"))
ptmTag <- c(ptmTag,ptms)
#if(length(ptms) > 0){
if(length(ptms) == length(mX) ){
motifXTag <- c(motifXTag,mX)
}else{
motifXTag <- c(motifXTag,rep(NA,length(ptms)))
}
#}
}
}else{ # scaffold PTM
for(i in 1:nrow(eset)){
ptms <- gsub("[0-9]","",unlist(strsplit(as.character(fData(eset)$ptm[i]),"\\, ")))
mX <- unlist(strsplit(as.character(fData(eset)$motifX[i]),"\\,"))
ptmTag <- c(ptmTag,ptms)
#if(length(ptms) > 0){
if(length(ptms) == length(mX) ){
motifXTag <- c(motifXTag,mX)
}else{
motifXTag <- c(motifXTag,rep(NA,length(ptms)))
}
#}
}
}
sel <- !is.na(motifXTag) & grepl("\\*",motifXTag)
return(unique(data.frame(ptm = ptmTag[sel], motif = motifXTag[sel])))
}
### Detectable/Identifiable peptide lengths (600 - 4000 Da -> 600/110 - 4000/110 ->) 5 - 36 AA's
#' Get number peptides passing defined length criteria
#' @param peptides list of peptides
#' @param peptideLength vector of two integers defining peptide length range
#' @return integer corresponding to number of detectable peptides
#' @export
#' @note No note
#' @details No details
#' @examples print("No examples")
getNbDetectablePeptides <- function(peptides, peptideLength=c(5,36)){
okLength <- (nchar(peptides) >= peptideLength[1]) & (nchar(peptides) <= peptideLength[2])
return(sum(okLength))
}
#' Digest protein
#' @param proteinSeq protein sequence
#' @param proteaseRegExp protease Regular Expression
#' @param nbMiscleavages default 0
#' @return vector of peptides
#' @export
#' @note No note
#' @details No details
#' @examples print("No examples")
getPeptides <- function(proteinSeq,proteaseRegExp=.getProteaseRegExp("trypsin"),nbMiscleavages=0){
allAA <- as.vector(unlist(strsplit(proteinSeq,"")))
### get all full cleaved peptides without cleaved residue
fcPeptides <- strsplit(proteinSeq,proteaseRegExp,perl=TRUE)[[1]]
### add cleaved residue
matchPos <- gregexpr(proteaseRegExp,proteinSeq,perl=TRUE)[[1]]
separator <- allAA[ matchPos ]
###if c-term peptide isn't tryptic
if(length(separator) < length(fcPeptides)) separator <- c(separator,"")
fcPeptides <- paste(fcPeptides,separator,sep="")
fcPeptides <- fcPeptides[nchar(fcPeptides) > 0 ]
### if no mis-cleavages that's it
if(nbMiscleavages == 0){
return(fcPeptides)
}
allPeptides <- c()
### handle miscleavages
for(i in 1:length(fcPeptides)){
#cat(fcPeptides[i]," ",i," -------------\n")
for(j in i:(i+nbMiscleavages)){
if(j <= length(fcPeptides)){
pept <- paste(fcPeptides[i:j],collapse="")
#cat(j," ---",pept,"\n")
allPeptides <- c(allPeptides,pept)
}
}
}
return(allPeptides[nchar(allPeptides) > 0])
}
#' Add identification leve q-values to ExpressionSet (calculated based on target-decoy score distribution)
#' @param eset ExpressionSet
#' @return ExpressionSet object
#' @details if ptm column is part if the ExpressionSet q-values are calculated seperately for modified and non-modified features
#' @export
#' @note No note
#' @details No details
#' @seealso \code{\link{getIdLevelQvals}}
#' @examples print("No examples")
addIdQvalues <- function(eset=eset){
### only calculate q-values for non-filtered features
sel <- rep(T,nrow(eset))
if(!is.null(fData(eset)$isFiltered)){
sel <- !fData(eset)$isFiltered
}
# init vector to be added
idQValue <- rep(1,nrow(eset))
# if ptm column exist seperate calculate qvals sepearately for modif and nomd features
if(!is.null(fData(eset)$ptm)){
isMod <- nchar(as.character(fData(eset)$ptm)) > 0
# get qvalues for modified features
if(sum(isMod & sel ) > 0){
idQValue[isMod & sel ] <- getIdLevelQvals(fData(eset)$idScore[isMod & sel],isDecoy(fData(eset)$proteinName[isMod & sel]))
}
# get qvalues for non-modified features
if(sum(!isMod & sel ) > 0){
idQValue[!isMod & sel] <- getIdLevelQvals(fData(eset)$idScore[!isMod & sel],isDecoy(fData(eset)$proteinName[!isMod & sel]))
}
}else{# get qvalues if ptm column is missing
idQValue[sel] <- getIdLevelQvals(fData(eset)$idScore[sel] ,isDecoy(fData(eset)$proteinName[sel]))
}
# add q-value vector
fData(eset)$idQValue <- idQValue
return(eset)
}
### check if protein is a contaminant
#' Check if protein is a contaminant entry
#' @param ac vector of protein accession numbers
#' @return vector TRUE/FALSE
#' @export
#' @note No note
#' @details contanminants proteins are typically annotated as: CON_P0000
#' @references NA
#' @examples print("No examples")
isCon <- function(ac){
ac <- as.character(ac)
return(regexpr("\\Wcon_",ac,ignore.case=TRUE, perl=TRUE) > -1 | regexpr("^con_",ac,ignore.case=TRUE, perl=TRUE) > -1 | regexpr("_con",ac,ignore.case=TRUE, perl=TRUE) > -1 )
}
### check if protein is a Decoy
#' Check if protein is a decoy entry
#' @param ac vector of protein accession numbers
#' @return vector TRUE/FALSE
#' @export
#' @note No note
#' @details decoy proteins are typically annotated as: REV_P0000
#' @references NA
#' @examples print("No examples")
isDecoy <-function(ac){
return(((regexpr("^rev_",ac,ignore.case=TRUE) > -1)
| (regexpr("^decoy_",ac,ignore.case=TRUE) > -1)
| (regexpr("^revipi",ac,ignore.case=TRUE) > -1)
| (regexpr("^reverse_",ac,ignore.case=TRUE) > -1)))
}
### calculate id level qvals based on target-decoy score distributions
#' Calculates identification level q-values based on target-decoy score distributions
#' @param scores peptide/protein identficationscore
#' @param isDecoy vector of TRUE/FALSE
#' @return vector of q.values
#' @export
#' @note No note
#' @details q-value = (Nb. Decoy Entries at idScore Threshold S*) / (Nb. Target Entries at idScore Threshold S). (* idScore >= S)
#' @references NA
#' @examples print("No examples")
getIdLevelQvals <- function(scores, isDecoy){
if(sum(isDecoy) == 0){
return(rep(0,length(scores)))
}
targetScores = scores[!isDecoy]
decoyScores = scores[isDecoy]
qvals = c()
for(score in scores){
qval = sum(decoyScores >= score) / sum(targetScores >= score)
qvals = c(qvals,qval)
}
return(qvals)
}
### Get score cutoff for a given fdr cut-off
#' Get score cutoff for a given fdr cut-off
#' @param scores peptide/protein identficationscore
#' @param isDecoy vector of TRUE/FALSE
#' @param fdrCutOff [0,1]
#' @return scoreCutoff
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
getScoreCutOff <- function(scores,isDecoy,fdrCutOff = 0.01){
return(min(scores[(getIdLevelQvals(scores ,isDecoy ) < fdrCutOff)]))
}
### create motif-x peptide annotation
#' Create motif-x peptide annotation
#' @param modifPos vector positions
#' @param peptide peptide sequence
#' @param proteinSeq protein sequence
#' @param motifLength motif flanking sequence
#' @return vector of motifs
#' @export
#' @note No note
#' @details motif-x example PGDYS*TTPG
#' @references NA
#' @examples print("No examples")
getMotifX <- function(modifPos,peptide,proteinSeq, motifLength=4){
#modifPos <- getModifProteinCoordinates(modifAnnot,peptide,proteinSeq)
if( modifPos[1] > -1){
motifX <- c()
for(mp in modifPos){
prefix <- substr(proteinSeq,mp-motifLength,mp-1)
suffix <- substr(proteinSeq,mp+1,mp+motifLength)
modifAA <- paste(substr(proteinSeq,mp,mp),"*",sep="")
mx <- paste(prefix,modifAA,suffix,sep="")
motifX <- c(motifX,mx)
}
return(motifX)
}else{
return(NA)
}
}
### Get modification coordinates on protein
#' Get modification coordinates on protein
#' @param modifAnnot modifcation as annotated by progenesis. E.g. '[15] Phospho (ST)|[30] Phospho (ST)'
#' @param peptideSeq peptide sequence
#' @param proteinSeq protein sequence
#' @param format c(1,2) 1. progenesis 2. scaffold
#' @return vector of protein coordinates (mmodification residue number)
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
getModifProteinCoordinates <- function(modifAnnot,peptideSeq,proteinSeq, format=1){
peptideSeq <- as.character(peptideSeq)
modifPos <- c()
if(format == 1){
### parse modifAnnot -> modif position(s) on peptide positions
modifList <- strsplit(as.character(modifAnnot),"\\|")[[1]]
for(m in modifList){
if(grepl("N\\-term",m)){
m <- 1
}else if(grepl("C\\-term",m)){
m <- nchar(peptideSeq)
}else{
m <- gsub("\\[","",m)
m <- as.numeric(gsub("\\].*","",m))
}
modifPos <- c(modifPos,m)
}
}else if(format == 2){
modifList <- strsplit(as.character(modifAnnot),"\\, ")[[1]]
for(m in tolower(modifList)){
### @ TODO check how n-term and c-term modificaitons are annotated
if(grepl("n\\-term",m)){
m <- 1
}else if(grepl("c\\-term",m)){
m <- nchar(peptideSeq)
}else{
m <- gsub("[a-z A-Z]","",m)
}
modifPos <- c(modifPos,m)
}
modifPos <- as.numeric(modifPos)
}else{
stop("ERROR: getModifProteinCoordinates - Unknown PTM format", format)
}
### get peptide position on protein (first matching position if duplicates)
pepStartPos <- regexpr(peptideSeq,proteinSeq)[1]
if(pepStartPos < 0){
cat("\nERROR - getModifProteinCoordinates -:",peptideSeq," not matching current protein sequence (check provided protein sequence db)","\n" )
return(pepStartPos)
}
return((modifPos+pepStartPos-1))
}
#' Get number of mis-cleavages perp peptide
#' @param peptide character vector
#' @param protease regular expression
#' @return vector ofintegers
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
getNbMisCleavages <-function(peptide, protease="trypsin"){
return(
unlist(lapply(peptide,function(t){
cTermRegExp <- paste(.getProteaseRegExp(protease),"$",sep="")
### discard 1 if cTerm matching regExp
# print(gregexpr(.getProteaseRegExp(protease),t,perl=T)[[1]])
return((sum(gregexpr(.getProteaseRegExp(protease),t,perl=T)[[1]] > -1 )) - grepl(cTermRegExp,t,perl=T))
}))
)
}
#' Get number of peptides per protein
#' @param eset ExpressionSet
#' @return table
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
getNbPeptidesPerProtein <- function(eset){
sel <- !fData(eset)$isFiltered
return(sort(table(unique(data.frame(fData(eset[sel,])$proteinName,fData(eset[sel,])$peptide))[,1]),decreasing=T))
}
#' Set nbPeptides coulmn of featureData
#' @param eset ExpressionSet
#' @return eset
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
setNbPeptidesPerProtein <- function(eset){
fData(eset)$nbPeptides <- getNbPeptidesPerProtein(eset)[as.character(fData(eset)$proteinName)]
return(eset)
}
#' Get number of spectra per protein
#' @param eset ExpressionSet
#' @return table
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
getNbSpectraPerProtein <- function(eset){
sel <- !fData(eset)$isFiltered
return(sort(table(fData(eset[sel,])$proteinName),decreasing=T))
}
#' Set nbPeptides coulmn of featureData
#' @param eset ExpressionSet
#' @return eset
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
setNbSpectraPerProtein <- function(eset){
fData(eset)$nbSpectra <- getNbSpectraPerProtein(eset)[as.character(fData(eset)$proteinName)]
return(eset)
}
#' Get modification coordinates on protein
#' @param d numeric vector
#' @param nbSd range spanning number of sd frmo mean
#' @return vector range boundries
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
getMeanCenteredRange <- function(d,nbSd=4){
return(c(mean(d,na.rm=T) - nbSd*sd(d,na.rm=T),mean(d,na.rm=T) + nbSd*sd(d,na.rm=T)))
}
#' Check if ACs are in "non-stripped" uniprot format e.g. "sp|Q8CHJ2|AQP12_MOUSE"
#' @param acs accession numbers
#' @return boolean TRUE/FALSE
#' @export
#' @note No note
#' @details TRUE if less than 10% of ACs contain a "|" character
#' @references NA
#' @examples print("No examples")
isStrippedACs <-function(acs){
acs <- as.character(acs)
return( (sum(grepl("\\|",acs)) / length(acs)) < 0.9 )
}
#' strip uniprot format e.g. "sp|Q8CHJ2|AQP12_MOUSE" -> Q8CHJ2
#' @param acs accession numbers
#' @return vector character
#' @export
#' @note No note
#' @details TRUE if less than 10% of ACs contain a "|" character
#' @references NA
#' @examples print("No examples")
stripACs <- function(acs){
acs <- gsub("[a-z]{1,4}\\|","",acs)
acs <- gsub("\\|.*","",acs)
return(acs)
}
#' Get amino acid coordinates on protein
#' @param peptideSeq peptide sequence
#' @param proteinSeq protein sequence
#' @param aaRegExpr target AA reg exp
#' @return vector of protein coordinates (mmodification residue number)
#' @export
#' @note No note
#' @details NA
#' @references NA
#' @examples print("No examples")
getAAProteinCoordinates <- function(peptideSeq,proteinSeq, aaRegExpr="[STY]"){
peptideSeq <- as.character(peptideSeq)
### parse modifAnnot -> modif position(s) on peptide positions
#modifList <- strsplit(as.character(modifAnnot),"\\|")[[1]]
modifPos <- as.vector(unlist(gregexpr(aaRegExpr,peptideSeq)[[1]]))
### get peptide position on protein (first matching position if duplicates)
pepStartPos <- regexpr(peptideSeq,proteinSeq)[1]
if(pepStartPos < 0){
cat("\nERROR - getModifProteinCoordinates -:",peptideSeq," not matching current protein sequence (check provided protein sequence db)","\n" )
return(pepStartPos)
}
return((modifPos+pepStartPos-1))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.