R/SpectroX.R

Defines functions parseMaxQuantMSMS getSearchedModifications getFragmentSequence getPeptides digestProteome proteotypicPeptideExport createSpectralLibrary createComplementaryIsotopeLibrary spectroDiveExport spectronautExport skylineExport parseTargetsFile barplotPetideCountPerProtein barplotPeptidesPerProtein getPTMColors

Documented in barplotPeptidesPerProtein barplotPetideCountPerProtein createComplementaryIsotopeLibrary createSpectralLibrary digestProteome getFragmentSequence getPeptides getPTMColors getSearchedModifications parseMaxQuantMSMS parseTargetsFile proteotypicPeptideExport skylineExport spectroDiveExport spectronautExport

#
# erik.ahrne@unibas.ch
#

# IRTPEPTIDES = c("LGGNEQVTR"
#                 ,"GAGSSEPVTGLDAK"
#                 ,"VEATFGVDESNAK"
#                 ,"YILAGVENSK"
#                 ,"TPVISGGPYEYR"
#                 ,"TPVITGAPYEYR"
#                 ,"DGLDAASYYAPVR"
#                 ,"ADVTPADFSEWSK"
#                 ,"GTFIIDPGGVIR"
#                 ,"GTFIIDPAAVIR"
#                 ,"LFLQFGAQGSPFLK")

#' @export
IRTPEPTIDES <- data.frame(refIRT = sort(c(0.00,0.00
                                     ,54.62, 54.62
                                     ,100.00, 100.00
                                     ,42.26, 42.26
                                     ,87.23, 87.23
                                     ,-24.92, -24.92
                                     ,70.52, 70.52
                                     ,28.71,  28.71
                                     ,33.38, 33.38
                                     ,12.39, 12.39
                                     ,19.79,19.79))
                     ,row.names=c("LGGNEQVTR", "LGGNETQVR"
                                  ,"GAGSSEPVTGLDAK", "AGGSSEPVTGLADK"
                                  ,"VEATFGVDESNAK", "VEATFGVDESANK"
                                  ,"YILAGVENSK", "YILAGVESNK"
                                  ,"TPVISGGPYEYR","TPVISGGPYYER"
                                  ,"TPVITGAPYEYR", "TPVITGAPYYER"
                                  ,"DGLDAASYYAPVR", "GDLDAASYYAPVR"
                                  ,"ADVTPADFSEWSK", "DAVTPADFSEWSK"
                                  ,"GTFIIDPGGVIR", "TGFIIDPGGVIR"
                                  ,"GTFIIDPAAVIR", "GTFIIDPAAIVR"
                                  ,"LFLQFGAQGSPFLK", "FLLQFGAQGSPLFK"))


#' Parse MaxQuant msms.txt
#' @param file path
#' @param pepCutoff numeric default 0.05
#' @param targetPeptides character default NA
#' @param targetProteins character default NA
#' @param filterContaminants TRUE
#' @param contaminantRegExp '^CON_'
#' @param selectedPTMRegExp NA
#' @param filterNonExclusivePeptides default TRUE
#' @param pepLength default [1,Inf)
#' @param chargeState default [1,Inf)
#' @param label (Arg10 and Lys8 only filter) options NA (deafault,no filter),  'light','heavy'
#' @param keepBestSpectrumOnly  keep top-scoring spectrum only, per peptidoform, default TRUE
#' @param requiredSequenceRegExp dafualt '.' (no filter)
#' @return data.frame of maxQuant psm level results
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
parseMaxQuantMSMS = function(file
                             , pepCutoff=0.05
                             , targetPeptides=NA
                             , targetProteins=NA
                             , filterContaminants = T
                             , contaminantRegExp = '^CON_'
                             , selectedPTMRegExp = NA
                             , filterNonExclusivePeptides = T
                             , pepLength = c(1,Inf)
                             , chargeState = c(1,Inf)
                             , label = NA
                             , maxMissedCleavages = 0
                             , keepBestSpectrumOnly = T
                             , requiredPepSeqRegExp = "."
                             ,...

){

  tb = suppressMessages(read_tsv(file))

  # is irt peptide (unmodified)
  tb$isIRT = paste0("_",rownames(IRTPEPTIDES),"_" , collapse="|") %>% grepl(.,tb$`Modified sequence`)

  # rename irt protein entry
  #tb$Proteins[tb$isIRT] %>% print
  tb$Proteins[tb$isIRT] = "BiognosysIRT"

  # target peptide filter
  if(!is.na(targetPeptides[1])){
    # targetPeptides = c("AAAAAAAAALGVR", "AAAAADEDEEADEEDEAEAGGMGPQAR" )
    tb = subset(tb, (Sequence %in% targetPeptides) | isIRT  )
    # disable additional peptide filters
    filterNonExclusivePeptides = F
    pepLength = c(1,Inf)
    maxMissedCleavages = Inf
    requiredPepSeqRegExp = "."
  }
  #Filter out non exclusive peptides
  if(filterNonExclusivePeptides){
    tb = subset(tb, !grepl("\\;",Proteins))
  }else if(!is.na(targetPeptides[1])){ # warn if target peptides are not exclusive
    nonExclPeptides =  subset(tb, grepl("\\;",Proteins))
    if(nrow(nonExclPeptides) > 0 ){
      cat("Warning - Found Non-Exclusive Peptides:")
      nonExclPeptides[c("Sequence","Proteins")] %>% as.data.frame() %>% unique %>% print
    }
  }

  # non peptide filters
  keep = rep(T, nrow(tb))
  # target protein filter
  if(!is.na(targetProteins[1])){
    #targetProteins = c("P46099", "Q99J95" )
    targetRegex = paste(targetProteins,collapse = "|")
    keep = keep & grepl(targetRegex , tb$Proteins)
  }

  # CON filter
  if(filterContaminants) keep = keep & !grepl("CON\\_",tb$Proteins)
  # pep thrs
  keep = keep & (tb$PEP <  pepCutoff)
  # decoy filter
  keep = keep & (is.na(tb$Reverse) | tb$Reverse != "+" )
  # peptide length
  keep = keep &  ((tb$Length >= min(pepLength)) & (tb$Length <= max(pepLength)) )


  # keep charge state
  keep = keep &  ((tb$Charge >= min(chargeState)) & (tb$Charge <= max(chargeState)) )
  # keep modifs
  #selectedPTMRegExp = "Oxidation"
  # if(!is.na(selectedPTMRegExp)){
  #   keep =  keep & (paste(c("Unmodified","Arg10","Lys8",selectedPTMRegExp),collapse="|") %>% grepl(.,tb$Modifications))
  # }else{
  #   keep =  keep & (paste(c("Unmodified","Arg10","Lys8"),collapse="|") %>% grepl(.,tb$Modifications))
  # }

  # get all modifications seached for
  excludedPTM = getSearchedModifications(tb,...)

  cat("INFO: allowed PTMs:", ifelse(is.na(excludedPTM[grepl(selectedPTMRegExp,excludedPTM)][1]),"None", paste(excludedPTM[grepl(selectedPTMRegExp,excludedPTM)],collapse=":")),"\n")
  # e.g. "Oxidation (M)". Is "Oxidation (M)" allowed. Check if it matches regexp
  if(!is.na(selectedPTMRegExp)) excludedPTM = excludedPTM[!grepl(selectedPTMRegExp,excludedPTM)]

    # if excludedPTM vector contains any mods. only keep peptides not affected by these ptms
  if(length(excludedPTM) > 0) keep =  keep & rowSums(tb[excludedPTM]) == 0

  # light heavy filter
  if(!is.na(label)){
    isHeavyIsotope = grepl("Arg10|Lys8",tb$Modifications)
    if(label == "light"){ # keep light only
      keep =  keep & !isHeavyIsotope
    }else{ # keep heavy only
      keep =  keep & isHeavyIsotope
    }
  }

  # missed cleavages
  keep =  keep & tb$`Missed cleavages` <= maxMissedCleavages

  # DO NOT discard irts
  #keep = keep | ((paste(IRTPEPTIDES, collapse="|") %>% grepl(.,tb$`Modified sequence`)) &  (tb$PEP <  pepCutoff))
  keep = keep | (tb$isIRT & (tb$PEP <  pepCutoff))

  # apply filter
  tb = subset(tb, keep)

  # filterBestSpectrum
  # keep top-scoring spectrum only, per peptidoform
  if(keepBestSpectrumOnly) tb = group_by(tb, Sequence, Modifications) %>% filter(rank(Score,ties.method = "first") == length(Score))

  # casting
  suppressWarnings(tb$`Precursor Apex Fraction` %<>% as.numeric)
  suppressWarnings(tb$`Precursor Intensity` %<>% as.numeric)

  return(tb)

}

#' Get list of variable modifiections condsidered by MaxQuant
#' @param df tible or data.frame
#' @param ignoreArgLysIsoLabel default T Do not consider Arg Lys C,N heavy isotope label
#' @return character vector of modification names (does not return "Arg10","Lys8" labels )
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
getSearchedModifications = function(tb, ignoreArgLysIsoLabel=T){

  mods = names(tb)[grepl("Probabilities$",names(tb))] %>% gsub(" Probabilities$","",.)
  # add n-term mods
  mods = c(mods,names(tb)[grepl("N\\-term",names(tb))])
  # remove "Arg10"         "Lys8"
  if(ignoreArgLysIsoLabel) mods = mods[!(mods %in% c("Arg10","Lys8"))]
  return(mods)
}

#' Parse spectrum framgment match information listed in MaxQuant 'Identifications' results
#' @param psm a row of MaxQuant 'Identifications' data.frame
#' @return list with  attributes intensity, ionType, charge, mz, isNL
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
createLibrarySpectrum <- function(psm){

  ionAnnot <- unlist(strsplit(as.character(psm$Matches), ";"))
  ionType <- substr(ionAnnot,1,1)
  fragmentNb <- as.integer(gsub("[-(].*","",substr(ionAnnot,2,nchar(ionAnnot))))

  intensity <- as.numeric(unlist(strsplit(as.character(psm$Intensities), ";")))
  adjustedIntensity <-  intensity/psm$`Precursor Apex Fraction`
  mz <- as.numeric(unlist(strsplit(as.character(psm$Masses), ";")))
  mzDelta = as.numeric(unlist(strsplit(as.character(psm$'Mass Deviations [Da]'), ";")))

  charge <- rep(1,length(mz))
  isNL <- grepl("\\-",ionAnnot)
  isSelected <- rep(F,length(isNL))

  # heavy / light
  labelState <- grepl("Lys8|Arg10",psm$Modifications)

  ### non +1 fragment charge states
  higherCS <- grepl("\\+",ionAnnot)
  if(T %in% higherCS){
    chargeTmp <- gsub(".*\\(","",ionAnnot)
    chargeTmp <- gsub("\\+\\)","",chargeTmp)
    charge[higherCS] <- as.numeric(chargeTmp[higherCS])
  }

  # theoretical fragment mz
  mz = mz + mzDelta

  # ret <- list()
  # ret$spectrum <- data.frame(intensity,adjustedIntensity,ionAnnot,ionType,fragmentNb,charge,mz,isNL,isSelected)
  # ret$precMz <- (psm$Mass+psm$Charge*1.007316)/psm$Charge
  # ret$precIntensity <- psm$`Precursor Intensity`
  # ret$isHeavy <- labelState
  # ret$retentionTime <- psm$`Retention time`
  # ret$iRT <- suppressWarnings(ifelse(is.null(psm$iRT),NA,psm$iRT))
  # ret$charge <- as.numeric(psm$Charge)
  # ret$protein <- as.character(psm$Proteins)
  # ret$peptide <- as.character(psm$Sequence)
  # ret$mqResIdx <- rownames(psm)
  #
  # return(ret)

  spectrum = data.frame(intensity,adjustedIntensity,ionAnnot,ionType,fragmentNb,charge,mz,isNL,isSelected
                        , precMz = (psm$Mass+psm$Charge*1.007316)/psm$Charge
                        , precIntensity =  psm$`Precursor Intensity`
                        , isHeavy = labelState
                        , retentionTime = psm$`Retention time`
                        , iRT = suppressWarnings(ifelse(is.null(psm$iRT),NA,psm$iRT))
                        , precCharge = as.numeric(psm$Charge)
                        , precApexIntensity = psm$`Precursor Intensity` / psm$`Precursor Apex Fraction`
                        , psmScore = psm$Score
                        , protein = as.character(psm$Proteins)
                        , proteinDescription =psm$`Protein Names`
                        , peptide = as.character(psm$Sequence)
                        , ptm = psm$Modifications
                        , mqResIdx = psm$`Scan number`
                        , isIRT = psm$isIRT


                        )
  return(spectrum)
}

#' Get linear model predicting iRT as a function retention time (column name "rt")
#' @param tb tibble including column "Retention time" and "Sequence"
#' @return list including fit rlm object,  data.frame peptide,rt,irtRef
#' @export
#' @note  No note
#' @details Uses Robust Linear Regression. Retention times of at least 3 iRT unique peptides needs to be listed in the input tibble
#' @references Using iRT, a normalized retention time for more targeted measurement of peptides, Escher et al. (2012), \url{http://www.ncbi.nlm.nih.gov/pubmed/22577012}
#' @examples print("No examples")
getIRTModel <- function(tb){

  irtDF <- data.frame(peptide = tb$Sequence,rt=tb$`Retention time`,irtRef= IRTPEPTIDES[ match( tb$Sequence , rownames(IRTPEPTIDES)),] ) %>% na.omit

  ret = list()
  ret$fit = NULL
  ret$irtPeptides = irtDF

  ### require at least 3 identified irt peptides
  if(length(unique(irtDF$peptide)) < 3){
    cat("WARN: Not enough iRT peptides detected to estimate iRT times\n")
    return(ret)
  }

  ret = list()
  ret$fit = rlm(irtRef ~ rt ,data=irtDF)
  ret$irtPeptides = irtDF

  return(ret)

}

#' Add iRT metric to data.frame
#' @param tb tibble containing colum labelled "Retention time"
#' @param fit lm object
#' @return vector of normalised rt (empirical irt)
#' @export
#' @note  No note
#' @details No details
#' @references Using iRT, a normalized retention time for more targeted measurement of peptides, Escher et al. (2012), \url{http://www.ncbi.nlm.nih.gov/pubmed/22577012}
#' @examples print("No examples")
getEmpiricalIRT <- function(tb,fit){
    return(predict(fit, newdata=cbind(tb,rt = tb$`Retention time` )))
}

#' Plot IRT calibration curve
#' @param irtModel list
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
plotIRTCalibration<- function(irtModel
                              , cex=1.5
                              , col="blue"
                              , pch=19
                              ,...){

  fit = irtModel$fit
  irt = irtModel$irtPeptides
  irt <- irt[irt$peptide %in% rownames(IRTPEPTIDES),]

  plot(irt$rt , IRTPEPTIDES[match(as.character(irt$peptide),rownames(IRTPEPTIDES)),]
       , xlab="Retention Time"
       , ylab="iRT Reference"
       , cex.lab=cex
       , cex.axis=cex
       , cex=cex
       , col=col
       , pch=pch
       , ...)

  # missing irt
  abline(h=setdiff(IRTPEPTIDES$refIRT,IRTPEPTIDES[match(as.character(irt$peptide),rownames(IRTPEPTIDES)),] ), col ="red", lwd=1.5)

  rSquared <- round(cor(irt$rt , IRTPEPTIDES[as.character(irt$peptide),1],use="complete")^2,2)
  legend("topleft",as.expression(c(bquote(R^2*"" == .(rSquared)))), box.lwd=0,box.col=0,cex=cex)
  legend("bottomright", paste("iRT = ",round(coef(fit)[2],1)," x rt + ",round(coef(fit)[1],1),sep="") , box.lwd=0,box.col=0,cex=cex)
  abline(fit,lwd=1.5)
}

#' Get Fragment Sub Sequnece
#' @param peptide character string
#' @param ionType c("a","b","x","y")
#' @param fragmentNb  integer
#' @return character string
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
getFragmentSequence <- function(peptide=peptide,ionType=ionType,fragmentNb=fragmentNb){

  # cast
  peptide = peptide %>% as.character

  if(ionType %in% c("y","x")){
    substr(peptide,(nchar(peptide)-fragmentNb)+1,nchar(peptide))
  }else if((ionType %in% c("a","b"))){
    substr(peptide,1,fragmentNb)
  }else{
    warning("Unknown ionType ", ionType)
    return(NA)
  }
}


#' Get Mz Shift of complementary peptide/fragment ion
#' @param aaSeq character string
#' @param charge default 1
#' @param isHeavy c(T,F)
#' @return numeric
#' @export
#' @note  No note
#' @details Calculate mass shift of complimentary spectrum.
#'  \itemize{
#'  \item PETIDEK (light) -> 8.014199
#'  \item PETIDEK (heavy) -> -8.014199
#'  \item PETIDER (light) -> 10.008269
#'  \item PETIDER (heavy) -> -10.008269
#'  }
#' @references NA
#' @examples print("No examples")
# getLabelMzShift <- function(aaSeq, charge=1,isHeavy=T){
#
#   mzShift = 0
#   kShift <- 8.014199
#   rShift <- 10.008269
#
#   ### ONLY LABELLED AT THE C-TERMINI
#   mzShift <- 0
#   if(grepl("K$" ,aaSeq)){
#     mzShift <- kShift / charge
#   }else if(grepl("R$" ,aaSeq)){
#     mzShift <- rShift / charge
#   }
#
#   if(isHeavy){
#     return(-mzShift)
#   }
#   return(mzShift)
# }

#' Created complimentary Heavy/Light annotated spectrum with updates precursor and fragment m/z values.
#' @param annotSpectrum Annotated Spectrum object
#' @return annotSpectrum Annotated Spectrum object
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
# getComplementaryLabelSpectrum <- function(annotSpectrum){
#
#   # update precursor mz
#   annotSpectrum$precMz <-  annotSpectrum$precMz + getLabelMzShift(aaSeq = annotSpectrum$peptide[1], charge=annotSpectrum$charge[1], isHeavy=annotSpectrum$isHeavy[1])
#
#   # update fragment mz
#   for(i in 1:nrow(annotSpectrum)){
#     aaSeq <- getFragmentSequence(peptide=annotSpectrum$peptide[1]  ,ionType=as.character(annotSpectrum[i,]$ionType),fragmentNb=as.numeric(annotSpectrum[i,]$fragmentNb))
#     annotSpectrum[i,]$mz = annotSpectrum[i,]$mz + getLabelMzShift(aaSeq = aaSeq, charge=as.numeric(annotSpectrum[i,]$charge), isHeavy=annotSpectrum$isHeavy[1])
#
#   }
#   annotSpectrum$isHeavy <- !annotSpectrum$isHeavy
#
#   return(annotSpectrum)
# }

#' Digest protein
#' @param proteinSeq protein sequence
#' @param proteaseRegExp protease Regular Expression
#' @param nbMiscleavages default 0
#' @return vector of peptides
#' @export
#' @note  No note
#' @details No details
#' @examples print("No examples")
getPeptides <- function(proteinSeq,proteaseRegExp="[KR](?!P)",nbMiscleavages=0){

  allAA <- as.vector(unlist(strsplit(proteinSeq,"")))

  ### get all full cleaved peptides without cleaved residue
  fcPeptides <- strsplit(proteinSeq,proteaseRegExp,perl=TRUE)[[1]]

  ### add cleaved residue
  matchPos <- gregexpr(proteaseRegExp,proteinSeq,perl=TRUE)[[1]]
  separator <- allAA[ matchPos ]
  ###if c-term peptide isn't tryptic
  if(length(separator) < length(fcPeptides)) separator <- c(separator,"")
  fcPeptides <- paste(fcPeptides,separator,sep="")
  fcPeptides <- fcPeptides[nchar(fcPeptides) > 0 ]

  ### if no mis-cleavages that's it
  if(nbMiscleavages == 0){
    return(fcPeptides)
  }

  allPeptides <- c()
  ### handle miscleavages
  for(i in 1:length(fcPeptides)){
    #cat(fcPeptides[i]," ",i," -------------\n")
    for(j in i:(i+nbMiscleavages)){
      if(j <= length(fcPeptides)){
        pept <- paste(fcPeptides[i:j],collapse="")
        #cat(j," ---",pept,"\n")
        allPeptides <- c(allPeptides,pept)
      }
    }
  }

  return(allPeptides[nchar(allPeptides) > 0])

}

#' Get peptide candidates per protein
#' @param proteins list
#' @param dispProgressBar T/F
#' @param peptideLengthRange integer vector default c(6,21)
#' @param proteaseRegExp character '[KR](?!P)' - trypsin
#' @param nbMiscleavages 0
#' @param trimAC TRUE/FALSE default TRUE sp|P62258|1433E_HUMAN -> P62258
#' @param exclusivePeptides TRUE/FALSE get exclusive peptides only i.e. peptides mapping to a single protein. default T
#' @return data.frame peptide,protein
#' @export
#' @note  No note
#' @references NA
#' @examples print("No examples")
digestProteome = function(proteins, proteaseRegExp="[KR](?!P)",dispProgressBar=T, peptideLengthRange=c(6,21), nbMiscleavages=0, exclusivePeptides=T, trimAC=T  ){

  # progress bar
  if(dispProgressBar) cat("Digesting Proteome \n")
  pbSum <- txtProgressBar(min = 0, max = length(proteins), style = 3)

  nbProteins =length(proteins)
  # assume 100 peptides per protein
  allPeptides = rep(NA,nbProteins*100)
  allProteins =  rep(NA,nbProteins*100)

  j = 1
  for(i in 1:nbProteins){

    prot <- names(proteins)[i]
    ### increment progressbar
    if(dispProgressBar) setTxtProgressBar(pbSum, i)

    # digest
    peptides <- getPeptides(unlist(proteins[prot]), proteaseRegExp= proteaseRegExp, nbMiscleavages=nbMiscleavages)
    # keep only peptides in accetped length range

    # add peptides and protein to vector
    k = (j+length(peptides)-1)
    allPeptides[j:k] = peptides
    allProteins[j:k] = prot
    j = k+1

  }

  # create df
  peptideDf = na.omit(data.frame(allPeptides,allProteins))
  names(peptideDf) = c("peptides","proteins")

  # filter by peptide length
  pepLength = nchar(peptideDf$peptides %>% as.character)
  peptideDf = subset(peptideDf,  (pepLength >=  min(peptideLengthRange)) & (pepLength <= max(peptideLengthRange)))

  # close progress bar
  setTxtProgressBar(pbSum, i)
  close(pbSum)

  # rename columns
  if(nrow(peptideDf) > 0)  names(peptideDf) = c("peptide","protein")

  # keep exclusive peptides only (i.e. peptides matching only one protein)
  if(exclusivePeptides){peptideDf = group_by(peptideDf, peptide) %>% filter(length(protein) == 1) %>% data.frame }

  # sp|P62258|1433E_HUMAN -> P62258
  # sp|Q9JII5|DAZP1_MOUSE -> Q9JII5
  if(trimAC) peptideDf$protein = gsub("(^[a-z]{2}\\|)([A-Z0-9\\-]*)(\\|.*)","\\2",peptideDf$protein)

  peptideDf$length = peptideDf$peptide %>% as.character %>% nchar

  return(peptideDf)

}


#'# Export top X peptide per protein (ordered by 'adjustedIntensitySum') and add theoretical peptides ranked by length if fewer than X peptides were identified by MaxQuant
#' @param spectralLibrary tibble
#' @param theoPeptides data.frame 'peptide' 'protein' 'length'
#' @param targetProteins list of protein accesion numbers e.g. P62258. default all proteins in spectralLibrary
#' @param nbPeptidesPerProtein number of peptides per protein to be exported
#' @param outFile path to output file defult tmp.xls in tmp dir
#' @export
#' @note  No note
#' @references NA
#' @examples print("No examples")
proteotypicPeptideExport = function(spectralLibrary
                                    ,theoPeptides=NA
                                    ,targetProteins=unique(spectralLibrary$protein)
                                    , nbPeptidesPerProtein=5
                                    , outFile= paste0(tempdir(),"/tmp.xls")){

  # output protein, peptide, rankingMetric, rank (if multiple charge states pick most intense)
  xlsOut = spectralLibrary[c("protein","peptide","ptm","rankingMetric","isIRT","adjustedIntensitySum", "psmScore", "precApexIntensity")]%>%
    unique %>% group_by(protein,peptide,ptm) %>% top_n(1,rankingMetric) %>%
    split(.$protein) %>%
    map_df(~ data.frame(peptide=.x$peptide
                        , ptm = .x$ptm
                        , protein = .x$protein
                        , isIRT = .x$isIRT
                        , adjustedIntensitySum =  .x$adjustedIntensitySum
                        , psmScore =  .x$psmScore
                        , precApexIntensity =  .x$precApexIntensity
                        , rankingMetric= .x$rankingMetric
                        , rank = length(.x$rankingMetric) - rank(.x$rankingMetric)+1
    ))

  # get proteins without enough peptides
  # nbMissingPeptidesPerProt = subset(xlsOut,rank <= nbPeptidesPerProtein) %>%
  #   group_by(. ,protein) %>%

  nbMissingPeptidesPerProt = group_by(xlsOut ,protein) %>%
  summarise(nbMissingPep = nbPeptidesPerProtein - max(rank,na.rm=T)) %>%
  subset(nbMissingPep > 0) %>% as.data.frame
  rownames(nbMissingPeptidesPerProt) = nbMissingPeptidesPerProt$protein

  # add proteins with no identified peptides
  missingProteins =  setdiff(targetProteins, nbMissingPeptidesPerProt$protein)
  if(sum(!is.na(missingProteins)) > 0 ){
    nbMissingPeptidesPerProt =rbind(nbMissingPeptidesPerProt
                                    ,data.frame(protein=missingProteins,nbMissingPep =nbPeptidesPerProtein, row.names=missingProteins)
    )
  }

  # add theo peptides
  if(!is.null(theoPeptides %>% dim)){
    for(prot in nbMissingPeptidesPerProt$protein ){
      # if protein has theo peptides
      if(prot %in% theoPeptides$protein ){
        subTP = subset(theoPeptides, protein %in% prot )
        # order peptides by length
        subTP = subTP[ order(subTP$length),]
        # add nbMissingPep peptides
        sel = 1:min(nbMissingPeptidesPerProt[match(prot, rownames(nbMissingPeptidesPerProt)),]$nbMissingPep, nrow(subTP))
        xlsOut = rbind(xlsOut, cbind(subTP[sel,c("peptide", "protein")]
                                     , rankingMetric=NA
                                     , ptm =NA
                                     , isIRT = F
                                     , rank=nbPeptidesPerProtein
                                     , adjustedIntensitySum =  NA
                                     , psmScore =  NA
                                     , precApexIntensity =  NA

                                     ) )
      }
    }
  }


  #keep all irts
  write.table(file=outFile,  subset(xlsOut, (rank <= nbPeptidesPerProtein) | isIRT ) %>% arrange(.,protein,rank) ,row.names=F,sep = "\t")
  cat("CREATED proteotypicPeptideExport FILE: ", outFile,"\n")

}




#' Create Spectral Library
#' @param tb tibble maxQuant spectrum level search results
#' @param minFragNb min frag number default 3 (i.e. b3 and y3 will be kept)
#' @param minNbTransitions minimum number of fragments
#' @param minBasePeakFraction minimum intnsity fraction of base peak (most intense kept fragment)
#' @param ionTypeFilter selected ion type default a,b,x,y
#' @param includeNL include neutral loss peaks defualt TRUE
#' @param rankingMetric character. column name, used for ranking peptide, default "adjustedIntensitySum", c("adjustedIntensitySum","psmScore","precApexIntensity")
#' @return data.frame
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
createSpectralLibrary = function(tb, minFragNb = 3, minNbTransitions = 5,maxNbTransitions = 5, minBasePeakFraction = 0, ionTypeFilter = c("a","b","x","y"), includeNL = T, rankingMetric = "adjustedIntensitySum"  ){

  cat("Parsing Spectra\n")
  pbSum <- txtProgressBar(min = 0, max = nrow(tb), style = 3)
  spectralLibrary = data.frame()
  for(rownb in 1:nrow(tb) ){

    setTxtProgressBar(pbSum, rownb)
    annotSpec = createLibrarySpectrum(tb[rownb,])

    ### apply spectrum filters
    # fragment number filter
    keep = with(annotSpec,  (fragmentNb >= minFragNb) & grepl(paste(ionTypeFilter,collapse="|"), annotSpec$ionType) )
    # neutral loss peak filter
    if(!includeNL) keep = keep &  !annotSpec$isNL
    annotSpec =   subset(annotSpec,keep )

    #fragment intensity as fraction of base peak
    annotSpec$basePeakIntFrac = annotSpec$intensity / suppressWarnings(max(annotSpec$intensity,na.rm=T))
    # filter minimum base peak intensity fraction
    annotSpec = subset(annotSpec,basePeakIntFrac >= minBasePeakFraction )

    # make sure enough fragments
    if(nrow(annotSpec) >= minNbTransitions){

      #select maxNbTransitions fragments
      annotSpec = arrange(annotSpec, intensity)[1:min(nrow(annotSpec),maxNbTransitions),]

      # calculate relative intensity based on fragment seleciton
      annotSpec$relativeIntensity = (annotSpec$intensity / max(annotSpec$intensity))*100

      # sum of adjusted intensity (basis for ranking)
      annotSpec$adjustedIntensitySum =  annotSpec$adjustedIntensity %>% sum(.,na.rm=T)

      # default ranking metric - summed adjusted intensity
      if(rankingMetric %in% names(annotSpec) & is.numeric(annotSpec[,rankingMetric])){
        annotSpec$rankingMetric =  annotSpec[,rankingMetric][1]
      }else{
        stop("createSpectralLibrary: Invalid rankingMetric", rankingMetric)
      }

      #add to library, SLOW append
      spectralLibrary = rbind(spectralLibrary,annotSpec )
    }
  }
  # close progress bar
  setTxtProgressBar(pbSum, rownb)
  close(pbSum)

  return(spectralLibrary)

}

#' Create complementary isotope (Arg, Lys H/L) spectral library
#' @param sl tibble or data frame
#' @return data.frame
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
createComplementaryIsotopeLibrary = function(sl){

  kShift =8.014199
  rShift = 10.008269

  hasKCTerm =  grepl("K$" ,sl$peptide)
  hasRCTerm =  grepl("R$" ,sl$peptide)

  # correct precursor m/z
  # heavy Lys c-term peptide
  sl$precMz[sl$isHeavy] = sl$precMz[sl$isHeavy] - (ifelse(hasKCTerm,kShift,0)[sl$isHeavy] / sl$precCharge[sl$isHeavy] )
  # heavy Arg c-term peptide
  sl$precMz[sl$isHeavy] = sl$precMz[sl$isHeavy] - (ifelse(hasRCTerm,rShift,0)[sl$isHeavy] / sl$precCharge[sl$isHeavy] )
  # light Lys c-term peptide
  sl$precMz[!sl$isHeavy] = sl$precMz[!sl$isHeavy] + (ifelse(hasKCTerm,kShift,0)[!sl$isHeavy] / sl$precCharge[!sl$isHeavy] )
  # light Arg c-term peptide
  sl$precMz[!sl$isHeavy] = sl$precMz[!sl$isHeavy] + (ifelse(hasRCTerm,rShift,0)[!sl$isHeavy] / sl$precCharge[!sl$isHeavy] )

  # correct c-term fragment m/z
  isCTermFrag =  sl$ionType %in% c("x","y","z")
  # heavy Lys c-term fragment
  sl$mz[sl$isHeavy & isCTermFrag] = sl$mz[sl$isHeavy & isCTermFrag] - (ifelse(hasKCTerm,kShift,0)[sl$isHeavy & isCTermFrag] / sl$charge[sl$isHeavy & isCTermFrag] )
  # heavy Arg c-term fragment
  sl$mz[sl$isHeavy & isCTermFrag] = sl$mz[sl$isHeavy & isCTermFrag] - (ifelse(hasRCTerm,rShift,0)[sl$isHeavy & isCTermFrag] / sl$charge[sl$isHeavy & isCTermFrag] )
  # light Lys c-term fragment
  sl$mz[!sl$isHeavy & isCTermFrag] = sl$mz[!sl$isHeavy & isCTermFrag] + (ifelse(hasKCTerm,kShift,0)[!sl$isHeavy & isCTermFrag] / sl$charge[!sl$isHeavy & isCTermFrag] )
  # light Arg c-term fragment
  sl$mz[!sl$isHeavy & isCTermFrag] = sl$mz[!sl$isHeavy & isCTermFrag] + (ifelse(hasRCTerm,rShift,0)[!sl$isHeavy & isCTermFrag] / sl$charge[!sl$isHeavy & isCTermFrag] )



  sl$isHeavy = !sl$isHeavy

  return(sl)

}

#' Write SpectroDive compatible xls file
#' @param sl tibble or data frame
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
spectroDiveExport =  function(sl,file){
  out = data.frame(eg_proteinId = sl$protein
                   ,eg_proteinDescription = sl$proteinDescription
                   ,eg_assayCollectionType = "external"
                   ,eg_iRT = sl$iRT
                   ,t_relativeFragmentIonIntensity = sl$relativeIntensity
                   ,tg_aminoAcidSequenceWithASCIIMod = paste(sl$peptide,sl$ptm,sep="_")
                   ,eg_aminoAcidSequence = sl$peptide
                   ,tg_precursorCharge = sl$precCharge
                   ,t_fragmentIonCharge = sl$charge
                   ,t_fragmentIonSeries = sl$ionType
                   ,t_fragmentIonNumber = sl$fragmentNb
                   ,tg_Q1 = sl$precMz
                   ,t_Q3 = sl$mz

  )
  write.table(out,file=file,sep="\t",row.names=F)
  cat("CREATED spectroDiveExport  FILE: ", file,"\n")
}

#' Write Spectronaut compatible xls file
#' @param sl tibble or data frame
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
spectronautExport =  function(sl,file){
  out = data.frame(protein_id = sl$protein
                   ,Workflow = "SPIKE_IN"
                   ,iRT = sl$iRT
                   ,RelativeFragmentIntensity = sl$relativeIntensity
                   ,IntModifiedSequence = paste(sl$peptide,sl$ptm,sep="_")
                   ,StrippedSequence = sl$peptide
                   ,IsotopicLabel = ifelse(sl$isHeavy,"heavy","light")
                   ,PrecursorCharge = sl$precCharge
                   ,FragmentCharge = sl$charge
                   ,FragmentType = sl$ionType
                   ,FragmentNumber = sl$fragmentNb
                   ,PrecursorMz = sl$precMz
                   ,FragmentMz = sl$mz
                   ,ExcludeFromQuantification = ""

  )
  write.table(out,file=file,sep="\t",row.names=F)
  cat("CREATED spectronautExport  FILE: ", file,"\n")
}

#' Write skyline compatible xls file
#' @param sl tibble or data frame
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
skylineExport =  function(sl,file){
  out = data.frame(sl$precMz,
                   sl$mz,
                   25,
                   sl$peptide,
                   sl$protein,
                   paste0(sl$ionType,sl$fragmentNb),
                   "",
                   ifelse(sl$isHeavy,"heavy","light")
  )
  write.table(out,file=file,sep="\t",row.names=F, col.names = F)
  cat("CREATED skylineExport FILE: ", file,"\n")
}

#' Parse list of target peptides/proteins
#' @param file path
#' @return list() peptides, proteins
#' @export
#' @note  No note
#' @details No details
#' @references NA
#' @examples print("No examples")
parseTargetsFile = function(file){
  tb = suppressMessages(read_csv(file))
  names(tb) = names(tb) %>% tolower

  proteins = tb[which(grepl("^protein[s]{0,1}$",names(tb)))]
  peptides = tb[which(grepl("^peptide[s]{0,1}$",names(tb)))]

  ret = list()
  ret$proteins = NA
  ret$peptides = NA

  # give priority to peptides
  if(length(peptides) > 0 ){
    ret$peptides = peptides %>% unlist%>% as.vector() %>% unique()
  }else if(length(proteins) > 0){
    ret$proteins = proteins %>% unlist%>% as.vector()  %>% unique()
  }else{
    stop("parseTargetsFile Invalid File format:",  file)
  }
  return(ret)
}

#' plot peptide count per protein
#' @param spectralLibrary data.frame
#' @param acLenTrunc integer default 12 "SOMEVERYLONGAC" -> "SOMEVERY.."
#' @param protLabCex default 0.9
#' @export
#' @note  No note
#' @references NA
#' @examples print("No examples")
barplotPetideCountPerProtein = function(spectralLibrary, protLabCex=0.9, acLenTrunc=12, col = "blue" , ... ){
  pepPerProt = (spectralLibrary[c("protein","peptide","ptm") ] %>% unique)[,"protein"] %>%table %>% sort(.,decreasing = T)
  protLab =names(pepPerProt)

  if(length(protLab) < 30){
    bp =barplot(pepPerProt,las=2, xaxt="n", ylab = "Unique Peptide Count", col =col,...)

    axis(1, labels = FALSE,tick=F)
    # truncate labels "SOMEVERYLONGAC" -> "SOMEVERY.."
    sel = nchar( protLab) > acLenTrunc
    protLab[sel] =  paste0(substr( protLab,1,acLenTrunc),"..")[sel]
    ### 45 degree labels
    text(bp, par("usr")[3], srt=45, adj = 1.1,
         labels =protLab, xpd = TRUE ,cex=protLabCex)
  }else{
    #axis(1, labels = TRUE,tick=T, cex.axis=cex.axis)
    #mtext("Protein",side =1, line=3, cex=cex.lab)
    h = hist(pepPerProt,breaks =  1:max(pepPerProt)
         #,cex.axis=cex.axis
         #,cex.lab=cex.lab
         ,xlab = "Unique Peptide Count Per Protein"
         ,col=col
         ,main=""
         ,...
         )
    axis(1, labels = TRUE,tick=T,...)

  }
}

#' plot ranking metric vs peptide (per protein) barplot
#' @param df data.frame
#' @param pepLenTrunc integer AFADAMEVIPSTLAENAGLNPISTVTELR -> AFADAMEVIPSTLAE..
#' @param pepLabCex default 0.7
#' @param rankingMetric character
#' @param ptmCol data.frame row.names: ptm col:col
#' @export
#' @note  No note
#' @references NA
#' @examples print("No examples")
barplotPeptidesPerProtein = function(df, pepLenTrunc=12,pepLabCex=0.7,rankingMetric="rankingMetric",ptmCol=getPTMColors(levels(df$ptm)),  ...){

  # avoid log(0) issue
  df$rankingMetric = df$rankingMetric +1

  # order by intensity decreasing L-R
  df = arrange(df,-rankingMetric)
  bp =  barplot(df$rankingMetric %>% log10
                , main = df$protein[1]
                , xaxt = "n"
                #, ylab="Adj. Fragment Int. Sum\n log10"
                ,ylab = paste0(rankingMetric,"\n log10")
                ,col=ptmCol[match(df$ptm,rownames(ptmCol)) ,]
                ,...
  )

  # truncate labels AFADAMEVIPSTLAENAGLNPISTVTELR -> AFADAMEVIPSTLAE..
  df$peptide %<>% as.character
  sel = nchar( df$peptide) > pepLenTrunc
  df$peptide[sel] = paste0(substr( df$peptide,1,pepLenTrunc), sep="..")[sel ]
  df$peptide = paste0(df$peptide,"/",df$precCharge)
  ### 35 degree labels
  axis(1, labels = FALSE,tick=F)
  text(bp , par("usr")[3], srt=35, adj = 1.1,
       labels =df$peptide, xpd = TRUE ,cex=pepLabCex)

}

#' assign colors to ptms
#' @param ptms vector
#' @return data.frame
#' @export
#' @note  No note
#' @references NA
#' @examples print("No examples")
getPTMColors = function(ptms){

  #set ptm colors
  ptmCol = data.frame(row.names=ptms)
  ptmCol$col = rep(NA,nrow(ptmCol))
  if("Unmodified" %in% rownames(ptmCol) ) ptmCol["Unmodified",] = 4
  ptmCol["Unmodified" != rownames(ptmCol), ] = c(1:3,5:100)[1:sum("Unmodified" != rownames(ptmCol))]

  return(ptmCol)

}
eahrne/SpectroX documentation built on May 21, 2019, 3:09 a.m.