posprot: Positional Proteomics in R

# helper functions ---------------------------------------------------------------------------

## strip modified aa sequences
simple.mods <- function(x){
  y <- gsub(pattern = "\\.0+\\]", x = x, replacement = "]")
  return(y)
}

strip.ends <- function(x){
  x <- as.character(x)
  y <- gsub(pattern = "^_|_$", x = x, replacement = "")
  return(y)
}

strip.mods <- function(x){
  any.mod <- "\\[(\\+|-)\\d+\\.?\\d+\\]"
  y <- gsub(pattern = any.mod, x = x, replacement = "")
  return(y)
}

## list-of-list conversion to df
sec.split <- function(x){
  y <- plyr::ldply(.data = x, .fun = function(x){data.frame(prot = x$Text, offset=x$Offset)})
  return(y)
}

as.df <- function(x){
  y <- plyr::ldply(.data = x, .fun = sec.split, .id = "s.pep")
  return(y)
}

## translates a modified aa sequence into a case
translate <- function(x){
  dplyr::case_when(
    grepl("^.\\[\\+28\\.?0*\\]", x) == TRUE ~ "label",
    grepl("^.\\[\\+42\\.?0*\\]", x) == TRUE ~ "acetylated",
    TRUE ~ "free")
}


#' Add positional annotation to MSstats group comparison result
#'
#' @param name file name (character) of a fasta file in working directory
#' @param MSstats.GC MSstats group comparison result as generated by groupComparison()
#' @param label Label (character) of the comparision
#'
#' @usage annotate_gcr(name, MSstats.GC, label)
#'
#' @description This function takes a peptide-level group comparison result as output by
#' MSstats::groupComparison and adds positional annotation to each peptide feature.
#'
#' @details This function maps all peptide features to a reference proteome using their stripped aa sequence and perfect string matching.
#' In addition, it determines the N-terminal modification status of each feature.
#'
#' Important prerequisites:
#'
#' This functions expects the results of a MSstats peptide-level significance analysis as input,
#' since positional proteomics is a completely peptide centric approach.
#'
#' Peptide-level significance analysis can be enforced easily by copying the content of PeptideSequence
#' to ProteinName prior to calling MSstats::dataProcess().
#'
#' @return This function returns a table (data.frame) in long format. For none proteotypic peptides this function returns multiple rows, each listing the peptide in a
#' different positional context. This ensures that all possible peptide origins can be considered in
#' the downstream analysis.
#'
#' @export
#'
#' @examples annotate_gcr("uniprot-proteome_UP000000589.fasta", sample.data, "control vs. GluC")
annotate_gcr <- function(fasta.file, MSstats.GC, label){

  stopifnot(is.character(fasta.file), is.MSstats_gcr(MSstats.GC), is.character(label))

  if (fasta.file %in% list.files()) {
    message(paste("Reading protein sequences from:", fasta.file))
    try({
      proteome <- Biostrings::readAAStringSet(filepath = fasta.file)
      proteins <- BiocGenerics::sapply(proteome, toString)
      message(paste("Imported", length(proteins), "protein entries found in fasta file."))
    })
  } else {
    stop("Fasta file does not exist in current working directory.")
  }

  gCR <- get.MSstats_gcr(MSstats.GC, label)
  message(paste("Annotating", nrow(gCR), "peptide features found for comparison:", label))
  #message(paste("N-term label RegEx:", nterm.label))
  #message(paste("N-term acetylation RegEx", nterm.acetylated))
  message("step 1 - Stripping peptide sequences.")
  gCR <- plyr::mutate(gCR, s.pep = strip.mods(strip.ends(Protein)), m.pep = strip.ends(Protein))
  message("step 2 - Locating stripped peptides using AhoCorasickSearch.")
  ll <- AhoCorasickTrie::AhoCorasickSearch(keywords = gCR$s.pep, text = proteins, alphabet = "aminoacid", groupByKeyword = TRUE)
  message("step 3 - Converting search results to data frame.")
  pos.anno <- as.df(ll)
  message("step 4 - Adding sequence context to peptide features.")
  pos.anno <- plyr::mutate(pos.anno,
                     prot.seq = BiocGenerics::sapply(proteome[prot], toString),
                     first.aa = substr(prot.seq, start = offset, stop = offset),
                     last.aa = substr(prot.seq, start = offset+nchar(as.character(s.pep))-1, stop = offset+nchar(as.character(s.pep))-1),
                     up.aa = substr(prot.seq, start = offset-1, stop = offset-1),
                     down.aa = substr(prot.seq, start = offset+nchar(as.character(s.pep)), stop = offset+nchar(as.character(s.pep))))
  message("step 5 - Matching N-terminal peptide modification patterns")
  gCR <- dplyr::mutate(gCR, nterm.label = translate(m.pep))
  message("step 6 - Joining positional annotation with comparision result.")
  pos.anno$s.pep <- as.character(pos.anno$s.pep)
  annotated.results <- dplyr::left_join(x = gCR, y = pos.anno, by = "s.pep")
  message("Done!")
  message(paste("Returning", nrow(annotated.results), "positionally annotated peptides."))
  return(annotated.results)
}