R/make_pep_table.R
In parsemsf: Parse ThermoFisher MSF Files and Estimate Protein Abundances

Documented in make_pep_table

#' Make a data frame of peptides
#'
#' @description Extracts amino acid sequences (without post-translational modifications), assigned protein groups, and quality scores.
#' @param msf_file A file path to a ThermoFisher MSF file.
#' @param min_conf "High", "Medium", or "Low". The minimum peptide confidence level to retrieve from MSF file.
#' @param prot_regex Regular expression where the first group matches a protein name or ID from the protein description. Regex must contain ONE group. The protein description is typically generated from a fasta reference file that was used for the database search.
#' @param collapse If TRUE, peptides that match to multiple protein sequences are collapsed into a single row with multiple protein descriptions and protein IDs in the \code{Proteins} and \code{ProteinID} columns separated by semi-colons (";").
#'
#' @return A data frame of all peptides above the confidence cut-off from a ThermoFisher MSF file.
#'
#' \item{peptide_id}{a unique peptide ID}
#' \item{spectrum_id}{a unique spectrum ID}
#' \item{protein_id}{unique protein group ID to which this peptide maps}
#' \item{protein_desc}{protein description from reference database used to assign peptides to protein groups, parsed according to \code{prot_regex}}
#' \item{sequence}{amino acid sequence (does not show post-translational modifications)}
#' \item{pep_score}{PEP score}
#' \item{q_value}{Q-value score}
#'
#' @export
#'
#' @examples
#' # Read from a path
#'
#' make_pep_table(parsemsf_example("test_db.msf"))
#'
#' # Retrieve full protein description
#'
#' make_pep_table(parsemsf_example("test_db.msf"), prot_regex = "")
#'
#' # ...which is also equivalent to...
#'
#' make_pep_table(parsemsf_example("test_db.msf"), prot_regex = "^(.+)$")
#'
make_pep_table <- function(msf_file,
                           min_conf = "High",
                           prot_regex = "^>([a-zA-Z0-9._]+)\\b",
                           collapse = TRUE) {

  confidence <- switch(min_conf,
                       High = 3,
                       Medium = 2,
                       Low = 1,
                       3)

  if (prot_regex == "") {
    prot_regex = "^(.+)$"
  }

  # Access MSF database file
  # my_db <- dplyr::src_sqlite(msf_file)
  my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)

  # First check file version (only Proteome Discoverer 1.4.x is supported)
  schema <- tbl(my_db, "SchemaInfo") %>%
    select_(~SoftwareVersion) %>%
    collect()
  if (!str_detect(as.character(schema$SoftwareVersion[[1]]), "^1\\.4\\..*$")) {
    warning("Only ThermoFisher MSF files generated by Proteome Discoverer 1.4.x are supported. parsemsf functions may produce unexpected results.")
  }

  # This table maps PeptideIDs to ProteinIDs
  PeptidesProteins <- tbl(my_db, "PeptidesProteins") %>%
    select_(~PeptideID, ~ProteinID)

  # Here are the actual peptide sequences with corresponding SpectrumIDs
  Peptides <- tbl(my_db, "Peptides") %>%
    select_(~PeptideID, ~SpectrumID, ~ConfidenceLevel, ~Sequence) %>%
    filter_(~ ConfidenceLevel >= confidence)

  # Protein descriptions are contained in this table
  ProteinAnnotations <- tbl(my_db, "ProteinAnnotations") %>%
    select_(~ProteinID, ~Description)

  # Lazy-eval formula for protein name matching
  prot_match <- lazyeval::interp(~str_match(Description, prot_regex)[,2], prot_regex = prot_regex)

  # Build a peptide table
  pep_table <- inner_join(PeptidesProteins, Peptides, by = c("PeptideID" = "PeptideID")) %>%
    inner_join(ProteinAnnotations, by = "ProteinID") %>%
    collect(n = Inf) %>%
    # Extract protein ID; assumes that protein ID is immediately after ">" and ends with a space
    mutate_(.dots = setNames(list(prot_match), c("Proteins"))) %>%
    group_by_(~PeptideID, ~SpectrumID, ~Sequence)
  # Collapse peptides that map to multiple proteins into a single row
  if (collapse == TRUE) {
    pep_table <- pep_table %>%
      summarize_(.dots = setNames(list(~paste(Proteins, sep = "", collapse = "; "),
                                       ~paste(ProteinID, sep = "", collapse = "; ")),
                                  c("Proteins",
                                    "ProteinID")))
  }
  pep_table <- pep_table %>% select_(~PeptideID, ~SpectrumID, ~Sequence, ~Proteins, ~ProteinID)
  # Append custom fields
  CustomFields <- tbl(my_db, "CustomDataFields")

  # Grab custom peptide data using SQL because of automatic sqlite typing
  # This assumes that FieldValue will always be a number
  CustomPeptides <- tbl(my_db, sql("SELECT FieldID, PeptideID, CAST(FieldValue as REAL) AS FieldValue FROM CustomDataPeptides"))

  # Spread custom fields as separate columns
  custom_data <- left_join(CustomPeptides, CustomFields, by = "FieldID") %>%
    select_(~PeptideID, ~DisplayName, ~FieldValue) %>%
    collect(n = Inf) %>%
    spread_("DisplayName", "FieldValue")

  # Join to peptide table
  pep_table <- left_join(pep_table, custom_data, by = "PeptideID")

  old_names <- list(~PeptideID,
                    ~SpectrumID,
                    ~ProteinID,
                    ~Proteins,
                    ~Sequence,
                    ~PEP,
                    ~`q-Value`)
  new_names <- c("peptide_id",
                 "spectrum_id",
                 "protein_id",
                 "protein_desc",
                 "sequence",
                 "pep_score",
                 "q_value")

  # Rename columns for a more consistent column name style
  pep_table <- select_(pep_table, .dots = setNames(old_names, new_names))

  DBI::dbDisconnect(my_db)

  return(pep_table)

}