R/get_peptide_coverage.R

Defines functions get_peptide_coverage

Documented in get_peptide_coverage

#' Creates the "Peptide Coverage" Object
#'
#' @description Returns an object with two data tables used to visualize where
#'    identified peptides map to a specific literature sequence. These data tables
#'    include one that contains sequences with start and stop positions, and the
#'    other contains counts of identified residues.
#'
#' @param ScanMetadata Object of the scan_metadata class from get_scan_metadata. Required.
#' @param ProteinTable A "protein_table" object generated by get_protein_table. Required.
#' @param ProteinID The ID of the protein to pull protein coverage information from. Required.
#'
#' @details
#' The two outputted data tables are used by the three coverage plots.
#'
#' The first is called "PeptidesByPosition" and is used by Coverage Plot and Coverage Lit Seq Plot.
#' \tabular{ll}{
#' Scan Number \tab MS Scan Number from ScanMetadata \cr
#' \tab \cr
#' Sequence \tab Peptide or protein sequence of the fragment or literature sequence. \cr
#' \tab \cr
#' Peptide Start Position \tab A single number to indicate where the fragment begins on the literature sequence (from N to C) \cr
#' \tab \cr
#' Q Value \tab The Q value (adjusted P value) of the peptide-spectral match provided by a database search tool. \cr
#' \tab \cr
#' Score \tab The score of the peptide-spectral match (typically a small value where smaller = better match) \cr
#' \tab \cr
#' }
#'
#' The second is called "ResidueCount" and is used by coverage bar plot.
#' \tabular{ll}{
#' Residue \tab The residue from the literature sequence written in letter annotation and position format. i.e. "M1" is the first methionine. \cr
#' \tab \cr
#' Count \tab The number of times that residue was identified across all fragments. \cr
#' \tab \cr
#' Position \tab The residue's position on the literature sequence \cr
#' \tab \cr
#' }
#'
#' @examples
#' \dontrun{
#'
#' # Make Peptide Coverage Object
#' PeptideCoverage <- get_peptide_coverage(ScanMetadata = BU_ScanMetadata, ProteinTable = ProteinTable1, ProteinID = "SO_0225")
#'
#' }
#' @export
get_peptide_coverage <- function(ScanMetadata,
                                 ProteinTable,
                                 ProteinID) {

  ##################
  ## CHECK INPUTS ##
  ##################

  # Assert that ScanMetadata is a ScanMetadata object.
  if ("scan_metadata" %in% class(ScanMetadata) == FALSE) {
    stop("ScanMetadata must be a scan_metadata object generated by get_scan_metadata.")
  }

  # Assert that ProteinTable is a "protein_table" object
  if ("protein_table" %in% class(ProteinTable) == FALSE) {
    stop("ProteinTable must be a protein_table object generated by get_protein_table.")
  }

  # Assert that Protein is a single string
  if (is.character(ProteinID) == FALSE || length(ProteinID) != 1) {
    stop("Protein must be a single string.")
  }

  # Assert that Protein is in the the ProteinTable
  if (ProteinID %in% ProteinTable$Protein == FALSE) {
    stop(paste0("Protein must be in the ProteinTable. ", ProteinID, " was not found."))
  }

  ##########################
  ## GENERATE DATA TABLES ##
  ##########################

  # Get the reference sequence
  RefSeq <- attr(ProteinTable, "pspecter")$LitSeq %>%
    dplyr::filter(Protein == ProteinID) %>%
    dplyr::select(`Literature Sequence`) %>%
    unlist()

  # Get the Q Value Maximum. If it's 1, set the filter to NULL.
  QValueMaximum <- if (attr(ProteinTable, "pspecter")$QValueMaximum == 1) {
    NULL
  } else {
    attr(ProteinTable, "pspecter")$QValueMaximum
  }

  # Get the Score Maximum. If it's 1, set the filter to NULL.
  ScoreMaximum <- if (attr(ProteinTable, "pspecter")$ScoreMaximum == 1) {
    NULL
  } else {
    attr(ProteinTable, "pspecter")$ScoreMaximum
  }

  ## Data Table 1: Peptides by Position-----------------------------------------

  # Pull scan number, sequence, and start/end position per Protein ID. Pull the
  # fragment table and order it so that subset works correctly
  FragmentTable <- ScanMetadata
  class(FragmentTable) <- c("data.table", "data.frame")
  FragmentTable <- FragmentTable[FragmentTable$`Protein ID` == ProteinID & is.na(FragmentTable$`Protein ID`) == FALSE,]

  # Get Peptides by Position
  PeptidesByPosition <- FragmentTable %>%
    dplyr::select(c(`Scan Number`, Sequence, `Peptide Start Position`, `Q Value`, Score)) %>%
    dplyr::mutate(
      "Peptide End Position" = `Peptide Start Position` + nchar(Sequence)
    )

  # Determine whether filters can be applied given a Q Value and a Score
  if (is.na(PeptidesByPosition$`Q Value`) %>% any()) {QValueMaximum <- NULL}
  if (is.na(PeptidesByPosition$Score) %>% any()) {ScoreMaximum <- NULL}

  # Apply filters that are not NULL
  if (is.null(QValueMaximum) == FALSE) {
    PeptidesByPosition <- PeptidesByPosition %>% dplyr::filter(`Q Value` <= QValueMaximum)
  }

  if (is.null(ScoreMaximum) == FALSE) {
    PeptidesByPosition <- PeptidesByPosition %>% dplyr::filter(Score <= ScoreMaximum)
  }

  # Add the reference sequence
  PeptidesByPosition <- rbind(PeptidesByPosition,
    data.table::data.table("Scan Number" = 0, "Sequence" = RefSeq,
                           "Peptide Start Position" = 0,  "Q Value" = NA,
                           "Score" = NA, "Peptide End Position" = nchar(RefSeq)))

  ## Data Table 2: Generate the counts of identified residues-------------------

  # Pull Sequence and Peptide Start Position information
  Residues <- PeptidesByPosition %>%
    dplyr::filter(`Scan Number` != 0) %>%
    dplyr::select(c(Sequence, `Peptide Start Position`))

  # Get positional number of each residue
  ResidueCount <- lapply(1:nrow(Residues), function(row) {

    # Split the sequence
    SplitSeq <- Residues[row, "Sequence"] %>% unlist() %>% strsplit("") %>% unlist()

    # Create positional vector
    Position <- (Residues[row, "Peptide Start Position"] %>% unlist()) + 0:(length(SplitSeq)-1)

    return(paste0(SplitSeq, Position))

  }) %>%
    unlist() %>%
    table(dnn = "Residue") %>%
    data.frame() %>%
    dplyr::mutate(
      "Position" = gsub("[[:alpha:]]", "", Residue) %>% as.numeric()
    ) %>%
    dplyr::arrange(Position) %>%
    data.table::data.table()

  # Rename the second column "count"
  colnames(ResidueCount)[2] <- "Count"

  #######################
  ## RETURN THE OBJECT ##
  #######################

  # Store results as two data tables
  res <- list(
    "PeptidesByPosition" = PeptidesByPosition,
    "ResidueCount" = ResidueCount
  )

  # Add attributes
  attr(res, "pspecter")$QValueMaximum <- attr(ProteinTable, "pspecter")$QValueMaximum
  attr(res, "pspecter")$ScoreMaximum <- attr(ProteinTable, "pspecter")$ScoreMaximum
  attr(res, "pspecter")$RemoveContaminants <- attr(ProteinTable, "pspecter")$RemoveContaminants
  attr(res, "pspecter")$ProteinID <- ProteinID

  # Store the class information of this object
  class(res) = "peptide_coverage"

  # Return results
  return(res)

}
EMSL-Computing/pspecterlib documentation built on Jan. 28, 2024, 8:13 p.m.