#' Parse Proteome Discoverer output
#'
#' @description This function parses the output .txt files (peptide groups or PSMs) from
#' Proteome Discoverer and then filters out features based on various criteria.
#'
#' The function performs the following steps:
#'
#' 1. Remove features without a master protein
#' 2. (Optional) Remove features without a unique master protein (i.e.
#' Number.of.Protein.Groups == 1)
#' 3. (Optional) Remove features matching a cRAP protein
#' 4. (Optional) Remove features matching any protein associated with
#' a cRAP protein (see below)
#' 5. Remove features without quantification values (only if TMT or SILAC
#' are `TRUE` and `level = "peptide"`.)
#'
#' @details **Associated cRAP proteins** are proteins which have at least
#' one feature shared with a cRAP protein. It has been observed that the cRAP
#' database does not contain all possible cRAP proteins e.g. some features
#' can be assigned to a keratin which is not in the provided cRAP database.
#'
#' Using `filter_associated_crap = TRUE` will filter out f2 and f3 in
#' addition to f1, in the example below; regardless of the value in the
#' Master.Protein.Accession column.
#'
#' ```
#' feature Protein.Accessions Master.Protein.Accessions
#' f1 protein1, protein2, cRAP, protein1,
#' f2 protein1, protein3 protein3,
#' f3 protein2 protein2
#' ```
#'
#' @param data `data.frame` generated from txt file output from Proteome
#' Discoverer.
#' @param master_protein_col `string`. Name of column containing master
#' proteins.
#' @param protein_col `string`. Name of column containing all protein
#' matches.
#' @param unique_master `logical`. Filter out features without a unique
#' master protein.
#' @param silac `logical`. Is the experiment a SILAC experiment?
#' @param TMT `logical`. Is the experiment a TMT experiment?
#' @param level `string`. Type of input file, must be one of either
#' `"peptide"` or `"PSM"`.
#' @param filter_crap `logical`. Filter out features which match a cRAP
#' protein.
#' @param crap_proteins `character vector`. Contains the cRAP accessions,
#' for example: `c("P02768")` which is serum albumin.
#' @param filter_associated_crap `logical`. Filter out features which
#' match a cRAP associated protein.
#' @return Returns a `data.frame` with the filtered Proteome Discoverer output.
#' @examples
#' \dontrun{
#'
#' #### PSMs.txt example ####
#' # load PD PSMs.txt output
#' psm <- read.delim("data-raw/PSMs.txt")
#'
#' # load the cRAP FASTA used for the PD search
#' crap_fasta <- Biostrings::fasta.index(
#' "2021-06_CCP_cRAP.fasta", seqtype = "AA"
#' )
#'
#' # extract the UniProt accessions from the cRAP FASTA headers
#' crap_accessions <- regmatches(
#' crap_fasta$desc,
#' gregexpr("(?<=\\|).*?(?=\\|)", crap_fasta$desc, perl = TRUE)
#' ) %>%
#' unlist()
#'
#' # parse peptides from an e.g. TMT experiment
#' psm2 <- parse_features(
#' data = psm,
#' master_protein_col = "Master.Protein.Accessions",
#' protein_col = "Protein.Accessions",
#' unique_master = TRUE,
#' TMT = TRUE,
#' level = "PSM",
#' filter_crap = TRUE,
#' crap_proteins = crap_accessions,
#' filter_associated_crap = TRUE
#' )
#'
#' #### peptideGroups.txt example ####
#' # load PD peptideGroups.txt output
#' pep_group <- read.delim("data-raw/peptideGroups.txt")
#'
#' # load the cRAP FASTA used for the PD search
#' crap_fasta <- Biostrings::fasta.index(
#' "2021-06_CCP_cRAP.fasta", seqtype = "AA"
#' )
#'
#' # extract the UniProt accessions from the cRAP FASTA headers
#' crap_accessions <- regmatches(
# crap_fasta$desc,
# gregexpr("(?<=\\|).*?(?=\\|)", crap_fasta$desc, perl = TRUE)
# ) %>%
# unlist()
#'
#' # parse peptides from an e.g. SILAC experiment
#' pep_group2 <- parse_features(
#' data = pep_group,
#' master_protein_col = "Master.Protein.Accessions",
#' protein_col = "Protein.Accessions",
#' unique_master = TRUE,
#' silac = TRUE,
#' level = "peptide",
#' filter_crap = TRUE,
#' crap_proteins = crap_accessions,
#' filter_associated_crap = TRUE
#' )
#'
#' }
#' @export
parse_features <- function(data,
master_protein_col = "Master.Protein.Accessions",
protein_col = "Protein.Accessions",
unique_master = TRUE,
silac = FALSE,
TMT = FALSE,
level = "peptide",
filter_crap = TRUE,
crap_proteins = NULL,
filter_associated_crap = TRUE) {
# check arguments
stopifnot(level %in% c("PSM", "peptide"))
if (filter_crap) {
if (is.null(crap_proteins)) {
stop("must supply the crap_proteins argument to filter cRAP proteins")
}
}
# print input summary
message("Parsing features...")
message_parse(data, master_protein_col, "Input")
# remove crap proteins
if (filter_crap) {
message(sprintf(
"%s cRAP proteins supplied",
length(crap_proteins)
))
# identify associated crap proteins first if necessary
if (filter_associated_crap) {
associated_crap <- data %>%
filter(!!sym(master_protein_col) %in% crap_proteins |
grepl("cRAP", data[[protein_col]], ignore.case = FALSE)) %>%
pull(protein_col) %>%
strsplit("; ") %>%
unlist()
associated_crap <- associated_crap[!grepl("cRAP", associated_crap)]
message(sprintf(
"%s proteins identified as 'cRAP associated'",
length(associated_crap)
))
}
# then remove normal crap proteins
data <- data %>%
filter(!(!!sym(master_protein_col)) %in% crap_proteins &
!grepl("cRAP", data[[protein_col]], ignore.case = FALSE))
message_parse(data, master_protein_col, "cRAP features removed")
# then remove associated crap proteins if necessary
if (filter_associated_crap) {
if (length(associated_crap) > 0) {
# remove isoforms
associated_crap_no_isoform <- unique(sapply(strsplit(associated_crap, "-"), "[[", 1))
associated_crap_regex <- paste(associated_crap_no_isoform, collapse = "|")
data <- data[!grepl(associated_crap_regex, data[[protein_col]]), ]
message_parse(data, master_protein_col, "associated cRAP features removed")
}
}
}
# remove features without a master protein
if (any(is.na(data[[master_protein_col]])) |
any(data[[master_protein_col]] == '')) {
data <- data[!is.na(data[[master_protein_col]]), ]
data <- data[data[[master_protein_col]] != '', ]
message_parse(data, master_protein_col, "features without a master protein removed")
}
# remove features with non-unique master proteins
if (unique_master) {
data <- data[data[["Number.of.Protein.Groups"]] == 1, ]
message_parse(data, master_protein_col, "features with non-unique master proteins removed")
}
# remove features with quantification warnings if necessary
if (silac | TMT & level == "peptide") {
data <- data[(is.na(data[["Quan.Info"]]) | data[["Quan.Info"]] == ''), ]
message_parse(data, master_protein_col, "features without quantification removed")
}
data
}
#' Filter a PSM-level MSnSet to remove low quality PSMs
#'
#' @description Filter PSMs from TMT quantification to remove the following
#'
#' 1. Missing values (NA) for all tags
#' 2. Interference/co-isolation above a set value (default=100, e.g no filtering)
#' 3. Signal:noise ratio below a set value (default=0, e.g no filtering)
#'
#' @param obj `MSnSet` PSMs
#' @param inter_thresh `numeric` Maximum allowed interference/co-isolation
#' @param sn_thresh `numeric` Minimum allowed signal:noise threshold
#' @param master_protein_col `string`. Name of column containing master
#' proteins.
#' @param inter_col `string` Name of column containing the interference value
#' @param sn_col `string` Name of column containing the signal:noise value
#'
#' @return `MSnSet` with the filtered PSMs.
#' @export
filter_TMT_PSMs <- function(obj,
inter_thresh = 100,
sn_thresh = 0,
master_protein_col = 'Master.Protein.Accessions',
inter_col = 'Isolation.Interference.in.Percent',
sn_col = 'Average.Reporter.SN',
verbose = TRUE) {
if (verbose) message("Filtering PSMs...")
obj <- obj[rowSums(is.finite(exprs(obj))) > 0, ]
if (verbose) message_parse(fData(obj), master_protein_col, "No quant filtering")
obj <- obj[fData(obj)[[inter_col]] <= inter_thresh, ]
if (verbose) message_parse(fData(obj), master_protein_col, "Co-isolation filtering")
obj <- obj[fData(obj)[[sn_col]] >= sn_thresh, ]
if (verbose) message_parse(fData(obj), master_protein_col, "S:N ratio filtering")
obj
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.