#' THIS FUNCTION SHOULD ONLY BE USED FOR LEGACY FRAGPIPE DATASETS THAT PRODUCED 'mbr_ion.tsv' OUTPUT FILES (e.g. FragPipe v15)
#'
#' To generate output files that we here require in MS-DAP, configure FragPipe as follows:
#' - assign Experiment IDs in the workflow tab (you may simply set these all to 1)
#' - enable IonQuant ("Quant (MS1)" tab, as of FragPipe v15)
#' - optionally, enable match-between-runs
#'
#' This function will merge data from various FragPipe output files to construct a MS-DAP dataset by
#' parsing PSM hits from "psm.tsv" files, collecting IonQuant peptide abundance values
#' from "MSstats.csv" and "mbr_ion.tsv", and finally fetching protein-group information from "combined_protein.tsv".
#'
#' @param path the full file path to the fragpipe output directory
#' @param acquisition_mode the type of experiment, should be a string. options: "dda" or "dia"
#' @param confidence_threshold confidence score threshold at which a peptide is considered 'identified' (target value must be lesser than or equals)
#' @param collapse_peptide_by if multiple data points are available for a peptide in a sample, at what level should these be combined? options: "sequence_modified" (recommended default), "sequence_plain", ""
#' @importFrom stringi stri_sub_replace
#' @export
import_dataset_fragpipe_ionquant__legacy = function(path, acquisition_mode, confidence_threshold = 0.01, collapse_peptide_by = "sequence_modified") {
reset_log()
append_log("using the legacy function for importing FragPipe data; import_dataset_fragpipe_ionquant__legacy(). If you have a legacy dataset from e.g. FragPipe v15, this is fine. But for more recent FragPipe datasets (e.g. that have 'combined_peptide.tsv' files in FragPipe output) you should use import_dataset_fragpipe_ionquant() instead", type = "warning")
stopifnot(acquisition_mode %in% c("dda","dia"))
# input validation; locate expected input files
file_msstats = path_exists(path, "MSstats.csv", try_compressed = TRUE)
file_mbr = path_exists(path, "mbr_ion.tsv", try_compressed = TRUE, silent = T) # optional
file_proteins = path_exists(path, "combined_protein.tsv", try_compressed = TRUE)
# check for nested files, regex analogous to path_exists()
files_psm = dir(path, pattern = "^psm.tsv$", recursive = T, ignore.case = T, full.names = T)
files_psm_compress1 = dir(path, pattern = "^psm.tsv.(zip|gz|bz2|xz|7z|zst|lz4)$", recursive = T, ignore.case = T, full.names = T)
files_psm_compress2 = dir(path, pattern = "^psm.(zip|gz|bz2|xz|7z|zst|lz4)$", recursive = T, ignore.case = T, full.names = T)
# de-duplicate using the path (so we don't have to deal with extensions in this de-dupe check)
files_psm_dir = dirname(files_psm)
files_psm_compress1_dir = dirname(files_psm_compress1)
files_psm_compress2_dir = dirname(files_psm_compress2)
files_psm = c(files_psm, files_psm_compress1[! files_psm_compress1_dir %in% files_psm_dir])
files_psm = c(files_psm, files_psm_compress2[! files_psm_compress2_dir %in% files_psm_dir])
# finally we can check if we found all required files
if (file_msstats == "") {
append_log(sprintf('Cannot find file "MSstats.csv" at provided path "%s".\nPerhaps you forgot to enter experiment info in the FragPipe "Workflow" tab? That would prevent generation of MSstats.csv (as tested in FragPipe v15)', path), type = "error")
}
if (file_proteins == "") {
append_log(sprintf('Cannot find file "combined_protein.tsv" at provided path "%s"', path), type = "error")
}
if (length(files_psm) == 0) {
append_log(sprintf('no "psm.tsv" files found at provided path "%s"', path), type = "error")
}
###### 1) process MSstats table into a MS-DAP formatted peptide table
DT = read_textfile_compressed(file_msstats, as_table = T)
colnames(DT) = tolower(colnames(DT))
# input validation
col_missing = setdiff(c("proteinname","peptidesequence","precursorcharge","run","intensity"), colnames(DT))
if (length(col_missing) > 0) {
append_log(paste('Expected column names in MSstats table (not case sensitive): ProteinName, PeptideSequence, PrecursorCharge, ProductCharge, Run, Intensity.\nmissing:', paste(col_missing, collapse = ", ")), type = "error")
}
# efficient computation of plain sequences using data.table
DT[ , sequence_plain := gsub("(\\[[^]]*\\])|(\\([^)]*\\))", "", peptidesequence), by=peptidesequence]
# transform MSstats table into our standard format peptide tibble
tib_result = as_tibble(DT) %>% select(sample_id = run, protein_id = proteinname, sequence_modified = peptidesequence, sequence_plain, charge = precursorcharge, intensity) %>%
filter(is.finite(intensity) & intensity > 1) %>%
mutate(intensity = log2(intensity),
peptide_id = paste(sequence_modified, charge, sep="_"),
id_sample_peptide = paste(sample_id, peptide_id),
isdecoy = FALSE, detect = FALSE, confidence = NA, rt = NA, mz=NA, is_mbr = FALSE)
rm(DT)
###### 2) MBR retention times from mbr_ion.tsv (if available)
if(file.exists(file_mbr)) {
append_log(paste("Extract peptide retention time and identification confidence from:", file_mbr), type = "info")
# input validation
headers = unlist(strsplit( read_textfile_compressed(file_mbr, as_table = F, nrow = 1), "[\t,]+"))
col_missing = setdiff(c("probability","intensity","apex_rt","modified_peptide","charge","acceptor_run_name"), headers)
if (length(col_missing) > 0) {
append_log(paste('Expected column names in mbr_ion.tsv table: probability, intensity, apex_rt, modified_peptide, charge, acceptor_run_name.\nmissing:', paste(col_missing, collapse = ", ")), type = "error")
}
# just to double-check and be robust to future changes, select per sample*peptide the unique entry with highest confidence (currently these entries are unique so the `distinct` call is not strictly needed, but safety first and cba about minor performance loss)
tib_mbr = as_tibble(read_textfile_compressed(file_mbr, as_table = T, data.table = F)) %>%
filter(is.finite(intensity) & intensity > 1) %>%
arrange(desc(probability), desc(intensity)) %>%
select(mz, rt = apex_rt, sequence_modified = modified_peptide, charge, sample_id = acceptor_run_name) %>%
distinct(sample_id, sequence_modified, charge, .keep_all = T) %>%
mutate(id_sample_peptide = paste(sample_id, paste(sequence_modified, charge, sep="_")),
rt = rt / 60) # convert RT to minutes
# map back to main peptide table
i = data.table::chmatch(tib_result$id_sample_peptide, tib_mbr$id_sample_peptide)
tib_result$rt = tib_mbr$rt[i]
tib_result$mz = tib_mbr$mz[i]
tib_result$is_mbr = !is.na(i)
} else {
append_log(sprintf('no match-between-runs data found. If you configured MBR in FragPipe, be aware that this data was NOT imported into MS-DAP because the "mbr_ion.tsv" file could not be found at: "%s"', path), type = "warning")
}
###### 3) retention time and identification confidence from psm.tsv files
for(filename in files_psm) {
append_log(paste("Extract peptide retention time and identification confidence from:", filename), type = "info")
# efficiently read peptide table, simplified from generic code @ import_dataset_in_long_format()
attributes_required = list(sample_id = "Spectrum File",
sequence_plain = "Peptide",
# sequence_modified = "", # by default, init this column as plain sequences because 'modified peptide' column is useless (see comments downstream),
mz = "Calibrated Observed M/Z",
modifications = "Assigned Modifications",
charge = "Charge",
rt = "Retention",
confidence = "PeptideProphet Probability")
# basically this reads the CSV/TSV table from file and maps column names to expected names.
# (complicated) downstream code handles compressed files, efficient parsing of only the requested columns, etc.
DT = read_table_by_header_spec(filename, attributes_required, as_tibble_type = F)
### specifically for fragpipe; strip full path + "interact-" + ".pep.xml" extension
# example entry in input table; C:\DATA\fragpipe_test\interact-a05191.pep.xml
DT[ , sample_id := gsub("(.*(\\\\|/)interact\\-)|(^interact\\-)|(\\.pep\\.xml$)", "", sample_id, ignore.case=F), by=sample_id]
# init modified sequence like plain sequence
DT[ , sequence_modified := sequence_plain]
# add key for plainseq+mods
DT[ , ismod := modifications != ""]
DT[ ismod==T , id_plainseq_modifications := .GRP, by=.(sequence_plain, modifications)]
# report if there are any sample mis-matches
sample_missing = setdiff(unique(DT$sample_id), tib_result$sample_id)
if (length(sample_missing) > 0) {
append_log(paste('These samples from PSM data table are not in main MSstats.csv table;', paste(sample_missing, collapse = ", ")), type = "warning")
}
### repair modified sequences !
# By default, the format is different between psm.tsv and both (MSstats.csv and mbr_ion.tsv)
# Example
# MSstats.tsv: HQGVM[15.9949]VGM[15.9949]GQK
# psm.tsv: "Modified Peptide" = "HQGVM[147]VGM[147]GQK" and in another column "Assigned Modifications" = "5M(15.9949), 8M(15.9949)"
#
# more examples from MSstats.csv (denoting n-terminal mods and fixed modifications)
# n[42.0106]AYHSFLVEPISC[57.0215]HAWNK psm.tsv = n[43]AYHSFLVEPISCHAWNK note; fixed modifications seem absent from psm.tsv but not MSstats.csv
# KLM[15.9949]DLEC[57.0215]SRDGLM[15.9949]YEQYR
#
# print( unique(unlist(strsplit(DT$modifications, "[, ]+"))) )
# 1) unique set of peptide sequence + modifications (the set of strings to update).
# modified peptides will have multiple entries in psm.tsv (e.g. same peptide identified in multiple samples), so taking unique set is important for efficiency
modseq_update = as_tibble(DT) %>%
filter(ismod) %>% # skip unmodified
select(id_plainseq_modifications, sequence_plain, sequence_modified, modifications) %>%
distinct(sequence_plain, modifications, .keep_all = T)
if(nrow(modseq_update) > 0) {
# 2) from a modification in FragPipe to a standardized instruction for updating the plain sequence such that it matches modified sequences in MSstats.csv
tib_modification_instructions = fragpipe_modification_to_instruction__legacy(unique(modseq_update$modifications))
# debug; tib_modification_instructions %>% filter(grepl("N-", input, fixed = T))
# 3) string manipulations; translate every sequence + set of N modifications to an ordered list of substrings, then join them all (e.g. collapse with paste)
for(i in 1:nrow(modseq_update)) { # i=grep(", N-", modseq_update$modifications, fixed=T)[1]
tib = tib_modification_instructions %>% filter(input == modseq_update$modifications[i])
stringi::stri_sub_replace(str=modseq_update$sequence_modified[i], from=tib$site+1, to=tib$site, value=tib$insert)
for(j in 1:nrow(tib)) {
if(is.na(tib$site[j])) {
modseq_update$sequence_modified[i] = paste0(modseq_update$sequence_modified[i], tib$insert[j])
} else {
modseq_update$sequence_modified[i] = stringi::stri_sub_replace(str=modseq_update$sequence_modified[i], from=tib$site[j]+1, to=tib$site[j], value=tib$insert[j])
}
}
}
# 4) join with input table by `id_plainseq_modifications` key
DT$sequence_modified = modseq_update$sequence_modified[match(DT$id_plainseq_modifications, modseq_update$id_plainseq_modifications)] # have to map from DT because of n:1 mapping to unique modseq
DT[ is.na(sequence_modified) , sequence_modified := sequence_plain] # set plain sequence for rows without a 'modifications' entry
}
# 5) map back to overall peptide tibble and transfer the observed retention time and identification confidence
DT[ , id_sample_peptide := paste(sample_id, paste(sequence_modified, charge, sep="_")) ]
i = data.table::chmatch(DT$id_sample_peptide, tib_result$id_sample_peptide)
# remove unmapped entries
DT = DT[!is.na(i)]
i = na.omit(i)
# transfer data
tib_result$mz[i] = DT$mz
tib_result$rt[i] = DT$rt / 60 # convert RT to minutes
# FragPipe output has "PeptideProphet Probability", which is 0~1 ranged score with 1 = 100% confident.
# In this R package we use confidence score = qvalue (or pvalue). So as a band-aid solution we use 1-probability to approximate
tib_result$confidence[i] = 1 - DT$confidence
}
# define 'detect'
tib_result$detect = is.finite(tib_result$confidence) & tib_result$confidence <= confidence_threshold
###### 4) update protein identifiers, from protein table include the "Indistinguishable Proteins"
tib_prot = parse_fragpipe_proteins__legacy(file_proteins)
# map data back to main peptide table
i = data.table::chmatch(tib_result$protein_id, tib_prot$protein_asis)
if(any(is.na(i))) {
append_log(sprintf('input data consistency problem; not all proteins in column "ProteinName" in %s are found in column "Protein" in %s', basename(file_msstats), basename(file_proteins)), type = "error")
}
tib_result$protein_id = tib_prot$protein_id[i]
###### 5) finally, construct MS-DAP dataset object
### collapse peptides by plain or modified sequence (eg; peptide can be observed in some sample with and without modifications, at multiple charges, etc)
if(collapse_peptide_by == "") {
# if 'no collapse' is set, at least merge by modseq and charge. eg; there may be multiple peaks for some peptide with the same charge throughout retention time
tib_result = peptides_collapse_by_sequence(tib_result, prop_peptide = "peptide_id")
append_log("NOT collapsing peptides by plain/modified-sequence, thus 2 observations of the same sequence with different charge are considered a distinct peptide_id. This is not recommended for DDA!", type = "warning")
} else {
tib_result = peptides_collapse_by_sequence(tib_result, prop_peptide = collapse_peptide_by) # alternative, collapse modified sequences; prop_peptide = "sequence_modified"
}
log_peptide_tibble_pep_prot_counts(tib_result)
return(list(peptides=tibble_peptides_reorder(tib_result), proteins=empty_protein_tibble(tib_result), plots=list(), acquisition_mode = acquisition_mode))
}
#' THIS FUNCTION SHOULD ONLY BE USED FOR LEGACY FRAGPIPE DATASETS; HELPER FOR EXTRACTING PROTEIN MODIFICATIONS
#'
#' @param modifications characer vector
fragpipe_modification_to_instruction__legacy = function(modifications) {
if(length(modifications) == 0) {
return(tibble())
}
tibble(input = modifications) %>%
mutate(mod = gsub("(", "[", gsub(")", "]", toupper(input), fixed = T), fixed = T), # replace brackets
modlist = stringr::str_split(mod, "[, ]+")) %>%
unnest(modlist) %>%
mutate(
# n-term and c-term modifications have a different notation, set their string position to begin/end of string
modlist = sub("N-TERM", "0n", modlist, fixed = T),
modlist = sub("C-TERM", "NAc", modlist, fixed = T),
# now we can split the modification into a string that should be inserted and some position
insert = sub("^(NA|\\d+)[A-Z]{0,1}([a-z]{0,1}\\[.*)$", "\\2", modlist),
site = sub("^(NA|\\d+)[A-Z]{0,1}([a-z]{0,1}\\[.*)$", "\\1", modlist),
site = suppressWarnings(as.numeric(site))
) %>%
# make sure the largest string index/position is modified first (thereby retaining relative string positions)
arrange(desc(site))
}
#' THIS FUNCTION SHOULD ONLY BE USED FOR LEGACY FRAGPIPE DATASETS; PARSER FOR PROTEIN TABLES
#'
#' parse a proteins.tsv or combined_protein.tsv from fragpipe output and return
#' a well formatted protein_id column that includes both "leading protein ID"
#' and "Indistinguishable Proteins"
#'
#' @param file_proteins full file path
parse_fragpipe_proteins__legacy = function(file_proteins) {
tib_prot = as_tibble(read_textfile_compressed(file_proteins, as_table = T))
# input validation
col_missing = setdiff(c("Protein","Protein ID","Indistinguishable Proteins"), colnames(tib_prot))
if (length(col_missing) > 0) {
append_log(sprintf('Expected column names in %s table: Protein, Protein ID, Indistinguishable Proteins. Missing: %s', file_proteins, paste(col_missing, collapse = ", ")), type = "error")
}
tib_prot %>%
# rename columns (lowercase and no spaces) and subset columns
select(protein_asis = Protein, protein_id_asis = `Protein ID`, protein_indistinguishable = `Indistinguishable Proteins`) %>%
# split 1:n protein identifiers in the `Protein ID` column
mutate(protein_list = strsplit(protein_id_asis, "[, ;]+")) %>%
# split 0:n protein identifiers in the `Indistinguishable Proteins` column
mutate(protein_indistinguishable_list = strsplit(protein_indistinguishable, "[, ;]+")) %>%
# unpack/unnest everything to long-format
unnest(cols = protein_list, keep_empty=T) %>%
unnest(cols = protein_indistinguishable_list, keep_empty=T) %>%
# now we can apply vectorized functions; enforce short protein IDs
mutate(protein_list = fasta_id_short(protein_list),
protein_indistinguishable_list = fasta_id_short(protein_indistinguishable_list)) %>%
# collapse everything again at the level of original keys
group_by(protein_asis, protein_id_asis) %>%
# note; relatively slow
summarise(protein_id = paste(na.omit(unique(c(protein_list, protein_indistinguishable_list))), sep=";", collapse=";")) %>%
ungroup()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.