Nothing
#'
#' Class to read an mzTab file and store the tables internally.
#'
#' The 'sections' field is initialized after $readMzTab was called.
#' The 'fn_map' fields should be initialized via ...$fn_map$readMappingFile(...) manually if user-defined filename mappings are desired
#' and is automatically updated/queried when $readMzTab is called.
#'
#' @field sections MzTab sections as list. Valid list entries are: "MTD", "PRT", "PEP", "PSM", "SML", "filename" and "comments"
#' @field fn_map FilenameMapper which can translate raw filenames into something shorter
#'
#'
#'
MzTabReader = setRefClass("MzTabReader",
fields = list(sections = "list",
fn_map = "FilenameMapper"
),
methods = list(
initialize=function() {
.self$sections = list();
.self$fn_map = FilenameMapper$new();
return(.self)
},
#'
readMzTab = function(.self, mztab_file) {
"Read an mzTab file into a list of 5 data.frames (one df per mzTab section).
Data.frames in the resulting list are named as follows:
'MTD', 'PRT', 'PEP', 'PSM', 'SML',.
Additionally, 'filename' and 'comments' are valid list elements.
"
cat("Reading mzTab '", mztab_file, "' ...", sep = "")
## this implementation is derived from with minor modifications
## https://github.com/lgatto/MSnbase/blob/master/R/MzTab.R
f_con = file(mztab_file, open = "r") ## for better error messages
lines = readLines(f_con)
close(f_con)
# remove empty lines
lines = lines[ nzchar(lines) ]
## Split on the first two characters (so headers stay in
## the same group as table content rows)
lineType = substring(lines, 1, 2)
## Could be stricter in the type checking to check that all
## three of the first characters match the 10 allowed types
## but since it doesn't affect parsing, I don't think it's
## worth bothering.
allowed_types = c("CO", "MT", "PR", "PE", "PS", "SM")
stopifnot(all(lineType %in% allowed_types))
linesByType = split(lines, lineType)
## Comments are easy: just strip the first four characters
## from each line.
comments = substring(linesByType[["CO"]], 5)
## Parse the other five blocks in a loop, then fix up
## metadata afterwards
res = setNames(
lapply(
linesByType[c("MT", "PR", "PE", "PS", "SM")],
function(x) {
if (length(x) == 0) return(data.frame())
## MTD section has no header...
if (startsWith(x[1], "MTD")) {
d = read.delim(text = x,
header = FALSE,
col.names = c("MTD", "key", "value"),
na.strings = c("", "null"),
stringsAsFactors = FALSE,
fill = FALSE)
} else {
d = read.delim(text = x,
header = TRUE,
na.strings = c("", "null", "not mapped"),
stringsAsFactors = FALSE,
fill = FALSE)
colnames(d) = make.names(colnames(d), allow_ = FALSE)
}
return(d[,-1])
}),
c("MTD", "PRT", "PEP", "PSM", "SML"))
## rewrite MetaData as named vector
#res[["MTD"]] = setNames(res[["MTD"]][,2], res[["MTD"]][, 1])
## create Raw filename mapping internally
mtd = res[["MTD"]]
## in addition to the correct 'ms_run[xx]-location' we also accept 'ms_run[xx]_location' (ProteomeDiscoverer2.2)
idx_run = grep("^ms_run\\[\\d*\\][_-]location", mtd$key, value = FALSE)
raw_filenames = mtd$value[idx_run]
ms_runs = gsub("([.]*)[_-]location", "\\1", mtd$key[idx_run])
.self$fn_map$getShortNames(raw_filenames, ms_runs = ms_runs)
res[["filename"]] = mztab_file
res[["comments"]] = comments
.self$sections = res
return (NULL)
},
getParameters = function()
{
"Converts internal mzTab metadata section to a two column key-value data.frame similar to MaxQuants parameters.txt."
# copy the whole MTD for now, since R likes shallow copies and we are about to rename columns by reference (see ?setnames)
res = data.table::copy(.self$sections[["MTD"]])
if (!is.na(unique(.self$sections$PSM$database))) {
res = rbind(res, data.frame(key= "fasta file", value = paste(basename(unique(.self$sections$PSM$database)), collapse=";")))
}
else {
res = rbind(res, data.frame(key= "fasta file", value = "NULL"))
}
res = res[grep("^custom", res$key, invert = TRUE),]
res[is.na(res)] = "NULL" # temp workaround
## todo: remove at some point, since it forces us to use `::copy`
renameColumns(res, list(key = "parameter"))
return (res)
},
getSummary = function()
{
"Converts internal mzTab metadata section to a two data.frame with columns 'fc.raw.file', 'ms.ms.identified....'
similar to MaxQuants summary.txt."
res = .self$fn_map$getRawfm()[ , c("from", "to")]
colnames(res) = c("raw.file", "fc.raw.file")
## read all custom entries
mtd_custom_df = .self$sections$MTD[grep("^custom", .self$sections$MTD$key), ]
if (nrow(mtd_custom_df) == 0) return(NULL)
## ... and subselect the ms2-ID-Rate
ms2_df = mtd_custom_df[grep("^\\[MS2 identification rate", mtd_custom_df$value), ]
res$ms.ms.identified.... = unlist(lapply(gsub(".*, ?(\\d*\\.\\d*)\\]", "\\1", ms2_df$value), as.numeric))
## read TIC
tic_df = mtd_custom_df[grep("total ion current", mtd_custom_df$value),]
if (nrow(tic_df) > 0) res$TIC = lapply(strsplit(sub(".* \\[(.*)\\]]", "\\1", tic_df$value), ","), as.numeric)
return (res)
},
## MaxQuant-like representation of PRT table, i.e. augmented this with more columns (or renamed) if a metric requires it
getProteins = function()
{
"Basically the PRT table ..."
res = .self$sections$PRT
return ( res )
},
## MaxQuant-like representation of PEP table, i.e. augmented this with more columns (or renamed) if a metric requires it
getEvidence = function()
{
"Basically the PSM table and additionally columns named 'raw.file' and 'fc.raw.file'."
res = data.table::as.data.table(.self$sections$PSM)
## remove empty PepIDs
## ... unidentfied MS2 scans (NA) or with no ConsensusFeature group (-1), i.e. unassigned PepIDs
if (all(c("opt.global.cf.id") %in% colnames(res))) {
res = res[!(is.na(res$opt.global.cf.id) | (res$opt.global.cf.id == -1)),]
stopifnot(min(res$opt.global.cf.id) >= 0) ## would stop on NA as well
}
## augment with fc.raw.file
## The 'spectra_ref' looks like 'ms_run[x]:index=y|ms_run'
res = cbind(res, .self$fn_map$specrefToRawfile(res$spectra.ref))
stopifnot(all(!is.na(res$fc.raw.file))) # Spectra-Ref in PSM table not set for all entries
res$retention.time.calibration = NA
if (all(c("opt.global.rt.align", "opt.global.rt.raw") %in% colnames(res)))
{
renameColumns(res, list( retention.time = "retention.time.pep",
opt.global.rt.raw = "retention.time",
opt.global.rt.align = "calibrated.retention.time"
))
res$retention.time.calibration = res$calibrated.retention.time - res$retention.time
}
renameColumns(res, list( opt.global.calibrated.mz.error.ppm = "mass.error..ppm.",
opt.global.uncalibrated.mz.error.ppm = "uncalibrated.mass.error..ppm.",
exp.mass.to.charge = "m.z",
opt.global.mass = "mass",
opt.global.identified = "identified",
opt.global.ScanEventNumber = "scan.event.number",
PSM.ID = "id",
opt.global.modified.sequence = "modified.sequence",
opt.global.cv.MS.1000889.peptidoform.sequence = "modified.sequence",
opt.global.is.contaminant = "contaminant",
opt.global.fragment.mass.error.da = "mass.deviations..da.",
opt.global.cv.MS.1002217.decoy.peptide = "reverse"
))
if (!"modified.sequence" %in% colnames(res)){
res$modified.sequence = res$sequence
warning("modified.sequence is not present in input data, metrics use sequence instead", immediate. = TRUE)
}
if(!"contaminant" %in% colnames(res)){
res$contaminant = 0
}
# set contaminant to TRUE/FALSE
res$contaminant = (res$contaminant > 0)
## optional in MzTab (depending on which FeatureFinder was used)
if ("opt.global.FWHM" %in% colnames(res)){
renameColumns(res, list(opt.global.FWHM = "retention.length"))
}
## de-duplicate protein-accession entries
accessions = res[, .(l_accession = list(accession)), by = id]$l_accession
res = unique(res, by = "id")
res$proteins = unlist(lapply(accessions, paste, collapse = ";"))
## todo: these are protein names not IDs, but the code does not care (its not very clean though)
res$protein.group.ids = res$proteins
## annotate ms.ms.count for identical sequences per rawfile, but only the first member of the group;
## all others get NA to prevent double counting
res[, ms.ms.count := c(.N, rep(NA, .N-1)), by = list(raw.file, modified.sequence, charge)]
## convert values from seconds to minutes for all RT columns
RTUnitCorrection(res)
##
## intensity from PEP to PSM: only labelfree ('opt.global.cf.id' links all PSMs belonging to a ConsensusFeature)
##
df_pep = data.frame()
if ("opt.global.cf.id" %in% colnames(res)){
df_pep = data.table::as.data.table(.self$sections$PEP)[!is.na(sequence), ]
renameColumns(df_pep, list(opt.global.modified.sequence = "modified.sequence",
opt.global.cv.MS.1000889.peptidoform.sequence = "modified.sequence"))
## add raw.file...
df_pep = cbind(df_pep, .self$fn_map$specrefToRawfile(df_pep$spectra.ref))
## .. a unique index
df_pep$idx = 1:nrow(df_pep)
## map from PSM -> PEP row
## ... do NOT use spectra.ref since this is ambiguous (IDMapper duplicates MS2 PepIDs to multiple features)
if (!("opt.global.cf.id" %in% colnames(df_pep)))
{
stop("Please re-run the TOPP QualityControl tool to generate an updated version of OpenMS mzTab. Your mzTab is missing some information.")
}
#old: res$pep_idx = match(res$opt.global.feature.id, df_pep$opt.global.feature.id, nomatch = NA_integer_)
res$pep_idx = match(res$opt.global.cf.id, df_pep$opt.global.cf.id, nomatch = NA_integer_)
res$ms_run_number = as.numeric(gsub("^ms_run\\[(\\d*)\\].*", "\\1", res$ms_run))
col_abd_df_pep = grepv( "^peptide.abundance.study.variable.", names(df_pep))
col_RT_df_pep = grepv( "^opt.global.retention.time.study.variable", names(df_pep))
## transposed matrix for all abundances (rows = study; cols = pep_idx)
## .. this is a significant speedup compared to indexing into .SD[,] in subqueries, since that requires unlist()
m_pep_abd = t(df_pep[, ..col_abd_df_pep])
m_pep_rt = t(df_pep[, ..col_RT_df_pep])
N.studies = length(col_RT_df_pep)
stopifnot(N.studies == length(col_abd_df_pep))
}
NA_duplicates = function(vec_abd, idx) {
## replaces duplicate indices into the same ms_run intensities with NA
## (to avoid counting a feature more than once due to oversampled PSMs from one run assigned to a CF)
r = vec_abd[idx]
r[duplicated(idx)] = NA
return(r)
}
if (all(c("opt.global.cf.id") %in% colnames(res))) {
## assign intensity to genuine PSMs
res$intensity = NA_real_ ## unassigned PSMs have no MS1 intensity
res[,
intensity := NA_duplicates(m_pep_abd[, .SD$pep_idx[1]],
.SD$ms_run_number),
by = "opt.global.cf.id"]
summary(res$intensity)
}
res$is.transferred = FALSE
res$type = "MULTI-MSMS"
#set reverse to needed values
if ("reverse" %in% colnames(res)){
res$reverse=(res$reverse=="decoy")
}
## remove the data.table info, since metrics will break due to different syntax
class(res) = "data.frame"
##
## Infer MBR
## --> find all subfeatures in a CF with abundance but missing PSM --> create as MBR-dummy-PSMs
##
## : df_pep: the PEP table (with some extras)
## : res: the PSM table (with some extras)
## res$pep_idx: row in df_pep (consensusFeature) to which this PSM was assigned
res_tf = res[NULL,]
stopifnot(ncol(res_tf) > 0)
## skip if MS2 isotope labeling detected
quant_methods = grep("quantification_method", .self$sections$MTD$key)
if (!any(grepl("PRIDE_0000317", .self$sections$MTD$value[quant_methods])) & (!plyr::empty(df_pep)))
{
## iterate though all consensusFeatures .. find subfeatures with intensity but missing PSM
res_tf_tmp = df_pep[, {#print(idx)
#idx = 44
idx_PSM = which(res$pep_idx == idx)
runs_with_MS2 = unique(res$ms_run_number[idx_PSM]) ## all existing PSMs for this PEP (=consensusFeature)
runs_wo_MS2 = (1:N.studies)[-runs_with_MS2]
df = .SD[rep(1, length(runs_wo_MS2)), c("charge", "modified.sequence", "sequence")]
df$pep_idx = idx
df$ms_run_number = runs_wo_MS2 ## vector
df$calibrated.retention.time = m_pep_rt[runs_wo_MS2, idx]
df$intensity = m_pep_abd[runs_wo_MS2, idx]
df$protein.group.ids = res$protein.group.ids[idx_PSM[1]] ## use PGI from genuine PSMs
## remove all inferred PSMs which do not have an intensity(= MS1 feature)
df = df[!is.na(df$intensity),]
df ## return
}, by = "idx"] ## one PEP row at a time
if(nrow(res_tf_tmp) > 0){
res_tf = res_tf_tmp
## convert from "ms_run_number" to fc.raw.file
res_tf = cbind(res_tf, .self$fn_map$msrunToRawfile(paste0("ms_run[", res_tf$ms_run_number, "]")))
res_tf$is.transferred = TRUE
res_tf$type = "MULTI-MATCH"
## check: summed intensities should be equal
if("intensity" %in% colnames(res) && "intensity" %in% colnames(res_tf)){
stopifnot(sum(df_pep[, ..col_abd_df_pep], na.rm = TRUE) == sum(res$intensity, na.rm = TRUE) + sum(res_tf$intensity, na.rm = TRUE))
}
}
}
## remove the data.table info, since metrics will break due to different syntax
class(res_tf) = "data.frame"
message("Evidence table generated: ", nrow(res), "x", ncol(res), "(genuine); ", nrow(res_tf), "x", ncol(res_tf), "(transferred)")
## must at least have column names, but can have 0 rows
stopifnot(ncol(res_tf) > 0)
return (list("genuine" = res, "transferred" = res_tf))
},
getMSMSScans = function(identified_only = FALSE)
{
"Basically the PSM table (partially renamed columns) and additionally two columns 'raw.file' and 'fc.raw.file'.
If identified_only is TRUE, only MS2 scans which were identified (i.e. a PSM) are returned -- this is equivalent to msms.txt in MaxQuant."
res = data.table::as.data.table(.self$sections$PSM)
stopifnot(all((res$opt.global.identified == 1) == (!is.na(res$sequence))))
if (identified_only) {
res = res[!is.na(res$sequence), ] # == NA sequence
}
## de-duplicate PSM.ID column: take first row for each PSM.ID
res_temp = res[!duplicated(res$PSM.ID), ]
## ... and append accessions of the complete subset (.SD)
res_temp$accessions = res[, paste0(.SD$accession, collapse=";"), by = "PSM.ID"]$V1
res = res_temp
## Augment fc.raw.file column
## ... the `spectra_ref` looks like "ms_run[12]:controllerType=0 controllerNumber=1 scan=25337"
res = cbind(res, .self$fn_map$specrefToRawfile(res$spectra.ref))
## IDMapper might duplicate PepIDs if two or more features are a good match
## ... but we only want each MS2 scan represented once here
res = unique(res, by = c("fc.raw.file", "spectra.ref"))
if (all(c("opt.global.rt.align", "opt.global.rt.raw") %in% colnames(res)))
{
renameColumns(res, list( retention.time = "retention.time.pep",
opt.global.rt.raw = "retention.time",
opt.global.rt.align = "calibrated.retention.time"))
res$retention.time.calibration = res$calibrated.retention.time - res$retention.time
}
else res$retention.time.calibration = NA
name = list( opt.global.calibrated.mz.error.ppm = "mass.error..ppm",
opt.global.uncalibrated.mz.error.ppm = "uncalibrated.mass.error..ppm.",
exp.mass.to.charge = "m.z",
opt.global.mass = "mass",
opt.global.fragment.mass.error.da = "mass.deviations..da.",
opt.global.fragment.mass.error.ppm = "mass.deviations..ppm.",
opt.global.identified = "identified",
opt.global.ScanEventNumber = "scan.event.number",
PSM.ID = "id",
opt.global.modified.sequence = "modified.sequence",
opt.global.cv.MS.1000889.peptidoform.sequence = "modified.sequence",
opt.global.is.contaminant = "contaminant",
opt.global.missed.cleavages = "missed.cleavages",
opt.global.cv.MS.1002217.decoy.peptide = "reverse",
opt.global.activation.method = "fragmentation",
opt.global.total.ion.count = "total.ion.current",
opt.global.base.peak.intensity = "base.peak.intensity",
opt.global.ion.injection.time = "ion.injection.time")
renameColumns(res, name)
if ("mass.deviations..ppm." %in% colnames(res)) {
res$mass.deviations..ppm. = substr(res$mass.deviations..ppm., 2, nchar(res$mass.deviations..ppm.) - 1)
res$mass.deviations..ppm. = gsub(",", ";", res$mass.deviations..ppm., fixed = TRUE)
}
if ("mass.deviations..da." %in% colnames(res)) {
res$mass.deviations..da. = substr(res$mass.deviations..da., 2, nchar(res$mass.deviations..da.) - 1)
res$mass.deviations..da. = gsub(",", ";", res$mass.deviations..da., fixed = TRUE)
}
#set reverse to needed values
if ("reverse" %in% colnames(res)){
res$reverse=(res$reverse=="decoy")
}
if (!"contaminant" %in% colnames(res)){
res$contaminant = 0
}
#set contaminant to TRUE/FALSE
res$contaminant = (res$contaminant > 0)
#set identified to needed values
if ("identified" %in% colnames(res)){
stopifnot(unique(res$identified) %in% c(0,1)) ## make sure the column has the expected values (0/1)
# set $identified to MaxQuant values (+/-)
res$identified = c("-", "+")[(res$identified==1) + 1]
}
RTUnitCorrection(res)
## remove the data.table info, since metrics will break due to different syntax
class(res) = "data.frame"
## order by file and specRef as RT proxy (do NOT use RT directly, since it might be NA or non-linearly transformed)
## e.g. spectra.ref might be 'ms_run[1]:controllerType=0 controllerNumber=1 scan=13999'
## or 'ms_run[2]:spectrum=33'
## --> extract scan as numeric, since string compare is insufficient for numbers ("13999" > "140")
res$scan = as.numeric(gsub(".*index=(\\d*)|.*scan=(\\d*)|.*spectrum=(\\d*)", "\\1\\2\\3", res$spectra.ref))
stopifnot(all(!is.na(res$scan)))
res = res[order(res$fc.raw.file, res$scan), ]
return ( res )
},
RTUnitCorrection = function(dt)
{
"Convert all RT columns from seconds (OpenMS default) to minutes (MaxQuant default)"
# heuristic to detect presence of unit:seconds; if retention.time has is, we assume that all rt-columns are in seconds
# retention.time is mandatory for mzTab
if (max(dt[, "retention.time"], na.rm = TRUE) > 300)
{
cn_rt = grepv("retention.time|retention.length", names(dt))
dt[, c(cn_rt) := lapply(.SD, function(x) x / 60 ), .SDcols = cn_rt]
}
#dt[, ..cn_rt]
return(NULL)
},
renameColumns = function(dt, namelist)
{
"Renames all columns and throws a warning if a column does not exist in the data"
from = names(namelist)
to = unlist(namelist)
data.table::setnames(dt, old = from, new = to, skip_absent = TRUE)
existName = to %in% colnames(dt)
if (!all(existName))
{
warning(paste0("Columns\n '",
paste(from[!existName], "' (mzTab name) --> '", to[!existName], collapse="' (internal name),\n '", sep=""),
"'\n are not present in input data!"),
immediate. = TRUE)
}
}
) # methods
) # class
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.