R/MzTabReader.R
In PTXQC: Quality Report Generation for MaxQuant and mzTab Results

#'
#' Class to read an mzTab file and store the tables internally.
#' 
#' The 'sections' field is initialized after $readMzTab was called.
#' The 'fn_map' fields should be initialized via ...$fn_map$readMappingFile(...) manually if user-defined filename mappings are desired
#' and is automatically updated/queried when $readMzTab is called.
#' 
#' @field sections MzTab sections as list. Valid list entries are: "MTD", "PRT", "PEP", "PSM", "SML", "filename" and "comments"
#' @field fn_map FilenameMapper which can translate raw filenames into something shorter
#'
#'
#'
MzTabReader = setRefClass("MzTabReader",
                       fields = list(sections = "list",
                                     fn_map = "FilenameMapper"
                       ),
                       methods = list(
                         initialize=function() {
                           .self$sections = list();
                           .self$fn_map = FilenameMapper$new();
                           
                           return(.self)
                         },
                         #'
readMzTab = function(.self, mztab_file) {
  "Read an mzTab file into a list of 5 data.frames (one df per mzTab section).
   Data.frames in the resulting list are named as follows:
     'MTD', 'PRT', 'PEP', 'PSM', 'SML',.
   Additionally, 'filename' and 'comments' are valid list elements.
  "

  cat("Reading mzTab '", mztab_file, "' ...", sep = "")
  
  ## this implementation is derived from with minor modifications
  ## https://github.com/lgatto/MSnbase/blob/master/R/MzTab.R
  
  f_con = file(mztab_file, open = "r") ## for better error messages
  lines = readLines(f_con)
  close(f_con)
  # remove empty lines
  lines = lines[ nzchar(lines) ]
  
  ## Split on the first two characters (so headers stay in
  ## the same group as table content rows)
  lineType = substring(lines, 1, 2)
  
  ## Could be stricter in the type checking to check that all
  ## three of the first characters match the 10 allowed types
  ## but since it doesn't affect parsing, I don't think it's
  ## worth bothering.
  allowed_types = c("CO", "MT", "PR", "PE", "PS", "SM")
  stopifnot(all(lineType %in% allowed_types))
  linesByType = split(lines, lineType)
  
  ## Comments are easy: just strip the first four characters
  ## from each line.
  comments = substring(linesByType[["CO"]], 5)
  
  ## Parse the other five blocks in a loop, then fix up
  ## metadata afterwards
  res = setNames(
    lapply(
      linesByType[c("MT", "PR", "PE", "PS", "SM")],
      function(x) {
        if (length(x) == 0) return(data.frame())
        ## MTD section has no header...
        if (startsWith(x[1], "MTD")) {
          d = read.delim(text = x,
                         header = FALSE,
                         col.names = c("MTD", "key", "value"),
                         na.strings = c("", "null"),
                         stringsAsFactors = FALSE,
                         fill = FALSE)
        } else {
          d = read.delim(text = x,
                         header = TRUE,
                         na.strings = c("", "null", "not mapped"),
                         stringsAsFactors = FALSE,
                         fill = FALSE)
          colnames(d) = make.names(colnames(d), allow_ = FALSE)
        }
        return(d[,-1])
      }),
    c("MTD", "PRT", "PEP", "PSM", "SML"))
  
  ## rewrite MetaData as named vector
  #res[["MTD"]] = setNames(res[["MTD"]][,2], res[["MTD"]][, 1])
  
  ## create Raw filename mapping internally
  mtd = res[["MTD"]]
  ## in addition to the correct 'ms_run[xx]-location' we also accept 'ms_run[xx]_location' (ProteomeDiscoverer2.2)
  idx_run = grep("^ms_run\\[\\d*\\][_-]location", mtd$key, value = FALSE)
  raw_filenames = mtd$value[idx_run]
  ms_runs = gsub("([.]*)[_-]location", "\\1", mtd$key[idx_run])
  .self$fn_map$getShortNames(raw_filenames, ms_runs = ms_runs)
  
  
  res[["filename"]] = mztab_file
  res[["comments"]] = comments
  .self$sections = res
  return (NULL)
},

getParameters = function()
{
  "Converts internal mzTab metadata section to a two column key-value data.frame similar to MaxQuants parameters.txt."
  
  # copy the whole MTD for now, since R likes shallow copies and we are about to rename columns by reference (see ?setnames)
  res = data.table::copy(.self$sections[["MTD"]])
  if (!is.na(unique(.self$sections$PSM$database))) {
    res = rbind(res, data.frame(key= "fasta file", value = paste(basename(unique(.self$sections$PSM$database)), collapse=";")))
  }
  else {
    res = rbind(res, data.frame(key= "fasta file", value = "NULL"))
  }
  
  res = res[grep("^custom", res$key, invert = TRUE),]
  
  res[is.na(res)] = "NULL" # temp workaround
  
  ## todo: remove at some point, since it forces us to use `::copy`
  renameColumns(res, list(key = "parameter"))

  return (res)
},

getSummary = function()
{
  "Converts internal mzTab metadata section to a two data.frame with columns 'fc.raw.file', 'ms.ms.identified....'
   similar to MaxQuants summary.txt."
  res = .self$fn_map$getRawfm()[ , c("from", "to")]
  colnames(res) = c("raw.file", "fc.raw.file")

  ## read all custom entries
  mtd_custom_df = .self$sections$MTD[grep("^custom", .self$sections$MTD$key), ]
  
  if (nrow(mtd_custom_df) == 0) return(NULL)

  ## ... and subselect the ms2-ID-Rate
  ms2_df = mtd_custom_df[grep("^\\[MS2 identification rate", mtd_custom_df$value), ] 
  res$ms.ms.identified.... = unlist(lapply(gsub(".*, ?(\\d*\\.\\d*)\\]", "\\1", ms2_df$value), as.numeric))

  ## read TIC
  tic_df = mtd_custom_df[grep("total ion current", mtd_custom_df$value),]
  if (nrow(tic_df) > 0) res$TIC = lapply(strsplit(sub(".* \\[(.*)\\]]", "\\1", tic_df$value), ","), as.numeric)

  return (res)
},

## MaxQuant-like representation of PRT table, i.e. augmented this with more columns (or renamed) if a metric requires it
getProteins = function()
{
  "Basically the PRT table ..."
  
  res = .self$sections$PRT

  return ( res )
},

## MaxQuant-like representation of PEP table, i.e. augmented this with more columns (or renamed) if a metric requires it
getEvidence = function()
{
  "Basically the PSM table and additionally columns named 'raw.file' and 'fc.raw.file'."
  
  res = data.table::as.data.table(.self$sections$PSM)

  ## remove empty PepIDs 
  ## ... unidentfied MS2 scans (NA)      or with no ConsensusFeature group (-1), i.e. unassigned PepIDs
  
  if (all(c("opt.global.cf.id") %in% colnames(res))) {
    res = res[!(is.na(res$opt.global.cf.id) | (res$opt.global.cf.id == -1)),]
    stopifnot(min(res$opt.global.cf.id) >= 0) ## would stop on NA as well
  }
  
  ## augment with fc.raw.file
  ## The 'spectra_ref' looks like 'ms_run[x]:index=y|ms_run'
  res = cbind(res, .self$fn_map$specrefToRawfile(res$spectra.ref))
  stopifnot(all(!is.na(res$fc.raw.file))) # Spectra-Ref in PSM table not set for all entries
  
  
  res$retention.time.calibration = NA
  if (all(c("opt.global.rt.align", "opt.global.rt.raw") %in% colnames(res))) 
  {
    renameColumns(res, list(               retention.time = "retention.time.pep",
                                        opt.global.rt.raw = "retention.time",
                                      opt.global.rt.align = "calibrated.retention.time"
                            ))
    res$retention.time.calibration = res$calibrated.retention.time - res$retention.time 
  }

  renameColumns(res, list(         opt.global.calibrated.mz.error.ppm = "mass.error..ppm.",
                                 opt.global.uncalibrated.mz.error.ppm = "uncalibrated.mass.error..ppm.", 
                                                   exp.mass.to.charge = "m.z", 
                                                      opt.global.mass = "mass", 
                                                opt.global.identified = "identified",
                                           opt.global.ScanEventNumber = "scan.event.number",
                                                               PSM.ID = "id", 
                                         opt.global.modified.sequence = "modified.sequence",
                        opt.global.cv.MS.1000889.peptidoform.sequence = "modified.sequence",
                                            opt.global.is.contaminant = "contaminant",
                                    opt.global.fragment.mass.error.da = "mass.deviations..da.",
                               opt.global.cv.MS.1002217.decoy.peptide = "reverse"
          ))

  if (!"modified.sequence" %in% colnames(res)){
    res$modified.sequence = res$sequence
    warning("modified.sequence is not present in input data, metrics use sequence instead", immediate. = TRUE)
  }
  
  if(!"contaminant" %in% colnames(res)){
    res$contaminant = 0
  }
  # set contaminant to TRUE/FALSE
  res$contaminant = (res$contaminant > 0)

  ## optional in MzTab (depending on which FeatureFinder was used)
  if ("opt.global.FWHM" %in% colnames(res)){
    renameColumns(res, list(opt.global.FWHM = "retention.length"))
  }
  
  ## de-duplicate protein-accession entries
  accessions = res[, .(l_accession = list(accession)), by = id]$l_accession
  res = unique(res, by = "id")
  res$proteins = unlist(lapply(accessions, paste, collapse = ";"))
  
  ## todo: these are protein names not IDs, but the code does not care (its not very clean though)
  res$protein.group.ids = res$proteins 
  
  ## annotate ms.ms.count for identical sequences per rawfile, but only the first member of the group; 
  ## all others get NA to prevent double counting
  res[, ms.ms.count := c(.N, rep(NA, .N-1)), by = list(raw.file, modified.sequence, charge)]

  ## convert values from seconds to minutes for all RT columns
  RTUnitCorrection(res)

  ##
  ## intensity from PEP to PSM: only labelfree ('opt.global.cf.id' links all PSMs belonging to a ConsensusFeature)
  ##
  df_pep = data.frame()
  if ("opt.global.cf.id" %in% colnames(res)){
    df_pep = data.table::as.data.table(.self$sections$PEP)[!is.na(sequence), ]
    renameColumns(df_pep, list(opt.global.modified.sequence = "modified.sequence", 
                               opt.global.cv.MS.1000889.peptidoform.sequence = "modified.sequence"))
    ## add raw.file...
    df_pep = cbind(df_pep, .self$fn_map$specrefToRawfile(df_pep$spectra.ref))
    ## .. a unique index
    df_pep$idx = 1:nrow(df_pep)
    ## map from PSM -> PEP row
    ## ... do NOT use spectra.ref since this is ambiguous (IDMapper duplicates MS2 PepIDs to multiple features)
    if (!("opt.global.cf.id" %in% colnames(df_pep)))
    {
      stop("Please re-run the TOPP QualityControl tool to generate an updated version of OpenMS mzTab. Your mzTab is missing some information.")
    } 
    #old: res$pep_idx = match(res$opt.global.feature.id, df_pep$opt.global.feature.id, nomatch = NA_integer_)
    res$pep_idx = match(res$opt.global.cf.id, df_pep$opt.global.cf.id, nomatch = NA_integer_)
    
    res$ms_run_number = as.numeric(gsub("^ms_run\\[(\\d*)\\].*", "\\1", res$ms_run))
    col_abd_df_pep = grepv( "^peptide.abundance.study.variable.", names(df_pep))
    col_RT_df_pep = grepv( "^opt.global.retention.time.study.variable", names(df_pep))
    ## transposed matrix for all abundances (rows = study; cols = pep_idx)
    ## .. this is a significant speedup compared to indexing into .SD[,] in subqueries, since that requires unlist()
    m_pep_abd = t(df_pep[, ..col_abd_df_pep])
    m_pep_rt = t(df_pep[, ..col_RT_df_pep])
    N.studies = length(col_RT_df_pep)
    stopifnot(N.studies == length(col_abd_df_pep))
  }
 

  NA_duplicates = function(vec_abd, idx) {
    ## replaces duplicate indices into the same ms_run intensities with NA
    ## (to avoid counting a feature more than once due to oversampled PSMs from one run assigned to a CF)
    r = vec_abd[idx]
    r[duplicated(idx)] = NA
    return(r)
  }
  
  if (all(c("opt.global.cf.id") %in% colnames(res))) {
    ## assign intensity to genuine PSMs
    res$intensity = NA_real_ ## unassigned PSMs have no MS1 intensity
    res[,
      intensity := NA_duplicates(m_pep_abd[, .SD$pep_idx[1]],
                                 .SD$ms_run_number),
      by = "opt.global.cf.id"]
    summary(res$intensity)
  }
  

  res$is.transferred = FALSE
  res$type = "MULTI-MSMS"

  #set reverse to needed values
  if ("reverse" %in% colnames(res)){
    res$reverse=(res$reverse=="decoy")
  }
  
  ## remove the data.table info, since metrics will break due to different syntax
  class(res) = "data.frame"


  ##
  ## Infer MBR 
  ## --> find all subfeatures in a CF with abundance but missing PSM --> create as MBR-dummy-PSMs
  ##
  ## : df_pep: the PEP table (with some extras)
  ## : res:    the PSM table (with some extras)
  ##     res$pep_idx: row in df_pep (consensusFeature) to which this PSM was assigned
  res_tf = res[NULL,]
  stopifnot(ncol(res_tf) > 0)
  ## skip if MS2 isotope labeling detected
  quant_methods = grep("quantification_method", .self$sections$MTD$key)
  if (!any(grepl("PRIDE_0000317", .self$sections$MTD$value[quant_methods])) & (!plyr::empty(df_pep)))
  {
    ## iterate though all consensusFeatures .. find subfeatures with intensity but missing PSM
    res_tf_tmp = df_pep[, {#print(idx)
      #idx = 44
      idx_PSM = which(res$pep_idx == idx)
      runs_with_MS2 = unique(res$ms_run_number[idx_PSM]) ## all existing PSMs for this PEP (=consensusFeature)
      runs_wo_MS2 = (1:N.studies)[-runs_with_MS2]
      df = .SD[rep(1, length(runs_wo_MS2)), c("charge", "modified.sequence", "sequence")]
      df$pep_idx = idx
      df$ms_run_number = runs_wo_MS2 ## vector
      df$calibrated.retention.time = m_pep_rt[runs_wo_MS2, idx]
      df$intensity = m_pep_abd[runs_wo_MS2, idx]
      df$protein.group.ids = res$protein.group.ids[idx_PSM[1]] ## use PGI from genuine PSMs
      ## remove all inferred PSMs which do not have an intensity(= MS1 feature)
      df = df[!is.na(df$intensity),]
      df ## return
    }, by = "idx"] ## one PEP row at a time
   
    if(nrow(res_tf_tmp) > 0){
      res_tf = res_tf_tmp
      ## convert from "ms_run_number" to fc.raw.file
      res_tf = cbind(res_tf, .self$fn_map$msrunToRawfile(paste0("ms_run[", res_tf$ms_run_number, "]")))
      res_tf$is.transferred = TRUE
      res_tf$type = "MULTI-MATCH"
      
      ## check: summed intensities should be equal
      if("intensity" %in% colnames(res) && "intensity" %in% colnames(res_tf)){
        stopifnot(sum(df_pep[, ..col_abd_df_pep], na.rm = TRUE) == sum(res$intensity, na.rm = TRUE) + sum(res_tf$intensity, na.rm = TRUE))
      }
    }
  }
  
  ## remove the data.table info, since metrics will break due to different syntax
  class(res_tf) = "data.frame"

  message("Evidence table generated: ", nrow(res), "x", ncol(res), "(genuine); ", nrow(res_tf), "x", ncol(res_tf), "(transferred)")
  
  ## must at least have column names, but can have 0 rows
  stopifnot(ncol(res_tf) > 0)
  
  return (list("genuine" = res, "transferred" = res_tf))
},


getMSMSScans = function(identified_only = FALSE)
{
  "Basically the PSM table (partially renamed columns) and additionally two columns 'raw.file' and 'fc.raw.file'. 
   If identified_only is TRUE, only MS2 scans which were identified (i.e. a PSM) are returned -- this is equivalent to msms.txt in MaxQuant."
  
  res = data.table::as.data.table(.self$sections$PSM)

  stopifnot(all((res$opt.global.identified == 1) == (!is.na(res$sequence))))
  if (identified_only) {
    res = res[!is.na(res$sequence), ] # == NA sequence
  }
  
  ## de-duplicate PSM.ID column: take first row for each PSM.ID 
  res_temp = res[!duplicated(res$PSM.ID), ]
  ## ... and append accessions of the complete subset (.SD)
  res_temp$accessions = res[, paste0(.SD$accession, collapse=";"), by = "PSM.ID"]$V1
  res = res_temp
  
  ## Augment fc.raw.file column
  ## ... the `spectra_ref` looks like "ms_run[12]:controllerType=0 controllerNumber=1 scan=25337"
  res = cbind(res, .self$fn_map$specrefToRawfile(res$spectra.ref))

  ## IDMapper might duplicate PepIDs if two or more features are a good match
  ## ... but we only want each MS2 scan represented once here
  res = unique(res, by = c("fc.raw.file", "spectra.ref"))
  
  if (all(c("opt.global.rt.align", "opt.global.rt.raw") %in% colnames(res))) 
  {
    renameColumns(res, list(     retention.time = "retention.time.pep",
                              opt.global.rt.raw = "retention.time",
                            opt.global.rt.align = "calibrated.retention.time"))
    res$retention.time.calibration = res$calibrated.retention.time - res$retention.time 
  }
  else res$retention.time.calibration = NA
  
  
  name = list(         opt.global.calibrated.mz.error.ppm = "mass.error..ppm",
                     opt.global.uncalibrated.mz.error.ppm = "uncalibrated.mass.error..ppm.", 
                                       exp.mass.to.charge = "m.z", 
                                          opt.global.mass = "mass", 
                        opt.global.fragment.mass.error.da = "mass.deviations..da.", 
                       opt.global.fragment.mass.error.ppm = "mass.deviations..ppm.",
                                    opt.global.identified = "identified",
                               opt.global.ScanEventNumber = "scan.event.number",
                                                   PSM.ID = "id", 
                             opt.global.modified.sequence = "modified.sequence",
            opt.global.cv.MS.1000889.peptidoform.sequence = "modified.sequence",
                                opt.global.is.contaminant = "contaminant",
                              opt.global.missed.cleavages = "missed.cleavages",
                   opt.global.cv.MS.1002217.decoy.peptide = "reverse",
                             opt.global.activation.method = "fragmentation",
                               opt.global.total.ion.count = "total.ion.current",
                           opt.global.base.peak.intensity = "base.peak.intensity",
                            opt.global.ion.injection.time = "ion.injection.time")
 
  renameColumns(res, name)
 
  if ("mass.deviations..ppm." %in% colnames(res)) {
    res$mass.deviations..ppm. = substr(res$mass.deviations..ppm., 2, nchar(res$mass.deviations..ppm.) - 1)
    res$mass.deviations..ppm. = gsub(",", ";", res$mass.deviations..ppm., fixed = TRUE)
  }
  if ("mass.deviations..da." %in% colnames(res)) {   
    res$mass.deviations..da. = substr(res$mass.deviations..da., 2, nchar(res$mass.deviations..da.) - 1)
    res$mass.deviations..da. =  gsub(",", ";", res$mass.deviations..da., fixed = TRUE)
  }
  
  #set reverse to needed values
  if ("reverse" %in% colnames(res)){
    res$reverse=(res$reverse=="decoy")
  }
  
  if (!"contaminant" %in% colnames(res)){
    res$contaminant = 0
  }
  #set contaminant to TRUE/FALSE
  res$contaminant = (res$contaminant > 0)
  
  #set identified to needed values
  if ("identified" %in% colnames(res)){
    stopifnot(unique(res$identified) %in% c(0,1)) ## make sure the column has the expected values (0/1)
    # set $identified to MaxQuant values (+/-)
    res$identified = c("-", "+")[(res$identified==1) + 1]
  }

  RTUnitCorrection(res)


  ## remove the data.table info, since metrics will break due to different syntax
  class(res) = "data.frame"
  
  ## order by file and specRef as RT proxy (do NOT use RT directly, since it might be NA or non-linearly transformed)
  ## e.g. spectra.ref might be 'ms_run[1]:controllerType=0 controllerNumber=1 scan=13999'
  ##                        or 'ms_run[2]:spectrum=33'
  ##      --> extract scan as numeric, since string compare is insufficient for numbers ("13999" > "140")
  res$scan = as.numeric(gsub(".*index=(\\d*)|.*scan=(\\d*)|.*spectrum=(\\d*)", "\\1\\2\\3", res$spectra.ref))
  stopifnot(all(!is.na(res$scan))) 
  res = res[order(res$fc.raw.file, res$scan), ]
  
  return ( res )
},

RTUnitCorrection = function(dt)
{
  "Convert all RT columns from seconds (OpenMS default) to minutes (MaxQuant default)"
  
  # heuristic to detect presence of unit:seconds; if retention.time has is, we assume that all rt-columns are in seconds
  # retention.time is mandatory for mzTab
  if (max(dt[, "retention.time"], na.rm = TRUE) > 300)
  {
    cn_rt = grepv("retention.time|retention.length", names(dt))
    dt[, c(cn_rt) := lapply(.SD, function(x) x / 60 ), .SDcols = cn_rt]
  }
  
  #dt[, ..cn_rt]
  return(NULL)
},

renameColumns = function(dt, namelist)
{
  "Renames all columns and throws a warning if a column does not exist in the data"
  
  from = names(namelist)
  to = unlist(namelist)
  data.table::setnames(dt, old = from, new = to, skip_absent = TRUE)
  
  existName = to %in% colnames(dt)
  if (!all(existName))
  {
    warning(paste0("Columns\n '", 
                   paste(from[!existName], "' (mzTab name) --> '", to[!existName], collapse="' (internal name),\n '", sep=""),
                   "'\n are not present in input data!"),
            immediate. = TRUE)
  }
}

) # methods
) # class
Any scripts or data that you put into this service are public.
PTXQC documentation built on April 3, 2025, 9:28 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PTXQC
Quality Report Generation for MaxQuant and mzTab Results

R/MzTabReader.R
In PTXQC: Quality Report Generation for MaxQuant and mzTab Results

Try the PTXQC package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PTXQC Quality Report Generation for MaxQuant and mzTab Results

R/MzTabReader.R In PTXQC: Quality Report Generation for MaxQuant and mzTab Results

Try the PTXQC package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PTXQC
Quality Report Generation for MaxQuant and mzTab Results

R/MzTabReader.R
In PTXQC: Quality Report Generation for MaxQuant and mzTab Results