R/MSstatsConvert_core_functions.R

Defines functions MSstatsMakeAnnotation MSstatsBalancedDesign MSstatsPreprocess MSstatsImport

Documented in MSstatsBalancedDesign MSstatsImport MSstatsMakeAnnotation MSstatsPreprocess

#' Class to model files that describe a single MS dataset.
#' 
#' @slot files named list of files generated by a signal processing tools. 
#' In most cases, this will be a single file named `input`. 
#' In some cases, multiple files are used, for example `MaxQuant` outputs 
#' `evidence` and `proteinGroups` files.
#' @slot type character: "MSstats" or "MSstatsTMT".
#' @slot tool character: name of a signal processing tools that generated the
#' output. Possible values are: DIAUmpire, MaxQuant, OpenMS, OpenSWATH, 
#' Progenesis, ProteomeDiscoverer, Skyline, SpectroMine, Spectronaut.
#' @slot version description of a software version of the signal processing tool.
#' Not implemented yet.
#' @rdname MSstatsInputFiles
setClass("MSstatsInputFiles", 
         slots = c(files = "list", type = "character", 
                   tool = "character", version = "ANY"))

#' MSstatsDIAUmpireFiles: class for DIAUmpire files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsDIAUmpireFiles", contains = "MSstatsInputFiles")
#' MSstatsMaxQuantFiles: class for MaxQuant files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsMaxQuantFiles", contains = "MSstatsInputFiles")
#' MSstatsOpenMSFiles: class for OpenMS files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsOpenMSFiles", contains = "MSstatsInputFiles")
#' MSstatsOpenSWATHFiles: class for OpenSWATH files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsOpenSWATHFiles", contains = "MSstatsInputFiles")
#' MSstatsProgenesisFiles: class for Progenesis files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsProgenesisFiles", contains = "MSstatsInputFiles")
#' MSstatsProteomeDiscovererFiles: class for ProteomeDiscoverer files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsProteomeDiscovererFiles", contains = "MSstatsInputFiles")
#' MSstatsSkylineFiles: class for Skyline files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsSkylineFiles", contains = "MSstatsInputFiles")
#' MSstatsSkylineFiles: class for SpectroMine files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsSpectroMineFiles", contains = "MSstatsInputFiles")
#' MSstatsSpectronautFiles: class for Spectronaut files.
#' @rdname MSstatsInputFiles
#' @keywords internal
setClass("MSstatsSpectronautFiles", contains = "MSstatsInputFiles")


#' Get one of files contained in an instance of `MSstatsInputFiles` class.
#' @rdname getInputFile
#' @return data.table
#' @export
#' @examples
#' evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", 
#'                             package = "MSstatsConvert")
#' pg_path = system.file("tinytest/raw_data/MaxQuant/mq_pg.csv", 
#'                       package = "MSstatsConvert")
#' evidence = read.csv(evidence_path)
#' pg = read.csv(pg_path)
#' imported = MSstatsImport(list(evidence = evidence, protein_groups = pg),
#'                          "MSstats", "MaxQuant")
#' class(imported)
#' head(getInputFile(imported, "evidence"))
setGeneric("getInputFile", 
           function(msstats_object, file_type) standardGeneric("getInputFile"), 
           signature = "msstats_object")
#' @param msstats_object object that inherits from `MSstatsInputFiles` class.
#' @param file_type character name of a type file. Usually equal to "input".
#' @return data.table
#' @export 
#' @rdname getInputFile
setMethod("getInputFile", "MSstatsInputFiles", 
          function(msstats_object, file_type = "input") 
              msstats_object@files[[file_type]])

#' Get type of dataset from an MSstatsInputFiles object.
#' @rdname getDataType
#' @keywords internal
#' @export
#' @return character - label of a data type. Currently, "MSstats" or "MSstatsTMT"
#' @examples
#' evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", 
#'                             package = "MSstatsConvert")
#' pg_path = system.file("tinytest/raw_data/MaxQuant/mq_pg.csv", 
#'                       package = "MSstatsConvert")
#' evidence = read.csv(evidence_path)
#' pg = read.csv(pg_path)
#' imported = MSstatsImport(list(evidence = evidence, protein_groups = pg),
#'                          "MSstats", "MaxQuant")
#' class(imported)
#' getDataType(imported) # "MSstats"
#' 
setGeneric("getDataType", 
           function(msstats_object) standardGeneric("getDataType"))
#' @param msstats_object object that inherits from `MSstatsInputFiles` class.
#' @return character "MSstats" or "MSstatsTMT".
#' @export
#' @rdname getDataType
setMethod("getDataType", "MSstatsInputFiles",
          function(msstats_object) msstats_object@type)


#' Import files from signal processing tools.
#' 
#' @param input_files list of paths to input files or `data.frame` objects.
#' Interpretation of this parameter depends on values of parameters `type` and `tool`.
#' @param type chr, "MSstats" or "MSstatsTMT".
#' @param tool chr, name of a signal processing tool that generated input files.
#' @param tool_version not implemented yet. In the future, this parameter will allow
#' handling different versions of each signal processing tools.
#' @param ... optional additional parameters to `data.table::fread`.
#' 
#' @return an object of class `MSstatsInputFiles`.
#' @export
#' 
#' @examples 
#' evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", 
#'                             package = "MSstatsConvert")
#' pg_path = system.file("tinytest/raw_data/MaxQuant/mq_pg.csv", 
#'                       package = "MSstatsConvert")
#' evidence = read.csv(evidence_path)
#' pg = read.csv(pg_path)
#' imported = MSstatsImport(list(evidence = evidence, protein_groups = pg),
#'                          "MSstats", "MaxQuant")
#' class(imported)
#' head(getInputFile(imported, "evidence"))
#' 
MSstatsImport = function(input_files, type, tool, tool_version = NULL, ...) {
    checkmate::assertChoice(tool, 
                            c("DIAUmpire", "MaxQuant", "OpenMS", "OpenSWATH",
                              "Progenesis", "ProteomeDiscoverer", "Skyline",
                              "SpectroMine", "Spectronaut"))
    checkmate::assertChoice(type, c("MSstats", "MSstatsTMT"))
    checkmate::assertTRUE(!is.null(names(input_files)))
    
    input_files = as.list(input_files)
    input_files = lapply(input_files, .getDataTable, ...)
    
    msstats_object = methods::new("MSstatsInputFiles", files = input_files,
                                  type = type, tool = tool, 
                                  version = tool_version)
    class = paste0("MSstats", tool, "Files")
    .logSuccess(tool, "import")
    methods::new(class, msstats_object)
}

#' Clean files generated by a signal processing tools.
#' @param msstats_object object that inherits from `MSstatsInputFiles` class.
#' @param ... additional parameter to specific cleaning functions.
#' @rdname MSstatsClean
#' @export
#' @return data.table
#' 
#' @examples 
#' evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", 
#'                             package = "MSstatsConvert")
#' pg_path = system.file("tinytest/raw_data/MaxQuant/mq_pg.csv", 
#'                       package = "MSstatsConvert")
#' evidence = read.csv(evidence_path)
#' pg = read.csv(pg_path)
#' imported = MSstatsImport(list(evidence = evidence, protein_groups = pg),
#'                          "MSstats", "MaxQuant")
#' cleaned_data = MSstatsClean(imported, protein_id_col = "Proteins")
#' head(cleaned_data)
#' 
setGeneric("MSstatsClean", function(msstats_object, ...) {
    standardGeneric("MSstatsClean")
})
#' Clean DIAUmpire files
#' @include clean_DIAUmpire.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawDIAUmpire
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsDIAUmpireFiles", 
          .cleanRawDIAUmpire)
#' Clean MaxQuant files
#' @include clean_MaxQuant.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawMaxQuant
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsMaxQuantFiles", 
          .cleanRawMaxQuant)
#' Clean OpenMS files
#' @include clean_OpenMS.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawOpenMS
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsOpenMSFiles", 
          .cleanRawOpenMS)
#' Clean OpenSWATH files
#' @include clean_OpenSWATH.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawOpenSWATH
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsOpenSWATHFiles", 
          .cleanRawOpenSWATH)
#' Clean Progenesis files
#' @include clean_Progenesis.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawProgenesis
setMethod("MSstatsClean", signature = "MSstatsProgenesisFiles", 
          .cleanRawProgenesis)
#' Clean ProteomeDiscoverer files
#' @include clean_ProteomeDiscoverer.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawPD
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsProteomeDiscovererFiles", 
          .cleanRawPD)
#' Clean Skyline files
#' @include clean_Skyline.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawSkyline
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsSkylineFiles", 
          .cleanRawSkyline)
#' Clean SpectroMine files
#' @include clean_SpectroMine.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawSpectroMineTMT
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsSpectroMineFiles", 
          .cleanRawSpectroMineTMT)
#' Clean Spectronaut files
#' @include clean_Spectronaut.R
#' @rdname MSstatsClean
#' @inheritParams .cleanRawSpectronaut
#' @return data.table
setMethod("MSstatsClean", signature = "MSstatsSpectronautFiles", 
          .cleanRawSpectronaut)


#' Preprocess outputs from MS signal processing tools for analysis with MSstats
#' 
#' @param input data.table processed by the MSstatsClean function.
#' @param annotation annotation file generated by a signal processing tool.
#' @param feature_columns character vector of names of columns that 
#' define spectral features.
#' @param remove_shared_peptides logical, if TRUE shared peptides will be removed.
#' @param remove_single_feature_proteins logical, if TRUE, proteins that only have
#' one feature will be removed.
#' @param feature_cleaning named list with maximum two (for `MSstats` converters)
#' or three (for `MSstatsTMT` converter) elements. If `handle_few_measurements` is
#' set to "remove", feature with less than three measurements will be removed 
#' (otherwise it should be equal to "keep"). `summarize_multiple_psms` is a function
#' that will be used to aggregate multiple feature measurements in a run. It should
#' return a scalar and accept an `na.rm` parameter. For `MSstatsTMT` converters,
#' setting `remove_psms_with_any_missing` will remove features which have missing
#' values in a run from that run. 
#' @param score_filtering a list of named lists that specify filtering options.
#' Details are provided in the vignette.
#' @param exact_filtering a list of named lists that specify filtering options.
#' Details are provided in the vignette.
#' @param pattern_filtering a list of named lists that specify filtering options.
#' Details are provided in the vignette.
#' @param columns_to_fill a named list of scalars. If provided, columns with
#' names defined by the names of this list and values corresponding to its elements
#' will be added to the output `data.frame`.
#' @param aggregate_isotopic logical. If `TRUE`, isotopic peaks will by summed.
#' @param ... additional parameters to `data.table::fread`.
#' 
#' @return data.table
#' @export
#' 
#' @examples 
#' evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", 
#'                             package = "MSstatsConvert")
#' pg_path = system.file("tinytest/raw_data/MaxQuant/mq_pg.csv", 
#'                       package = "MSstatsConvert")
#' evidence = read.csv(evidence_path)
#' pg = read.csv(pg_path)
#' imported = MSstatsImport(list(evidence = evidence, protein_groups = pg),
#'                          "MSstats", "MaxQuant")
#' cleaned_data = MSstatsClean(imported, protein_id_col = "Proteins")
#' annot_path = system.file("tinytest/raw_data/MaxQuant/annotation.csv", 
#'                          package = "MSstatsConvert")
#' mq_annot = MSstatsMakeAnnotation(cleaned_data, read.csv(annot_path),
#'                                  Run = "Rawfile")
#'                                
#' # To filter M-peptides and oxidatin peptides 
#' m_filter = list(col_name = "PeptideSequence", pattern = "M", 
#'                 filter = TRUE, drop_column = FALSE)
#' oxidation_filter = list(col_name = "Modifications", pattern = "Oxidation", 
#'                         filter = TRUE, drop_column = TRUE)
#' msstats_format = MSstatsPreprocess(
#' cleaned_data, mq_annot, 
#' feature_columns = c("PeptideSequence", "PrecursorCharge"),
#' columns_to_fill = list(FragmentIon = NA, ProductCharge = NA),
#' pattern_filtering = list(oxidation = oxidation_filter, m = m_filter)
#' )
#' # Output in the standard MSstats format
#' head(msstats_format)
#' 
MSstatsPreprocess = function(
    input, annotation, feature_columns, remove_shared_peptides = TRUE,
    remove_single_feature_proteins = TRUE,
    feature_cleaning = list(remove_features_with_few_measurements = TRUE,
                            summarize_multiple_psms = max),
    score_filtering = list(), exact_filtering = list(), 
    pattern_filtering = list(), columns_to_fill = list(), 
    aggregate_isotopic = FALSE, ...
) {
    Intensity = NULL
    
    .checkMSstatsParams(input, annotation, feature_columns,
                        remove_shared_peptides,
                        remove_single_feature_proteins,
                        feature_cleaning)
    .logConverterOptions(
        feature_columns, remove_shared_peptides, remove_single_feature_proteins,
        feature_cleaning, is.element("Channel", colnames(input))
    )
    input = .handleFiltering(input, score_filtering, 
                             exact_filtering, pattern_filtering)
    input = .handleIsotopicPeaks(input, aggregate_isotopic)
    input = .filterFewMeasurements(input, 1, FALSE)
    input = .handleSharedPeptides(input, remove_shared_peptides)
    input = .cleanByFeature(input, feature_columns, feature_cleaning)
    input = .handleSingleFeaturePerProtein(input, remove_single_feature_proteins)
    input = .mergeAnnotation(input, annotation)
    .fillValues(input, columns_to_fill)
    .adjustIntensities(input)
    input
}


#' Creates balanced design by removing overlapping fractions and filling incomplete rows
#' 
#' @param input `data.table` processed by the `MSstatsPreprocess` function
#' @param feature_columns str, names of columns that define spectral features
#' @param fill_incomplete if TRUE (default), Intensity values for missing runs
#' will be added as NA
#' @param handle_fractions if TRUE (default), overlapping fractions will be resolved
#' @param fix_missing str, optional. Defaults to NULL, which means no action.
#' If not NULL, must be one of the options: "zero_to_na" or "na_to_zero".
#' If "zero_to_na", Intensity values equal exactly to 0 will be converted to NA.
#' If "na_to_zero", missing values will be replaced by zeros.
#' 
#' @export
#' @return data.frame of class `MSstatsValidated`
#' 
#' @examples
#' unbalanced_data = system.file("tinytest/raw_data/unbalanced_data.csv", 
#'                               package = "MSstatsConvert")
#' unbalanced_data = data.table::as.data.table(read.csv(unbalanced_data))
#' balanced = MSstatsBalancedDesign(unbalanced_data, 
#'                                  c("PeptideSequence", "PrecursorCharge",
#'                                    "FragmentIon", "ProductCharge"))
#' dim(balanced) # Now balanced has additional rows (with Intensity = NA)
#' # for runs that were not included in the unbalanced_data table
#' 
MSstatsBalancedDesign = function(input, feature_columns, fill_incomplete = TRUE,
                                 handle_fractions = TRUE, fix_missing = NULL) {
    feature = NULL
    
    input[, feature := do.call(".combine", .SD), .SDcols = feature_columns]
    if (handle_fractions) {
        input = .handleFractions(input)
        input = .filterFewMeasurements(input, 1, TRUE, feature_columns)
        msg_fractions = "** Fractionation handled."
        getOption("MSstatsLog")("INFO", msg_fractions)
        getOption("MSstatsMsg")("INFO", msg_fractions)
    } 
    input = .makeBalancedDesign(input, fill_incomplete)
    msg_balanced = paste("** Updated quantification data to make balanced design.",
                         "Missing values are marked by NA")
    getOption("MSstatsLog")("INFO", msg_balanced)
    getOption("MSstatsMsg")("INFO", msg_balanced)
    input = .fixMissingValues(input, fix_missing)
    input = input[, !(colnames(input) %in% c("feature", "isZero")), 
                  with = FALSE]
    
    getOption("MSstatsLog")("INFO", "\n")
    .MSstatsFormat(input)
}


#' Create annotation
#' 
#' @param input data.table preprocessed by the MSstatsClean function
#' @param annotation data.table 
#' @param ... key-value pairs, where keys are names of columns of `annotation` 
#' 
#' @return data.table
#' @export
#' 
#' @examples 
#' evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", 
#'                             package = "MSstatsConvert")
#' pg_path = system.file("tinytest/raw_data/MaxQuant/mq_pg.csv", 
#'                       package = "MSstatsConvert")
#' evidence = read.csv(evidence_path)
#' pg = read.csv(pg_path)
#' imported = MSstatsImport(list(evidence = evidence, protein_groups = pg),
#'                          "MSstats", "MaxQuant")
#' cleaned_data = MSstatsClean(imported, protein_id_col = "Proteins")
#' annot_path = system.file("tinytest/raw_data/MaxQuant/annotation.csv", 
#'                          package = "MSstatsConvert")
#' mq_annot = MSstatsMakeAnnotation(cleaned_data, read.csv(annot_path),
#'                                  Run = "Rawfile")
#' head(mq_annot)
#' 
MSstatsMakeAnnotation = function(input, annotation, ...) {
    all_columns = unlist(list(...))
    if (!is.null(annotation)) {
        annotation = .getDataTable(annotation)
        msg = "** Using provided annotation."
        getOption("MSstatsLog")("INFO", msg)
        getOption("MSstatsMsg")("INFO", msg)
    } else {
        cols = c("Run", "Channel", "Condition", "BioReplicate", "TechReplicate",
                 "Mixture", "TechRepMixture", "Fraction", unname(all_columns))
        cols = intersect(cols, colnames(input))
        annotation = unique(input[, cols, with = FALSE])
        msg = "** Using annotation extracted from quantification data."
        getOption("MSstatsLog")("INFO", msg)
        getOption("MSstatsMsg")("INFO", msg)
    }
    if (length(all_columns) > 0) {
        data.table::setnames(annotation, 
                             unname(all_columns),
                             names(all_columns),
                             skip_absent = TRUE)
    }
    annotation = annotation[, !duplicated(colnames(annotation)), 
                            with = FALSE]
    .checkAnnotation(input, annotation)
    if (is.element("Channel", colnames(annotation))) {
        annotation$Channel = .standardizeColnames(annotation$Channel)
        labels_msg = "Run and Channel"
    } else {
        labels_msg = "Run"
    }
    annotation$Run = .standardizeColnames(annotation$Run)
    msg = paste("**", labels_msg, "labels were standardized to remove",
                "symbols such as '.' or '%'.")
    getOption("MSstatsLog")("INFO", msg)
    getOption("MSstatsMsg")("INFO", msg)
    annotation
}

Try the MSstatsConvert package in your browser

Any scripts or data that you put into this service are public.

MSstatsConvert documentation built on Nov. 8, 2020, 5:49 p.m.