R/formulas-sirius.R

Defines functions processSIRIUSFormulas

#' @include main.R
#' @include mspeaklists.R
#' @include formulas.R
#' @include utils-sirius.R
NULL

#' Formulas class for SIRIUS results.
#'
#' This class is derived from \code{\link{formulas}} and contains additional specific \command{SIRIUS} data.
#'
#' Objects from this class are generated by \code{\link{generateFormulasSIRIUS}}
#'
#' @slot fingerprints A \code{list} with for each feature group result a \code{data.table} containing fingerprints
#'   obtained with \command{CSI:FingerID}. Will be empty unless the \code{getFingerprints} argument to
#'   \code{\link{generateFormulasSIRIUS}} was set to \code{TRUE}.
#' @slot MS2QuantMeta Metadata from \pkg{MS2Quant} filled in by \code{predictRespFactors}.
#'
#' @templateVar class formulasSIRIUS
#' @template class-hierarchy
#'
#' @seealso \code{\link{formulas}} and \code{\link{generateFormulasSIRIUS}}
#'
#' @inherit generateFormulasSIRIUS references
#'
#' @export
formulasSIRIUS <- setClass("formulasSIRIUS", slots = c(fingerprints = "list", MS2QuantMeta = "list"),
                           contains = "formulas")

# callback for executeMultiProcess()
processSIRIUSFormulas <- function(msFName, outPath, adduct, ...)
{
    noResult <- forms <- data.table::data.table(neutral_formula = character(), ion_formula = character(),
                                                neutralMass = numeric(), ion_formula_mz = numeric(),
                                                adduct = character(), score = numeric(), MSMSScore = numeric(),
                                                isoScore = numeric(), explainedPeaks = integer(),
                                                explainedIntensity = numeric(), fragInfo = list())
    fingerprints <- data.table()
    
    resultPath <- patRoon:::getSiriusResultPath(outPath, msFName)
    summary <- file.path(resultPath, "formula_candidates.tsv")
    if (length(summary) == 0 || !file.exists(summary))
        forms <- noResult
    else
    {
        forms <- fread(summary)
        fragFiles <- patRoon:::getAndPrepareSIRIUSResFiles(resultPath, "spectra", "tsv")
        
        if (nrow(forms) == 0 || length(fragFiles) == 0)
            forms <- noResult
        else
        {
            data.table::setnames(forms,
                     c("molecularFormula", "precursorFormula", "SiriusScore",
                       "TreeScore", "IsotopeScore", "numExplainedPeaks", "medianMassErrorFragmentPeaks(ppm)",
                       "medianAbsoluteMassErrorFragmentPeaks(ppm)", "massErrorPrecursor(ppm)"),
                     c("neutral_formula", "neutral_adduct_formula", "score", "MSMSScore", "isoScore",
                       "explainedPeaks", "error_frag_median", "error_frag_median_abs", "error"))
            
            ionImpAdductsCached <- patRoon:::makeEmptyListNamed(list())
            fragInfoList <- lapply(fragFiles, function(ff)
            {
                fragInfo <- data.table::fread(ff)
                data.table::setnames(fragInfo,
                         c("exactmass", "formula"),
                         c("ion_formula_mz", "ion_formula_SIR"))
                fragInfo[, rel.intensity := NULL]
                fragInfo[, ionization := gsub(" ", "", ionization, fixed = TRUE)]

                # sirius neutralizes fragments, make them ion again
                if (!is.null(fragInfo[["implicitAdduct"]]))
                    ionImpAdducts <- ifelse(nzchar(fragInfo$implicitAdduct),
                                            mapply(paste0("+", fragInfo$implicitAdduct, "]"), fragInfo$ionization,
                                                   FUN = sub, MoreArgs = list(pattern = "\\]")),
                                            fragInfo$ionization)
                else
                    ionImpAdducts <- fragInfo$ionization
                
                notCached <- setdiff(ionImpAdducts, names(ionImpAdductsCached))
                if (length(notCached) > 0)
                    ionImpAdductsCached <<- c(ionImpAdductsCached, sapply(notCached, patRoon:::as.adduct,
                                                                          format = "sirius", simplify = FALSE))
                ionImpAdducts <- ionImpAdductsCached[ionImpAdducts]

                fragInfo[, ion_formula := mapply(ion_formula_SIR, ionImpAdducts, FUN = patRoon:::calculateIonFormula)]
                if (!is.null(fragInfo[["implicitAdduct"]]))
                {
                    # 'correct' formula masses: SIRIUS subtract implicit adduct from it
                    fragInfo[nzchar(implicitAdduct), ion_formula_mz := ion_formula_mz +
                                 sapply(implicitAdduct, patRoon:::getFormulaMass)]
                }
                return(fragInfo)
            })
            names(fragInfoList) <- sapply(fragFiles, patRoon:::getFormulaFromSIRIUSResFile, "tsv")
            
            # initialize all with empty fragInfos
            if (length(fragInfoList) > 0)
                forms[match(names(fragInfoList), neutral_adduct_formula), fragInfo := fragInfoList]
            else
                forms[, fragInfo := list()]

            forms <- patRoon:::addMiscFormulaInfo(forms, adduct)
            
            forms[, rank := NULL]
            
            # Precursor is always present in MS/MS spectrum: it's added by SIRIUS if necessarily (with zero intensity).
            # Remove it and use its mz to get ion_formula_mz
            forms[, ion_formula_mz := mapply(ion_formula, fragInfo,
                                             FUN = function(form, fi) fi$ion_formula_mz[form == fi$ion_formula])]
            forms[, fragInfo := Map(ion_formula, fragInfo, f = function(form, fi)
            {
                fi <- fi[intensity != 0 | ion_formula != form]
                fi[, intensity := NULL]
                return(fi)
            })]
            forms[, explainedPeaks := sapply(fragInfo, nrow)] # update
            
            # set nice column order
            data.table::setcolorder(forms, c("neutral_formula", "ion_formula", "neutral_adduct_formula", "neutralMass",
                                             "ion_formula_mz", "error", "error_frag_median", "error_frag_median_abs",
                                             "adduct", "score", "MSMSScore", "isoScore", "explainedPeaks",
                                             "explainedIntensity"))
            
            forms <- patRoon:::rankFormulaTable(forms)
            
            if (nrow(forms) > 0)
                fingerprints <- loadSIRIUSFingerprints(resultPath, forms$neutral_formula, adduct)
        }
    }
    return(list(formtab = forms, fingerprints = fingerprints))
}


#' @rdname formulas-class
#' @export
setMethod("delete", "formulasSIRIUS", function(obj, i = NULL, j = NULL, ...)
{
    obj <- callNextMethod()
    return(syncSIRFPs(obj))
})

#' @rdname pred-quant
#' @export
setMethod("predictRespFactors", "formulasSIRIUS", function(obj, fGroups, calibrants, eluent, organicModifier, pHAq,
                                                           concUnit = "ugL", calibConcUnit = concUnit)
{
    checkPackage("MS2Quant", "kruvelab/MS2Quant")
    
    ac <- checkmate::makeAssertCollection()
    checkmate::assertClass(fGroups, "featureGroups", add = ac)
    assertQuantEluent(eluent, fGroups, add = ac)
    checkmate::assertChoice(organicModifier, c("MeOH", "MeCN"), add = ac)
    checkmate::assertNumber(pHAq, finite = TRUE, add = ac)
    aapply(assertConcUnit, . ~ concUnit + calibConcUnit, fixed = list(add = ac))
    checkmate::reportAssertions(ac)
    
    calibrants <- assertAndPrepareQuantCalib(calibrants, calibConcUnit)
    
    if (length(obj) == 0)
        return(obj)
    
    printf("Predicting response factors from fingerprints with MS2Quant for %d candidates...\n", length(obj))
    res <- predictRespFactorsSIRFPs(obj, groupInfo(fGroups), calibrants, eluent, organicModifier, pHAq, concUnit)
    obj@MS2QuantMeta <- res$MD
    
    obj@groupAnnotations <- Map(groupNames(obj), annotations(obj), f = function(grp, ann)
    {
        ann <- copy(ann)
        if (nrow(res$RFs) == 0)
        {
            ann[, RF_SIRFP := NA_real_]
            return(ann)
        }
     
        if (!is.null(ann[["RF_SIRFP"]]))
            ann[, RF_SIRFP := NULL] # clearout for merge below    
        return(merge(ann, res$RFs[group == grp, c("neutral_formula", "RF_SIRFP"), with = FALSE], by = "neutral_formula",
                     sort = FALSE, all.x = TRUE))
    })

    # UNDONE: do something equivalent for formula rankings?    
    # obj <- addCompoundScore(obj, "RF_SIRFP", FALSE, 1)
    
    return(obj)
})

#' @rdname pred-tox
#' @export
setMethod("predictTox", "formulasSIRIUS", function(obj, LC50Mode = "static", concUnit = "ugL")
{
    checkPackage("MS2Tox", "kruvelab/MS2Tox")
    
    checkmate::assertChoice(LC50Mode, c("static", "flow"))
    assertConcUnit(concUnit)

    if (length(obj) == 0)
        return(obj)
    
    printf("Predicting LC50 values from fingerprints with MS2Tox for %d candidates...\n", length(obj))
    LC50Tab <- predictLC50SIRFPs(obj, LC50Mode, concUnit)
    
    obj@groupAnnotations <- Map(groupNames(obj), annotations(obj), f = function(grp, ann)
    {
        ann <- copy(ann)
        if (nrow(LC50Tab) == 0)
        {
            ann[, LC50_SIRFP := NA_real_]
            return(ann)
        }

        if (!is.null(ann[["LC50_SIRFP"]]))
            ann[, LC50_SIRFP := NULL] # clearout for merge below    
        return(merge(ann, LC50Tab[group == grp, c("neutral_formula", "LC50_SIRFP"), with = FALSE],
                     by = "neutral_formula", sort = FALSE, all.x = TRUE))
    })
    
    # UNDONE: do something equivalent for formula rankings?
    # obj <- addCompoundScore(obj, "LC50_SIRFP", FALSE, 1)
    
    return(obj)
})


#' Generate formula with SIRIUS
#'
#' Uses \href{https://bio.informatik.uni-jena.de/software/sirius/}{SIRIUS} to generate chemical formulae candidates.
#'
#' @templateVar algo SIRIUS
#' @templateVar do generate formula candidates
#' @templateVar generic generateFormulas
#' @templateVar algoParam sirius
#' @template algo_generator
#'
#' @details Similarity of measured and theoretical isotopic patterns will be used for scoring candidates. Note that
#'   \command{SIRIUS} requires availability of MS/MS data.
#'
#' @param getFingerprints Set to \code{TRUE} to load \command{SIRIUS-CSI:FingerID} MS/MS fingerprints for the formula
#'   candidates. This is currently only supported with \code{calculateFeatures=FALSE} to avoid heavy server traffic. The
#'   fingerprints are stored in the \code{fingerprints} slot of the returned \code{\link{formulasSIRIUS}} object, and
#'   are used by the \code{\link{predictTox}} and \code{\link{predictRespFactors}} methods.
#' @param verbose If \code{TRUE} then more output is shown in the terminal.
#'
#' @template sirius_form-args
#' @templateVar ident FALSE
#' @template sirius-args
#' @template adduct-arg
#' @templateVar algo sirius
#' @template form_algo-args
#'
#' @inheritParams generateFormulas
#'
#' @return A \code{\link{formulasSIRIUS}} object.
#'
#' @templateVar what \code{generateFormulasSIRIUS}
#' @template uses-multiProc
#'
#' @templateVar what generateFormulasSIRIUS
#' @template main-rd-method
#' @export
setMethod("generateFormulasSIRIUS", "featureGroups", function(fGroups, MSPeakLists, relMzDev = 5,
                                                              adduct = NULL, projectPath = NULL, elements = "CHNOP",
                                                              profile = "qtof", database = NULL, noise = NULL,
                                                              cores = NULL, getFingerprints = FALSE, topMost = 100,
                                                              token = NULL, extraOptsGeneral = NULL,
                                                              extraOptsFormula = NULL, calculateFeatures = TRUE,
                                                              featThreshold = 0, featThresholdAnn = 0.75,
                                                              absAlignMzDev = 0.002, verbose = TRUE,
                                                              splitBatches = FALSE, dryRun = FALSE)
{
    ac <- checkmate::makeAssertCollection()
    checkmate::assertClass(MSPeakLists, "MSPeakLists", add = ac)
    checkmate::assertNumber(relMzDev, lower = 0, finite = TRUE, add = ac)
    checkmate::assertString(projectPath, null.ok = TRUE, add = ac)
    aapply(checkmate::assertString, . ~ elements + profile, fixed = list(add = ac))
    checkmate::assertString(database, null.ok = TRUE, add = ac)
    checkmate::assertNumber(noise, lower = 0, finite = TRUE, null.ok = TRUE, add = ac)
    checkmate::assertCount(cores, positive = TRUE, null.ok = TRUE, add = ac)
    checkmate::assertCount(topMost, positive = TRUE, add = ac)
    checkmate::assertString(token, null.ok = TRUE, add = ac)
    aapply(checkmate::assertCharacter, . ~ extraOptsGeneral + extraOptsFormula, null.ok = TRUE, fixed = list(add = ac))
    checkmate::assertFlag(getFingerprints, add = ac)
    checkmate::assertFlag(calculateFeatures, add = ac)
    aapply(checkmate::assertNumber, . ~ featThreshold + featThresholdAnn + absAlignMzDev, lower = 0, upper = 1,
           fixed = list(add = ac))
    checkmate::assertFlag(verbose, add = ac)
    checkmate::assertFlag(splitBatches, add = ac)
    checkmate::assertFlag(dryRun, add = ac)
    checkmate::reportAssertions(ac)
    
    if (getFingerprints && calculateFeatures)
        stop("Fingerprints can currently only be loaded if calculateFeatures is set to FALSE", call. = FALSE)
    
    if (length(fGroups) == 0)
        return(formulas(algorithm = "sirius"))
    
    if (!is.null(projectPath))
        projectPath <- normalizePath(projectPath, mustWork = FALSE)
    
    adduct <- checkAndToAdduct(adduct, fGroups)
    gNames <- names(fGroups)
    gCount <- length(fGroups)
    
    printf("Processing %d feature groups with SIRIUS...\n---\n", gCount)
    formTable <- doSIRIUS(fGroups, MSPeakLists, calculateFeatures, profile, adduct, relMzDev, elements,
                          database, noise, cores, if (getFingerprints) "fingerprint" else "none", NULL, topMost,
                          projectPath, token, extraOptsGeneral, extraOptsFormula, verbose, "formulasSIRIUS",
                          patRoon:::processSIRIUSFormulas, NULL, splitBatches, dryRun)
    
    groupFormulas <- fingerprints <- list()
    
    if (length(formTable) > 0)
    {
        # omit empty result tables and extract formulae/fingerprints
        
        hasResults <- function(r) !is.null(r[["formtab"]]) && nrow(r$formtab) > 0
        if (calculateFeatures)
        {
            formTable <- lapply(formTable, function(fta)
            {
                fta <- fta[sapply(fta, hasResults)]
                return(lapply(fta, "[[", "formtab"))
            })
            groupFormulas <- generateGroupFormulasByConsensus(formTable,
                                                              lapply(groupFeatIndex(fGroups), function(x) sum(x > 0)),
                                                              featThreshold, featThresholdAnn, gNames)
        }
        else
        {
            formTable <- formTable[sapply(formTable, hasResults)]
            fingerprints <- pruneList(lapply(formTable, "[[", "fingerprints"), checkZeroRows = TRUE)
            groupFormulas <- lapply(formTable, "[[", "formtab")
            formTable <- list()
        }
    }
    
    groupFormulas <- setFormulaPLID(groupFormulas, MSPeakLists, absAlignMzDev)
    
    if (verbose)
    {
        printf("-------\n")
        if (calculateFeatures && length(formTable) > 0)
        {
            fCounts <- sapply(formTable, countUniqueFormulas)
            fTotCount <- sum(fCounts)
            printf("Formula statistics:\n")
            printf("%s: %d (%.1f%%)\n", names(formTable), fCounts, if (fTotCount == 0) 0 else fCounts * 100 / fTotCount)
            printf("Total: %d\n", fTotCount)
        }
        ngrp <- length(groupFormulas)
        printf("Assigned %d unique formulas to %d feature groups (%.2f%% coverage).\n", countUniqueFormulas(groupFormulas),
               ngrp, if (gCount == 0) 0 else ngrp * 100 / gCount)
    }
    
    return(formulasSIRIUS(groupAnnotations = groupFormulas, featureFormulas = formTable, fingerprints = fingerprints,
                          algorithm = "sirius"))
})

#' @template featAnnSets-gen_args
#' @rdname generateFormulasSIRIUS
#' @export
setMethod("generateFormulasSIRIUS", "featureGroupsSet", function(fGroups, MSPeakLists, relMzDev = 5, adduct = NULL,
                                                                 projectPath = NULL, ..., setThreshold = 0,
                                                                 setThresholdAnn = 0, setAvgSpecificScores = FALSE)
{
    checkmate::assertCharacter(projectPath, len = length(sets(fGroups)), null.ok = TRUE)
    sa <- if (!is.null(projectPath)) lapply(projectPath, function(p) list(projectPath = p)) else list()
    generateFormulasSet(fGroups, MSPeakLists, adduct, generateFormulasSIRIUS, relMzDev = relMzDev, ...,
                        setThreshold = setThreshold, setThresholdAnn = setThresholdAnn,
                        setAvgSpecificScores = setAvgSpecificScores, setArgs = sa)
})
rickhelmus/patRoon documentation built on April 25, 2024, 8:15 a.m.