R/compounds-sirius.R

Defines functions processSIRIUSCompounds

#' @include main.R
#' @include compounds.R
#' @include utils-sirius.R
#' @include feature_groups-set.R
NULL

#' Compounds class for SIRIUS results.
#'
#' This class is derived from \code{\link{compounds}} and contains additional specific \command{SIRIUS} data.
#'
#' Objects from this class are generated by \code{\link{generateCompoundsSIRIUS}}
#'
#' @slot fingerprints A \code{list} with for each feature group result a \code{data.table} containing fingerprints
#'   obtained with \command{CSI:FingerID}.
#'
#' @templateVar class compoundsSIRIUS
#' @template class-hierarchy
#'
#' @seealso \code{\link{compounds}} and \code{\link{generateCompoundsSIRIUS}}
#'
#' @inherit generateCompoundsSIRIUS references
#'
#' @export
compoundsSIRIUS <- setClass("compoundsSIRIUS", slots = c(fingerprints = "list"),
                            contains = "compounds")

processSIRIUSCompounds <- function(msFName, outPath, MSMS, database, adduct, topMost)
{
    resultPath <- getSiriusResultPath(outPath, msFName)
    results <- scRanges <- fingerprints <- data.table()
    
    if (length(resultPath) != 0) # zero for no results
    {
        summary <- file.path(resultPath, "structure_candidates.tsv")
        if (file.exists(summary))
        {
            results <- fread(summary)
            results <- unifySirNames(results)
            
            # NOTE: so far SIRIUS only has one score
            if (nrow(results) > 0)
                scRanges <- list(score = range(results$score))
            
            # manually sort results: SIRIUS seems to sometimes randomly order results with equal scores
            # NOTE: do this before topMost filter step below to ensure proper filtering
            setorderv(results, c("score", "identifier"), order = c(-1, 1))
            
            if (!is.null(topMost))
            {
                if (nrow(results) > topMost)
                    results <- results[seq_len(topMost)] # results should already be sorted on score
            }
            
            # identifiers to characters, as they sometimes are and sometimes aren't depending on if multiple are present
            results[, identifier := as.character(identifier)]
            
            # NOTE: fragment info is based on SIRIUS results, ie from formula
            # prediction and not by compounds! Hence, results are the same for
            # candidates with the same formula.
            fragFiles <- patRoon:::getAndPrepareSIRIUSResFiles(resultPath, "spectra", "tsv")
            for (ff in fragFiles)
            {
                fragInfo <- fread(ff)
                fragInfo[, ionization := gsub(" ", "", ionization)]
                fragInfo[, PLID := sapply(mz, function(omz) MSMS[which.min(abs(omz - mz))]$ID)]
                
                # each frag file always contains the precursor (even in input doesn't) --> use this to figure out which
                # candidate(s) it belongs to
                wh <- which(results$neutral_formula %in% fragInfo$formula) # UNDONE: check if it's really the precursor?
                
                # Remove zero intensity precursor peak since it wasn't actually present in the peak list
                fragInfo <- fragInfo[intensity != 0 | formula != results$neutral_formula[wh[1]]]
                
                fragInfo[, c("rel.intensity", "exactmass", "intensity") := NULL]
                
                if (length(wh) > 0)
                {
                    # sirius neutralizes fragments, make them ion again
                    if (!is.null(fragInfo[["implicitAdduct"]]))
                        ionImpAdducts <- ifelse(nzchar(fragInfo$implicitAdduct),
                                                mapply(paste0("+", fragInfo$implicitAdduct, "]"), fragInfo$ionization,
                                                       FUN = sub, MoreArgs = list(pattern = "\\]")),
                                                fragInfo$ionization)
                    else
                        ionImpAdducts <- fragInfo$ionization
                    setnames(fragInfo, "formula", "formula_SIR")
                    fragInfo[, ion_formula := mapply(formula_SIR, ionImpAdducts, FUN = calculateIonFormula)]
                    
                    ionform <- calculateIonFormula(results$neutral_formula[wh[1]], adduct)
                    fragInfo[, neutral_loss := sapply(ion_formula, subtractFormula, formula1 = ionform)]
                    
                    set(results, wh, "fragInfo", list(list(fragInfo)))
                    set(results, wh, "explainedPeaks", nrow(fragInfo))
                }
            }
            
            if (is.null(results[["fragInfo"]]))
            {
                # warning(sprintf("no fragment info for %s", cmd$gName))
                results[, fragInfo := list(rep(list(data.table(mz = numeric(0), ion_formula = character(0),
                                                               neutral_loss = character(0),  score = numeric(0),
                                                               PLID = numeric(0))),
                                               nrow(results)))]
                results[, explainedPeaks := 0]
            }
            results[, database := database][]
        }
        
        if (nrow(results) > 0)
            fingerprints <- loadSIRIUSFingerprints(resultPath, results$neutral_formula, adduct)
    }
    
    ret <- list(comptab = results, scRanges = scRanges, fingerprints = fingerprints)
    return(ret)
}


#' @rdname compounds-class
#' @export
setMethod("delete", "compoundsSIRIUS", function(obj, i = NULL, j = NULL, ...)
{
    obj <- callNextMethod()
    return(syncSIRFPs(obj))
})

#' @rdname pred-quant
#' @export
setMethod("predictRespFactors", "compoundsSIRIUS", function(obj, fGroups, calibrants, eluent, organicModifier, pHAq,
                                                            concUnit = "ugL", calibConcUnit = concUnit, type = "FP")
{
    checkPackage("MS2Quant", "kruvelab/MS2Quant")
    
    ac <- checkmate::makeAssertCollection()
    checkmate::assertClass(fGroups, "featureGroups", add = ac)
    assertQuantEluent(eluent, fGroups, add = ac)
    checkmate::assertChoice(organicModifier, c("MeOH", "MeCN"), add = ac)
    checkmate::assertNumber(pHAq, finite = TRUE, add = ac)
    aapply(assertConcUnit, . ~ concUnit + calibConcUnit, fixed = list(add = ac))
    checkmate::assertChoice(type, c("FP", "SMILES", "both"), add = ac)
    checkmate::reportAssertions(ac)
    
    if (type == "SMILES" || type == "both")
        obj <- callNextMethod(obj, fGroups = fGroups, calibrants = calibrants, eluent = eluent,
                              organicModifier = organicModifier, pHAq = pHAq, concUnit = concUnit,
                              calibConcUnit = calibConcUnit, updateScore = FALSE, scoreWeight = 1)

    calibrants <- assertAndPrepareQuantCalib(calibrants, calibConcUnit)
    
    if (length(obj) == 0)
        return(obj)
    
    if (type == "FP" || type == "both")
    {
        printf("Predicting response factors from fingerprints with MS2Quant for %d candidates...\n", length(obj))
        res <- predictRespFactorsSIRFPs(obj, groupInfo(fGroups), calibrants, eluent, organicModifier, pHAq, concUnit)
        obj@MS2QuantMeta <- res$MD
        
        obj@groupAnnotations <- Map(groupNames(obj), annotations(obj), f = function(grp, ann)
        {
            ann <- copy(ann)
            if (nrow(res$RFs) == 0)
            {
                ann[, RF_SIRFP := NA_real_]
                return(ann)
            }
        
            if (!is.null(ann[["RF_SIRFP"]]))
                ann[, RF_SIRFP := NULL] # clearout for merge below    
            return(merge(ann, res$RFs[group == grp, c("neutral_formula", "RF_SIRFP"), with = FALSE],
                         by = "neutral_formula", sort = FALSE, all.x = TRUE))
        })
        
        obj <- addCompoundScore(obj, "RF_SIRFP", FALSE, 1)
    }
    
    return(obj)
})

#' @rdname pred-tox
#' @export
setMethod("predictTox", "compoundsSIRIUS", function(obj, type = "FP", LC50Mode = "static", concUnit = "ugL")
{
    checkPackage("MS2Tox", "kruvelab/MS2Tox")
    
    ac <- checkmate::makeAssertCollection()
    checkmate::assertChoice(type, c("FP", "SMILES", "both"), add = ac)
    checkmate::assertChoice(LC50Mode, c("static", "flow"), add = ac)
    assertConcUnit(concUnit, add = ac)
    checkmate::reportAssertions(ac)
    
    if (type == "SMILES" || type == "both")
        obj <- callNextMethod(obj, LC50Mode = LC50Mode, concUnit = concUnit, updateScore = FALSE, scoreWeight = 1)
    
    if (length(obj) == 0)
        return(obj)
    
    if (type == "FP" || type == "both")
    {
        printf("Predicting LC50 values from fingerprints with MS2Tox for %d candidates...\n", length(obj))
        LC50Tab <- predictLC50SIRFPs(obj, LC50Mode, concUnit = concUnit)
        
        obj@groupAnnotations <- Map(groupNames(obj), annotations(obj), f = function(grp, ann)
        {
            ann <- copy(ann)
            if (nrow(LC50Tab) == 0)
            {
                ann[, LC50_SIRFP := NA_real_]
                return(ann)
            }

            if (!is.null(ann[["LC50_SIRFP"]]))
                ann[, LC50_SIRFP := NULL] # clearout for merge below    
            return(merge(ann, LC50Tab[group == grp, c("neutral_formula", "LC50_SIRFP"), with = FALSE],
                         by = "neutral_formula", sort = FALSE, all.x = TRUE))
        })
        
        obj <- addCompoundScore(obj, "LC50_SIRFP", FALSE, 1)
    }
    
    return(obj)
})


#' Compound annotation with SIRIUS
#'
#' Uses \href{https://bio.informatik.uni-jena.de/software/sirius/}{SIRIUS} in combination with
#' \href{https://www.csi-fingerid.uni-jena.de/}{CSI:FingerID} for compound annotation.
#'
#' @templateVar algo SIRIUS
#' @templateVar do generate compound candidates
#' @templateVar generic generateCompounds
#' @templateVar algoParam sirius
#' @template algo_generator
#'
#' @details Similar to \code{\link{generateFormulasSIRIUS}}, candidate formulae are generated with SIRIUS. These results
#'   are then feed to CSI:FingerID to acquire candidate structures. This method requires the availability of MS/MS data,
#'   and feature groups without it will be ignored.
#'
#' @param fingerIDDatabase Database specifically used for \command{CSI:FingerID}. If \code{NULL}, the value of the
#'   \code{formulaDatabase} parameter will be used or \code{"pubchem"} when that is also \code{NULL}. Sets the
#'   \option{--fingerid-db} option.
#' @param topMostFormulas Do not return more than this number of candidate formulae. Note that only compounds for these
#'   formulae will be searched. Sets the \option{--candidates} commandline option.
#' @param verbose If \code{TRUE} then more output is shown in the terminal.
#'
#' @templateVar ident TRUE
#' @template sirius-args
#' @template sirius_form-args
#' @template adduct-arg
#' @template comp_algo-args
#'
#' @inheritParams generateCompounds
#'
#' @return A \code{\link{compoundsSIRIUS}} object.
#'
#' @templateVar what \code{generateCompoundsSIRIUS}
#' @template uses-multiProc
#'
#' @templateVar what generateCompoundsSIRIUS
#' @template main-rd-method
#' @export
setMethod("generateCompoundsSIRIUS", "featureGroups", function(fGroups, MSPeakLists, relMzDev = 5, adduct = NULL,
                                                               projectPath = NULL, elements = "CHNOP",
                                                               profile = "qtof", formulaDatabase = NULL,
                                                               fingerIDDatabase = "pubchem", noise = NULL,
                                                               cores = NULL, topMost = 100, topMostFormulas = 5,
                                                               token = NULL, extraOptsGeneral = NULL,
                                                               extraOptsFormula = NULL, verbose = TRUE,
                                                               splitBatches = FALSE, dryRun = FALSE)
{
    ac <- checkmate::makeAssertCollection()
    checkmate::assertClass(fGroups, "featureGroups", add = ac)
    checkmate::assertClass(MSPeakLists, "MSPeakLists", add = ac)
    checkmate::assertNumber(relMzDev, lower = 0, finite = TRUE, add = ac)
    checkmate::assertString(projectPath, null.ok = TRUE, add = ac)
    aapply(checkmate::assertString, . ~ elements + profile + fingerIDDatabase, fixed = list(add = ac))
    aapply(checkmate::assertString, . ~ formulaDatabase + fingerIDDatabase, null.ok = TRUE, fixed = list(add = ac))
    checkmate::assertNumber(noise, lower = 0, finite = TRUE, null.ok = TRUE, add = ac)
    checkmate::assertCount(cores, positive = TRUE, null.ok = TRUE, add = ac)
    aapply(checkmate::assertCount, . ~ topMost + topMostFormulas, positive = TRUE, fixed = list(add = ac))
    checkmate::assertString(token, null.ok = TRUE, add = ac)
    aapply(checkmate::assertCharacter, . ~ extraOptsGeneral + extraOptsFormula, null.ok = TRUE, fixed = list(add = ac))
    checkmate::assertFlag(verbose, add = ac)
    checkmate::assertFlag(splitBatches, add = ac)
    checkmate::assertFlag(dryRun, add = ac)
    checkmate::reportAssertions(ac)

    if (length(fGroups) == 0)
        return(compoundsSIRIUS(algorithm = "sirius"))
    
    if (!is.null(projectPath))
        projectPath <- normalizePath(projectPath, mustWork = FALSE)
    
    adduct <- checkAndToAdduct(adduct, fGroups)
    if (is.null(fingerIDDatabase))
        fingerIDDatabase <- if (!is.null(formulaDatabase)) formulaDatabase else "pubchem"

    gCount <- length(fGroups)
    printf("Processing %d feature groups with SIRIUS+CSI:FingerID...\n", gCount)
    
    results <- doSIRIUS(fGroups, MSPeakLists, FALSE, profile, adduct, relMzDev, elements,
                        formulaDatabase, noise, cores, "structure", fingerIDDatabase, topMostFormulas, projectPath,
                        token, extraOptsGeneral, extraOptsFormula, verbose, "compoundsSIRIUS",
                        patRoon:::processSIRIUSCompounds, list(database = fingerIDDatabase, topMost = topMost),
                        splitBatches, dryRun)
    
    # prune empty/NULL results
    if (length(results) > 0)
        results <- results[sapply(results, function(r) !is.null(r$comptab) && nrow(r$comptab) > 0, USE.NAMES = FALSE)]
    
    if (verbose)
    {
        printf("-------\n")
        ngrp <- length(results)
        printf("Assigned %d compounds to %d feature groups (%.2f%%).\n", sum(unlist(lapply(results, function(r) nrow(r$comptab)))),
               ngrp, if (gCount == 0) 0 else ngrp * 100 / gCount)
    }

    return(compoundsSIRIUS(groupAnnotations = lapply(results, "[[", "comptab"), scoreTypes = "score",
                           scoreRanges = lapply(results, "[[", "scRanges"),
                           fingerprints = pruneList(lapply(results, "[[", "fingerprints"), checkZeroRows = TRUE),
                           algorithm = "sirius"))
})

#' @template featAnnSets-gen_args
#' @rdname generateCompoundsSIRIUS
#' @export
setMethod("generateCompoundsSIRIUS", "featureGroupsSet", function(fGroups, MSPeakLists, relMzDev = 5, adduct = NULL,
                                                                  projectPath = NULL, ..., setThreshold = 0,
                                                                  setThresholdAnn = 0, setAvgSpecificScores = FALSE)
{
    checkmate::assertCharacter(projectPath, len = length(sets(fGroups)), null.ok = TRUE)
    sa <- if (!is.null(projectPath)) lapply(projectPath, function(p) list(projectPath = p)) else list()
    generateCompoundsSet(fGroups, MSPeakLists, adduct, generateCompoundsSIRIUS, relMzDev = relMzDev, ...,
                         setThreshold = setThreshold, setThresholdAnn = setThresholdAnn,
                         setAvgSpecificScores = setAvgSpecificScores, setArgs = sa)
})
rickhelmus/patRoon documentation built on April 3, 2024, 6:56 p.m.