R/compounds-metfrag.R

Defines functions MFMPErrorHandler MFMPTimeoutHandler MFMPPrepareHandler MFMPFinishHandler processMFResults generateMetFragRunData initMetFragCLCommand getMetFragExtDB isLocalMetFragDB getMFFragmentInfo cleanFragFormulas unifyMFNames

#' @include main.R
#' @include compounds.R
#' @include mspeaklists.R
#' @include formulas.R
#' @include feature_groups-set.R
NULL

#' Compounds list class for MetFrag results.
#'
#' This class is derived from \code{\link{compounds}} and contains additional
#' specific MetFrag data.
#'
#' Objects from this class are generated by
#' \code{\link{generateCompoundsMetFrag}}
#'
#' @slot settings A list with all general configuration settings passed to
#'   MetFrag. Feature specific items (\emph{e.g.} spectra and precursor masses)
#'   are not contained in this list.
#'
#' @param compoundsMF A \code{compoundsMF} object.
#'
#' @templateVar class compoundsMF
#' @template class-hierarchy
#'
#' @seealso \code{\link{compounds}} and \code{\link{generateCompoundsMetFrag}}
#'
#' @references \insertRef{Ruttkies2016}{patRoon}
#'
#' @export
compoundsMF <- setClass("compoundsMF", slots = c(settings = "list"),
                        contains = "compounds")

setMethod("initialize", "compoundsMF",
          function(.Object, ...) callNextMethod(.Object, algorithm = "metfrag", ...))


#' @describeIn compoundsMF Accessor method for the \code{settings} slot.
#' @aliases settings
#' @export
setMethod("settings", "compoundsMF", function(compoundsMF) compoundsMF@settings)


unifyMFNames <- function(mfr, scoreTypesMF)
{
    unNames <- c(NoExplPeaks = "explainedPeaks",
                 Score = "score",
                 MonoisotopicMass = "neutralMass",
                 SMILES = "SMILES",
                 InChIKey = "InChIKey",
                 InChIKey2 = "InChIKey2",
                 InChIKey1 = "InChIKey1",
                 InChI = "InChI",
                 Identifier = "identifier",
                 PubChemNumberPatents = "numberPatents",
                 fragInfo = "fragInfo", # keep it
                 FragmenterScore = "fragScore",
                 MolecularFormula = "neutral_formula",
                 TrivialName = "compoundName",
                 CompoundName = "compoundName",
                 IUPACName = "compoundName",

                 # PubChem
                 XlogP3 = "XlogP",
                 PubChemNumberPubMedReferences = "pubMedReferences",
                 ChemSpiderNumberPubMedReferences = "pubMedReferences",
                 
                 # PubChemLite
                 FP = "FP",
                 PubMed_Count = "pubMedReferences",
                 Patent_Count = "numberPatents",
                 Related_CIDs = "relatedCIDs",
                 Name2 = "compoundName2", # Nov2019 version
                 Synonym = "compoundName2", # Jan2020 version
                 AgroChemInfo = "agroChemInfo",
                 BioPathway = "bioPathway",
                 DrugMedicInfo = "drugMedicInfo",
                 FoodRelated = "foodRelated",
                 PharmacoInfo = "pharmacoInfo",
                 SafetyInfo = "safetyInfo",
                 ToxicityInfo = "toxicityInfo",
                 DisorderDisease = "disorderDisease",
                 Identification = "identification",
                 KnownUse = "knownUse",
                 FPSum = "annoTypeCount", # Nov2019 version (rename to Jan2020 version)
                 AnnoTypeCount = "annoTypeCount", # Jan2020 version
                 
                 # PubChem OECD PFAS
                 Name = "compoundName",
                 IUPAC_Name = "compoundName2",
                 AnnotHitCount = "annotHitCount",

                 # ChemSpider
                 CHEMSPIDER_XLOGP = "XlogP",
                 CHEMSPIDER_ALOGP = "AlogP",
                 ChemSpiderNumberExternalReferences = "extReferenceCount",
                 ChemSpiderDataSourceCount = "dataSourceCount",
                 ChemSpiderReferenceCount = "referenceCount",
                 ChemSpiderRSCCount = "RSCCount",

                 OfflineMetFusionScore = "metFusionScore",
                 OfflineIndividualMoNAScore = "individualMoNAScore",
                 SmartsSubstructureInclusionScore = "smartsInclusionScore",
                 SmartsSubstructureExclusionScore = "smartsExclusionScore",
                 SuspectListScore = "suspectListScore",
                 RetentionTimeScore = "retentionTimeScore",
                 AutomatedPeakFingerprintAnnotationScore = "peakFingerprintScore",
                 AutomatedLossFingerprintAnnotationScore = "lossFingerprintScore",

                 # CompTox variables
                 CASRN_DTXSID = "CASRN",
                 CPDAT_COUNT = "CPDATCount",
                 ECOTOX = "ECOTOX",
                 NORMANSUSDAT = "NORMANSUSDAT",
                 TOXCAST_PERCENT_ACTIVE = "TOXCASTActive",
                 NUMBER_OF_PUBMED_ARTICLES = "pubMedReferences",
                 MASSBANKEU = "MASSBANKEU",
                 #MONOISOTOPIC_MASS_DTXCID = "neutralMass",
                 QC_LEVEL = "QCLevel",
                 DATA_SOURCES = "dataSources",
                 PUBCHEM_DATA_SOURCES = "pubChemDataSources",
                 TOX21SL = "TOX21SL",
                 "EXPOCAST_MEDIAN_EXPOSURE_PREDICTION_MG/KG-BW/DAY" = "EXPOCASTPredExpo",
                 TOXCAST = "TOXCAST",
                 KEMIMARKET = "KEMIMARKET",
                 MZCLOUD = "MZCLOUD",
                 
                 # CompTox - smoke metadata
                 PubMedNeuro = "pubMedNeuro",
                 CIGARETTES = "CIGARETTES",
                 INDOORCT16 = "INDOORCT16",
                 SRM2585DUST = "SRM2585DUST",
                 SLTCHEMDB = "SLTCHEMDB",
                 THSMOKE = "THSMOKE",
                 
                 # CompTox - wastewater metadata
                 ITNANTIBIOTIC = "ITNANTIBIOTIC",
                 STOFFIDENT = "STOFFIDENT",
                 KEMIMARKET_EXPO = "KEMIMARKET_EXPO",
                 KEMIMARKET_HAZ = "KEMIMARKET_HAZ",
                 REACH2017 = "REACH2017",
                 KEMIWW_WDUIndex = "KEMIWW_WDUIndex",
                 KEMIWW_StpSE = "KEMIWW_StpSE",
                 KEMIWW_SEHitsOverDL = "KEMIWW_SEHitsOverDL",
                 ZINC15PHARMA = "ZINC15PHARMA",
                 PFASMASTER = "PFASMASTER",

                 # FOR-IDENT
                 ForIdentTonnage = "tonnage",
                 ForIdentCategories = "categories",

                 # database generated from TP prediction
                 molNeutralized = "molNeutralized",
                 parent = "parent",
                 ALogP = "ALogP",
                 LogP = "LogP",
                 XLogP = "XLogP",
                 transformation = "transformation",
                 enzyme = "enzyme",
                 evidencedoi = "evidencedoi"

                 )

    unNames <- unNames[names(unNames) %in% names(mfr)] # filter out missing
    setnames(mfr, names(unNames), unNames)
    
    scoreTypesMF <- intersect(scoreTypesMF, names(mfr))

    return(mfr[, union(unNames, scoreTypesMF), with = FALSE]) # filter out any other columns
}

# MetFragCL gives bracketed fragment formulas including charge and weird (de)protonation adducts
# For comparison with other results these should be converted to a simple formula format
cleanFragFormulas <- function(forms)
{
    # get ionization adduct (if any)
    plusminpat <- "[\\-\\+]"
    addpat <- sprintf(".*\\]%s([[:alnum:]]+)%s", plusminpat, plusminpat)
    adducts <- ifelse(grepl(addpat, forms), gsub(addpat, "\\1", forms), "")
    
    forms <- gsub(sprintf("%s([[:alnum:]]+)%s$", plusminpat, plusminpat), "", forms) # remove adducts and charge
    forms <- gsub("\\[|\\]", "", forms) # remove brackets

    # add single counts to hydrogen adducts without count (e.g. -H becomes -1H)
    forms <- gsub("([-\\+])H", "\\11H", forms)

    protAdducts <- regmatches(forms, gregexpr("([-\\+][0-9]+)H", forms)) # get "-1H", "+2H" etc

    baseForms <- gsub("^([[:alnum:]]+)[-|\\+].*", "\\1", forms) # Get 'regular' part of formula
    baseForms <- paste0(baseForms, adducts) # re-add adducts
    
    # handle protonation adducts
    return(sapply(seq_along(forms), function(fi)
    {
        addHCount <- sum(as.integer(gsub("H", "", protAdducts[[fi]])))
        flist <- splitFormulaToList(baseForms[[fi]])

        if (addHCount != 0)
        {
            if (!"H" %in% names(flist))
            {
                flist <- c(flist, addHCount)
                names(flist)[length(flist)] <- "H"
            }
            else
                flist[["H"]] <- flist[["H"]] + addHCount
        }
        return(simplifyFormula(formulaListToString(flist)))
    }))
}

getMFFragmentInfo <- function(spec, mfResult, adduct)
{
    if (mfResult$NoExplPeaks == 0 || mfResult$FormulasOfExplPeaks == "NA")
        return(data.table(mz = numeric(0), ion_formula = character(0), neutral_loss = character(0),
                          score = numeric(0), PLID = numeric(0)))

    # format of FormulasOfExplPeaks: list of strings with mz1:formula1;mz2:formula2;...
    fi <- unlist(strsplit(mfResult$FormulasOfExplPeaks, "[;:]")) # split into list with subsequent m/z / formula pairs

    ret <- data.table(mz = as.numeric(fi[c(TRUE, FALSE)]),
                      ion_formula = cleanFragFormulas(fi[c(FALSE, TRUE)]),
                      ion_formula_MF = fi[c(FALSE, TRUE)])
    if (!is.null(mfResult[["FragmenterScore_Values"]]))
        ret[, score := as.numeric(unlist(strsplit(mfResult$FragmenterScore_Values, ";")))]
    ionform <- calculateIonFormula(mfResult$MolecularFormula, adduct)
    ret[, neutral_loss := sapply(ion_formula, subtractFormula, formula1 = ionform)]
    ret[, PLID := sapply(mz, function(omz) spec[which.min(abs(omz - mz))]$ID)]
    setcolorder(ret, c("mz", "ion_formula", "ion_formula_MF", "neutral_loss"))

    return(ret)
}

isLocalMetFragDB <- function(database) database %in% c("sdf", "psv", "csv", "comptox", "pubchemlite")

getMetFragExtDB <- function(localDB, database)
{
    extDB <- if (!is.null(localDB)) localDB else NULL
    if ((database == "comptox" || database == "pubchemlite") && is.null(extDB))
        extDB <- getExtDepPath(if (database == "comptox") "metfragct" else "metfragpcl", NULL)
    
    if (!is.null(extDB))
        extDB <- normalizePath(extDB)
    
    if (is.null(extDB) || !file.exists(extDB))
    {
        stop("No external database file set. This should be set as part of the extraOpts argument ",
             "e.g. extraOpts = list(LocalDatabasePath = \"C:/CompTox_17March2019_SelectMetaData.csv\")",
             call. = FALSE)
    }
    
    return(extDB)
}

initMetFragCLCommand <- function(mfSettings, spec, mfBin)
{
    paramFile <- tempfile("parameters", fileext = ".txt")
    paramCon <- file(paramFile, "w")

    writeParam <- function(param, val) cat(sprintf("%s = %s\n", param, paste0(as.character(val), collapse = ",")), file = paramCon)

    for (param in names(mfSettings))
        writeParam(param, mfSettings[[param]])

    outFile <- tempfile("results", fileext = ".csv")
    writeParam("MetFragCandidateWriter", "CSV")
    writeParam("SampleName", basename(tools::file_path_sans_ext(outFile)))
    writeParam("ResultsPath", dirname(outFile))

    specFile <- tempfile("spectrum", fileext = ".txt")
    write.table(spec[, c("mz", "intensity")], specFile, sep = "\t", row.names = FALSE, col.names = FALSE)
    writeParam("PeakListPath", specFile)

    close(paramCon)

    return(list(command = "java", args = c("-jar", mfBin, paramFile), outFile = outFile))
}

generateMetFragRunData <- function(fGroups, MSPeakLists, mfSettings, extDB, topMost, identifiers, method,
                                   adduct, database, errorRetries, timeoutRetries)
{
    gNames <- names(fGroups)
    gTable <- groupTable(fGroups)
    gInfo <- groupInfo(fGroups)
    anaInfo <- analysisInfo(fGroups)
    
    baseHash <- makeHash(method, topMost)
    if (!is.null(extDB))
        baseHash <- makeHash(baseHash, makeFileHash(extDB))

    fgAdd <- getFGroupAdducts(names(fGroups), annotations(fGroups), adduct, "metfrag")

    ret <- Map(gNames, fgAdd$grpAdducts, fgAdd$grpAdductsChr, f = function(grp, add, addChr)
    {
        if (!is.null(identifiers) && is.null(identifiers[[grp]]))
            return(NULL)
        if (is.null(MSPeakLists[[grp]][["MS"]]))
            return(NULL)

        spec <- MSPeakLists[[grp]][["MSMS"]]
        if (is.null(spec))
            return(NULL)

        mfSettings$IonizedPrecursorMass <- MSPeakLists[[grp]]$MS[precursor == TRUE]$mz
        mfSettings$PrecursorIonType <- addChr
        mfSettings$ExperimentalRetentionTimeValue <- gInfo[grp, "rts"] / 60

        if (!is.null(identifiers))
            mfSettings$PrecursorCompoundIDs <- identifiers[[grp]]
        
        hash <- makeHash(baseHash, mfSettings, spec)

        logf <- paste0("mfcl-", grp, ".txt")
        
        return(list(hash = hash, gName = grp, spec = spec, mfSettings = mfSettings,
                    adduct = add, database = database, topMost = topMost,
                    errorRetries = errorRetries, timeoutRetries = timeoutRetries,
                    logFile = logf))
    })

    return(ret[!sapply(ret, is.null)])
}

processMFResults <- function(comptab, runData, lfile = "")
{
    scRanges <- list()

    if (nrow(comptab) > 0)
    {
        compsc <- compoundScorings("metfrag")
        compsc <- compsc[compsc$metfrag %in% names(comptab), ]
        allScoreCols <- union(compsc$metfrag, runData$mfSettings$MetFragScoreTypes)

        if (length(allScoreCols) > 0)
        {
            # fix up score and suspect list columns: dash --> 0
            # NOTE: do as.numeric as values with '-' will cause the column to be a character
            comptab[, (allScoreCols) := lapply(.SD, function(x) as.numeric(ifelse(x == "-", 0, x))),
                    .SDcols = allScoreCols]

            scRanges <- sapply(allScoreCols, function(sc) range(comptab[[sc]]), simplify = FALSE)
            namesInGenSC <- match(names(scRanges), compsc$metfrag)
            notNA <- !is.na(namesInGenSC)
            if (any(notNA))
                names(scRanges)[notNA] <- compsc$name[notNA]
        }

        if (!is.null(runData$topMost) && nrow(comptab) > runData$topMost)
            comptab <- comptab[seq_len(runData$topMost)]

        # make character: needed for getMFFragmentInfo()
        # note: this column is not present in empty tables so can't do this with colClasses
        if (!is.null(comptab[["FragmenterScore_Values"]]))
            comptab[, FragmenterScore_Values := as.character(FragmenterScore_Values)]

        # fill in fragment info
        # NOTE: double wrap in list to nest table
        if (!is.null(lfile))
            cat(sprintf("\n%s - Done! Processing frags...\n", date()), file = lfile, append = TRUE)
        for (r in seq_len(nrow(comptab)))
            set(comptab, r, "fragInfo", list(list(getMFFragmentInfo(runData$spec, comptab[r], runData$adduct))))
        if (!is.null(lfile))
            cat(sprintf("\n%s - Done!\n", date()), file = lfile, append = TRUE)

        # unify column names & filter unnecessary columns
        comptab <- unifyMFNames(comptab, runData$mfSettings$MetFragScoreTypes)

        if (!is.null(comptab[["CASRN"]]))
            comptab[, CASRN := sub("CASRN:", "", CASRN, fixed = TRUE)] # remove "CASRN" prefix
        
        # sometimes these can be interpreted as dates!
        # also convert identifiers to characters, which comes in handy for set consensus merging
        for (col in c("compoundName", "compoundName2", "CASRN", "identifier", "relatedCIDs"))
        {
            if (!is.null(comptab[[col]]))
                comptab[, (col) := as.character(get(col))]
        }

        comptab[, database := runData$database]
    }

    return(list(comptab = comptab, scRanges = scRanges))
}

MFMPFinishHandler <- function(cmd)
{
    comptab <- data.table::fread(cmd$outFile, colClasses = c(Identifier = "character"))
    procres <- patRoon:::processMFResults(comptab, cmd, cmd$stderrFile)
    return(procres)
}

MFMPPrepareHandler <- function(cmd)
{
    mfBin <- getExtDepPath("metfragcl")
    
    if (!nzchar(Sys.which("java")))
        stop("Please make sure that java is installed and its location is correctly set in PATH.")
    
    if (isLocalMetFragDB(cmd$database))
    {
        extDB <- patRoon:::getMetFragExtDB(cmd$mfSettings[["LocalDatabasePath"]], cmd$database)
        # NOTE: this will set LocalDatabasePath in case only options() were set or update it with normalizePath()
        cmd$mfSettings$LocalDatabasePath <- normalizePath(extDB)
    }
    
    return(c(cmd, patRoon:::initMetFragCLCommand(cmd$mfSettings, cmd$spec, mfBin)))
}

MFMPTimeoutHandler <- function(cmd, retries)
{
    if (retries >= cmd$timeoutRetries)
    {
        warning(sprintf("Could not run MetFrag for %s: timeout. Log: %s", cmd$gName, cmd$logFile))
        return(FALSE)
    }
    warning(sprintf("Restarting timed out MetFrag command for %s (retry %d/%d)",
                    cmd$gName, retries+1, cmd$timeoutRetries))
    return(TRUE)
}

MFMPErrorHandler <- function(cmd, exitStatus, retries)
{
    if (!is.na(exitStatus) && exitStatus <= 6) # some error thrown by MF
    {
        if (retries >= cmd$errorRetries)
        {
            warning(sprintf("Could not run MetFrag for %s - exit code: %d - Log: %s",
                            cmd$gName, exitStatus, cmd$logFile))
            return(FALSE)
        }
        warning(sprintf("Restarting failed MetFrag command for %s - exit: %d (retry %d/%d)",
                        cmd$gName, exitStatus, retries+1, cmd$errorRetries))
        return(TRUE)
    }
    
    # some other error (e.g. java not present)
    return(sprintf("Fatal: Failed to execute MetFragCL for %s - exit code: %d - Log: %s",
                   cmd$gName, exitStatus, cmd$logFile))
}

#' Compound annotation with MetFrag
#'
#' Uses the \pkg{metfRag} package or \command{MetFrag CL} for compound identification (see
#' \url{http://ipb-halle.github.io/MetFrag/}).
#'
#' @templateVar algo MetFrag
#' @templateVar do generate compound candidates
#' @templateVar generic generateCompounds
#' @templateVar algoParam metfrag
#' @template algo_generator
#'
#' @details Several online compound databases such as \href{https://pubchem.ncbi.nlm.nih.gov/}{PubChem} and
#'   \href{http://www.chemspider.com/}{ChemSpider} may be chosen for retrieval of candidate structures. This method
#'   requires the availability of MS/MS data, and feature groups without it will be ignored. Many options exist to score
#'   and filter resulting data, and it is highly suggested to optimize these to improve results. The \command{MetFrag}
#'   options \code{PeakList}, \code{IonizedPrecursorMass} and \code{ExperimentalRetentionTimeValue} (in minutes) fields
#'   are automatically set from feature data.
#'
#' @param method Which method should be used for MetFrag execution: \code{"CL"} for \command{MetFragCL} and \code{"R"}
#'   for \command{MetFragR}. The former is usually much faster and recommended.
#' @param timeout Maximum time (in seconds) before a metFrag query for a feature group is stopped. Also see
#'   \code{timeoutRetries} argument.
#' @param timeoutRetries Maximum number of retries after reaching a timeout before completely skipping the metFrag query
#'   for a feature group. Also see \code{timeout} argument.
#' @param errorRetries Maximum number of retries after an error occurred. This may be useful to handle e.g. connection
#'   errors.
#' @param dbRelMzDev Relative mass deviation (in ppm) for database search. Sets the
#'   \option{DatabaseSearchRelativeMassDeviation} option.
#' @param fragRelMzDev Relative mass deviation (in ppm) for fragment matching. Sets the
#'   \option{FragmentPeakMatchRelativeMassDeviation} option.
#' @param fragAbsMzDev Absolute mass deviation (in Da) for fragment matching. Sets the
#'   \option{FragmentPeakMatchAbsoluteMassDeviation} option.
#' @param database Compound database to use. Valid values are: \code{"pubchem"}, \code{"chemspider"},
#'   \code{"for-ident"}, \code{"comptox"}, \code{"pubchemlite"}, \code{"kegg"}, \code{"sdf"}, \code{"psv"} and
#'   \code{"csv"}. See section below for more information. Sets the \code{MetFragDatabaseType} option.
#' @param extendedPubChem If \code{database="pubchem"}: whether to use the \emph{extended} database that includes
#'   information for compound scoring (\emph{i.e.} number of patents/PubMed references). Note that downloading
#'   candidates from this database might take extra time. Valid values are: \code{FALSE} (never use it), \code{TRUE}
#'   (always use it) or \code{"auto"} (default, use if specified scorings demand it).
#' @param chemSpiderToken A character string with the \href{http://www.chemspider.com/AboutServices.aspx}{ChemSpider
#'   security token} that should be set when the ChemSpider database is used. Sets the \option{ChemSpiderToken} option.
#' @param scoreTypes A character vector defining the scoring types. See the \verb{Scorings} section below for more
#'   information. Note that both generic and \command{MetFrag} specific names are accepted (\emph{i.e.} \code{name} and
#'   \code{metfrag} columns returned by \code{\link{compoundScorings}}). When a local database is used, the name should
#'   match what is given there (\code{e.g} column names when \code{database=csv}). Note that MetFrag may still report
#'   other scoring data, however, these are not used for ranking. Sets the \option{MetFragScoreTypes} option.
#' @param scoreWeights Numeric vector containing weights of the used scoring types. Order is the same as set in
#'   \code{scoreTypes}. Values are recycled if necessary. Sets the \option{MetFragScoreWeights} option.
#' @param preProcessingFilters,postProcessingFilters A character vector defining pre/post filters applied before/after
#'   fragmentation and scoring (\emph{e.g.} \code{"UnconnectedCompoundFilter"}, \code{"IsotopeFilter"},
#'   \code{"ElementExclusionFilter"}). Some methods require further options to be set. For all filters and more
#'   information refer to the \verb{Candidate Filters} section on the
#'   \href{http://ipb-halle.github.io/MetFrag/projects/metfragr/}{MetFragR homepage}. Sets the
#'   \option{MetFragPreProcessingCandidateFilter} and \code{MetFragPostProcessingCandidateFilter} options.
#' @param maxCandidatesToStop If more than this number of candidate structures are found then processing will be aborted
#'   and no results this feature group will be reported. Low values increase the chance of missing data, whereas too
#'   high values will use too much computer resources and signficantly slowdown the process. Sets the
#'   \option{MaxCandidateLimitToStop} option.
#' @param identifiers A \code{list} containing for each feature group a character vector with database identifiers that
#'   should be used to find candidates for a feature group (the list should be named by feature group names). If
#'   \code{NULL} all relevant candidates will be retrieved from the specified database. An example usage scenario is to
#'   obtain the list of candidate identifiers from a \code{\link{compounds}} object obtained with
#'   \code{\link{generateCompoundsSIRIUS}} using the \code{\link{identifiers}} method. This way, only those candidates
#'   will be searched by MetFrag that were generated by SIRIUS+CSI:FingerID. Sets the \option{PrecursorCompoundIDs}
#'   option.
#' @param extraOpts A named \code{list} containing further settings \command{MetFrag}. See the
#'   \href{http://ipb-halle.github.io/MetFrag/projects/metfragr/}{MetFragR} and
#'   \href{http://ipb-halle.github.io/MetFrag/projects/metfragcl/}{MetFrag CL} homepages for all available options. Set
#'   to \code{NULL} to ignore.
#'
#' @template adduct-arg
#' @template comp_algo-args
#'
#' @inheritParams generateCompounds
#'
#' @return \code{generateCompoundsMetFrag} returns a \code{\link{compoundsMF}} object.
#'
#' @section Scorings: \command{MetFrag} supports \emph{many} different scorings to rank candidates. The
#'   \code{\link{compoundScorings}} function can be used to get an overview: (some columns are omitted)
#'
#' @eval paste("@@section Scorings:", patRoon:::tabularRD(patRoon::compoundScorings("metfrag")[, 1:3]))
#'
#' @section Scorings: In addition, the \code{\link{compoundScorings}} function is also useful to programmatically
#'   generate a set of scorings to be used for ranking with \command{MetFrag}. For instance, the following can be given
#'   to the \code{scoreTypes} argument to use all default scorings for PubChem: \code{compoundScorings("metfrag",
#'   "pubchem", onlyDefault=TRUE)$name}.
#'
#'   For all \command{MetFrag} scoring types refer to the \verb{Candidate Scores} section on the
#'   \href{http://ipb-halle.github.io/MetFrag/projects/metfragr/}{MetFragR homepage}.
#'
#' @section Usage of MetFrag databases: When \code{database="chemspider"} setting the \code{chemSpiderToken} argument is
#'   mandatory.
#'
#'   If a local database is chosen via \code{sdf}, \code{psv}, or \code{csv} then its file location should be set with
#'   the \code{LocalDatabasePath} value via the \code{extraOpts} argument. For example: \code{extraOpts =
#'   list(LocalDatabasePath = "C:/myDB.csv")}.
#'
#'   If \code{database="pubchemlite"} or \code{database="comptox"} and \pkg{patRoonExt} is \emph{not} installed then the
#'   file location must be specified as above or by setting the
#'   \code{patRoon.path.MetFragPubChemLite}/\code{patRoon.path.MetFragCompTox} option. See the installation section in
#'   the handbook for more details.
#'
#' @templateVar what \code{generateCompoundsMetFrag}
#' @template uses-multiProc
#'
#' @section Parallelization: When local database files are used with \code{generateCompoundsMetFrag} (\emph{e.g.} when
#'   \code{database} is set to \code{"pubchemlite"}, \code{"csv"} etc.) and \option{patRoon.MP.method="future"}, then
#'   the database file must be present on all the nodes. When \code{pubchemlite} or \code{comptox} is used, the location
#'   for these databases can be configured on the host with the respective package options
#'   (\option{patRoon.path.MetFragPubChemLite} and \option{patRoon.path.MetFragCompTox}) or made available by installing
#'   the \pkg{patRoonExt} package. Note that these files must \emph{also} be present on the local host computer, even if
#'   it is not participating in computations.
#'
#' @references \insertRef{Ruttkies2016}{patRoon}
#'
#' @templateVar what generateCompoundsMetFrag
#' @template main-rd-method
#' @export
setMethod("generateCompoundsMetFrag", "featureGroups", function(fGroups, MSPeakLists, method = "CL",
                                                                timeout = 300, timeoutRetries = 2, errorRetries = 2, topMost = 100,
                                                                dbRelMzDev = 5, fragRelMzDev = 5, fragAbsMzDev = 0.002, adduct = NULL,
                                                                database = "pubchem", extendedPubChem = "auto", chemSpiderToken = "",
                                                                scoreTypes = compoundScorings("metfrag", database, onlyDefault = TRUE)$name,
                                                                scoreWeights = 1.0,
                                                                preProcessingFilters = c("UnconnectedCompoundFilter", "IsotopeFilter"),
                                                                postProcessingFilters = c("InChIKeyFilter"),
                                                                maxCandidatesToStop = 2500, identifiers = NULL, extraOpts = NULL)
{
    if (method == "R")
        checkPackage("metfRag", "c-ruttkies/MetFragR")

    ac <- checkmate::makeAssertCollection()
    checkmate::assertClass(fGroups, "featureGroups", add = ac)
    checkmate::assertClass(MSPeakLists, "MSPeakLists", add = ac)
    checkmate::assertChoice(method, c("CL", "R"), add = ac)
    aapply(checkmate::assertNumber, . ~ timeout + dbRelMzDev + fragRelMzDev + fragAbsMzDev,
           lower = 0, finite = TRUE, fixed = list(add = ac))
    aapply(checkmate::assertCount, . ~ timeoutRetries + errorRetries, fixed = list(add = ac))
    aapply(checkmate::assertCount, . ~ topMost + maxCandidatesToStop, positive = TRUE, fixed = list(add = ac))
    checkmate::assertString(chemSpiderToken, add = ac)
    checkmate::assertChoice(database, c("pubchem", "chemspider", "kegg", "for-ident", "sdf", "psv", "csv",
                                        "comptox", "pubchemlite"), add = ac)
    checkmate::assert(checkmate::checkFlag(extendedPubChem),
                      checkmate::checkChoice(extendedPubChem, "auto"),
                      .var.name = "extendedPubChem")
    aapply(checkmate::assertCharacter, . ~ scoreTypes + preProcessingFilters + postProcessingFilters,
           any.missing = FALSE, fixed = list(add = ac))
    checkmate::assertNumeric(scoreWeights, lower = 0, finite = TRUE, any.missing = FALSE, min.len = 1, add = ac)
    aapply(checkmate::assertList, . ~ identifiers + extraOpts, any.missing = FALSE,
           names = "unique", null.ok = TRUE, fixed = list(add = ac))

    compsScores <- compoundScorings("metfrag", database)
    isLocalDB <- isLocalMetFragDB(database)
    allScoringNames <- union(compsScores$name, compsScores$metfrag)
    # allow freely definable scorings from local databases
    if (!isLocalDB)
        checkmate::assertSubset(scoreTypes, allScoringNames, add = ac)

    checkmate::reportAssertions(ac)

    if (length(fGroups) == 0)
        return(compoundsMF())
    
    adduct <- checkAndToAdduct(adduct, fGroups)

    anaInfo <- analysisInfo(fGroups)
    ftind <- groupFeatIndex(fGroups)
    gTable <- groupTable(fGroups)
    gNames <- names(fGroups)
    gCount <- length(fGroups)
    gInfo <- groupInfo(fGroups)
    pLists <- peakLists(MSPeakLists)

    if (!is.logical(extendedPubChem) && database == "pubchem")
    {
        PCScorings <- compoundScorings("metfrag", "pubchem", includeNoDB = FALSE)
        extendedPubChem <- any(scoreTypes %in% union(PCScorings$name, PCScorings$metfrag))
    }

    # convert to metfrag naming scheme
    databaseMF <- switch(database,
                         pubchem = "PubChem",
                         chemspider = "ChemSpider",
                         kegg = "KEGG",
                         "for-ident" = "FOR-IDENT",
                         sdf = "LocalSDF",
                         psv = "LocalPSV",
                         csv = "LocalCSV",
                         comptox = "LocalCSV",
                         pubchemlite = "LocalCSV")
    if (databaseMF == "PubChem" && extendedPubChem) # can't seem to combine this conditional in above switch...
        databaseMF <- "ExtendedPubChem"

    scoreTypesMF <- setdiff(scoreTypes, c("score", "Score")) # final score is to be determined
    isSimplScore <- scoreTypesMF %in% compsScores$name
    if (any(isSimplScore))
        scoreTypesMF[isSimplScore] <- compsScores[match(scoreTypesMF[isSimplScore], compsScores$name), "metfrag"]

    scoreWeights <- rep(scoreWeights, length.out = length(scoreTypesMF))
    
    extDB <- NULL
    if (isLocalDB)
    {
        extDB <- getMetFragExtDB(extraOpts[["LocalDatabasePath"]], database)
        
        # only keep scorings that are present in the DB
        # BUG: nrows=0 gives internal data.table error, so read 1 line
        scInDB <- names(fread(extDB, nrows = 1))
        
        isDBType <- scoreTypesMF %in% compsScores[nzchar(compsScores$database), "metfrag"]
        keep <- !isDBType | scoreTypesMF %in% scInDB

        scoreTypesMF <- scoreTypesMF[keep]; scoreWeights <- scoreWeights[keep]
    }
    
    mfSettings <- list(DatabaseSearchRelativeMassDeviation = dbRelMzDev,
                       FragmentPeakMatchRelativeMassDeviation = fragRelMzDev,
                       FragmentPeakMatchAbsoluteMassDeviation = fragAbsMzDev,
                       MetFragDatabaseType = databaseMF, MetFragScoreTypes = scoreTypesMF,
                       MetFragScoreWeights = scoreWeights,
                       MetFragPreProcessingCandidateFilter = preProcessingFilters,
                       MetFragPostProcessingCandidateFilter = postProcessingFilters,
                       MaxCandidateLimitToStop = maxCandidatesToStop,
                       UseSmiles = "true")

    if (!is.null(chemSpiderToken) && nzchar(chemSpiderToken))
        mfSettings$ChemSpiderToken <- chemSpiderToken

    if (!is.null(extraOpts))
    {
        # add (or replace) any extra options
        mfSettings <- modifyList(mfSettings, extraOpts)
    }

    setHash <- makeHash(fGroups, pLists, method, mfSettings, topMost, identifiers)
    if (!is.null(extDB))
        setHash <- makeHash(setHash, makeFileHash(extDB))
    
    printf("Identifying %d feature groups with MetFrag...\n", gCount)

    runData <- generateMetFragRunData(fGroups, MSPeakLists, mfSettings, extDB, topMost, identifiers, method,
                                      adduct, database, errorRetries, timeoutRetries)

    if (method == "CL")
    {
        results <- executeMultiProcess(runData, finishHandler = patRoon:::MFMPFinishHandler,
                                       timeoutHandler = patRoon:::MFMPTimeoutHandler,
                                       errorHandler = patRoon:::MFMPErrorHandler,
                                       prepareHandler = patRoon:::MFMPPrepareHandler,
                                       cacheName = "compoundsMetFrag", setHash = setHash,
                                       procTimeout = timeout, delayBetweenProc = 1000,
                                       logSubDir = "metfrag")
    }
    else
    {
        cacheDB <- openCacheDBScope()
        cachedSet <- loadCacheSet("compoundsMetFrag", setHash, cacheDB)
        resultHashes <- vector("character", length(gNames))
        names(resultHashes) <- gNames
        
        cachedResults <- sapply(runData, function(rd)
        {
            resultHashes[rd$gName] <<- rd$hash
            comptab <- NULL
            if (!is.null(cachedSet))
                comptab <- cachedSet[[rd$hash]]
            if (is.null(comptab))
                comptab <- loadCacheData("compoundsMetFrag", rd$hash, cacheDB)
            return(comptab)
        }, simplify = FALSE)
        cachedResults <- cachedResults[!sapply(cachedResults, is.null)]
        
        runData <- runData[setdiff(names(runData), names(cachedResults))] # remove cached results
        
        if (length(runData) > 0)
        {
            prog <- openProgBar(0, gCount)
            
            results <- lapply(runData, function(rd)
            {
                rd$mfSettings$PeakList <- as.matrix(rd$spec[, c("mz", "intensity")])
                comptab <- metfRag::run.metfrag(rd$mfSettings)
                jgc() # hopefully reduce some memory usage
                
                if (nrow(comptab) > 0)
                    comptab <- unFactorDF(comptab)
                
                comptab <- as.data.table(comptab)
                
                procres <- processMFResults(comptab, rd)
                
                if (nrow(procres$comptab) > 0)
                {
                    # BUG: metfRag seems to give second duplicate results where only NoExplPeaks may differ and have an incorrect value.
                    # for now, just remove all duplicates and re-assign NoExplPeaks
                    procres$comptab <- procres$comptab[!duplicated(identifier)]
                    procres$comptab[, explainedPeaks := sapply(fragInfo, nrow)]
                }
                
                saveCacheData("compoundsMetFrag", procres, rd$hash, cacheDB)
                
                setTxtProgressBar(prog, match(rd$gName, gNames))
                
                return(procres)
            })
            
            setTxtProgressBar(prog, gCount)
            close(prog)
        }
        else
            results <- list()
        
        if (length(cachedResults) > 0)
        {
            results <- c(results, cachedResults)
            results <- results[intersect(gNames, names(results))] # re-order
        }
        
        if (is.null(cachedSet))
            saveCacheSet("compoundsMetFrag", resultHashes[nzchar(resultHashes)], setHash, cacheDB)
    }
    
    # prune empty/NULL results
    if (length(results) > 0)
        results <- results[sapply(results, function(r) !is.null(r$comptab) && nrow(r$comptab) > 0, USE.NAMES = FALSE)]

    ngrp <- length(results)
    printf("Loaded %d compounds from %d features (%.2f%%).\n", sum(unlist(lapply(results, function(r) nrow(r$comptab)))),
           ngrp, if (gCount == 0) 0 else ngrp * 100 / gCount)

    # convert scoreTypes to generic names
    scoreTypes <- union(scoreTypes, "score") # ensure final score is in and remove duplicates
    isMFScore <- scoreTypes %in% compsScores$metfrag
    if (any(isMFScore))
        scoreTypes[isMFScore] <- compsScores[match(scoreTypes[isMFScore], compsScores$metfrag), "name"]
    
    scoreRanges <- lapply(results, "[[", "scRanges")
    
    # Ensure that scoreTypes doesn't contain anything 'extra', e.g. this can happen when the user specified scores not
    # present in the database.
    scoreTypes <- if (length(scoreRanges) > 0) intersect(scoreTypes, unlist(lapply(scoreRanges, names))) else character()

    return(compoundsMF(groupAnnotations = lapply(results, "[[", "comptab"), scoreTypes = scoreTypes,
                       scoreRanges = scoreRanges, settings = mfSettings))
})

#' @template featAnnSets-gen_args
#' @rdname generateCompoundsMetFrag
#' @export
setMethod("generateCompoundsMetFrag", "featureGroupsSet", function(fGroups, MSPeakLists, method = "CL", timeout = 300,
                                                                   timeoutRetries = 2, errorRetries = 2, topMost = 100,
                                                                   dbRelMzDev = 5, fragRelMzDev = 5, fragAbsMzDev = 0.002,
                                                                   adduct = NULL, ..., setThreshold = 0,
                                                                   setThresholdAnn = 0, setAvgSpecificScores = FALSE)
{
    generateCompoundsSet(fGroups, MSPeakLists, adduct, generateCompoundsMetFrag, method = method, timeout = timeout,
                         timeoutRetries = timeoutRetries, errorRetries = errorRetries, topMost = topMost,
                         dbRelMzDev = dbRelMzDev, fragRelMzDev = fragRelMzDev, fragAbsMzDev = fragAbsMzDev, ...,
                         setThreshold = setThreshold, setThresholdAnn = setThresholdAnn,
                         setAvgSpecificScores = setAvgSpecificScores)
})
rickhelmus/patRoon documentation built on Sept. 17, 2024, 4:32 p.m.