#' @include main.R
#' @include compounds.R
#' @include mspeaklists.R
#' @include formulas.R
#' @include feature_groups-set.R
NULL
#' Compounds list class for MetFrag results.
#'
#' This class is derived from \code{\link{compounds}} and contains additional
#' specific MetFrag data.
#'
#' Objects from this class are generated by
#' \code{\link{generateCompoundsMetFrag}}
#'
#' @slot settings A list with all general configuration settings passed to
#' MetFrag. Feature specific items (\emph{e.g.} spectra and precursor masses)
#' are not contained in this list.
#'
#' @param compoundsMF A \code{compoundsMF} object.
#'
#' @templateVar class compoundsMF
#' @template class-hierarchy
#'
#' @seealso \code{\link{compounds}} and \code{\link{generateCompoundsMetFrag}}
#'
#' @references \insertRef{Ruttkies2016}{patRoon}
#'
#' @export
compoundsMF <- setClass("compoundsMF", slots = c(settings = "list"),
contains = "compounds")
setMethod("initialize", "compoundsMF",
function(.Object, ...) callNextMethod(.Object, algorithm = "metfrag", ...))
#' @describeIn compoundsMF Accessor method for the \code{settings} slot.
#' @aliases settings
#' @export
setMethod("settings", "compoundsMF", function(compoundsMF) compoundsMF@settings)
unifyMFNames <- function(mfr, scoreTypesMF)
{
unNames <- c(NoExplPeaks = "explainedPeaks",
Score = "score",
MonoisotopicMass = "neutralMass",
SMILES = "SMILES",
InChIKey = "InChIKey",
InChIKey2 = "InChIKey2",
InChIKey1 = "InChIKey1",
InChI = "InChI",
Identifier = "identifier",
PubChemNumberPatents = "numberPatents",
fragInfo = "fragInfo", # keep it
FragmenterScore = "fragScore",
MolecularFormula = "neutral_formula",
TrivialName = "compoundName",
CompoundName = "compoundName",
IUPACName = "compoundName",
# PubChem
XlogP3 = "XlogP",
PubChemNumberPubMedReferences = "pubMedReferences",
ChemSpiderNumberPubMedReferences = "pubMedReferences",
# PubChemLite
FP = "FP",
PubMed_Count = "pubMedReferences",
Patent_Count = "numberPatents",
Related_CIDs = "relatedCIDs",
Name2 = "compoundName2", # Nov2019 version
Synonym = "compoundName2", # Jan2020 version
AgroChemInfo = "agroChemInfo",
BioPathway = "bioPathway",
DrugMedicInfo = "drugMedicInfo",
FoodRelated = "foodRelated",
PharmacoInfo = "pharmacoInfo",
SafetyInfo = "safetyInfo",
ToxicityInfo = "toxicityInfo",
DisorderDisease = "disorderDisease",
Identification = "identification",
KnownUse = "knownUse",
FPSum = "annoTypeCount", # Nov2019 version (rename to Jan2020 version)
AnnoTypeCount = "annoTypeCount", # Jan2020 version
# PubChem OECD PFAS
Name = "compoundName",
IUPAC_Name = "compoundName2",
AnnotHitCount = "annotHitCount",
# ChemSpider
CHEMSPIDER_XLOGP = "XlogP",
CHEMSPIDER_ALOGP = "AlogP",
ChemSpiderNumberExternalReferences = "extReferenceCount",
ChemSpiderDataSourceCount = "dataSourceCount",
ChemSpiderReferenceCount = "referenceCount",
ChemSpiderRSCCount = "RSCCount",
OfflineMetFusionScore = "metFusionScore",
OfflineIndividualMoNAScore = "individualMoNAScore",
SmartsSubstructureInclusionScore = "smartsInclusionScore",
SmartsSubstructureExclusionScore = "smartsExclusionScore",
SuspectListScore = "suspectListScore",
RetentionTimeScore = "retentionTimeScore",
AutomatedPeakFingerprintAnnotationScore = "peakFingerprintScore",
AutomatedLossFingerprintAnnotationScore = "lossFingerprintScore",
# CompTox variables
CASRN_DTXSID = "CASRN",
CPDAT_COUNT = "CPDATCount",
ECOTOX = "ECOTOX",
NORMANSUSDAT = "NORMANSUSDAT",
TOXCAST_PERCENT_ACTIVE = "TOXCASTActive",
NUMBER_OF_PUBMED_ARTICLES = "pubMedReferences",
MASSBANKEU = "MASSBANKEU",
#MONOISOTOPIC_MASS_DTXCID = "neutralMass",
QC_LEVEL = "QCLevel",
DATA_SOURCES = "dataSources",
PUBCHEM_DATA_SOURCES = "pubChemDataSources",
TOX21SL = "TOX21SL",
"EXPOCAST_MEDIAN_EXPOSURE_PREDICTION_MG/KG-BW/DAY" = "EXPOCASTPredExpo",
TOXCAST = "TOXCAST",
KEMIMARKET = "KEMIMARKET",
MZCLOUD = "MZCLOUD",
# CompTox - smoke metadata
PubMedNeuro = "pubMedNeuro",
CIGARETTES = "CIGARETTES",
INDOORCT16 = "INDOORCT16",
SRM2585DUST = "SRM2585DUST",
SLTCHEMDB = "SLTCHEMDB",
THSMOKE = "THSMOKE",
# CompTox - wastewater metadata
ITNANTIBIOTIC = "ITNANTIBIOTIC",
STOFFIDENT = "STOFFIDENT",
KEMIMARKET_EXPO = "KEMIMARKET_EXPO",
KEMIMARKET_HAZ = "KEMIMARKET_HAZ",
REACH2017 = "REACH2017",
KEMIWW_WDUIndex = "KEMIWW_WDUIndex",
KEMIWW_StpSE = "KEMIWW_StpSE",
KEMIWW_SEHitsOverDL = "KEMIWW_SEHitsOverDL",
ZINC15PHARMA = "ZINC15PHARMA",
PFASMASTER = "PFASMASTER",
# FOR-IDENT
ForIdentTonnage = "tonnage",
ForIdentCategories = "categories",
# database generated from TP prediction
molNeutralized = "molNeutralized",
parent = "parent",
ALogP = "ALogP",
LogP = "LogP",
XLogP = "XLogP",
transformation = "transformation",
enzyme = "enzyme",
evidencedoi = "evidencedoi"
)
unNames <- unNames[names(unNames) %in% names(mfr)] # filter out missing
setnames(mfr, names(unNames), unNames)
scoreTypesMF <- intersect(scoreTypesMF, names(mfr))
return(mfr[, union(unNames, scoreTypesMF), with = FALSE]) # filter out any other columns
}
# MetFragCL gives bracketed fragment formulas including charge and weird (de)protonation adducts
# For comparison with other results these should be converted to a simple formula format
cleanFragFormulas <- function(forms)
{
# get ionization adduct (if any)
plusminpat <- "[\\-\\+]"
addpat <- sprintf(".*\\]%s([[:alnum:]]+)%s", plusminpat, plusminpat)
adducts <- ifelse(grepl(addpat, forms), gsub(addpat, "\\1", forms), "")
forms <- gsub(sprintf("%s([[:alnum:]]+)%s$", plusminpat, plusminpat), "", forms) # remove adducts and charge
forms <- gsub("\\[|\\]", "", forms) # remove brackets
# add single counts to hydrogen adducts without count (e.g. -H becomes -1H)
forms <- gsub("([-\\+])H", "\\11H", forms)
protAdducts <- regmatches(forms, gregexpr("([-\\+][0-9]+)H", forms)) # get "-1H", "+2H" etc
baseForms <- gsub("^([[:alnum:]]+)[-|\\+].*", "\\1", forms) # Get 'regular' part of formula
baseForms <- paste0(baseForms, adducts) # re-add adducts
# handle protonation adducts
return(sapply(seq_along(forms), function(fi)
{
addHCount <- sum(as.integer(gsub("H", "", protAdducts[[fi]])))
flist <- splitFormulaToList(baseForms[[fi]])
if (addHCount != 0)
{
if (!"H" %in% names(flist))
{
flist <- c(flist, addHCount)
names(flist)[length(flist)] <- "H"
}
else
flist[["H"]] <- flist[["H"]] + addHCount
}
return(simplifyFormula(formulaListToString(flist)))
}))
}
getMFFragmentInfo <- function(spec, mfResult, adduct)
{
if (mfResult$NoExplPeaks == 0 || mfResult$FormulasOfExplPeaks == "NA")
return(data.table(mz = numeric(0), ion_formula = character(0), neutral_loss = character(0),
score = numeric(0), PLID = numeric(0)))
# format of FormulasOfExplPeaks: list of strings with mz1:formula1;mz2:formula2;...
fi <- unlist(strsplit(mfResult$FormulasOfExplPeaks, "[;:]")) # split into list with subsequent m/z / formula pairs
ret <- data.table(mz = as.numeric(fi[c(TRUE, FALSE)]),
ion_formula = cleanFragFormulas(fi[c(FALSE, TRUE)]),
ion_formula_MF = fi[c(FALSE, TRUE)])
if (!is.null(mfResult[["FragmenterScore_Values"]]))
ret[, score := as.numeric(unlist(strsplit(mfResult$FragmenterScore_Values, ";")))]
ionform <- calculateIonFormula(mfResult$MolecularFormula, adduct)
ret[, neutral_loss := sapply(ion_formula, subtractFormula, formula1 = ionform)]
ret[, PLID := sapply(mz, function(omz) spec[which.min(abs(omz - mz))]$ID)]
setcolorder(ret, c("mz", "ion_formula", "ion_formula_MF", "neutral_loss"))
return(ret)
}
isLocalMetFragDB <- function(database) database %in% c("sdf", "psv", "csv", "comptox", "pubchemlite")
getMetFragExtDB <- function(localDB, database)
{
extDB <- if (!is.null(localDB)) localDB else NULL
if ((database == "comptox" || database == "pubchemlite") && is.null(extDB))
extDB <- getExtDepPath(if (database == "comptox") "metfragct" else "metfragpcl", NULL)
if (!is.null(extDB))
extDB <- normalizePath(extDB)
if (is.null(extDB) || !file.exists(extDB))
{
stop("No external database file set. This should be set as part of the extraOpts argument ",
"e.g. extraOpts = list(LocalDatabasePath = \"C:/CompTox_17March2019_SelectMetaData.csv\")",
call. = FALSE)
}
return(extDB)
}
initMetFragCLCommand <- function(mfSettings, spec, mfBin)
{
paramFile <- tempfile("parameters", fileext = ".txt")
paramCon <- file(paramFile, "w")
writeParam <- function(param, val) cat(sprintf("%s = %s\n", param, paste0(as.character(val), collapse = ",")), file = paramCon)
for (param in names(mfSettings))
writeParam(param, mfSettings[[param]])
outFile <- tempfile("results", fileext = ".csv")
writeParam("MetFragCandidateWriter", "CSV")
writeParam("SampleName", basename(tools::file_path_sans_ext(outFile)))
writeParam("ResultsPath", dirname(outFile))
specFile <- tempfile("spectrum", fileext = ".txt")
write.table(spec[, c("mz", "intensity")], specFile, sep = "\t", row.names = FALSE, col.names = FALSE)
writeParam("PeakListPath", specFile)
close(paramCon)
return(list(command = "java", args = c("-jar", mfBin, paramFile), outFile = outFile))
}
generateMetFragRunData <- function(fGroups, MSPeakLists, mfSettings, extDB, topMost, identifiers, method,
adduct, database, errorRetries, timeoutRetries)
{
gNames <- names(fGroups)
gTable <- groupTable(fGroups)
gInfo <- groupInfo(fGroups)
anaInfo <- analysisInfo(fGroups)
baseHash <- makeHash(method, topMost)
if (!is.null(extDB))
baseHash <- makeHash(baseHash, makeFileHash(extDB))
fgAdd <- getFGroupAdducts(names(fGroups), annotations(fGroups), adduct, "metfrag")
ret <- Map(gNames, fgAdd$grpAdducts, fgAdd$grpAdductsChr, f = function(grp, add, addChr)
{
if (!is.null(identifiers) && is.null(identifiers[[grp]]))
return(NULL)
if (is.null(MSPeakLists[[grp]][["MS"]]))
return(NULL)
spec <- MSPeakLists[[grp]][["MSMS"]]
if (is.null(spec))
return(NULL)
mfSettings$IonizedPrecursorMass <- MSPeakLists[[grp]]$MS[precursor == TRUE]$mz
mfSettings$PrecursorIonType <- addChr
mfSettings$ExperimentalRetentionTimeValue <- gInfo[grp, "rts"] / 60
if (!is.null(identifiers))
mfSettings$PrecursorCompoundIDs <- identifiers[[grp]]
hash <- makeHash(baseHash, mfSettings, spec)
logf <- paste0("mfcl-", grp, ".txt")
return(list(hash = hash, gName = grp, spec = spec, mfSettings = mfSettings,
adduct = add, database = database, topMost = topMost,
errorRetries = errorRetries, timeoutRetries = timeoutRetries,
logFile = logf))
})
return(ret[!sapply(ret, is.null)])
}
processMFResults <- function(comptab, runData, lfile = "")
{
scRanges <- list()
if (nrow(comptab) > 0)
{
compsc <- compoundScorings("metfrag")
compsc <- compsc[compsc$metfrag %in% names(comptab), ]
allScoreCols <- union(compsc$metfrag, runData$mfSettings$MetFragScoreTypes)
if (length(allScoreCols) > 0)
{
# fix up score and suspect list columns: dash --> 0
# NOTE: do as.numeric as values with '-' will cause the column to be a character
comptab[, (allScoreCols) := lapply(.SD, function(x) as.numeric(ifelse(x == "-", 0, x))),
.SDcols = allScoreCols]
scRanges <- sapply(allScoreCols, function(sc) range(comptab[[sc]]), simplify = FALSE)
namesInGenSC <- match(names(scRanges), compsc$metfrag)
notNA <- !is.na(namesInGenSC)
if (any(notNA))
names(scRanges)[notNA] <- compsc$name[notNA]
}
if (!is.null(runData$topMost) && nrow(comptab) > runData$topMost)
comptab <- comptab[seq_len(runData$topMost)]
# make character: needed for getMFFragmentInfo()
# note: this column is not present in empty tables so can't do this with colClasses
if (!is.null(comptab[["FragmenterScore_Values"]]))
comptab[, FragmenterScore_Values := as.character(FragmenterScore_Values)]
# fill in fragment info
# NOTE: double wrap in list to nest table
if (!is.null(lfile))
cat(sprintf("\n%s - Done! Processing frags...\n", date()), file = lfile, append = TRUE)
for (r in seq_len(nrow(comptab)))
set(comptab, r, "fragInfo", list(list(getMFFragmentInfo(runData$spec, comptab[r], runData$adduct))))
if (!is.null(lfile))
cat(sprintf("\n%s - Done!\n", date()), file = lfile, append = TRUE)
# unify column names & filter unnecessary columns
comptab <- unifyMFNames(comptab, runData$mfSettings$MetFragScoreTypes)
if (!is.null(comptab[["CASRN"]]))
comptab[, CASRN := sub("CASRN:", "", CASRN, fixed = TRUE)] # remove "CASRN" prefix
# sometimes these can be interpreted as dates!
# also convert identifiers to characters, which comes in handy for set consensus merging
for (col in c("compoundName", "compoundName2", "CASRN", "identifier", "relatedCIDs"))
{
if (!is.null(comptab[[col]]))
comptab[, (col) := as.character(get(col))]
}
comptab[, database := runData$database]
}
return(list(comptab = comptab, scRanges = scRanges))
}
MFMPFinishHandler <- function(cmd)
{
comptab <- data.table::fread(cmd$outFile, colClasses = c(Identifier = "character"))
procres <- patRoon:::processMFResults(comptab, cmd, cmd$stderrFile)
return(procres)
}
MFMPPrepareHandler <- function(cmd)
{
mfBin <- getExtDepPath("metfragcl")
if (!nzchar(Sys.which("java")))
stop("Please make sure that java is installed and its location is correctly set in PATH.")
if (isLocalMetFragDB(cmd$database))
{
extDB <- patRoon:::getMetFragExtDB(cmd$mfSettings[["LocalDatabasePath"]], cmd$database)
# NOTE: this will set LocalDatabasePath in case only options() were set or update it with normalizePath()
cmd$mfSettings$LocalDatabasePath <- normalizePath(extDB)
}
return(c(cmd, patRoon:::initMetFragCLCommand(cmd$mfSettings, cmd$spec, mfBin)))
}
MFMPTimeoutHandler <- function(cmd, retries)
{
if (retries >= cmd$timeoutRetries)
{
warning(sprintf("Could not run MetFrag for %s: timeout. Log: %s", cmd$gName, cmd$logFile))
return(FALSE)
}
warning(sprintf("Restarting timed out MetFrag command for %s (retry %d/%d)",
cmd$gName, retries+1, cmd$timeoutRetries))
return(TRUE)
}
MFMPErrorHandler <- function(cmd, exitStatus, retries)
{
if (!is.na(exitStatus) && exitStatus <= 6) # some error thrown by MF
{
if (retries >= cmd$errorRetries)
{
warning(sprintf("Could not run MetFrag for %s - exit code: %d - Log: %s",
cmd$gName, exitStatus, cmd$logFile))
return(FALSE)
}
warning(sprintf("Restarting failed MetFrag command for %s - exit: %d (retry %d/%d)",
cmd$gName, exitStatus, retries+1, cmd$errorRetries))
return(TRUE)
}
# some other error (e.g. java not present)
return(sprintf("Fatal: Failed to execute MetFragCL for %s - exit code: %d - Log: %s",
cmd$gName, exitStatus, cmd$logFile))
}
#' Compound annotation with MetFrag
#'
#' Uses the \pkg{metfRag} package or \command{MetFrag CL} for compound identification (see
#' \url{http://ipb-halle.github.io/MetFrag/}).
#'
#' @templateVar algo MetFrag
#' @templateVar do generate compound candidates
#' @templateVar generic generateCompounds
#' @templateVar algoParam metfrag
#' @template algo_generator
#'
#' @details Several online compound databases such as \href{https://pubchem.ncbi.nlm.nih.gov/}{PubChem} and
#' \href{http://www.chemspider.com/}{ChemSpider} may be chosen for retrieval of candidate structures. This method
#' requires the availability of MS/MS data, and feature groups without it will be ignored. Many options exist to score
#' and filter resulting data, and it is highly suggested to optimize these to improve results. The \command{MetFrag}
#' options \code{PeakList}, \code{IonizedPrecursorMass} and \code{ExperimentalRetentionTimeValue} (in minutes) fields
#' are automatically set from feature data.
#'
#' @param method Which method should be used for MetFrag execution: \code{"CL"} for \command{MetFragCL} and \code{"R"}
#' for \command{MetFragR}. The former is usually much faster and recommended.
#' @param timeout Maximum time (in seconds) before a metFrag query for a feature group is stopped. Also see
#' \code{timeoutRetries} argument.
#' @param timeoutRetries Maximum number of retries after reaching a timeout before completely skipping the metFrag query
#' for a feature group. Also see \code{timeout} argument.
#' @param errorRetries Maximum number of retries after an error occurred. This may be useful to handle e.g. connection
#' errors.
#' @param dbRelMzDev Relative mass deviation (in ppm) for database search. Sets the
#' \option{DatabaseSearchRelativeMassDeviation} option.
#' @param fragRelMzDev Relative mass deviation (in ppm) for fragment matching. Sets the
#' \option{FragmentPeakMatchRelativeMassDeviation} option.
#' @param fragAbsMzDev Absolute mass deviation (in Da) for fragment matching. Sets the
#' \option{FragmentPeakMatchAbsoluteMassDeviation} option.
#' @param database Compound database to use. Valid values are: \code{"pubchem"}, \code{"chemspider"},
#' \code{"for-ident"}, \code{"comptox"}, \code{"pubchemlite"}, \code{"kegg"}, \code{"sdf"}, \code{"psv"} and
#' \code{"csv"}. See section below for more information. Sets the \code{MetFragDatabaseType} option.
#' @param extendedPubChem If \code{database="pubchem"}: whether to use the \emph{extended} database that includes
#' information for compound scoring (\emph{i.e.} number of patents/PubMed references). Note that downloading
#' candidates from this database might take extra time. Valid values are: \code{FALSE} (never use it), \code{TRUE}
#' (always use it) or \code{"auto"} (default, use if specified scorings demand it).
#' @param chemSpiderToken A character string with the \href{http://www.chemspider.com/AboutServices.aspx}{ChemSpider
#' security token} that should be set when the ChemSpider database is used. Sets the \option{ChemSpiderToken} option.
#' @param scoreTypes A character vector defining the scoring types. See the \verb{Scorings} section below for more
#' information. Note that both generic and \command{MetFrag} specific names are accepted (\emph{i.e.} \code{name} and
#' \code{metfrag} columns returned by \code{\link{compoundScorings}}). When a local database is used, the name should
#' match what is given there (\code{e.g} column names when \code{database=csv}). Note that MetFrag may still report
#' other scoring data, however, these are not used for ranking. Sets the \option{MetFragScoreTypes} option.
#' @param scoreWeights Numeric vector containing weights of the used scoring types. Order is the same as set in
#' \code{scoreTypes}. Values are recycled if necessary. Sets the \option{MetFragScoreWeights} option.
#' @param preProcessingFilters,postProcessingFilters A character vector defining pre/post filters applied before/after
#' fragmentation and scoring (\emph{e.g.} \code{"UnconnectedCompoundFilter"}, \code{"IsotopeFilter"},
#' \code{"ElementExclusionFilter"}). Some methods require further options to be set. For all filters and more
#' information refer to the \verb{Candidate Filters} section on the
#' \href{http://ipb-halle.github.io/MetFrag/projects/metfragr/}{MetFragR homepage}. Sets the
#' \option{MetFragPreProcessingCandidateFilter} and \code{MetFragPostProcessingCandidateFilter} options.
#' @param maxCandidatesToStop If more than this number of candidate structures are found then processing will be aborted
#' and no results this feature group will be reported. Low values increase the chance of missing data, whereas too
#' high values will use too much computer resources and signficantly slowdown the process. Sets the
#' \option{MaxCandidateLimitToStop} option.
#' @param identifiers A \code{list} containing for each feature group a character vector with database identifiers that
#' should be used to find candidates for a feature group (the list should be named by feature group names). If
#' \code{NULL} all relevant candidates will be retrieved from the specified database. An example usage scenario is to
#' obtain the list of candidate identifiers from a \code{\link{compounds}} object obtained with
#' \code{\link{generateCompoundsSIRIUS}} using the \code{\link{identifiers}} method. This way, only those candidates
#' will be searched by MetFrag that were generated by SIRIUS+CSI:FingerID. Sets the \option{PrecursorCompoundIDs}
#' option.
#' @param extraOpts A named \code{list} containing further settings \command{MetFrag}. See the
#' \href{http://ipb-halle.github.io/MetFrag/projects/metfragr/}{MetFragR} and
#' \href{http://ipb-halle.github.io/MetFrag/projects/metfragcl/}{MetFrag CL} homepages for all available options. Set
#' to \code{NULL} to ignore.
#'
#' @template adduct-arg
#' @template comp_algo-args
#'
#' @inheritParams generateCompounds
#'
#' @return \code{generateCompoundsMetFrag} returns a \code{\link{compoundsMF}} object.
#'
#' @section Scorings: \command{MetFrag} supports \emph{many} different scorings to rank candidates. The
#' \code{\link{compoundScorings}} function can be used to get an overview: (some columns are omitted)
#'
#' @eval paste("@@section Scorings:", patRoon:::tabularRD(patRoon::compoundScorings("metfrag")[, 1:3]))
#'
#' @section Scorings: In addition, the \code{\link{compoundScorings}} function is also useful to programmatically
#' generate a set of scorings to be used for ranking with \command{MetFrag}. For instance, the following can be given
#' to the \code{scoreTypes} argument to use all default scorings for PubChem: \code{compoundScorings("metfrag",
#' "pubchem", onlyDefault=TRUE)$name}.
#'
#' For all \command{MetFrag} scoring types refer to the \verb{Candidate Scores} section on the
#' \href{http://ipb-halle.github.io/MetFrag/projects/metfragr/}{MetFragR homepage}.
#'
#' @section Usage of MetFrag databases: When \code{database="chemspider"} setting the \code{chemSpiderToken} argument is
#' mandatory.
#'
#' If a local database is chosen via \code{sdf}, \code{psv}, or \code{csv} then its file location should be set with
#' the \code{LocalDatabasePath} value via the \code{extraOpts} argument. For example: \code{extraOpts =
#' list(LocalDatabasePath = "C:/myDB.csv")}.
#'
#' If \code{database="pubchemlite"} or \code{database="comptox"} and \pkg{patRoonExt} is \emph{not} installed then the
#' file location must be specified as above or by setting the
#' \code{patRoon.path.MetFragPubChemLite}/\code{patRoon.path.MetFragCompTox} option. See the installation section in
#' the handbook for more details.
#'
#' @templateVar what \code{generateCompoundsMetFrag}
#' @template uses-multiProc
#'
#' @section Parallelization: When local database files are used with \code{generateCompoundsMetFrag} (\emph{e.g.} when
#' \code{database} is set to \code{"pubchemlite"}, \code{"csv"} etc.) and \option{patRoon.MP.method="future"}, then
#' the database file must be present on all the nodes. When \code{pubchemlite} or \code{comptox} is used, the location
#' for these databases can be configured on the host with the respective package options
#' (\option{patRoon.path.MetFragPubChemLite} and \option{patRoon.path.MetFragCompTox}) or made available by installing
#' the \pkg{patRoonExt} package. Note that these files must \emph{also} be present on the local host computer, even if
#' it is not participating in computations.
#'
#' @references \insertRef{Ruttkies2016}{patRoon}
#'
#' @templateVar what generateCompoundsMetFrag
#' @template main-rd-method
#' @export
setMethod("generateCompoundsMetFrag", "featureGroups", function(fGroups, MSPeakLists, method = "CL",
timeout = 300, timeoutRetries = 2, errorRetries = 2, topMost = 100,
dbRelMzDev = 5, fragRelMzDev = 5, fragAbsMzDev = 0.002, adduct = NULL,
database = "pubchem", extendedPubChem = "auto", chemSpiderToken = "",
scoreTypes = compoundScorings("metfrag", database, onlyDefault = TRUE)$name,
scoreWeights = 1.0,
preProcessingFilters = c("UnconnectedCompoundFilter", "IsotopeFilter"),
postProcessingFilters = c("InChIKeyFilter"),
maxCandidatesToStop = 2500, identifiers = NULL, extraOpts = NULL)
{
if (method == "R")
checkPackage("metfRag", "c-ruttkies/MetFragR")
ac <- checkmate::makeAssertCollection()
checkmate::assertClass(fGroups, "featureGroups", add = ac)
checkmate::assertClass(MSPeakLists, "MSPeakLists", add = ac)
checkmate::assertChoice(method, c("CL", "R"), add = ac)
aapply(checkmate::assertNumber, . ~ timeout + dbRelMzDev + fragRelMzDev + fragAbsMzDev,
lower = 0, finite = TRUE, fixed = list(add = ac))
aapply(checkmate::assertCount, . ~ timeoutRetries + errorRetries, fixed = list(add = ac))
aapply(checkmate::assertCount, . ~ topMost + maxCandidatesToStop, positive = TRUE, fixed = list(add = ac))
checkmate::assertString(chemSpiderToken, add = ac)
checkmate::assertChoice(database, c("pubchem", "chemspider", "kegg", "for-ident", "sdf", "psv", "csv",
"comptox", "pubchemlite"), add = ac)
checkmate::assert(checkmate::checkFlag(extendedPubChem),
checkmate::checkChoice(extendedPubChem, "auto"),
.var.name = "extendedPubChem")
aapply(checkmate::assertCharacter, . ~ scoreTypes + preProcessingFilters + postProcessingFilters,
any.missing = FALSE, fixed = list(add = ac))
checkmate::assertNumeric(scoreWeights, lower = 0, finite = TRUE, any.missing = FALSE, min.len = 1, add = ac)
aapply(checkmate::assertList, . ~ identifiers + extraOpts, any.missing = FALSE,
names = "unique", null.ok = TRUE, fixed = list(add = ac))
compsScores <- compoundScorings("metfrag", database)
isLocalDB <- isLocalMetFragDB(database)
allScoringNames <- union(compsScores$name, compsScores$metfrag)
# allow freely definable scorings from local databases
if (!isLocalDB)
checkmate::assertSubset(scoreTypes, allScoringNames, add = ac)
checkmate::reportAssertions(ac)
if (length(fGroups) == 0)
return(compoundsMF())
adduct <- checkAndToAdduct(adduct, fGroups)
anaInfo <- analysisInfo(fGroups)
ftind <- groupFeatIndex(fGroups)
gTable <- groupTable(fGroups)
gNames <- names(fGroups)
gCount <- length(fGroups)
gInfo <- groupInfo(fGroups)
pLists <- peakLists(MSPeakLists)
if (!is.logical(extendedPubChem) && database == "pubchem")
{
PCScorings <- compoundScorings("metfrag", "pubchem", includeNoDB = FALSE)
extendedPubChem <- any(scoreTypes %in% union(PCScorings$name, PCScorings$metfrag))
}
# convert to metfrag naming scheme
databaseMF <- switch(database,
pubchem = "PubChem",
chemspider = "ChemSpider",
kegg = "KEGG",
"for-ident" = "FOR-IDENT",
sdf = "LocalSDF",
psv = "LocalPSV",
csv = "LocalCSV",
comptox = "LocalCSV",
pubchemlite = "LocalCSV")
if (databaseMF == "PubChem" && extendedPubChem) # can't seem to combine this conditional in above switch...
databaseMF <- "ExtendedPubChem"
scoreTypesMF <- setdiff(scoreTypes, c("score", "Score")) # final score is to be determined
isSimplScore <- scoreTypesMF %in% compsScores$name
if (any(isSimplScore))
scoreTypesMF[isSimplScore] <- compsScores[match(scoreTypesMF[isSimplScore], compsScores$name), "metfrag"]
scoreWeights <- rep(scoreWeights, length.out = length(scoreTypesMF))
extDB <- NULL
if (isLocalDB)
{
extDB <- getMetFragExtDB(extraOpts[["LocalDatabasePath"]], database)
# only keep scorings that are present in the DB
# BUG: nrows=0 gives internal data.table error, so read 1 line
scInDB <- names(fread(extDB, nrows = 1))
isDBType <- scoreTypesMF %in% compsScores[nzchar(compsScores$database), "metfrag"]
keep <- !isDBType | scoreTypesMF %in% scInDB
scoreTypesMF <- scoreTypesMF[keep]; scoreWeights <- scoreWeights[keep]
}
mfSettings <- list(DatabaseSearchRelativeMassDeviation = dbRelMzDev,
FragmentPeakMatchRelativeMassDeviation = fragRelMzDev,
FragmentPeakMatchAbsoluteMassDeviation = fragAbsMzDev,
MetFragDatabaseType = databaseMF, MetFragScoreTypes = scoreTypesMF,
MetFragScoreWeights = scoreWeights,
MetFragPreProcessingCandidateFilter = preProcessingFilters,
MetFragPostProcessingCandidateFilter = postProcessingFilters,
MaxCandidateLimitToStop = maxCandidatesToStop,
UseSmiles = "true")
if (!is.null(chemSpiderToken) && nzchar(chemSpiderToken))
mfSettings$ChemSpiderToken <- chemSpiderToken
if (!is.null(extraOpts))
{
# add (or replace) any extra options
mfSettings <- modifyList(mfSettings, extraOpts)
}
setHash <- makeHash(fGroups, pLists, method, mfSettings, topMost, identifiers)
if (!is.null(extDB))
setHash <- makeHash(setHash, makeFileHash(extDB))
printf("Identifying %d feature groups with MetFrag...\n", gCount)
runData <- generateMetFragRunData(fGroups, MSPeakLists, mfSettings, extDB, topMost, identifiers, method,
adduct, database, errorRetries, timeoutRetries)
if (method == "CL")
{
results <- executeMultiProcess(runData, finishHandler = patRoon:::MFMPFinishHandler,
timeoutHandler = patRoon:::MFMPTimeoutHandler,
errorHandler = patRoon:::MFMPErrorHandler,
prepareHandler = patRoon:::MFMPPrepareHandler,
cacheName = "compoundsMetFrag", setHash = setHash,
procTimeout = timeout, delayBetweenProc = 1000,
logSubDir = "metfrag")
}
else
{
cacheDB <- openCacheDBScope()
cachedSet <- loadCacheSet("compoundsMetFrag", setHash, cacheDB)
resultHashes <- vector("character", length(gNames))
names(resultHashes) <- gNames
cachedResults <- sapply(runData, function(rd)
{
resultHashes[rd$gName] <<- rd$hash
comptab <- NULL
if (!is.null(cachedSet))
comptab <- cachedSet[[rd$hash]]
if (is.null(comptab))
comptab <- loadCacheData("compoundsMetFrag", rd$hash, cacheDB)
return(comptab)
}, simplify = FALSE)
cachedResults <- cachedResults[!sapply(cachedResults, is.null)]
runData <- runData[setdiff(names(runData), names(cachedResults))] # remove cached results
if (length(runData) > 0)
{
prog <- openProgBar(0, gCount)
results <- lapply(runData, function(rd)
{
rd$mfSettings$PeakList <- as.matrix(rd$spec[, c("mz", "intensity")])
comptab <- metfRag::run.metfrag(rd$mfSettings)
jgc() # hopefully reduce some memory usage
if (nrow(comptab) > 0)
comptab <- unFactorDF(comptab)
comptab <- as.data.table(comptab)
procres <- processMFResults(comptab, rd)
if (nrow(procres$comptab) > 0)
{
# BUG: metfRag seems to give second duplicate results where only NoExplPeaks may differ and have an incorrect value.
# for now, just remove all duplicates and re-assign NoExplPeaks
procres$comptab <- procres$comptab[!duplicated(identifier)]
procres$comptab[, explainedPeaks := sapply(fragInfo, nrow)]
}
saveCacheData("compoundsMetFrag", procres, rd$hash, cacheDB)
setTxtProgressBar(prog, match(rd$gName, gNames))
return(procres)
})
setTxtProgressBar(prog, gCount)
close(prog)
}
else
results <- list()
if (length(cachedResults) > 0)
{
results <- c(results, cachedResults)
results <- results[intersect(gNames, names(results))] # re-order
}
if (is.null(cachedSet))
saveCacheSet("compoundsMetFrag", resultHashes[nzchar(resultHashes)], setHash, cacheDB)
}
# prune empty/NULL results
if (length(results) > 0)
results <- results[sapply(results, function(r) !is.null(r$comptab) && nrow(r$comptab) > 0, USE.NAMES = FALSE)]
ngrp <- length(results)
printf("Loaded %d compounds from %d features (%.2f%%).\n", sum(unlist(lapply(results, function(r) nrow(r$comptab)))),
ngrp, if (gCount == 0) 0 else ngrp * 100 / gCount)
# convert scoreTypes to generic names
scoreTypes <- union(scoreTypes, "score") # ensure final score is in and remove duplicates
isMFScore <- scoreTypes %in% compsScores$metfrag
if (any(isMFScore))
scoreTypes[isMFScore] <- compsScores[match(scoreTypes[isMFScore], compsScores$metfrag), "name"]
scoreRanges <- lapply(results, "[[", "scRanges")
# Ensure that scoreTypes doesn't contain anything 'extra', e.g. this can happen when the user specified scores not
# present in the database.
scoreTypes <- if (length(scoreRanges) > 0) intersect(scoreTypes, unlist(lapply(scoreRanges, names))) else character()
return(compoundsMF(groupAnnotations = lapply(results, "[[", "comptab"), scoreTypes = scoreTypes,
scoreRanges = scoreRanges, settings = mfSettings))
})
#' @template featAnnSets-gen_args
#' @rdname generateCompoundsMetFrag
#' @export
setMethod("generateCompoundsMetFrag", "featureGroupsSet", function(fGroups, MSPeakLists, method = "CL", timeout = 300,
timeoutRetries = 2, errorRetries = 2, topMost = 100,
dbRelMzDev = 5, fragRelMzDev = 5, fragAbsMzDev = 0.002,
adduct = NULL, ..., setThreshold = 0,
setThresholdAnn = 0, setAvgSpecificScores = FALSE)
{
generateCompoundsSet(fGroups, MSPeakLists, adduct, generateCompoundsMetFrag, method = method, timeout = timeout,
timeoutRetries = timeoutRetries, errorRetries = errorRetries, topMost = topMost,
dbRelMzDev = dbRelMzDev, fragRelMzDev = fragRelMzDev, fragAbsMzDev = fragAbsMzDev, ...,
setThreshold = setThreshold, setThresholdAnn = setThresholdAnn,
setAvgSpecificScores = setAvgSpecificScores)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.