#' Create a MS2ID backend
#'
#' \code{createMS2ID} processes the data contained in a
#' \code{\link[CompDb-class]{CompDb}} object \insertCite{CompoundDB}{MS2ID} and
#' creates a MS2ID backend by saving data as SQL and bigMemory files in a
#' directory. Later on, that directory will be load as an \linkS4class{MS2ID}
#' object in order to be used repeatedly as reference library with the
#' \code{\link{annotate}} function.
#'
#' @param name character(1) name of the directory where the files will be saved.
#' @param path character(1) with the path where to create the MS2ID backend.
#' @param cmpdb \code{\link[CompDb-class]{CompDb}}(1) object (with MS/MS
#' spectra).
#' @param noiseThresh A numeric defining the threshold used in the noise
#' filtering of the MS/MS spectra. e.g. noiseThresh = 0.01 removes peaks with
#' an intensity less than 1\% of the base peak.
#' @param calcSplash boolean(1) indicating if SPLASH values (when missing) must
#' be calculated using the \code{\link[splashR]{getSplash}} function.
#' @param calcMmi boolean(1) indicating if compound monoisotopic mass values
#' (when missing) must be calculated using the
#' \code{\link[enviPat]{check_chemform}} function.
#' @param overwrite boolean(1) indicating if the function can overwrite
#' results.
#' @param removeRedundant character(1) eliminate redundant compounds and/or
#' spectra. Expected options: "compounds", "spectra", "both" or "none".
#' Redundant compounds are removed by comparing their InChIKeys. Redundant
#' spectra are removed by comparing the spectrum and specific metadata
#' variables, which include: "collisionEnergy", "polarity", "predicted",
#' "instrumentType", and "precursorMz". If the metadata is incomplete or lacks
#' the necessary fields, redundant spectra will not be removed.
#' @return \code{createMS2ID} returns a character(1) with the MS2ID backend
#' location. This value must be used as \code{ms2idFolder} parameter in the
#' \code{MS2ID} constructor.
#' @author Josep M. Badia \email{josepmaria.badia@@urv.cat}
#' @seealso \linkS4class{MS2ID} class.
#' @rdname MS2ID
#' @export
createMS2ID <- function(name = "MS2ID", path = ".", cmpdb, noiseThresh = 0.01,
calcSplash = TRUE, calcMmi = TRUE, overwrite = FALSE,
removeRedundant = "compounds"){
if(missing(cmpdb))
stop("Argument 'cmpdb' required")
validArgs <- c("compounds", "spectra", "both", "none")
if(!removeRedundant %in% validArgs)
paste("Invalid argument for removeRedundant: %s.\n",
"Expected one of: %s") |>
sprintf(removeRedundant, paste(validArgs, collapse = ", ")) |>
stop()
if(!is(cmpdb, "CompDb"))
stop("Argument 'cmpdb' must be a CompDb object (CompoundDb package)")
MS2IDdir <- file.path(path, name)
if(dir.exists(MS2IDdir)){
if(overwrite){
unlink(MS2IDdir, recursive = TRUE)
}else{
stop(glue::glue("'MS2IDdir' directory already exists. Use \\
overwrite argument if need it"))
}
}
#.COLLECT() INFO FUNCTION-----------------
message("\n-------- 1/n. RE-STRUCTURING THE DATA --------")
src <- src_compdb(cmpdb)
## Get a tbl for all tables
cmp_tbl <- tbl(src, "ms_compound")
mtdt_tbl <- tbl(src, "metadata")
msms_tbl <- tbl(src, "msms_spectrum")
pks_tbl <- tbl(src, "msms_spectrum_peak")
#syns_tbl <- tbl(src, "synonym")# don't use it now
##convert compound & spectrum metadata to real tbl (no sql connection)
cmp_tbl <- cmp_tbl %>% collect()
numCmp <- nrow(cmp_tbl)
msms_tbl <- msms_tbl %>% collect()
numMs <- nrow(msms_tbl)
#Merge compounds and spectra into a table
mrg <- merge(cmp_tbl, msms_tbl, by="compound_id",
all.x = FALSE, all.y = FALSE)
cmpNoSpectra <- sum(!cmp_tbl$compound_id %in% mrg$compound_id)
if(cmpNoSpectra > 0){
message(glue::glue("
{cmpNoSpectra} out of {numCmp} compounds does not have any spectrum \\
associated and will be eliminated"))
numCmp <- numCmp - cmpNoSpectra
}
#sync order of metadata rows & spectra rows
#use spectrum_id to sort properly metadata & spectra and remove it
message("loading spectral data...")
pks_tbl <- pks_tbl %>%
select(!peak_id) %>%
rename("mass-charge" = "mz") %>%
collect()
#convert df into a list of peak spectra
pks_tbl <- split(pks_tbl, f = pks_tbl$spectrum_id)
message("parsing spectral data")
fragm_CompDB <- pbapply::pblapply(pks_tbl, function(spctr){
t(spctr[, c("mass-charge", "intensity") ])
})
fragm_spctrID <- vapply(pks_tbl, function(spctr){
unique(spctr$spectrum_id)
}, FUN.VALUE = 1)
varsToParse <- VARS2PARSE
varsToParse$originalNames <- varsToParse$CompoundDBname
#order metadata according spectra order
mrg <- mrg[match(fragm_spctrID, mrg$spectrum_id),]
if("originalSpectrumDb" %in% names(mrg)){
nameDB = mrg$originalSpectrumDb
mrg$originalSpectrumDb <- NULL
}else{
nameDB="unknown"
}
DB <- .basicMS2IDstrct(metadata = mrg, fragm = fragm_CompDB,
varsToParse = varsToParse, nameDB = nameDB,
removeRedundant = removeRedundant,
numCmp = numCmp, numMs = numMs)
###TEMPORARILY. we do not comtemplate DB union (check 10 to implement HERE)
# ALSO elimina metabolits (amb funcio .removeRedundantCompounds()) i spectra duplicats. Només necesari quan juntem diferents
message("\n-------- 3/n. CALCULATING MISSING VARIABLES --------")
if(!calcSplash & !calcMmi)
message("No missing variables to calculate (step avoided by arguments)")
#Obtain Splash where is not present
if(calcSplash){
message("Obtaining spectra's SPLASH values using splashR")
if("splashR" %in% installed.packages()[, "Package"]){
notSplash <- is.na(DB$spectra$splash) | DB$spectra$splash %in% c("")
if(any(notSplash)){
fragmNoSplash <- match(DB$spectra$ID_spectra[notSplash],
DB$fragments$ID_spectra)
splashos <- pbapply::pbsapply(
DB$fragments$spectra[fragmNoSplash],
function(x) splashR::getSplash(t(x))
)
DB$spectra$splash[notSplash] <- splashos
}
}else{
warning(glue::glue(
"SPLASH values of spectra could not be calculated. The \\
required package 'splashR' is not installed in the system."))
}
}
#Obtain compound monoisotopicMW using envipat. Only on rows with formula
if(calcMmi){
message("Obtaining compound monoisotopic mass using envipat")
if("enviPat" %in% installed.packages()[, "Package"]){
noFormula <- is.na(DB$compounds$formula) |
DB$compounds$formula == ""
if(!all(noFormula)){
utils::data(isotopes, package = "enviPat")
ept <- enviPat::check_chemform(isotopes,
DB$compounds$formula[!noFormula])
DB$compounds$exactmass[!noFormula][!ept$warning] <-
ept$monoisotopic_mass[!ept$warning]
}
}else{
warning(glue::glue(
"The molecular mass of the compounds could not be calculated. The \\
required package 'enviPat' is not installed in the system."))
}
}
#spectra treatment
DB$fragments$spectra <- lapply(DB$fragments$spectra, function(x) {
# Sort spectra by mass (necessary in order to apply properly COS SIM)
x[, order(x[1,], decreasing = F), drop = FALSE]
# Convert any spectrum vector to matrix. Spectrum with only one
# fragment can be saved as 1x2 vector instead of 2x1 matrix, inducing
# mistakes.
if(is(x, "numeric") & length(x) == 2)
return(as.matrix(x))
else if(!is(x, "matrix"))
stop("Spectral data type unknown")
#Filter noise
x[, x["intensity", ] > noiseThresh * max(x["intensity",]), drop=FALSE]
})
#order compounds by neutral mass. Need it to identify() (in order to find faster the REF spectra with the neutral mass we want)
DB$compounds <- DB$compounds[order(DB$compounds$exactmass,
decreasing = FALSE ), ]
#order by precursor mass the spectra. Need it to identify() (more precisely, in order to find faster the REF spectra with the precurso mass we want)
DB$spectra <- DB$spectra[order(DB$spectra$precursorMz,
decreasing = FALSE ), ]
#create mz index ------------------------------
message("\n-------- 4/n. OBTAINING THE MZ INDEX --------")
DB$mzIdx <- .mzIdx(DB)
#saveRDS(DB, "esborraDB20210819.rds")
#create MYSQL + bigmemory files-----------------
message("\n-------- 5/n. SAVING THE MS2ID BACKEND FILES --------")
.setMS2IDbackend(MS2IDdir, DB, overwrite = NA)
invisible(MS2IDdir)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.