#' @title metIdentify
#' @description Identify metabolites based on MS/MS database.
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param ms1.data The name of ms1 peak table (csv format). Column 1 is "name", Column 2 is
#' "mz" and column is "rt" (second).
#' @param ms2.data MS2 data, must be mgf, msp or mzXML format. For example, ms2.data = c("test.mgf", "test2.msp").
#' @param ms1.ms2.match.mz.tol MS1 peak and MS2 spectrum matching m/z tolerance. Default is 25 pm.
#' @param ms1.ms2.match.rt.tol MS1 peak and MS2 spectrum matching RT tolerance. Default is 10 s.
#' @param ms1.match.ppm Precursor match ppm tolerance.
#' @param ms2.match.ppm Fragment ion match ppm tolerance.
#' @param mz.ppm.thr Accurate mass tolerance for m/z error calculation.
#' @param ms2.match.tol MS2 match (MS2 similarity) tolerance.
#' @param fraction.weight The weight for matched fragments.
#' @param dp.forward.weight Forward dot product weight.
#' @param dp.reverse.weight Reverse dot product weight.
#' @param rt.match.tol RT match tolerance.
#' @param polarity The polarity of data, "positive"or "negative".
#' @param ce Collision energy. Please confirm the CE values in your database. Default is "all".
#' @param column "hilic" (HILIC column) or "rp" (reverse phase).
#' @param ms1.match.weight The weight of MS1 match for total score calculation.
#' @param rt.match.weight The weight of RT match for total score calculation.
#' @param ms2.match.weight The weight of MS2 match for total score calculation.
#' @param path Work directory.
#' @param total.score.tol Total score tolerance. The total score are refering to MS-DIAL.
#' @param candidate.num The number of candidate.
#' @param database MS2 database name.
#' @param threads Number of threads
#' @return A metIdentifyClass object.
#' @export
#' @seealso The example and demo data of this function can be found
#' https://jaspershen.github.io/metIdentify/articles/metIdentify.html
setGeneric(name = "metIdentify",
def = function(ms1.data, ##csv format
ms2.data,##only msp and mgf and mz(X)ML are supported
ms1.ms2.match.mz.tol = 25,
ms1.ms2.match.rt.tol = 10,
ms1.match.ppm = 25,
ms2.match.ppm = 30,
mz.ppm.thr = 400,
ms2.match.tol = 0.5,
fraction.weight = 0.3,
dp.forward.weight = 0.6,
dp.reverse.weight = 0.1,
rt.match.tol = 30,
polarity = c("positive", "negative"),
ce = "30",
column = c("hilic", "rp"),
ms1.match.weight = 0.25,
rt.match.weight = 0.25,
ms2.match.weight = 0.5,
path = ".",
total.score.tol = 0.5,
candidate.num = 3,
database,
threads = 3){
###Check data
if(missing(database)){
stop("No database is provided.\n")
}
##parameter specification
polarity <- match.arg(polarity)
column <- match.arg(column)
##check ms1.file and ms2.file
file <- dir(path)
if(!all(ms1.data %in% file)) {
stop("MS1 data is not in the directory, please check it.\n")
}
if(!all(ms2.data %in% file)) {
stop("Some MS2 data are not in the directory, please check it.\n")
}
if(!all(database %in% file)) {
stop("Database is not in this directory, please check it.\n")
}
#load MS2 database
database.name <- database
load(file.path(path, database.name))
database <- get(database.name)
if(class(database) != "databaseClass"){
stop("database must be databaseClass object\n")
}
ce.list.pos <- unique(unlist(lapply(database@spectra.data$Spectra.positive, names)))
ce.list.neg <- unique(unlist(lapply(database@spectra.data$Spectra.negative, names)))
ce.list <- ifelse(polarity == "positive", ce.list.pos, ce.list.neg)
if(all(ce %in% ce.list) & ce != "all"){
stop("All ce values you set are not in database. Please check it.\n")
ce <- ce[ce %in% ce.list]
}
rm(list = c("ce.list.pos", "ce.list.neg", "ce.list"))
##ce values
if(all(ce != "all")){
if(polarity == "positive"){
ce.list <- unique(unlist(lapply(database@spectra.data$Spectra.positive, function(x){
names(x)
})))
if(length(grep("Unknown", ce.list)) > 0){
ce <- unique(c(ce, grep(pattern = "Unknown", ce.list, value = TRUE)))
}
}else{
ce.list <- unique(unlist(lapply(database@spectra.data$Spectra.negative, function(x){
names(x)
})))
if(length(grep("Unknown", ce.list)) > 0){
ce <- unique(c(ce, grep(pattern = "Unknown", ce.list, value = TRUE)))
}
}
}
##RT in database or not
if(!database@database.info$RT){
cat("No RT information in database. The weight of RT have been set as 0.\n")
}
#------------------------------------------------------------------
##load adduct table
if(polarity == "positive" & column == "hilic"){
data("hilic.pos", envir = environment())
adduct.table <- hilic.pos
}
if(polarity == "positive" & column == "rp"){
data("rp.pos", envir = environment())
adduct.table <- rp.pos
}
if(polarity == "negative" & column == "hilic"){
data("hilic.neg", envir = environment())
adduct.table <- hilic.neg
}
if(polarity == "negative" & column == "rp"){
data("rp.neg", envir = environment())
adduct.table <- rp.neg
}
if(all(c("ms1.info", "ms2.info") %in% file)){
cat("Use old data\n")
load(file.path(path, "ms1.info"))
load(file.path(path, "ms2.info"))
}else{
##read MS2 data
cat("Reading MS2 data...\n")
ms2.data.name <- ms2.data
temp.ms2.type <- stringr::str_split(string = ms2.data.name,
pattern = "\\.")[[1]]
temp.ms2.type <- temp.ms2.type[length(temp.ms2.type)]
if(temp.ms2.type %in% c("mzXML", "mzML")){
ms2.data <- readMZXML(file = file.path(path, ms2.data.name), threads = threads)
}else{
ms2.data <- lapply(ms2.data.name, function(temp.ms2.data){
temp.ms2.type <- stringr::str_split(string = temp.ms2.data,
pattern = "\\.")[[1]]
temp.ms2.type <- temp.ms2.type[length(temp.ms2.type)]
if(!temp.ms2.type %in% c("mgf", "msp")) stop("We only support mgf or msp.\n")
if(temp.ms2.type == "msp"){
temp.ms2.data <- readMSP(file = file.path(path, temp.ms2.data))
}else{
temp.ms2.data <- readMGF(file = file.path(path, temp.ms2.data))
}
temp.ms2.data
})
names(ms2.data) <- ms2.data.name
###prepare data for metIdentification function
cat("Preparing MS2 data for identification...\n")
ms2.data <- mapply(FUN = function(temp.ms2.data, temp.ms2.data.name){
temp.ms2.data <- lapply(temp.ms2.data, function(x){
info <- x$info
info <- data.frame(name = paste("mz", info[1],"rt", info[2], sep = ""),
"mz" = info[1], "rt" = info[2],
"file" = temp.ms2.data.name,
stringsAsFactors = FALSE)
rownames(info) <- NULL
x$info <- info
x
})
temp.ms2.data
},
temp.ms2.data = ms2.data,
temp.ms2.data.name = ms2.data.name)
if(class(ms2.data) == "matrix"){
ms2.data <- ms2.data[,1]
}else{
ms2.data <- do.call(what = c, args = ms2.data)
}
}
ms1.info <- lapply(ms2.data, function(x){
x[[1]]
})
ms2.info <- lapply(ms2.data, function(x){
x[[2]]
})
ms1.info <- do.call(what = rbind, args = ms1.info)
ms1.info <- as.data.frame(ms1.info)
rownames(ms1.info) <- NULL
duplicated.name <- unique(ms1.info$name[duplicated(ms1.info$name)])
if(length(duplicated.name) > 0){
lapply(duplicated.name, function(x){
ms1.info$name[which(ms1.info$name == x)] <- paste(x, c(1:sum(ms1.info$name == x)), sep = "_")
})
}
names(ms2.info) <- ms1.info$name
##save intermediate data
save(ms1.info, file = file.path(path, "ms1.info"), compress = "xz")
save(ms2.info, file = file.path(path, "ms2.info"), compress = "xz")
}
if(!missing(ms1.data)){
cat("Matching peak table with MS2 spectrum...\n")
ms1.data <- readr::read_csv(file = file.path(path, ms1.data),
col_types = readr::cols())
colnames(ms1.data)[1:3] <- c("name", "mz", "rt")
match.result <- SXTMTmatch(data1 = ms1.data[,c(2,3)],
data2 = ms1.info[,c(2,3)],
mz.tol = ms1.ms2.match.mz.tol,
rt.tol = ms1.ms2.match.rt.tol,
rt.error.type = "abs")
if(is.null(match.result)) return("No peaks are matched with MS2 spectra.\n")
if(nrow(match.result) == 0) return("No peaks are matched with MS2 spectra.\n")
cat(length(unique(match.result[,1])),"out of", nrow(ms1.data), "peaks have MS2 spectra.\n")
###if one peak matches multiple peaks, select the more relibale MS2 spectrum
cat("Selecting the most intense MS2 spectrum for each peak...\n")
temp.idx <- unique(match.result[,1])
match.result <- lapply(temp.idx, function(idx){
idx2 <- match.result[which(match.result[,1] == idx), 2]
if(length(idx2) == 1){
return(c(idx, idx2))
}else{
temp.ms2.info <- ms2.info[idx2]
return(c(idx, idx2[which.max(unlist(lapply(temp.ms2.info, function(y){
y <- y[order(y[,2], decreasing = TRUE),,drop = FALSE]
if(nrow(y) > 5) y <- y[1:5,]
sum(y[,2])
})))]
)
)
}
})
match.result <- do.call(rbind, match.result)
match.result <- as.data.frame(match.result)
match.result <- data.frame(match.result,
ms1.data$name[match.result$V1],
ms1.info$name[match.result$Index2], stringsAsFactors = FALSE)
colnames(match.result) <- c("Index1.ms1.data", "Index.ms2.spectra",
"MS1.peak.name", "MS2.spectra.name")
ms1.info <- ms1.info[unique(match.result[,2]), , drop = FALSE]
ms2.info <- ms2.info[unique(match.result[,2])]
match.result$Index.ms2.spectra <- match(match.result$MS2.spectra.name, ms1.info$name)
save(match.result, file = file.path(path, "match.result"), compress = "xz")
}else{
stop("Please provide MS1 data name.\n")
}
ms2Matchresult <- metIdentification(ms1.info = ms1.info,
ms2.info = ms2.info,
polarity = polarity,
ce = ce,
database = database,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
mz.ppm.thr = mz.ppm.thr,
ms2.match.tol = ms2.match.tol,
rt.match.tol = rt.match.tol,
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
adduct.table = adduct.table,
threads = threads,
fraction.weight = fraction.weight,
dp.forward.weight = dp.forward.weight,
dp.reverse.weight = dp.reverse.weight
)
return.result <- new(Class = "metIdentifyClass",
ms1.data = ms1.data,
ms1.info = ms1.info,
ms2.info = ms2.info,
identification.result = ms2Matchresult,
match.result = match.result,
adduct.table = adduct.table,
ms1.ms2.match.mz.tol = ms1.ms2.match.mz.tol,
ms1.ms2.match.rt.tol = ms1.ms2.match.rt.tol,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
ms2.match.tol = ms2.match.tol,
rt.match.tol = rt.match.tol,
polarity = polarity,
ce = paste(ce, collapse = ";"),
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
path = path,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
database = database.name,
threads = threads,
version = "0.1.6")
cat("All is done.\n")
return(return.result)
})
.onAttach <- function(libname, pkgname){
packageStartupMessage("metIdentify,
More information can be found at https://jaspershen.github.io/metIdentify/
Authors: Xiaotao Shen (shenxt1990@163.com), Si Wu
Maintainer: Xiaotao Shen.
Version 0.1.6 (20190724)
--------------
o Add some new functions and fix some bugs.")
}
packageStartupMessage("metIdentify,
More information can be found at https://jaspershen.github.io/metIdentify/
Authors: Xiaotao Shen (shenxt1990@163.com), Si Wu
Maintainer: Xiaotao Shen.
Version 0.1.6 (20190724)
--------------
o Add some new functions and fix some bugs.")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.