metID: Metabolite identification based on MS1 and MS2 spectra

Documented in databaseConstruction

##------------------------------------------------------------------------------
#' @title Construct in-house or public MS2 database for metID.
#' @description Construct MS2 spectra database according to mzXML data and compound information table (csv format).
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param path Work directory.
#' @param version The version of you database. Default is 0.0.1.
#' @param metabolite.info.name The metabolite information table name, it must be csv format.
#' The demo data can be got from the `demoData` package.
#' Please see https://jaspershen.github.io/metID/articles/metID.html
#' @param source The source of your database.
#' @param link Website link of the source.
#' @param creater Creater name. For example, Xiaotao Shen.
#' @param email email address.
#' @param rt Do the metabolites have RT information or not?. If not, set it as FALSE.
#' @param mz.tol m/z tolerance for the match between metabolites and precursor m/z of MS2 spectra.
#' @param rt.tol RT tolerance for the match between metabolites and precursor m/z of MS2 spectra.
#' @param threads The number of threads
#' @importFrom crayon red yellow green bgRed
#' @importFrom stringr str_detect str_extract
#' @importFrom readr cols
#' @importFrom pbapply pblapply
#' @return A databaseClass object.
#' @seealso The example and demo data of this function can be found
#' https://jaspershen.github.io/metID/articles/metID.html
#' @export

# sxtTools::setwd_project()
# setwd("test_data/database/")


# path = "."
# version = "0.0.1"
# metabolite.info.name = "metabolite.info_RPLC.csv"
# source = "Michael Snyder Lab"
# link = "http://snyderlab.stanford.edu/"
# creater = "Xiaotao Shen"
# email = "shenxt1990@163.com"
# rt = TRUE
# mz.tol = 15
# rt.tol = 30
# threads = 3

# database <-
#   databaseConstruction(metabolite.info.name = "metabolite.info_RPLC.csv")

databaseConstruction = function(path = ".",
                                version = "0.0.1",
                                metabolite.info.name = "metabolite.info.csv",
                                source = "Michael Snyder Lab",
                                link = "http://snyderlab.stanford.edu/",
                                creater = "Xiaotao Shen",
                                email = "shenxt1990@163.com",
                                rt = TRUE,
                                mz.tol = 15,
                                rt.tol = 30,
                                threads = 3) {
  cat(
    crayon::yellow(
      "`databaseConstruction()` is deprecated, use `construct_database()`."
    )
  )
  
  ##check data first
  file <- dir(path)
  if (all(file != metabolite.info.name)) {
    cat(crayon::red("No", metabolite.info.name, "in your", path, "\n"))
    return(NULL)
  }
  
  if (all(file != "POS")) {
    cat(crayon::red("No POS file in your", path, "\n"))
  } else{
    file_pos <- dir(file.path(path, "POS"))
    if (length(file_pos) == 0) {
      cat(crayon::red("No mzXML files in POS folder\n"))
    } else{
      if (sum(stringr::str_detect(file_pos, "mzXML")) == 0) {
        cat(crayon::red("No mzXML files in POS folder\n"))
      }
    }
  }
  
  if (all(file != "NEG")) {
    cat(crayon::red("No NEG file in your", path, "\n"))
  } else{
    file_neg <- dir(file.path(path, "NEG"))
    if (length(file_neg) == 0) {
      cat(crayon::red("No mzXML files in NEG folder\n"))
    } else{
      if (sum(stringr::str_detect(file_neg, "mzXML")) == 0) {
        cat(crayon::red("No mzXML files in NEG folder\n"))
      }
    }
  }
  
  ##read metabolite information
  cat(crayon::green("Reading metabolite information...\n"))
  metabolite.info <-
    readTable(file = file.path(path, metabolite.info.name))
  
  cat(crayon::green("Reading positive MS2 data...\n"))
  
  file.pos <-
    dir(file.path(path, 'POS'), full.names = TRUE)
  
  ms2.data.pos <-
    read_mzxml(file = file.pos, threads = threads)
  
  ms1.info.pos <- lapply(ms2.data.pos, function(x) {
    x[[1]]
  })
  
  ms1.info.pos <- do.call(rbind, ms1.info.pos)
  
  ms1.info.pos$file <- basename(ms1.info.pos$file)
  
  ms2.info.pos <- lapply(ms2.data.pos, function(x) {
    x[[2]]
  })
  
  rm(list = "ms2.data.pos")
  
  cat(crayon::red("OK\n"))
  
  cat(crayon::green("Reading negative MS2 data...\n"))
  
  file.neg <-
    dir(file.path(path, 'NEG'), full.names = TRUE)
  
  ms2.data.neg <-
    read_mzxml(file = file.neg, threads = threads)
  
  ms1.info.neg <- lapply(ms2.data.neg, function(x) {
    x[[1]]
  })
  
  ms1.info.neg <- do.call(rbind, ms1.info.neg)
  
  ms1.info.neg$file <- basename(ms1.info.neg$file)
  
  ms2.info.neg <- lapply(ms2.data.neg, function(x) {
    x[[2]]
  })
  
  rm(list = "ms2.data.neg")
  
  cat(crayon::red("OK\n"))
  
  ###---------------------------------------------------------------------------
  cat(crayon::green("Matching metabolites with MS2 spectra (positive)...\n"))
  
  match.result.pos <-
    SXTMTmatch(
      data1 = as.data.frame(metabolite.info[, c("mz.pos", "RT")]),
      data2 = ms1.info.pos[, c(2, 3)],
      mz.tol = mz.tol,
      rt.tol = rt.tol,
      rt.error.type = "abs"
    )
  
  match.result.pos <- data.frame(match.result.pos,
                                 "file" = ms1.info.pos$file[match.result.pos[, 2]],
                                 stringsAsFactors = FALSE)
  
  unique.idx1 <- unique(match.result.pos[, 1])
  
  spectra.pos <-
    pbapply::pblapply(unique.idx1, function(idx) {
      temp.match.result.pos <-
        match.result.pos[which(match.result.pos == idx), , drop = FALSE]
      if (nrow(temp.match.result.pos) == 0)
        return(NULL)
      temp.submitter <- metabolite.info$Submitter[idx]
      temp.match.result.pos <-
        temp.match.result.pos[grep(temp.submitter, temp.match.result.pos[, 9]), ]
      if (nrow(temp.match.result.pos) == 0)
        return(NULL)
      
      if (nrow(temp.match.result.pos) == 1) {
        temp.ms2.pos <- ms2.info.pos[temp.match.result.pos[1, 2]]
        names(temp.ms2.pos) <-
          stringr::str_extract(string = temp.match.result.pos[1, 9],
                               pattern = "NCE[0-9]{1,3}")
        return(temp.ms2.pos)
      }
      
      unique.file.name <-
        unique(temp.match.result.pos$file)
      
      temp.ms2.pos <-
        lapply(unique.file.name, function(temp.name) {
          temp.x <-
            temp.match.result.pos[which(temp.match.result.pos$file == temp.name), , drop = FALSE]
          temp.idx <-
            which.max(unlist(lapply(ms2.info.pos[temp.x[, 2]], function(y) {
              sum(y[, 2])
            })))
          ms2.info.pos[[temp.x[temp.idx, 2]]]
        })
      
      names(temp.ms2.pos) <-
        stringr::str_extract(string = unique.file.name,
                             pattern = "NCE[0-9]{1,3}")
      temp.ms2.pos
      
      
    })
  
  names(spectra.pos) <-
    metabolite.info$Lab.ID[unique.idx1]
  
  spectra.pos <-
    spectra.pos[which(!unlist(lapply(spectra.pos, is.null)))]
  
  cat(crayon::red("OK\n"))
  ###---------------------------------------------------------------------------
  cat(crayon::green("Matching metabolites with MS2 spectra (negative)...\n"))
  
  match.result.neg <-
    SXTMTmatch(
      data1 = as.data.frame(metabolite.info[, c("mz.neg", "RT")]),
      data2 = ms1.info.neg[, c(2, 3)],
      mz.tol = mz.tol,
      rt.tol = rt.tol,
      rt.error.type = "abs"
    )
  
  match.result.neg <- data.frame(match.result.neg,
                                 "file" = ms1.info.neg$file[match.result.neg[, 2]],
                                 stringsAsFactors = FALSE)
  
  unique.idx1 <- unique(match.result.neg[, 1])
  
  spectra.neg <-
    pbapply::pblapply(unique.idx1, function(idx) {
      temp.match.result.neg <-
        match.result.neg[which(match.result.neg == idx), , drop = FALSE]
      if (nrow(temp.match.result.neg) == 0)
        return(NULL)
      temp.submitter <- metabolite.info$Submitter[idx]
      temp.match.result.neg <-
        temp.match.result.neg[grep(temp.submitter, temp.match.result.neg[, 9]), ]
      if (nrow(temp.match.result.neg) == 0)
        return(NULL)
      
      if (nrow(temp.match.result.neg) == 1) {
        temp.ms2.neg <- ms2.info.neg[temp.match.result.neg[1, 2]]
        names(temp.ms2.neg) <-
          stringr::str_extract(string = temp.match.result.neg[1, 9],
                               pattern = "NCE[0-9]{1,3}")
        return(temp.ms2.neg)
      }
      
      unique.file.name <-
        unique(temp.match.result.neg$file)
      
      temp.ms2.neg <-
        lapply(unique.file.name, function(temp.name) {
          temp.x <-
            temp.match.result.neg[which(temp.match.result.neg$file == temp.name), , drop = FALSE]
          temp.idx <-
            which.max(unlist(lapply(ms2.info.neg[temp.x[, 2]], function(y) {
              sum(y[, 2])
            })))
          ms2.info.neg[[temp.x[temp.idx, 2]]]
        })
      
      names(temp.ms2.neg) <-
        stringr::str_extract(string = unique.file.name,
                             pattern = "NCE[0-9]{1,3}")
      temp.ms2.neg
    })
  
  names(spectra.neg) <-
    metabolite.info$Lab.ID[unique.idx1]
  
  spectra.neg <-
    spectra.neg[which(!unlist(lapply(spectra.neg, is.null)))]
  
  cat(crayon::red("OK\n"))
  
  Spectra <- list("Spectra.positive" = spectra.pos,
                  "Spectra.negative" = spectra.neg)
  
  database.info <- list(
    "Version" = version,
    "Source" = source,
    "Link" = link,
    "Creater" = creater,
    "Email" = email,
    "RT" = rt
  )
  
  spectra.info <- as.data.frame(metabolite.info)
  rm(list = "metabolite.info")
  
  msDatabase0.0.1 <- new(
    Class = "databaseClass",
    database.info = database.info,
    spectra.info = spectra.info,
    spectra.data = Spectra
  )
  
  msDatabase0.0.1@database.info$RT <-
    ifelse(all(is.na(msDatabase0.0.1@spectra.info$RT)), FALSE, TRUE)
  cat(crayon::bgRed("All done!\n"))
  return(msDatabase0.0.1)
}