R/import.msfinder.mssearch.R

#' import.MSFinder.mssearch
#'
#' After running MSFinder on .mat or .msp files, import the spectral search results
#' @param ramclustObj R object - the ramclustR object which was used to write the .mat or .msp files
#' @param mat.dir optional path to .mat directory
#' @param msp.dor optional path to .msp directory
#' @details this function imports the output from the MSFinder program to annotate the ramclustR object
#' @return an updated ramclustR object, with new slots at $msfinder.mssearch.details and $msfinder.mssearch.scores
#' @references Broeckling CD, Afsar FA, Neumann S, Ben-Hur A, Prenni JE. RAMClust: a novel feature clustering method enables spectral-matching-based annotation for metabolomics data. Anal Chem. 2014 Jul 15;86(14):6812-7. doi: 10.1021/ac501530d.  Epub 2014 Jun 26. PubMed PMID: 24927477.
#' @references Broeckling CD, Ganna A, Layer M, Brown K, Sutton B, Ingelsson E, Peers G, Prenni JE. Enabling Efficient and Confident Annotation of LC-MS Metabolomics Data through MS1 Spectrum and Time Prediction. Anal Chem. 2016 Sep 20;88(18):9226-34. doi: 10.1021/acs.analchem.6b02479. Epub 2016 Sep 8. PubMed PMID: 7560453.
#' @references Tsugawa H, Kind T, Nakabayashi R, Yukihira D, Tanaka W, Cajka T, Saito K, Fiehn O, Arita M. Hydrogen Rearrangement Rules: Computational MS/MS Fragmentation and Structure Elucidation Using MS-FINDER Software. Anal Chem. 2016 Aug 16;88(16):7946-58. doi: 10.1021/acs.analchem.6b00770. Epub 2016 Aug 4. PubMed PMID: 27419259.
#' @keywords 'ramclustR' 'RAMClustR', 'ramclustR', 'metabolomics', 'mass spectrometry', 'clustering', 'feature', 'xcms', 'MSFinder'
#' @author Corey Broeckling
#' @export 

import.msfinder.mssearch <- function (
  ramclustObj = RC,
  standardize.names = TRUE,
  mat.dir = NULL,
  msp.dir = NULL
) {
  
  home.dir <-getwd()
  
  if(is.null(mat.dir)) {
    mat.dir = paste0(getwd(), "/spectra/mat")
  }
  
  if(is.null(msp.dir)) {
    msp.dir = paste0(getwd(), "/spectra/msp")
  }
  
  usemat = TRUE
  usemsp = TRUE
  
  if(!dir.exists(mat.dir)) {
    usemat = FALSE
  }
  
  if(!dir.exists(msp.dir)) {
    usemsp = FALSE
  }
  
  if(!usemsp & !usemat) {
    stop("neither of these two directories exist: ", '\n',
         "  ", mat.dir, '\n',
         "  ", msp.dir, '\n')
  }
  
  if(usemsp & usemat) {
    msps<-list.files(msp.dir, recursive  = TRUE)
    mats<-list.files(mat.dir, recursive = TRUE)
    if(length(mats) > length(msps)) {usemsp <- FALSE}
    if(length(msps) > length(mats)) {usemat <- FALSE}
    if(length(msps) == length(mats)) {
      feedback<-readline(prompt="Press 1 for .mat or 2 for .msp to continue")
      if(feedback == 1) {usemsp <- FALSE}
      if(feedback == 2) {usemat <- FALSE}
    }
  }
  
  mat.dir <- c(mat.dir, msp.dir)[c(usemat, usemsp)]
  
  dirs<-list.dirs(mat.dir, recursive = FALSE, full.names = FALSE) 
  
  do<-list.files(mat.dir, pattern = "Spectral DB search.sfd", full.names = TRUE, recursive = TRUE)
  
  
  if(!identical(sort(ramclustObj$cmpd), sort(dirs))) {
    stop("spectral match directory names do not match compound names: please ensure you have the correct pair of ramclust object and directory")
  } else {
    cmpd<-dirs
  }
  
  tags<-c(
    "NAME: ",
    "ID: ",
    "IsSpectrumSearch: ",
    "INCHIKEY: ",
    "SMILES: ",
    "RESOURCES: ",
    "SubstructureInChIKeys: ",
    "RETENTIONTIME: ",
    "RETENTIONINDEX: ",
    "TotalBondEnergy: ",
    "TotalScore: ",
    "TotalHrRulesScore: ",
    "TotalBondCleavageScore: ",
    "TotalMassAccuracyScore: ",
    "TotalFragmentLinkageScore: ",
    "TotalBondDissociationEnergyScore: ",
    "DatabaseScore: ",
    "SubstructureAssignmentScore: ",
    "RtSimilarityScore: ",
    "RiSimilarityScore: ",
    "Num Peaks: "
  )
  
  names(tags)<-tolower(gsub(": ", "", tags))
  
  msfinder.mssearch<-as.list(rep(NA, length(ramclustObj$cmpd)))
  names(msfinder.mssearch)<-ramclustObj$cmpd
  msfinder.mssearch.details<-as.list(rep(NA, length(ramclustObj$cmpd)))
  names(msfinder.mssearch.details)<-ramclustObj$cmpd
  template<-as.list(c("summary" = NA, "spectra" = NA))
  for(i in 1:length(msfinder.mssearch)) {
    msfinder.mssearch[[i]]<-template
    msfinder.mssearch.details[[i]]<-template
  }
  
  for(i in 1:length(cmpd)) {
    setwd(paste0(mat.dir, "/", cmpd[i]))
    if(!file.exists("Spectral DB search.sfd")) {next}
    tmp<-readLines("Spectral DB search.sfd")
    if(length(tmp) == 0) {next}
    starts <- grep("NAME: ", tmp)
    if(length(starts) > 1) {
      stops <- c(((starts[2:length(starts)])-1), (length(tmp)-1))
    } else {
      stops <- length(tmp)-1
    }
    
    out<-matrix(nrow = length(starts), ncol = length(tags))
    dimnames(out)[[2]]<-names(tags)
    spectra<-as.list(rep(NA, length(starts)))
    for(j in 1:length(starts)) {
      d<-tmp[starts[j]:stops[j]]
      vals<-sapply(1:length(tags), FUN = function(x) {
        m<-grep(tags[x], d, fixed = TRUE)
        if(length(m)==0) {
          NA
        } else {
          if(length(m)>1) {
            m<-m[1]
          }
          gsub(tags[x], "", d[m])
        }
      }
      )
      out[j,]<-vals
      spec<-d[(grep("Num Peaks: ", d)+1):length(d)]
      if(any(nchar(spec) == 0)) { 
        spec <- spec[-(which(nchar(spec) == 0))]
      }
      if(length(spec) > 0) {
        spec<-data.frame(matrix(unlist(strsplit(spec, "\t")) , nrow = length(spec), byrow = TRUE), stringsAsFactors = FALSE)
        names(spec)<-c("mz", "int")
        for(k in 1:2) {
          spec[,k]<-as.numeric(spec[,k])
        }
        spectra[[j]]<-(spec)
      } else {
        spec<-data.frame("mz" = NA, "int" = NA)
        spec<-spec[0,]
        spectra[[j]]<-spec
      }
      
      
    }
    out<-data.frame(out, check.names = FALSE, stringsAsFactors = FALSE)
    if(any(out$name == "Spectral DB search")) {
      out <- out[-grep("Spectral DB search", out$name),]
    }
    for(k in 1:ncol(out)) {
      suppressWarnings(an<-as.numeric(out[,k]))
      if(is.na(an[1])) {next}
      if(is.numeric(an[1])) {
        out[,k]<-an
      }
    }
    msfinder.mssearch.details[[cmpd[i]]]$summary<-out
    msfinder.mssearch.details[[cmpd[i]]]$spectra<-spectra
  }
  
  
  for(i in 1:length(msfinder.mssearch.details ) ){
    if(is.null(nrow(msfinder.mssearch.details[[i]]$summary))) {
      msfinder.mssearch.details[[i]]$summary <- data.frame(t(tags))[0,]
    }
  }
  msfinder.mssearch.score<-as.numeric(sapply(1:length(msfinder.mssearch), FUN = function(x) {
    if(is.null(nrow(msfinder.mssearch.details[[x]]$summary))) {
      NA 
    } else {
      msfinder.mssearch.details[[x]]$summary[1,"totalscore"]
    }
  }
  )
  )
  
  ramclustObj$msfinder.mssearch.details<-msfinder.mssearch.details
  ramclustObj$msfinder.mssearch.score<-msfinder.mssearch.score
  
  setwd(home.dir)

  return(ramclustObj)
  
}
sneumann/RAMClustR documentation built on May 30, 2019, 5:05 a.m.