R/Functions_MF_prediction.R

Defines functions calcMF

Documented in calcMF

#' calcMF
#'
#' Calculate molecular formulas from a monoisotopic mass. This function uses \pkg{Rdisop} to generate molecular formulas, and can apply many of the filters described as the "seven golden rules"[1]
#' 
#' @references 
#' \enumerate{
#' \item Kind T, Fiehn O (2007) Seven Golden Rules for heuristic filtering of molecular formulas obtained by accurate mass spectrometry. BMC Bioinformatics 8:105. doi: 10.1186/1471-2105-8-105
#' \item Senior JK (1951) Partitions and Their Representative Graphs. Am J Math 73:663. doi: 10.2307/2372318
#' \item Morikawa T, Newbold BT (2003) Analogous odd-even parities in mathematics and chemistry. Chemistry 12:445–450
#'}
#'
#' @param mz monoisotopic mass or monoisotopic m/z value 
#' @param z charge (set 0 for uncharged)
#' @param ppm error tolerance in ppm
#' @param top if not NULL, maximum number of \code{top} formulas to be returned, as ranked by ppm error
#' @param elements either NULL or an element list for \pkg{Rdisop}, as generated by \link[Rdisop]{initializeCHNOPS} and related functions.
#' @param maxCounts check each predicted molecula formula for maximum number of elements expected in a natural product of its size ("Golden Rule #1")
#' @param SeniorRule for each predicted molecula formula, calculate the sum of valences minus twice (the number of atoms minus 1) (to check for Senior's third theorem, "Golden Rule #2")[1][2][3]
#' @param HCratio for each predicted molecula formula, calculate Hydrogen/ Carbon count ratio ("Golden Rule #4")
#' @param moreRatios for each predicted molecula formula, calculate Nitrogen/Carbon, Oxygen/Carbon, Phosphorus/Carbon and Sulfur/Carbon count ratios ("Golden Rule #5")
#' @param elementHeuristic apply a heuristic HNOPS probability check as proposed in "Golden Rule #6"
#' @param Filters which filters to apply, see Details
#' @param summarize if \code{TRUE}, will return a single character string with molecular formulas and charge instead of a data.table
#' @param BPPARAM parameters for parallel processing with \pkg{BiocParallel}. Can be \link[BiocParallel]{bpparam}(), or an object of these classes \link[BiocParallel]{SnowParam-class}, \link[BiocParallel]{MulticoreParam-class}. If NULL, \link[BiocParallel]{SerialParam}() is called (no multicore processing).
#'
#' @details 
#' \subsection{Filters}{
#' These filtering steps can be selected. Filter items that are NULL (or not in the list) will not be applied.
#' 
#' \describe{
#'    \item{DBErange}{numeric vector of length 2, defining lower and upper limit for double bond equivalents in the molecule}
#'    \item{minElements}{character(1) with a molecula formula defining the minimum number of atom counts for a set of elements}
#'    \item{maxElements}{character(1) with a molecula formula defining the maximum number of atom counts for a set of elements}
#'    \item{parity}{filter for parity. must be \code{"e"} (for even) or \code{"o"} (for odd). What to expect depends on ionization technique used.}
#'    \item{maxCounts}{if \code{TRUE}, uses a filter limiting atom counts for elements as described by Kind & Fiehn [1]}
#'    \item{SENIOR3}{If \code{TRUE}, the sum of valences has to be greater than or equal to twice the number of atoms minus 1, as Senior's third theorem suggests [1][2][3]}
#'    \item{HCratio}{If \code{TRUE}, molecular formulas must have a Hydrogen/ Carbon count ratio of >0.2 and <3.1 ("Golden Rule #4")[1]. Will be ignored if no Hydrogen or Carbon present in a molecule.}
#'    \item{moreRatios}{If \code{TRUE}, molecular formulas must have these atom count ratios: Nitrogen/Carbon <1.3, Oxygen/Carbon <1.2, Phosphorus/Carbon <0.3 and Sulfur/Carbon <0.8 ("Golden Rule #5")[1]. Will be ignored if no Carbon present in a molecule.}
#'    \item{elementHeuristic}{If \code{TRUE}, additional element ratio heuristic is applied ("Golden Rule #6") [1]}
#' }}
#' 
#' @importFrom Rdisop decomposeMass initializeCHNOPS initializeElements
#' @importFrom BiocParallel bplapply SerialParam
#' @import stats methods utils
#' 
#' @return a data.frame (or list of data.frames if multiple mz values are supplied) with molecular formulas generated with \link[Rdisop]{decomposeMass}() with additional information 
#' and optionally filtered as described. NOTE: If \code{summarize = TRUE}, results for each query mz value are summarized in a single character string with all molecular formulas matching filter criteria
#' 
#' @examples 
#' 
#' calcMF(mz = 200.000659, z = 1, ppm = 5)
#' 
#' @export
calcMF <- function(mz = 200.000659,
                   z = 1,
                   ppm = 5,
                   top = NULL,
                   elements = Rdisop::initializeCHNOPS(),
                   maxCounts = TRUE,
                   SeniorRule = TRUE,
                   HCratio = TRUE,
                   moreRatios = TRUE,
                   elementHeuristic = TRUE,
                   Filters = list(DBErange = c(-5,40),
                                  minElements = "C0",
                                  maxElements = "C99999",
                                  parity = "e",
                                  maxCounts = TRUE,
                                  SENIOR3 = 0,
                                  HCratio = TRUE,
                                  moreRatios = TRUE,
                                  elementHeuristic = TRUE),
                   summarize = FALSE,
                   BPPARAM = NULL
){
  
  if(is.null(Filters$minElements) || !is.character(Filters$minElements)
     || is.na(Filters$minElements) || Filters$minElements == ""){
    Filters$minElements <- "C0"
  }
  
  
  if(is.null(Filters$maxElements) || !is.character(Filters$maxElements)
     || is.na(Filters$maxElements) || Filters$maxElements == ""){
    Filters$maxElements <- "C99999"
  }
  
  
  
  if(is.null(elements)){
    
    
    elements = initializeCHNOPS()
    
    
  }else if(is.character(elements)){
    #if character formula is given, e.g. "CHOPS"
    
    elements <- makeMF(elements)
    
    elements <- initializeElements( names(elements)[elements>0])
    
  }
  
  if(length(mz)>1){
    
    
    rl <- bplapply(mz,calcMF,
                   z = z,
                   ppm = ppm,
                   top = top,
                   elements = elements,
                   maxCounts = maxCounts,
                   SeniorRule = SeniorRule,
                   HCratio = HCratio,
                   moreRatios = moreRatios,
                   elementHeuristic = elementHeuristic,
                   Filters = Filters,
                   summarize = summarize,
                   BPPARAM=if(is.null(BPPARAM)){SerialParam()}else{BPPARAM})
    
    if(summarize){
      return(unlist(rl))
    }else{
      return(rl)
    }
    
  }
  
  
  
  #how to handle it if there are no molecula formulas given filter conditions
  if(summarize){
    failReturn <- "" 
  }else{
    failReturn <- NULL
  }
  
  
  
   #have to add an electon mass to mz for each charge to get uncharged mass 
  #[needed for correct ppm calculation in decomposeMass; maybe breaks nitrogen rule calculation]
  mm <- decomposeMass(mz + z*5.48579909070e-4,   z = z, 
                      maxisotopes = 1, 
                      ppm = ppm, 
                      mzabs = 0,
                      elements = elements,
                      minElements = Filters$minElements,
                      maxElements = Filters$maxElements)
  
  if(is.null(mm)){return(failReturn)}
  
  f1 <- if(!is.null(Filters$DBErange)){mm$DBE >= Filters$DBErange[1] & mm$DBE<= Filters$DBErange[2]}else{rep(TRUE,length(mm$DBE))}
  
  if(!is.null(Filters$parity) && Filters$parity %in% c('e','o')){
    f1 <- f1 & mm$parity == Filters$parity
  }
  
  if(!any(f1)){return(failReturn)}
  
  
  f1 <- which(f1)
  
  if(length(f1) == 0){return(failReturn)}
  
  #now reorder by mass difference
  f2 <- f1[order(abs(mm$exactmass[f1] - z*5.48579909070e-4 - mz))]
  
  
  sfs <- makeMF(mm$formula[f2], forcelist = TRUE)
  
  # ret <- if(!is.null(Filters$minElements)){
  #   sapply(sfs,">=", Filters$minElements)
  # }else{
  #     T
  # } & if(!is.null(Filters$maxElements)){
  #     sapply(sfs,"<=", Filters$maxElements)
  # }else{
  #     T}
  # 
  # if(!any(ret)){return(failReturn)}
  # 
  # #remove formulas not meeting min and max expectations
  # f2 <- f2[ret]
  # sfs <- sfs[ret]
  
  res <- data.frame(mz = mm$exactmass[f2] - z*5.48579909070e-4,
                    MF = mm$formula[f2],
                    charge = z,
                    RdisopScore = mm$score[f2],
                    unsat = mm$DBE[f2],
                    parity = mm$parity[f2],
                    error = mm$exactmass[f2] - z*5.48579909070e-4 - mz,
                    nrule = mm$valid[f2],
                    stringsAsFactors = FALSE)
  
  res$ppm <- res$error/mz *1e6
  
  #Golden rule #1: Restriction for element numbers
  if(maxCounts){
    Da <- abs(mz/z)
    if(Da < 500){
      maxCountLimit <- consolidateMF(c(C = 29,H = 72, N= 10, O = 18, P= 4, S= 7, F =15, Cl = 8, Br = 5))
    }else if(Da <1000){
      maxCountLimit <- consolidateMF(c(C = 66,H = 126, N= 25, O = 27, P= 6, S= 8, F =16, Cl = 11, Br = 8))
    }else if(Da < 2000){
      maxCountLimit <- consolidateMF(c(C = 115,H = 236, N= 32, O = 63, P= 6, S= 8, F =16, Cl = 11, Br = 8))
    }else if(Da < 3000){
      maxCountLimit <- consolidateMF(c(C = 162,H = 208, N= 48, O = 78, P= 6, S= 9, F =16, Cl = 11, Br = 8))
    }else{
      maxCountLimit <- NULL
    }
    
    if(!is.null(maxCountLimit)){
      
      maxCountLimit[maxCountLimit == 0] <- Inf
      
      res$maxCounts <- sapply(sfs, "<=", maxCountLimit)
      
      
    }else{
      res$maxCounts <- TRUE
    }
    
    if(!is.null(Filters$maxCounts) && Filters$maxCounts){
      sel <- res$maxCounts
      if(!any(sel)){return(failReturn)}
      
      res <- res[sel,]
      sfs <- sfs[sel]
      
    }
    
  }
  
  #Golden rule #2: (Senior's third theorem): 
  
  if(SeniorRule){
    
    res$SENIOR3 <- sapply(sfs,getSenior3.MFobject)
    
    if(!is.null(Filters$SENIOR3) && is.numeric(Filters$SENIOR3)){
      
      sel <- res$SENIOR3 >= Filters$SENIOR3
      if(!any(sel)){return(failReturn)}
      
      
      res <- res[sel,]
      sfs <- sfs[sel]
      
    }
    
  }
  
  
  #Golden Rule #4: Hydrogen/Carbon element ratio check
  if(HCratio){
    
    res$HtoC <-  sapply(sfs,function(MF){
      
      MF["H"]/MF["C"]
      
    })
    
    if(!is.null(Filters$HCratio) && Filters$HCratio){
      
      sel <- is.na(res$HtoC) | (res$HtoC > 0.2 & res$HtoC < 3.1)
      if(!any(sel)){return(failReturn)}
      
      res <- res[sel,]
      sfs <- sfs[sel]
      
    }
  }
  
  #Golden Rule #5: Heteroatom check
  if(moreRatios){
    
    res$NtoC <-  sapply(sfs,function(MF){
      
      MF["N"]/MF["C"]
      
    })
    
    res$OtoC <-  sapply(sfs,function(MF){
      
      MF["O"]/MF["C"]
      
    })
    
    res$PtoC <-  sapply(sfs,function(MF){
      
      MF["P"]/MF["C"]
      
    })
    
    res$StoC <-  sapply(sfs,function(MF){
      
      MF["S"]/MF["C"]
      
    })
    
    if(!is.null(Filters$moreRatios) && Filters$moreRatios){
      
      sel <- ((is.na(res$NtoC) | res$NtoC < 1.3)
              & (is.na(res$OtoC) | res$OtoC < 1.2)
              & (is.na(res$PtoC) | res$PtoC < 0.3)
              & (is.na(res$StoC) | res$StoC < 0.8))
      
      if(!any(sel)){return(failReturn)}
      
      res <- res[sel,]
      sfs <- sfs[sel]
      
    }
    
  }
  
  # Golden rule #6: element probability check
  if(elementHeuristic){
    
    res$elementHeuristic = sapply(sfs,function(MF){
      
      r1 <- na.omit(MF[c("N","O","P","S")])
      
      r2 <- r1[r1>1]
      
      #if two of these elements are not in the molecule, don't apply restrictions
      if(length(r2) <= 2){return(TRUE)}
      
      if(length(r2) == 4 
         && (r2["N"] >=10
             || r2["O"] >=20
             ||r2["P"] >=4
             ||r2["S"] >=3)){return(FALSE)}
      
      switch(paste(names(r2), collapse = ""),
             NOP = {if(min(r2) > 3 && (r2["N"] >=11 || r2["O"] >=22 || r2["P"] >=7)){return(FALSE)}},
             OPS = {if(min(r2) > 1 && (r2["S"] >=3 || r2["O"] >=14 || r2["P"] >=3)){return(FALSE)}},                                                                            NPS = {if(min(r2) > 1 && (r2["N"] >=4 || r2["S"] >=3 || r2["P"] >=3)){return(F)}},
             NOS = {if(min(r2) > 6 && (r2["S"] >=8 || r2["O"] >=14 || r2["N"] >=19)){return(FALSE)}})
      
      return(TRUE)
      
      
    })
    
    
    if(!is.null(Filters$elementHeuristic) && Filters$elementHeuristic){
      
      sel <- res$elementHeuristic
      if(!any(sel)){return(failReturn)}
      
      res <- res[sel,]
      sfs <- sfs[sel]
      
    }
    
  }
  
  
  #final selection:
  
  if(!is.null(top) && is.numeric(top)){
    
    res <- res[seq(min(top,nrow(res))),]
    
  }
  
  if(summarize){
    
    return(paste(paste0(res$MF,
                        if(res$charge[1] > 0){paste0("(+",res$charge,")")}
                        else if(res$charge[1] < 0){paste0("(",res$charge,")")}
                        else{""}),
                 collapse = "|"))
  }
  
  return(res)
}
mjhelf/MassTools documentation built on Nov. 19, 2021, 2:38 a.m.