mzAnnotation: Tools for Putative Annotation of Electrospray Ionisation High Resolution Mass Spectrometry Data

Documented in CHOproportion elementCountCheck elementRatioCheck goldenRules goldenRulesScore lewis rdbe senior

#' Calculate the rings plus double bonds equivalent
#' @description Calculate the rings plus double bonds equivalent (RDBE) for molecular formulas.
#' @param element_frequencies table of element frequencies for a set of molecular formulas as returned by `elementFrequencies()`.
#' @param valences named list of element valences
#' @return A vector of RDBE values.
#' @examples 
#' element_frequencies <- elementFrequencies(c('C12H22O11','C12H22NO11'))
#' rdbe(element_frequencies)
#' @importFrom tibble as_tibble deframe
#' @importFrom dplyr distinct
#' @importFrom tidyr replace_na
#' @export

rdbe <- function(element_frequencies,
                 valences = list(C = 4,
                                 H = 1,
                                 N = 3,
                                 O = 2,
                                 P = 3,
                                 S = 4)){
  valences <- valences %>% 
    as_tibble() %>% 
    gather(element,valence) %>% 
    mutate(valence = valence - 2)
  
  element_frequencies <- element_frequencies %>% 
    gather(element,
           frequency,
           -MF) %>% 
    mutate(frequency = replace_na(frequency,0)) %>% 
    left_join(valences,by = 'element')
  
  rdbe_values <- element_frequencies %>% 
    group_by(MF,element) %>% 
    mutate(value = frequency * valence) %>%
    ungroup() %>% 
    group_by(MF) %>% 
    summarise(rdbe = sum(value) %>% 
                {. + 2} %>% 
                {./2}
    )
  element_frequencies %>% 
    select(MF) %>% 
    distinct() %>% 
    left_join(rdbe_values,by = 'MF') %>% 
    select(rdbe) %>% 
    deframe()
}

#' LEWIS and SENIOR checks
#' @rdname lewis_senior
#' @description LEWIS molecular formula valence test and SENIOR test for the existence of molecular graphs. 
#' @param element_frequencies table of element frequencies for a set of molecular formulas as returned by `elementFrequencies()`.
#' @param valences named list of element valences
#' @return Boolean vector of check results for each molecular formula.
#' @examples 
#' element_frequencies <- elementFrequencies(c('C12H22O11','C12H22NO11'))
#' lewis(element_frequencies)
#' senior(element_frequencies)
#' @export

lewis <- function(element_frequencies,
                  valences = list(C = 4,
                                  H = 1,
                                  N = 3,
                                  O = 2,
                                  P = 3,
                                  S = 4)){
  tibble(RDBE = rdbe(element_frequencies,
                     valences = valences)) %>% 
    rowwise() %>% 
    mutate(remainder = RDBE %% 1,
           LEWIS =  ifelse(
             all(
               RDBE >= 0,
               remainder != 0.5
             ),
             TRUE,
             FALSE)) %>% 
    .$LEWIS
}

#' @rdname lewis_senior
#' @export

senior <- function(element_frequencies,
                   valences = list(C = 4,
                                   H = 1,
                                   N = 3,
                                   O = 2,
                                   P = 3,
                                   S = 4)){
  valences <- valences %>% 
    as_tibble() %>% 
    gather(element,valence)
  
  mfs <- element_frequencies$MF
  
  element_frequencies <- element_frequencies %>%  
    gather(element,frequency,-MF) %>% 
    mutate(frequency = replace_na(frequency,0)) %>% 
    left_join(valences,by = 'element') %>% 
    mutate(total_valence = frequency * valence) %>% 
    group_by(MF)
  
  element_frequencies %>% 
    summarise(sum_valence = sum(total_valence)) %>% 
    select(MF,sum_valence) %>% 
    left_join(element_frequencies %>%
                filter((valence %% 2) != 0) %>% 
                summarise(odd_valence_total = sum(frequency)),
              by = 'MF') %>% 
    left_join(element_frequencies %>% 
                filter(frequency > 0) %>% 
                summarise(twice_maximum_valence = max(valence) * 2),
              by = 'MF') %>% 
    left_join(element_frequencies %>% 
                summarise(twice_atoms_minus_1 = sum(frequency) %>% 
                            {. * 2 - 1}),
              by = 'MF') %>% 
    rowwise() %>% 
    mutate(SENIOR = ifelse(
      all(
        (sum_valence %% 2) == 0 | (odd_valence_total %% 2) == 0,
        sum_valence >= twice_maximum_valence,
        sum_valence >= twice_atoms_minus_1
      ),
      TRUE,
      FALSE
    )) %>% 
    mutate(MF = factor(MF,levels = mfs)) %>% 
    arrange(MF) %>% 
    .$SENIOR
}

#' Element ratio checks
#' @description Element ratio checks based on Rules 4 and 5 of the Seven Golden Rules by Kind et al 2007.
#' @param element_ratios a tibble containing molecular formula elemental ratios
#' @param range the ratio threshold ranges as defined by Rules 4 and 5 of the Seven Golden Rules by Kind et al 2007
#' @return A tibble containing results of the element ratio checks. The column headers specify the check criteria. 
#' `TRUE` is returned where relevant checks are passed and `FALSE` where not. `NA` is returned where the check is 
#' not relevant.
#' @references 
#' Kind, T. and Fiehn, O., 2007. Seven Golden Rules for heuristic filtering of molecular formulas obtained by 
#' accurate mass spectrometry. *BMC bioinformatics*, *8*(1), pp.1-20.
#' @examples 
#' ## Using the 'common' ratio ranges
#' c('H2O','C12H22O11') %>% 
#'   elementFrequencies() %>% 
#'   elementRatios() %>% 
#'   elementRatioCheck(range = 'common')
#' 
#' ## Using the 'extreme' ratio ranges
#' c('H2O','C12H22O11') %>% 
#'   elementFrequencies() %>% 
#'   elementRatios() %>% 
#'   elementRatioCheck(range = 'extreme')
#' @importFrom purrr flatten_chr map
#' @importFrom dplyr group_split rowwise
#' @importFrom rlang parse_expr eval_tidy
#' @export

elementRatioCheck <- function(element_ratios, 
                                 range = c('common',
                                           'extended',
                                           'extreme')){
  range_types <- c('common',
                   'extended',
                   'extreme')
  
  range_type <- match.arg(range,
                          choices = range_types)
  
  thresholds <- tibble(range = range_types %>% 
                         map(rep,times = 6) %>% 
                         flatten_chr(),
                       ratio = rep(c(rep('H/C',2),'N/C','O/C','P/C','S/C'),3),
                       operator = c(rep(c('>',rep('<',5)),2),'<',rep('>',5)),
                       threshold = c(0.2,3.1,1.3,1.2,0.3,0.8,
                                     0.1,6,4,3,2,3,
                                     0.1,6,1.3,1.2,0.3,0.8)) %>% 
    filter(range == range_type) %>%
    mutate(name = paste0(ratio,' ',
                         operator,
                         ' ',threshold),
           expr = paste0('element_ratios[["',ratio,'"]] ',
                         operator,
                         ' ',threshold)
    )
  
  ratio_checks <- thresholds %>% 
    rowwise() %>% 
    group_split() %>% 
    map_dfc(~{
      result <- .x$expr %>% 
        parse_expr() %>% 
        eval_tidy()
      
      if (length(result > 0)) result <- result else result <- NA
      
      checks <- tibble(
        !!.x$name := result
      )
      
      return(checks)
    }) %>% 
    bind_cols(select(element_ratios,MF)) %>% 
    select(MF,everything())
  
  return(ratio_checks)
}

#' Element count checks
#' @description Element count checks based on Rule 6 of the Seven Golden Rules by Kind et al 2007.
#' @param element_frequencies a tibble containing element frequencies as returned by `elementFrequencies()`
#' @return A tibble containing results of the element count checks. The column headers specify the check criteria. `TRUE` is returned where relevant checks 
#' are passed and `FALSE` where not. `NA` is returned where the check is not relevant.
#' @references 
#' Kind, T. and Fiehn, O., 2007. Seven Golden Rules for heuristic filtering of molecular formulas obtained by 
#' accurate mass spectrometry. *BMC bioinformatics*, *8*(1), pp.1-20.
#' @examples 
#' elementFrequencies(c('H2O','C12H22O11')) %>% 
#'   elementCountCheck()
#' @importFrom rlang .data
#' @importFrom magrittr set_colnames
#' @importFrom dplyr everything
#' @export

elementCountCheck <- function(element_frequencies){
  
  check_names <- c(rep('NOPS all >= 1',4),
                   rep('NOP all >= 3',3),
                   rep('OPS all >= 1',3),
                   rep('PSN all >= 1',3),
                   rep('NOS all >= 6',3))
  
  element_checks <- tibble(
    name = check_names,
    element = c('N','O','P','S',
                'N','O','P',
                'O','P','S',
                'P','S','N',
                'N','O','S'),
    operator = '>=',
    count = c(rep(1,4),
              rep(3,3),
              rep(1,3),
              rep(1,3),
              rep(6,3))
  ) %>% 
    dplyr::mutate(check = paste0(.data$element,' ',
                                 .data$operator,' ',
                                 .data$count),
                  expr = paste0('element_frequencies[["',.data$element,'"]]',
                                ' ',.data$operator, ' ',
                                .data$count))
  
  probability_checks <- check_names %>% 
    unique() %>% 
    map_dfc(~{
      element_freq_checks <- element_checks %>% 
        filter(name == .x) %>% 
        rowwise() %>% 
        group_split() %>% 
        map_dfc(~{
          result <- .x$expr %>% 
            parse_expr() %>% 
            eval_tidy()
          
          if (length(result) > 0) result <- result 
          else result <- NA
          
          checks <- tibble(
            !!.x$check := result)
          
        }) 
      
      if (nrow(element_freq_checks) > 0){
        element_freq_checks %>% 
          rowid_to_column(var = 'row') %>% 
          gather(check,result,-row) %>% 
          group_by(row) %>% 
          summarise(!!.x := all(result)) %>% 
          select(-row)
      } else {
        NULL
      }
    }) %>% 
    rowid_to_column(var = 'row') %>% 
    gather(name,probability_check,-row)
  
  heuristics <- element_checks %>% 
    mutate(
      operator = '<',
      count = c(10,20,4,3,
                11,22,6,
                14,3,3,
                3,3,4,
                19,14,8
      ),
      check = paste0(.data$element,' ',
                     .data$operator,' ',
                     .data$count),
      expr = paste0('element_frequencies[["',.data$element,'"]]',
                    ' ',.data$operator, ' ',
                    .data$count)
    )
  
  heuristic_checks <- check_names %>% 
    unique() %>% 
    map_dfc(~{
      heuristic_checks <- heuristics %>% 
        filter(name == .x) %>% 
        rowwise() %>% 
        group_split() %>% 
        map_dfc(~{
          checks <- tibble(
            !!.x$check := .x$expr %>% 
              parse_expr() %>% 
              eval_tidy()
          )    
          
          if (nrow(checks) == 0) NULL
          else checks
        }) 
      
      if (nrow(heuristic_checks) > 0){
        heuristic_checks  %>% 
          rowid_to_column(var = 'row') %>% 
          gather(check,result,-row) %>% 
          group_by(row) %>% 
          summarise(!!.x := all(result)) %>% 
          select(-row)
      } else {
        tibble(!!.x := NA)
      }
    }) %>% 
    rowid_to_column(var = 'row') %>% 
    gather(name,heuristic_check,-row)
  
  check_results <- left_join(probability_checks,
                             heuristic_checks, 
                             by = c("row", "name")) %>% 
    mutate(heuristic_check = replace(.data$heuristic_check,
                                     .data$probability_check == FALSE,
                                     NA) %>% 
             replace(is.na(.data$probability_check),
                     NA)) %>% 
    group_by(row,.data$name) %>% 
    summarise(result = all(.data$heuristic_check),.groups = 'drop') %>% 
    spread(name,result) %>% 
    select(-row) %>% 
    bind_cols(select(element_frequencies,MF)) %>% 
    select(MF,everything())
  
  heuristic_names <- heuristics %>% 
    select(name,check) %>% 
    group_by(name) %>% 
    summarise(label = paste(check,collapse = ', ')) %>% 
    mutate(label = paste(name,label,sep = '; ')) %>% 
    .$label
  
  check_results <- check_results %>% 
    set_colnames(c('MF',heuristic_names))
  
  return(check_results)
}

#' The proportion of C, H and O atoms in molecular formulas
#' @description Calculate the proportion of C, H and O in specified molecular formulas.
#' @param element_frequencies a tibble containing element frequencies as returned by `elementFrequencies()`
#' @return A tibble containing the CHO proportions for the specified element frequencies. 
#' @examples
#' c('H2O','C12H22O11') %>% 
#'   elementFrequencies() %>% 
#'   CHOproportion()
#' @export

CHOproportion <- function(element_frequencies){
  element_totals <- element_frequencies %>% 
    gather(element,
           count,
           -MF) %>% 
    group_by(.data$MF) %>% 
    summarise(total_atoms = sum(.data$count,na.rm = TRUE))
  
  CHO_counts <- element_frequencies %>% 
    gather(element,
           count,
           -MF) %>% 
    filter(.data$element %in% c('C','H','O'))
  
  
  if (nrow(CHO_counts) == 0) {
    CHO_counts <- tibble(
      MF = element_frequencies$MF,
      CHO = 0
    )
    } else {
      CHO_counts <- CHO_counts %>% 
        group_by(MF) %>% 
        summarise(CHO = sum(.data$count,na.rm = TRUE))
    }
   CHO_counts %>% 
    left_join(element_totals,by = 'MF') %>% 
    mutate(`CHO proportion` = CHO/.data$total_atoms)
}

#' Golden rule tests for molecular formulas
#' @description Heuristic tests for molecular formulas based on the Golden Rules 2, 4, 5 and 6 from Kind et al 2007.
#' @param MF a vector of molecular formulas
#' @return A tibble containing golden rule heuristic check results. `TRUE` is returned where relevant the Golden Rule 
#' is satisfied and `FALSE` where not. `NA` is returned where the rule is not relevant for a molecular formula.
#' @references 
#' Kind, T. and Fiehn, O., 2007. Seven Golden Rules for heuristic filtering of molecular formulas obtained by 
#' accurate mass spectrometry. *BMC bioinformatics*, *8*(1), pp.1-20.
#' @examples 
#' goldenRules(c('H2O','C12H22O11'))
#' @export

goldenRules <- function(MF){
  element_frequencies <- elementFrequencies(MF)
  element_ratios <- elementRatios(element_frequencies)
  
  golden_rules <- tibble(
    MF = MF,
    LEWIS = lewis(element_frequencies),
    SENIOR = senior(element_frequencies),
  ) %>% 
    left_join(elementRatioCheck(element_ratios),
              by = 'MF') %>% 
    left_join(elementCountCheck(element_frequencies),
              by = 'MF') %>% 
    left_join(CHOproportion(element_frequencies) %>% 
                select(MF,`CHO proportion`),
              by = 'MF')
  
  return(golden_rules)
}

#' Molecular formula plausibility scores
#' @description Percentage scores for molecular formula plausibility based on Rules 2, 4, 5 and 6 of Kind et al 2007.
#' @param golden_rules a tibble containing golden rule heuristic checks results as from `goldenRules()`
#' @return A tibble containing golden rules plausibility percentage scores.
#' @references 
#' Kind, T. and Fiehn, O., 2007. Seven Golden Rules for heuristic filtering of molecular formulas obtained by 
#' accurate mass spectrometry. *BMC bioinformatics*, *8*(1), pp.1-20.
#' @examples 
#' c('H2O','C12H22O11') %>% 
#'   goldenRules() %>% 
#'   goldenRulesScore()
#' @export

goldenRulesScore <- function(golden_rules){
  
  rule_types <- tibble(
    check = colnames(golden_rules)[-1]
  ) %>% 
    mutate(rule = .data$check,
           rule = replace(.data$rule,.data$rule == 'LEWIS','LEWIS and SENIOR'),
           rule = replace(.data$rule,.data$rule == 'SENIOR','LEWIS and SENIOR'),
           rule = replace(.data$rule,grepl('/',.data$rule),'Element ratios'),
           rule = replace(.data$rule,grepl('all',.data$rule),'Element counts'))
  
  golden_rules %>% 
    gather(check,result,-MF) %>% 
    left_join(rule_types, 
              by = "check") %>% 
    mutate(result = replace(result,
                            result == TRUE,
                            1) %>% 
             replace(is.na(result),
                     1)) %>%
    group_by(MF,rule) %>% 
    summarise(score = sum(result)/dplyr::n(),
              .groups = 'drop') %>% 
    spread(rule,score) %>% 
    mutate(`Plausibility (%)` = (`LEWIS and SENIOR` +
                                   `Element ratios` +
                                   `Element counts` +
                                   `CHO proportion`) / 4 * 100) %>% 
    select(MF,
           `LEWIS and SENIOR`,
           `Element ratios`,
           `Element counts`,
           `CHO proportion`,
           `Plausibility (%)`)
}

jasenfinch/mzAnnotation documentation built on Feb. 25, 2023, 1:27 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jasenfinch/mzAnnotation
Tools for Putative Annotation of Electrospray Ionisation High Resolution Mass Spectrometry Data

R/golden_rules.R
In jasenfinch/mzAnnotation: Tools for Putative Annotation of Electrospray Ionisation High Resolution Mass Spectrometry Data

Defines functions goldenRulesScore goldenRules CHOproportion elementCountCheck elementRatioCheck senior lewis rdbe

Documented in CHOproportion elementCountCheck elementRatioCheck goldenRules goldenRulesScore lewis rdbe senior

R Package Documentation

Browse R Packages

We want your feedback!

jasenfinch/mzAnnotation Tools for Putative Annotation of Electrospray Ionisation High Resolution Mass Spectrometry Data

R/golden_rules.R In jasenfinch/mzAnnotation: Tools for Putative Annotation of Electrospray Ionisation High Resolution Mass Spectrometry Data

Defines functions goldenRulesScore goldenRules CHOproportion elementCountCheck elementRatioCheck senior lewis rdbe

Documented in CHOproportion elementCountCheck elementRatioCheck goldenRules goldenRulesScore lewis rdbe senior

R Package Documentation

Browse R Packages

We want your feedback!

jasenfinch/mzAnnotation
Tools for Putative Annotation of Electrospray Ionisation High Resolution Mass Spectrometry Data

R/golden_rules.R
In jasenfinch/mzAnnotation: Tools for Putative Annotation of Electrospray Ionisation High Resolution Mass Spectrometry Data