mspcompiler: Compile Mass Spectral Libraries from Various Sources

Documented in assign_ri clean_ri_dat clean_user_dbu extract_ri remove_ri

#' Remove retention index for EI libraries
#'
#' \code{remove_ri} offers a way to remove all RI for EI libraries.
#'
#' This function supports parallel computing.
#'
#' @param lib The \code{list} generated by \code{read_lib}.
#'
#' @return A \code{list} without RI
#' @export
#'
#' @import future.apply
remove_ri <- function(lib) {
  future.apply::future_lapply(lib, function(x) {
    x$RI <- NA

    return(x)
  })
}


# Define characters to be kept and keep them as raw.
keep_char <- c(
  letters, LETTERS, 0:9, "*", ".", ",", ";", '"', "'", "\\",
  "/", ":", "_", "^", "%", "&", "{", "}", "[", "]", "(", ")",
  "+", "-", "|", "=", "@", "#", "!", "$", "\n", "\t", " "
) %>%
  sapply(charToRaw)


#' Extract experimental RI from NIST
#'
#' \code{clean_ri_dat}, an interanl function, offers a way to extract RI from
#' the "ri.dat" file.
#'
#' Once you have NIST library installed, there will be a "ri.dat" file in the
#' installation path (e.g., "~/Programs/nist17/mssearch"). This file
#' contains all experimental RI in the NIST library but it is not human readable.
#' This function provides a way to convert the "ri.dat" file into a data.frame,
#' so that we can better leverage the RI information present in the NIST library
#' and to incorporate them into the msp file.
#'
#'
#' @param file The "ri.dat" file in the installation path
#'   (e.g., "~/Programs/nist17/mssearch").
#'
#' @return A cleaned data.frame containing experimental RI from NIST
#'
#' @import readr
#' @import rio
#' @export
clean_ri_dat <- function(file) {
  # Read the file in binary.
  tmp <- readr::read_file_raw(file)
  # Convert all NUL characters to \n
  tmp[tmp == 00] <- charToRaw("\n")
  # Keep only pre-defined characters
  tmp <- tmp[tmp %in% keep_char]

  # Write it into a *.txt to allow being re-read in text form
  readr::write_file(tmp, "tmp.txt")
  tmp <- readLines("tmp.txt")
  # Every useful entry starts with C/R/U following a number, which is the ID of
  # the compound. So, only keep these elements
  tmp <- tmp[grepl("^[A-Z]+[0-9]+", tmp)]

  # Write it into a *.txt to allowing being read in tab delimited form
  writeLines(tmp, "tmp.txt")
  tmp <- read.delim("tmp.txt", header = FALSE)
  # Set column names
  names(tmp) <- c(
    "ID", "Name", "Molecular_Formula", "RI", "Column_Type",
    "Column_Polarity", "Column", "Column_Length", "Carrier_Gas",
    "Substrate", "Column_Diameter", "Phase_Thickness", "RI_Type",
    "Ramp_Type", "Temperature1", "Temperature2",
    "Temperature_Increment", "Time1", "Time2", "Ramp_Detail", "Note"
  )

  # Remove the temporary file
  file.remove("tmp.txt")
  return(tmp)
}


#' Extract InChIKey for compounds that have experimental RI
#'
#' \code{clean_user_dbu}, an internal function, offers a way to extract InChIKey
#' for compounds that have experimental RI from the "USER.DBU" file.
#'
#' RI values in the cleaned RI table obtained by \code{\link{clean_ri_dat}}
#' cannot be linked to compounds in the msp file. Providing that the "USER.DBU"
#' file in the installation path (e.g., "~/Programs/nist17/mssearch") contains
#' InChIKey of each compound in RI table. However this file is not human
#' readable. Therefore, this function provides a way to clean the "USER.DBU"
#' file. Then, we can link experimental RI values to the compounds in the msp
#' file.
#'
#' @param file The "USER.DBU" file in the installation path
#'   (e.g., "~/Programs/nist17/mssearch")
#'
#' @return A data.frame containing four variables, Name, InChIKey, ID,
#'   and "Formula"
#'
#' @import readr
#' @import stringr
#' @import rio
#' @export
clean_user_dbu <- function(file) {
  tmp <- readr::read_file_raw(file)
  # Convert all SOH characters to \n
  tmp[tmp == 01] <- charToRaw("\n")
  # change all NUL to \t
  tmp[tmp == 00] <- charToRaw("\t")
  # Keep only pre-defined characters
  tmp <- tmp[tmp %in% keep_char]
  # Write it into a *.txt to allow being re-read in text form
  readr::write_file(tmp, "tmp.txt")

  tmp <- readLines("tmp.txt", warn = FALSE)
  # remove everything before the second continuous \t from last
  tmp <- str_remove(tmp, "^.*\t(?=\t)")
  tmp <- str_remove_all(tmp, "^\t.{1,2}$") # remove the remaining starting \t
  tmp <- str_trim(tmp, side = "both")
  tmp <- tmp[str_detect(tmp, "^.+")]
  tmp <- tmp[str_count(tmp) > 5] # previously 4, but in nist23, must be 5
  tmp <- str_replace(tmp, " \\${2} \\$:28", "\t")
  tmp <- str_replace(tmp, "\\$\\$\\s*[^\\t]*", "")

  # Write it into a *.txt to allowing being read in tab delimited form
  writeLines(tmp, "tmp.txt")

  tmp <- rio::import(
    "tmp.txt", fill = TRUE, comment.char = "",
    header = FALSE, quote = "", sep = "\t"
  )
  colnames(tmp) <- c("Name", "InChIKey", "ID", "Formula")

  file.remove("tmp.txt")
  return(tmp)
}


#' Extract experimental RI from NIST library
#'
#' \code{extract_ri} offers a way to extract experimental RI from the NIST
#' library if you have it installed.
#'
#' Once you have NIST library installed, there will be a "ri.dat" file in the
#' installation path (e.g., "~/Programs/nist17/mssearch"). This file
#' contains all experimental RI in the NIST library but it is not human readable.
#' This function firstly convert the "ri.dat" file into a data.frame. However,
#' it is tricky to link RI values in the cleaned RI table to compounds in the
#' msp file. Providing that the "USER.DBU" file in the installation path
#' (e.g., "~/Programs/nist17/mssearch") contains InChIKey of each compound in
#' RI table, we can assign correspondent InChIKey to the RI table, but this
#' file is not human readable. Therefore, this function secondly provides a way
#' to clean the "USER.DBU" file and then assign correspondent InChIKey to the
#' RI table.
#'
#' @param ri_dat The "ri.dat" file in the installation path
#'   (e.g., "~/Programs/nist17/mssearch/nist_ri").
#' @param user_dbu The "USER.DBU" file in the installation path
#'   (e.g., "~/Programs/nist17/mssearch/nist_ri")
#'
#' @return A \code{data.frame} containing experimental RI and InChIKey assigned.
#' @export
#'
#' @import dplyr
#' @importFrom rlang .data
extract_ri <- function(ri_dat, user_dbu) {
  # First, clean ri.dat and re-order it based on ID, which is important to
  # assign correspond ID.
  nist_ri <- clean_ri_dat(ri_dat) %>% arrange(.data$ID)
  # Count the number of records for each compound.
  nist_ri_table <- table(nist_ri$ID)
  # Assign the order of the compound.
  nist_ri <-
    nist_ri %>%
    mutate(correspond_ID = rep(seq_along(nist_ri_table), nist_ri_table)) %>%
    relocate(.data$correspond_ID, .before = .data$ID)

  # Second, clean the USER.DBU file file and assign correspond ID.
  # !!! The appearance of a compound in this list is the same as that in
  # the nist_ri after re-ordering.
  nist_ri_inchikey <-
    clean_user_dbu(user_dbu) %>%
    as_tibble() %>%
    mutate(
      ID = as.numeric(str_remove(.data$ID, "@")),
      correspond_ID = row_number()
    ) %>%
    arrange(!desc(.data$correspond_ID))
  # Assign inchikey to nist_ri
  nist_ri <-
    nist_ri %>%
    mutate(InChIKey = nist_ri_inchikey$InChIKey[
      match(.data$correspond_ID, nist_ri_inchikey$correspond_ID)
    ]) %>%
    relocate(.data$InChIKey, .before = .data$Molecular_Formula)

  return(nist_ri)
}


#' Assign experimental RI to compounds in the msp file
#'
#' \code{assign_ri} offers a way to assign experimental RI to the msp file if
#' you have NIST library installed.
#'
#' Depending on the column polarity, experimental RI can be assigned to
#' compounds in the msp file. Providing that "capillary" GC columns are
#' commonly used. This function will only keep RI records from "capillary"
#' columns and "Lee RI" will be removed. When there are multiple records for
#' a single compound, the median RI will be used and if the standard deviation
#' is higher than 30, this value will be discarded. This function supports
#' parallel computing.
#'
#' @param lib The EI library generated by \code{read_lib}.
#' @param ri_table The RI table cleaned up by \code{extract_ri}.
#' @param polarity The polarity of the column. Can be either "semi-polar",
#'   "non-polar", or "polar".

#'
#' @return A \code{list} with experimental RI assigned.
#' @export
#'
#' @import dplyr
#' @import tibble
#' @import future.apply
#' @importFrom rlang .data
#' @rawNamespace import(stats, except = c(filter, lag))
assign_ri <-
  function(lib, ri_table, polarity = "semi-polar") {
    # Subset RI based on polarity provided.
    if (polarity == "semi-polar") {
      exp_ri <- ri_table %>%
        filter(.data$Column_Polarity == "Semi-standard non-polar")
    } else if (polarity == "non-polar") {
      exp_ri <- ri_table %>%
        filter(.data$Column_Polarity == "Standard non-polar")
    } else {
      exp_ri <- ri_table %>%
        filter(.data$Column_Polarity == "Standard polar")
    }
    # Remove Lee RI and only keep Capillary RI
    exp_ri <- exp_ri %>%
      filter(.data$RI_Type != "Lee RI" & .data$Column_Type == "Capillary") %>%
      group_by(.data$InChIKey) %>%
      # Experimental RI will be rounded to integer while predicted RI will have
      # two digit numbers. This tiny distinction can be easily differentiated
      # in MS-DIAL to help people understand how well the match is.
      summarise(
        SD = round(sd(.data$RI)),
        RI = round(median(.data$RI)),
        number = n()
      ) %>%
      # Change SD values of only one replicate to 0
      mutate(SD = case_when(
        is.na(.data$SD) ~ 0,
        TRUE ~ .data$SD
      )) %>%
      # In the case of multiple records, SD higher than 30 will be removed.
      filter(.data$SD <= 30)

    # Fourth, assign experimental RI to the msp file.
    future.apply::future_lapply(lib, function(x) {
      if (length(x$InChIKey) != 0) {
        if (is.na(x$RI)) {
          x$RI <- exp_ri$RI[match(x$InChIKey, exp_ri$InChIKey)]
        }
      } else {
        x$RI <- NA
      }

      return(x)
    })
  }