#' Remove retention index for EI libraries
#'
#' \code{remove_ri} offers a way to remove all RI for EI libraries.
#'
#' This function supports parallel computing.
#'
#' @param lib The \code{list} generated by \code{read_lib}.
#'
#' @return A \code{list} without RI
#' @export
#'
#' @import future.apply
remove_ri <- function(lib) {
future.apply::future_lapply(lib, function(x) {
x$RI <- NA
return(x)
})
}
# Define characters to be kept and keep them as raw.
keep_char <- c(
letters, LETTERS, 0:9, "*", ".", ",", ";", '"', "'", "\\",
"/", ":", "_", "^", "%", "&", "{", "}", "[", "]", "(", ")",
"+", "-", "|", "=", "@", "#", "!", "$", "\n", "\t", " "
) %>%
sapply(charToRaw)
#' Extract experimental RI from NIST
#'
#' \code{clean_ri_dat}, an interanl function, offers a way to extract RI from
#' the "ri.dat" file.
#'
#' Once you have NIST library installed, there will be a "ri.dat" file in the
#' installation path (e.g., "~/Programs/nist17/mssearch"). This file
#' contains all experimental RI in the NIST library but it is not human readable.
#' This function provides a way to convert the "ri.dat" file into a data.frame,
#' so that we can better leverage the RI information present in the NIST library
#' and to incorporate them into the msp file.
#'
#'
#' @param file The "ri.dat" file in the installation path
#' (e.g., "~/Programs/nist17/mssearch").
#'
#' @return A cleaned data.frame containing experimental RI from NIST
#'
#' @import readr
#' @import rio
#' @export
clean_ri_dat <- function(file) {
# Read the file in binary.
tmp <- readr::read_file_raw(file)
# Convert all NUL characters to \n
tmp[tmp == 00] <- charToRaw("\n")
# Keep only pre-defined characters
tmp <- tmp[tmp %in% keep_char]
# Write it into a *.txt to allow being re-read in text form
readr::write_file(tmp, "tmp.txt")
tmp <- readLines("tmp.txt")
# Every useful entry starts with C/R/U following a number, which is the ID of
# the compound. So, only keep these elements
tmp <- tmp[grepl("^[A-Z]+[0-9]+", tmp)]
# Write it into a *.txt to allowing being read in tab delimited form
writeLines(tmp, "tmp.txt")
tmp <- read.delim("tmp.txt", header = FALSE)
# Set column names
names(tmp) <- c(
"ID", "Name", "Molecular_Formula", "RI", "Column_Type",
"Column_Polarity", "Column", "Column_Length", "Carrier_Gas",
"Substrate", "Column_Diameter", "Phase_Thickness", "RI_Type",
"Ramp_Type", "Temperature1", "Temperature2",
"Temperature_Increment", "Time1", "Time2", "Ramp_Detail", "Note"
)
# Remove the temporary file
file.remove("tmp.txt")
return(tmp)
}
#' Extract InChIKey for compounds that have experimental RI
#'
#' \code{clean_user_dbu}, an internal function, offers a way to extract InChIKey
#' for compounds that have experimental RI from the "USER.DBU" file.
#'
#' RI values in the cleaned RI table obtained by \code{\link{clean_ri_dat}}
#' cannot be linked to compounds in the msp file. Providing that the "USER.DBU"
#' file in the installation path (e.g., "~/Programs/nist17/mssearch") contains
#' InChIKey of each compound in RI table. However this file is not human
#' readable. Therefore, this function provides a way to clean the "USER.DBU"
#' file. Then, we can link experimental RI values to the compounds in the msp
#' file.
#'
#' @param file The "USER.DBU" file in the installation path
#' (e.g., "~/Programs/nist17/mssearch")
#'
#' @return A data.frame containing four variables, Name, InChIKey, ID,
#' and "Formula"
#'
#' @import readr
#' @import stringr
#' @import rio
#' @export
clean_user_dbu <- function(file) {
tmp <- readr::read_file_raw(file)
# Convert all SOH characters to \n
tmp[tmp == 01] <- charToRaw("\n")
# change all NUL to \t
tmp[tmp == 00] <- charToRaw("\t")
# Keep only pre-defined characters
tmp <- tmp[tmp %in% keep_char]
# Write it into a *.txt to allow being re-read in text form
readr::write_file(tmp, "tmp.txt")
tmp <- readLines("tmp.txt", warn = FALSE)
# remove everything before the second continuous \t from last
tmp <- str_remove(tmp, "^.*\t(?=\t)")
tmp <- str_remove_all(tmp, "^\t.{1,2}$") # remove the remaining starting \t
tmp <- str_trim(tmp, side = "both")
tmp <- tmp[str_detect(tmp, "^.+")]
tmp <- tmp[str_count(tmp) > 5] # previously 4, but in nist23, must be 5
tmp <- str_replace(tmp, " \\${2} \\$:28", "\t")
tmp <- str_replace(tmp, "\\$\\$\\s*[^\\t]*", "")
# Write it into a *.txt to allowing being read in tab delimited form
writeLines(tmp, "tmp.txt")
tmp <- rio::import(
"tmp.txt", fill = TRUE, comment.char = "",
header = FALSE, quote = "", sep = "\t"
)
colnames(tmp) <- c("Name", "InChIKey", "ID", "Formula")
file.remove("tmp.txt")
return(tmp)
}
#' Extract experimental RI from NIST library
#'
#' \code{extract_ri} offers a way to extract experimental RI from the NIST
#' library if you have it installed.
#'
#' Once you have NIST library installed, there will be a "ri.dat" file in the
#' installation path (e.g., "~/Programs/nist17/mssearch"). This file
#' contains all experimental RI in the NIST library but it is not human readable.
#' This function firstly convert the "ri.dat" file into a data.frame. However,
#' it is tricky to link RI values in the cleaned RI table to compounds in the
#' msp file. Providing that the "USER.DBU" file in the installation path
#' (e.g., "~/Programs/nist17/mssearch") contains InChIKey of each compound in
#' RI table, we can assign correspondent InChIKey to the RI table, but this
#' file is not human readable. Therefore, this function secondly provides a way
#' to clean the "USER.DBU" file and then assign correspondent InChIKey to the
#' RI table.
#'
#' @param ri_dat The "ri.dat" file in the installation path
#' (e.g., "~/Programs/nist17/mssearch/nist_ri").
#' @param user_dbu The "USER.DBU" file in the installation path
#' (e.g., "~/Programs/nist17/mssearch/nist_ri")
#'
#' @return A \code{data.frame} containing experimental RI and InChIKey assigned.
#' @export
#'
#' @import dplyr
#' @importFrom rlang .data
extract_ri <- function(ri_dat, user_dbu) {
# First, clean ri.dat and re-order it based on ID, which is important to
# assign correspond ID.
nist_ri <- clean_ri_dat(ri_dat) %>% arrange(.data$ID)
# Count the number of records for each compound.
nist_ri_table <- table(nist_ri$ID)
# Assign the order of the compound.
nist_ri <-
nist_ri %>%
mutate(correspond_ID = rep(seq_along(nist_ri_table), nist_ri_table)) %>%
relocate(.data$correspond_ID, .before = .data$ID)
# Second, clean the USER.DBU file file and assign correspond ID.
# !!! The appearance of a compound in this list is the same as that in
# the nist_ri after re-ordering.
nist_ri_inchikey <-
clean_user_dbu(user_dbu) %>%
as_tibble() %>%
mutate(
ID = as.numeric(str_remove(.data$ID, "@")),
correspond_ID = row_number()
) %>%
arrange(!desc(.data$correspond_ID))
# Assign inchikey to nist_ri
nist_ri <-
nist_ri %>%
mutate(InChIKey = nist_ri_inchikey$InChIKey[
match(.data$correspond_ID, nist_ri_inchikey$correspond_ID)
]) %>%
relocate(.data$InChIKey, .before = .data$Molecular_Formula)
return(nist_ri)
}
#' Assign experimental RI to compounds in the msp file
#'
#' \code{assign_ri} offers a way to assign experimental RI to the msp file if
#' you have NIST library installed.
#'
#' Depending on the column polarity, experimental RI can be assigned to
#' compounds in the msp file. Providing that "capillary" GC columns are
#' commonly used. This function will only keep RI records from "capillary"
#' columns and "Lee RI" will be removed. When there are multiple records for
#' a single compound, the median RI will be used and if the standard deviation
#' is higher than 30, this value will be discarded. This function supports
#' parallel computing.
#'
#' @param lib The EI library generated by \code{read_lib}.
#' @param ri_table The RI table cleaned up by \code{extract_ri}.
#' @param polarity The polarity of the column. Can be either "semi-polar",
#' "non-polar", or "polar".
#'
#' @return A \code{list} with experimental RI assigned.
#' @export
#'
#' @import dplyr
#' @import tibble
#' @import future.apply
#' @importFrom rlang .data
#' @rawNamespace import(stats, except = c(filter, lag))
assign_ri <-
function(lib, ri_table, polarity = "semi-polar") {
# Subset RI based on polarity provided.
if (polarity == "semi-polar") {
exp_ri <- ri_table %>%
filter(.data$Column_Polarity == "Semi-standard non-polar")
} else if (polarity == "non-polar") {
exp_ri <- ri_table %>%
filter(.data$Column_Polarity == "Standard non-polar")
} else {
exp_ri <- ri_table %>%
filter(.data$Column_Polarity == "Standard polar")
}
# Remove Lee RI and only keep Capillary RI
exp_ri <- exp_ri %>%
filter(.data$RI_Type != "Lee RI" & .data$Column_Type == "Capillary") %>%
group_by(.data$InChIKey) %>%
# Experimental RI will be rounded to integer while predicted RI will have
# two digit numbers. This tiny distinction can be easily differentiated
# in MS-DIAL to help people understand how well the match is.
summarise(
SD = round(sd(.data$RI)),
RI = round(median(.data$RI)),
number = n()
) %>%
# Change SD values of only one replicate to 0
mutate(SD = case_when(
is.na(.data$SD) ~ 0,
TRUE ~ .data$SD
)) %>%
# In the case of multiple records, SD higher than 30 will be removed.
filter(.data$SD <= 30)
# Fourth, assign experimental RI to the msp file.
future.apply::future_lapply(lib, function(x) {
if (length(x$InChIKey) != 0) {
if (is.na(x$RI)) {
x$RI <- exp_ri$RI[match(x$InChIKey, exp_ri$InChIKey)]
}
} else {
x$RI <- NA
}
return(x)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.