Nothing
#==============================================================================#
#' Parse a single mass spectrum read from an msp-file
#'
#' @description
#' Parse a single mass spectrum read from an msp-file.
#'
#' @param input_lines
#' A character vector. Each element of the character vector is a line from an
#' msp-file.
#' @param line_no
#' An integer. The line in the original file containing the 'Name' field. It
#' is used only to identify the position of a problematic mass spectrum in
#' the original msp-file.
#'
#' @details
#' It is a hidden helper, which is called from the \code{\link{ReadMsp}}
#' function.
#'
#' @return
#' Return a named list. Most elements of the list are strings containing some
#' metadata (e.g., \code{name}, \code{cas_no}, \code{formula}, etc.). Mass
#' spectral data are stored in two numeric vectors (\code{mz} and
#' \code{intst}) containing m/z values and intensities of mass spectral
#' peaks. Synonyms (i.e., \code{synon}) are stored as a character vector.
#'
#' @noRd
#'
#==============================================================================#
.ParseSingleSpectrum <- function(input_lines,
line_no) {
#--[ Input check ]------------------------------------------------------------
# It is a hidden function. Any check is not performed.
#--[ Metadata and mass spectral data ]----------------------------------------
mask_metadata <- grepl("^[a-zA-Z].*?:", input_lines)
mask_ms <- (input_lines != "" & !mask_metadata)
if (max(which(mask_metadata)) > min(which(mask_ms))) {
stop("Incorrect msp format.")
}
#--[ Parsing metadata ]-------------------------------------------------------
ptrn <- "^([a-zA-Z][^:]*):\\s(.*)$"
field_values <- trimws(gsub(ptrn, "\\2", input_lines[mask_metadata]))
temp <- trimws(gsub(ptrn, "\\1", input_lines[mask_metadata]))
field_names <- gsub("\\W+", "_", gsub("#", "_no", tolower(temp)))
# Checking that the 'Name' field is present.
if (!is.element("name", field_names)) {
stop("The 'name' field is missing (line_no: ", line_no, ").")
}
# Checking that all field names (except 'Synon') are unique.
if (length(unique(field_names[field_names != "synon"])) !=
sum(field_names != "synon")) {
stop("Field names are not unique (line_no: ", line_no, ").")
}
# Checking that 'MZ' and 'Intst' fields are missing.
if (is.element("mz", field_names)) {
stop("The 'mz' field is present (line_no: ", line_no, ").")
}
if (is.element("intst", field_names)) {
stop("The 'intst' field is present (line_no: ", line_no, ").")
}
idxs <- which(!(field_names %in% c("synon", "num_peaks")))
msp_obj <- as.list(field_values[idxs])
names(msp_obj) <- field_names[idxs]
# Processing the 'NIST#' field.
if (is.null(msp_obj$nist_no) &&
(!is.null(msp_obj$cas_no) || !is.null(msp_obj$compound_rep))) {
if (!is.null(msp_obj$cas_no)) {
fied_preceding_nist_no <- "cas_no"
} else { # i.e., '!is.null(msp_obj$compound_rep)'
fied_preceding_nist_no <- "compound_rep"
}
if (grepl("NIST#:", msp_obj[[fied_preceding_nist_no]])) {
# According to the format specification, the 'NIST#' filed is present on
# the same line with the 'CAS#' field:
# CAS#: 00-00-0; NIST#: 000000
temp <- sub(";\\s*nist#:\\s+", ";", msp_obj[[fied_preceding_nist_no]],
ignore.case = TRUE)
temp <- unlist(strsplit(temp, ";", fixed = TRUE))
# The 'ignore.case' argument is not available for 'strsplit()'.
msp_obj[[fied_preceding_nist_no]] <- temp[[1]]
msp_obj$nist_no <- temp[[2]]
}
}
# Processing the 'Synon' fields.
if (is.element("synon", field_names)) {
msp_obj$synon <- field_values[field_names == "synon"]
}
#--[ Parsing mass spectral data ]---------------------------------------------
temp <- paste(input_lines[mask_ms], collapse = " ")
temp <- trimws(gsub("[ \\t,;:\\[\\]{}()]+", " ", temp, perl = TRUE))
raw_ms <- unlist(strsplit(temp, " ", fixed = TRUE))
msp_obj$mz <- as.numeric(raw_ms[c(TRUE, FALSE)])
# if (max(abs(msp_obj$mz - as.integer(msp_obj$mz))) < 1e-10) {
# msp_obj$mz <- as.integer(msp_obj$mz)
# }
msp_obj$intst <- as.numeric(raw_ms[c(FALSE, TRUE)])
# Checking that the 'Num Peaks' field is present and the specified number of
# peaks is correct.
if (!is.element("num_peaks", field_names)) {
warning("The 'Num Peaks' field is missing (line_no: ", line_no, ").")
} else {
num_peaks <- as.numeric(field_values[[match("num_peaks", field_names)]])
if (length(msp_obj$mz) != num_peaks) {
warning("The 'Num Peaks' field is incorrect (line_no: ", line_no, ").")
}
}
#--[ Output ]-----------------------------------------------------------------
return(msp_obj)
}
#==============================================================================#
#' Read mass spectra from an msp-file (NIST format)
#'
#' @description
#' Read an msp-file containing mass spectra in the NIST format. The complete
#' description of the format can be found in the NIST Mass Spectral Search
#' Program manual. A summary is presented below in the "Description of the
#' NIST format" section.
#'
#' @param input_file
#' A string. The name of a file.
#'
#' @details
#' Data from an msp-file are read without any modification (e.g., the order of
#' mass values is not changed, zero-intensity peaks are preserved, etc.).
#'
#' @return
#' Return a list of nested lists. Each nested list is a mass spectrum. Almost
#' all metadata fields (e.g., "Name", "CAS#", "Formula", "MW", etc.) are
#' represented as strings. All "Synon" fields are merged into a single
#' character vector. Mass values and intensities are represented as numeric
#' vectors (\code{mz} and \code{intst}). Names of fields are slightly
#' modified:
#' \itemize{
#' \item names are converted to lowercase;
#' \item hash symbols are replaced with \code{_no};
#' \item any other special character is replaced with an underscore
#' character.
#' }
#'
#' @section Description of the NIST format:
#' The summary was prepared using the NIST Mass Spectral Search Program manual
#' v.2.4 (2020).
#' \itemize{
#' \item An msp-file can contain as many spectra as wanted.
#' \item Each spectrum must start with the "Name" field. There must be
#' something in this field.
#' \item The "Num Peaks" field is also required. It must contain the number
#' of mass/intensity pairs.
#' \item Some optional fields (e.g. "Comments", "Formula", "MW") can be
#' between the "Name" and "Num Peaks" fields.
#' \item When a spectrum is exported from the NIST library it also
#' contains the "NIST#" and "DB#" fields. The "NIST#" field is on the same
#' line as the "CAS#" field and separated by a semicolon.
#' \item Each field should be on a separate line (the "NIST#" field is an
#' exception from this rule)
#' \item The mass/intensity list begins on the line following the "Num
#' Peaks" field. The peaks need not be normalized, and the masses need not
#' be ordered. The exact spacing and delimiters used for the mass/intensity
#' pairs are unimportant. The following characters are accepted as
#' delimiters: '\code{space}', '\code{tab}', '\code{,}', '\code{;}',
#' '\code{:}'. Parentheses, square brackets and curly braces ('\code{(}',
#' '\code{(}', '\code{[}', '\code{]}', '\code{\{}', and '\code{\}}') are
#' also allowed.
#' \item The "Name" field can be up to 511 characters.
#' \item The "Comments" field can be up to 1023 characters.
#' \item The "Formula" field can be up to 23 characters.
#' \item The "Synon" field may be repeated.
#' }
#'
#' @examples
#' # Reading the 'alkanes.msp' file
#' msp_file <- system.file("extdata", "alkanes.msp", package = "mssearchr")
#' msp_objs <- ReadMsp(msp_file)
#'
#' # Plotting the first mass spectrum from the 'msp_objs' list
#' par_old <- par(yaxs = "i")
#' plot(msp_objs[[1]]$mz, msp_objs[[1]]$intst,
#' ylim = c(0, 1000), main = msp_objs[[1]]$name,
#' type = "h", xlab = "m/z", ylab = "Intensity", bty = "l")
#' par(par_old)
#'
#' @export
#'
#==============================================================================#
ReadMsp <- function(input_file) {
#--[ Input check ]------------------------------------------------------------
# 'input_file'
if (!is.character(input_file) || length(input_file) != 1) {
stop("'input_file' must be a string.")
}
if (!file.exists(input_file)) {
stop("The file ", "'", basename(input_file), "'", " does not exist.")
}
#--[ Reading and pre-processing ]---------------------------------------------
all_lines <- readLines(input_file)
first_line_idxs <- grep("^name: ", all_lines, ignore.case = TRUE)
if (length(first_line_idxs) == 0L) {
stop("The msp-file is invalid (the 'Name' field is missing).")
}
last_line_idxs <- c(first_line_idxs[-1] - 1L, length(all_lines))
#--[ Parsing ]----------------------------------------------------------------
msp_objs <- lapply(seq_along(first_line_idxs), function(ms_no) {
idxs <- seq(first_line_idxs[[ms_no]], last_line_idxs[[ms_no]])
out <- .ParseSingleSpectrum(all_lines[idxs], first_line_idxs[[ms_no]])
return(out)
})
#--[ Output ]----------------------------------------------------------------
return(invisible(msp_objs))
}
#==============================================================================#
#' Write mass spectra in an msp-file (NIST format)
#'
#' @description
#' Write mass spectra in an msp-file (NIST format).
#'
#' @param msp_objs
#' A list of nested lists. Each nested list is a mass spectrum. Each nested
#' list must contain at least three elements: (1) \code{name} (a string) -
#' compound name (or short description); (2) \code{mz} (a numeric/integer
#' vector) - m/z values of mass spectral peaks; (3) \code{intst} (a
#' numeric/integer vector) - intensities of mass spectral peaks.
#' @param output_file
#' A string. The name of a file.
#' @param fields
#' A character vector. Names of elements in an R list (not the original field
#' names from an msp-file) to be exported. For example, if only CAS number is
#' needed to be exported, the \code{'cas_no'} (not \code{'cas#'}) should be
#' passed. If \code{NULL}, all fields are exported. The output file always
#' contains the 'Name' field, the 'Num Peaks' field, and the mass/intensity
#' list.
#'
#' @details
#' Names of all fields are exported in lower case. It does not cause any
#' problem in the case of the MS Search (NIST) software (however correct
#' operation with other software products has not been tested). Only in a few
#' cases hash symbols and spaces are restored:
#' \itemize{
#' \item the \code{cas_no} element is exported as the 'cas#' field;
#' \item the \code{nist_no} element is exported as the 'nist#' field;
#' \item the \code{num_peaks} element is exported as the 'num peaks' field.
#' }
#'
#' @return
#' \code{NULL} is returned.
#'
#' @examples
#'
#' \dontshow{
#' .old_wd <- setwd(tempdir())
#' }
#' # Exporting mass spectra
#' # Only 'Name', 'SMILES', 'Formula', and 'Num Peaks' fields are exported.
#' WriteMsp(massbank_alkanes[1:3], "test.msp", fields = c("smiles", "formula"))
#' \dontshow{
#' setwd(.old_wd)
#' }
#'
#' @export
#'
#==============================================================================#
WriteMsp <- function(msp_objs,
output_file,
fields = NULL) {
#--[ Input check ]------------------------------------------------------------
# 'msp_objs'
if (!is.list(msp_objs) || !is.list(msp_objs[[1]]) ||
is.null(msp_objs[[1]]$mz) || is.null(msp_objs[[1]]$intst) ||
is.null(msp_objs[[1]]$name)) {
stop("'msp_objs' is invalid.")
}
# 'output_file'
if (!is.character(output_file) || length(output_file) != 1) {
stop("'output_file' must be a string.")
}
# 'fields'
if (!is.null(fields)) {
if (!is.character(fields) || length(fields) == 0L) {
stop("'temp' must be a character vector.")
}
}
#--[ 'fields' ]---------------------------------------------------------------
if (!is.null(fields)) {
mask <- !(fields %in% c("name", "mz", "intst"))
fields <- unique(fields[mask])
}
#--[ Writing data to msp-file ]-----------------------------------------------
all_lines <- unlist(lapply(msp_objs, function(msp) {
if (is.null(msp$name)) {
stop("The '...$name' element is missing.")
}
# 'fields_to_export'
all_fields <- names(msp)
if (is.null(fields)) {
mask <- !(all_fields %in% c("name", "mz", "intst"))
} else {
mask <- all_fields %in% fields
}
fields_to_export <- all_fields[mask]
# Treating 'irregular' fields ('synon', 'db#', 'cas#', and 'nist#')
irreg_lines <- list()
if ("synon" %in% fields_to_export) {
irreg_lines[[length(irreg_lines) + 1L]] <- paste0("synon: ", msp$synon)
}
if ("db_no" %in% fields_to_export) {
irreg_lines[[length(irreg_lines) + 1L]] <- paste0("db#: ", msp$db_no)
}
if (("cas_no" %in% fields_to_export) && ("nist_no" %in% fields_to_export)) {
irreg_lines[[length(irreg_lines) + 1L]] <-
paste0("cas#: ", msp$cas_no, "; ", "nist#: ", msp$nist_no)
} else if ("cas_no" %in% fields_to_export) {
irreg_lines[[length(irreg_lines) + 1L]] <- paste0("cas#: ", msp$cas_no)
} else if ("nist_no" %in% fields_to_export) {
irreg_lines[[length(irreg_lines) + 1L]] <- paste0("nist#: ", msp$nist_no)
}
# Treating 'regular' fields
reg_fields_to_export <-
setdiff(fields_to_export, c("synon", "db_no", "cas_no", "nist_no"))
if (length(reg_fields_to_export) > 0L) {
reg_lines <- paste0(reg_fields_to_export, ": ", msp[reg_fields_to_export])
} else {
reg_lines <- NULL
}
# Preparing text lines for exporting
out <- c(paste0("name: ", msp$name), # 'name'
unlist(irreg_lines), # 'synon', 'db#', 'cas#', and 'nist#'
reg_lines, # all regular fields (e.g., 'formula', 'inchikey', 'mw')
paste0("num peaks: ", length(msp$mz)), # 'num peaks'
paste0(msp$mz, " ", msp$intst), # mass spectral data
"")
return(out)
}))
writeLines(all_lines, output_file)
#--[ Output ]-----------------------------------------------------------------
return(invisible(NULL))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.