#' Combine multiple mol files into a single sdf file and remove duplicates
#'
#' \code{combine_mol2sdf} offers a way to combine multiple mol files in to
#' a single sdf file removing duplicates.
#'
#' The msp file exported from NIST format by the \strong{Lib2NIST} software
#' has no SMILES which is used for viewing the structure in MS-DIAL and is
#' crucial as well if you want to predict retention index (RI) for compounds
#' with no experimental RI (in the case of EI library). Chemical structure of
#' each compound in the NIST library can be exported as a mol file. Every
#' library entry will have one corresponding mol file. All these mol files will
#' be stored in a folder with ".MOL" suffix. This function aims to combine all
#' these mol files into a single sdf file which can be then used for retrieve
#' SMILES for each entry. This function supports parallel computing.
#'
#' @param input The location of the exported *.MOL folder from Lib2NIST,
#' e.g., "/home/nist.MOL".
#' @param output The location where the sdf file will be stored and its name,
#' e.g., "/home/exported.sdf".
#' @param use_filename In case you want to use the file name as the Molecule_Name
#' in the sdf file, please use \code{use_filename = TRUE}. This is useful when
#' you draw your own chemicals which might not have Molecule_Name in the .MOL files.
#' With this option, you can use the name of the .MOL files.
#'
#' @return It will return no value but only creates a sdf file.
#' @export
#'
#' @import future.apply
#' @rawNamespace import(ChemmineR, except = c(groups, view))
#' @import rlist
combine_mol2sdf <- function(input, output, use_filename = FALSE) {
# Read mol files into a single sdfset.
mols <- list.files(
path = input, pattern = "*.MOL",
full.names = TRUE, ignore.case = TRUE
)
# Allows to use file name as the compound name
# it is useful for home-draw chemicals which might not have Molecule_Name
if (use_filename) {
sdfset <- future.apply::future_lapply(
mols, function(mol) {
tmp <- read.SDFset(mol, skipErrors = TRUE)
name <- gsub(".*\\\\|.*/", "", mol)
name <- gsub("\\.MOL", "", name, ignore.case = TRUE)
tmp@SDF[[1]]@header[["Molecule_Name"]] <- name
return(tmp)
}
)
} else {
sdfset <- future.apply::future_lapply(
mols, function(mol) read.SDFset(mol, skipErrors = TRUE)
)
}
# Remove duplicates based on "name" to save time and export the sdf file
sdfset <- lapply(sdfset, "[[", 1)
name <- sapply(sdfset, function(x) x[[1]][[1]]) # Extract Molecule_Name
not_duplicate_index <- which(!duplicated(name)) # Index of non-duplicated name
# Important to put the index into the list
# otherwise, it will not be recognized in list.filter
sdfset <- c(sdfset, not_duplicate_index)
sdfset <- rlist::list.filter(sdfset, .i %in% not_duplicate_index)
sdfset <- SDFset(SDFlist = sdfset) # Turn it back to SDFset class
# Assign cid for each SDF. It is important! Otherwise no information
# will be converted
cid(sdfset) <- paste0("CMP", seq_along(sdfset))
sdfset <- sdfset[validSDF(sdfset)] # Important to remove invalid sdf
write.SDF(sdfset, output)
}
#' Extract SMILES from the sdf file generated by \code{\link{combine_mol2sdf}}
#'
#' \code{extract_structure} offers a way to retrieve SMILES from the sdf file.
#'
#' The function is a wrapper of the \code{convertFormatFile} function from the
#' \pkg{ChemmineOB} package. As InChI and InChIKey are not supported in Windows-
#' based systems, this function will automatically determine which type of
#' operating system you are working with. Only \strong{name} and \strong{SMILES}
#' will be retrieved if you work with Windows, while \strong{InChI} and
#' \strong{InChIKey} will be exported as well in Linux-based or Mac OS systems.
#'
#' @param input The sdf file generated by \code{\link{combine_mol2sdf}}, e.g.,
#' "/home/exported.sdf".
#' @param output The location where the structure information will be stored
#' and its name, e.g., "/home/exported.txt".
#'
#' @return A data.frame and creates a *.txt file.
#' @export
#'
#' @import ChemmineOB
#' @import rio
extract_structure <- function(input, output) {
if (grepl("windows", Sys.info()[1], ignore.case = TRUE)) {
# Only extract smiles in windows system, as it does not support inchi
# with inchi and inchikey, it takes longer time.
ChemmineOB::convertFormatFile("SDF",
"SMI",
input,
output,
options = data.frame(
names = "e",
args = ""
)
)
# Read back the converted data into R
structure_data <- rio::import(output, header = FALSE)
colnames(structure_data) <- c("Smiles", "Name") # Set column names
structure_data$Name <- tolower(structure_data$Name)
rio::export(structure_data, output)
return(structure_data)
} else {
# Include inchi and inchikey
# \t (tab-delimted) in the "args"
ChemmineOB::convertFormatFile("SDF",
"SMI",
input,
output,
options = data.frame(
names = "append",
args = "\tinchi\tinchikey"
)
)
structure_data <- rio::import(output, header = FALSE)
colnames(structure_data) <- c("Smiles", "Name", "InChI", "InChIKey")
structure_data$Name <- tolower(structure_data$Name)
rio::export(structure_data, output)
return(structure_data)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.