R/read_structures.R

Defines functions extract_structure combine_mol2sdf

Documented in combine_mol2sdf extract_structure

#' Combine multiple mol files into a single sdf file and remove duplicates
#'
#' \code{combine_mol2sdf} offers a way to combine multiple mol files in to
#' a single sdf file removing duplicates.
#'
#' The msp file exported from NIST format by the \strong{Lib2NIST} software
#' has no SMILES which is used for viewing the structure in MS-DIAL and is
#' crucial as well if you want to predict retention index (RI) for compounds
#' with no experimental RI (in the case of EI library). Chemical structure of
#' each compound in the NIST library can be exported as a mol file. Every
#' library entry will have one corresponding mol file. All these mol files will
#' be stored in a folder with ".MOL" suffix. This function aims to combine all
#' these mol files into a single sdf file which can be then used for retrieve
#' SMILES for each entry. This function supports parallel computing.
#'
#' @param input The location of the exported *.MOL folder from Lib2NIST,
#'   e.g., "/home/nist.MOL".
#' @param output The location where the sdf file will be stored and its name,
#'   e.g., "/home/exported.sdf".
#' @param use_filename In case you want to use the file name as the Molecule_Name
#' in the sdf file, please use \code{use_filename = TRUE}. This is useful when
#' you draw your own chemicals which might not have Molecule_Name in the .MOL files.
#' With this option, you can use the name of the .MOL files.
#'
#' @return It will return no value but only creates a sdf file.
#' @export
#'
#' @import future.apply
#' @rawNamespace import(ChemmineR, except = c(groups, view))
#' @import rlist
combine_mol2sdf <- function(input, output, use_filename = FALSE) {
  # Read mol files into a single sdfset.
  mols <- list.files(
    path = input, pattern = "*.MOL",
    full.names = TRUE, ignore.case = TRUE
  )
  # Allows to use file name as the compound name
  # it is useful for home-draw chemicals which might not have Molecule_Name
  if (use_filename) {
    sdfset <- future.apply::future_lapply(
      mols, function(mol) {
        tmp <- read.SDFset(mol, skipErrors = TRUE)
        name <- gsub(".*\\\\|.*/", "", mol)
        name <- gsub("\\.MOL", "", name, ignore.case = TRUE)
        tmp@SDF[[1]]@header[["Molecule_Name"]] <- name

        return(tmp)
      }
    )
  } else {
    sdfset <- future.apply::future_lapply(
      mols, function(mol) read.SDFset(mol, skipErrors = TRUE)
    )
  }

  # Remove duplicates based on "name" to save time and export the sdf file
  sdfset <- lapply(sdfset, "[[", 1)
  name <- sapply(sdfset, function(x) x[[1]][[1]]) # Extract Molecule_Name
  not_duplicate_index <- which(!duplicated(name)) # Index of non-duplicated name
  # Important to put the index into the list
  # otherwise, it will not be recognized in list.filter
  sdfset <- c(sdfset, not_duplicate_index)
  sdfset <- rlist::list.filter(sdfset, .i %in% not_duplicate_index)
  sdfset <- SDFset(SDFlist = sdfset) # Turn it back to SDFset class
  # Assign cid for each SDF. It is important! Otherwise no information
  # will be converted
  cid(sdfset) <- paste0("CMP", seq_along(sdfset))
  sdfset <- sdfset[validSDF(sdfset)] # Important to remove invalid sdf

  write.SDF(sdfset, output)
}


#' Extract SMILES from the sdf file generated by \code{\link{combine_mol2sdf}}
#'
#' \code{extract_structure} offers a way to retrieve SMILES from the sdf file.
#'
#' The function is a wrapper of the \code{convertFormatFile} function from the
#' \pkg{ChemmineOB} package. As InChI and InChIKey are not supported in Windows-
#' based systems, this function will automatically determine which type of
#' operating system you are working with. Only \strong{name} and \strong{SMILES}
#' will be retrieved if you work with Windows, while \strong{InChI} and
#' \strong{InChIKey} will be exported as well in Linux-based or Mac OS systems.
#'
#' @param input The sdf file generated by \code{\link{combine_mol2sdf}}, e.g.,
#'   "/home/exported.sdf".
#' @param output The location where the structure information will be stored
#' and its name, e.g., "/home/exported.txt".
#'
#' @return A data.frame and creates a *.txt file.
#' @export
#'
#' @import ChemmineOB
#' @import rio
extract_structure <- function(input, output) {
  if (grepl("windows", Sys.info()[1], ignore.case = TRUE)) {
    # Only extract smiles in windows system, as it does not support inchi
    # with inchi and inchikey, it takes longer time.
    ChemmineOB::convertFormatFile("SDF",
      "SMI",
      input,
      output,
      options = data.frame(
        names = "e",
        args = ""
      )
    )
    # Read back the converted data into R
    structure_data <- rio::import(output, header = FALSE)
    colnames(structure_data) <- c("Smiles", "Name") # Set column names
    structure_data$Name <- tolower(structure_data$Name)
    rio::export(structure_data, output)

    return(structure_data)
  } else {
    # Include inchi and inchikey
    # \t (tab-delimted) in the "args"
    ChemmineOB::convertFormatFile("SDF",
      "SMI",
      input,
      output,
      options = data.frame(
        names = "append",
        args = "\tinchi\tinchikey"
      )
    )
    structure_data <- rio::import(output, header = FALSE)
    colnames(structure_data) <- c("Smiles", "Name", "InChI", "InChIKey")
    structure_data$Name <- tolower(structure_data$Name)
    rio::export(structure_data, output)

    return(structure_data)
  }
}
QizhiSu/mspcompiler documentation built on May 7, 2024, 4:25 a.m.