R/oxy-db_base.R

Defines functions buildBaseDB cleanDB iatom.to.formula iatom.to.charge iatom.to.smiles smiles.to.iatom

#' @title Get iatom containers from SMILES
#' @description FUNCTION_DESCRIPTION
#' @param smiles character vector of smiles
#' @param silent suppress warnings?, Default: TRUE
#' @param cl parallel::makeCluster object for multithreading, Default: 0
#' @return iatom containers for use in rcdk package
#' @examples
#'  smiles.to.iatom(c('OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O'))
#' @seealso
#'  \code{\link[rJava]{jcall}}
#' @rdname smiles.to.iatom
#' @export
#' @importFrom rcdk parse.smiles do.aromaticity do.typing do.isotopes
#' @importFrom rJava .jcall
smiles.to.iatom <- function(smiles, silent = TRUE, cl = 0) {
  iatoms <- sapply(smiles, function(x, silent) {
    #print(x)
    mol <- NULL
    try(
      {
        fun <- function() {
          mol <- NULL
          try({
            mol <- rcdk::parse.smiles(x)[[1]]
          })
          if (is.null(mol)) {
            mol <- rcdk::parse.smiles(x, kekulise = FALSE)[[1]]
          }
          rcdk::do.aromaticity(mol)
          rcdk::do.typing(mol)
          rcdk::do.isotopes(mol)
          mol
        }
        mol <- if (silent) {
          suppressWarnings(fun())
        } else {
          fun()
        }
      }
    )
    mol
  }, silent = silent)
  try({
    rJava::.jcall("java/lang/System", "V", "gc")
    gc()
  })
  return(iatoms)
}

#' @title Get SMILES from iatom container
#' @description This function takes an rcdk iatomcontainer and returns SMILES
#' @param iatoms list of iatomcontainers
#' @param smitype Which type of SMILES to export?, Default: 'Canonical'
#' @param silent Suppress warnings?, Default: TRUE
#' @return character vector of SMILES in the chosen format
#' @seealso
#'  \code{\link[rcdk]{get.smiles}},\code{\link[rcdk]{smiles.flavors}}
#'  \code{\link[rJava]{jcall}}
#' @rdname iatom.to.smiles
#' @examples
#'  iatom = smiles.to.iatom(c('OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O'))
#'  smi = iatom.to.smiles(iatom)
#' @export
#' @importFrom rcdk get.smiles smiles.flavors
#' @importFrom rJava .jcall
iatom.to.smiles <- function(iatoms, smitype = "Canonical", silent = TRUE) {
  if (!silent) {
    cat("Valid SMILES output options include:\n\n")
    cat(c(" - - - \n", "Absolute", "AtomAtomMap", "AtomicMass", "AtomicMassStrict", "Canonical", "Cx2dCoordinates", "Cx3dCoordinates", "CxAtomLabel", "CxAtomValue", "CxCoordinates", "CxFragmentGroup", "CxMulticenter", "CxPolymer", "CxRadical", "CxSmiles", "CxSmilesWithCoords", "Default", "Generic", "InChILabelling", "Isomeric", "Stereo", "StereoCisTrans", "StereoExTetrahedral", "StereoTetrahedral", "Unique", "UniversalSmiles", "UseAromaticSymbols\n", "- - - \n\n"))
    cat("Defaulting to 'Canonical'.")
  }
  new.smiles <- sapply(iatoms, function(mol, silent) {
    smi <- ""
    try(
      {
        fun <- function() {
          if (is.null(mol)) {
            ""
          } else {
            rcdk::get.smiles(molecule = mol, flavor = rcdk::smiles.flavors(smitype))
          }
        }
        smi <- if (silent) {
          suppressWarnings(fun())
        } else {
          fun()
        }
      },
      silent = silent
    )
    smi
  }, silent = silent)
  try({
    rJava::.jcall("java/lang/System", "V", "gc")
    gc()
  })
  return(new.smiles)
}

#' @title Get formal charge from iatomcontainer
#' @description This function takes iatomcontainer object and returns the formal charge.
#' @param iatoms list of rcdk iatomcontainers
#' @param silent suppress warnings?, Default: TRUE
#' @return Character vector of formal charges per iatomcontainer.
#' @seealso
#'  \code{\link[rcdk]{get.total.formal.charge}}
#'  \code{\link[rJava]{jcall}}
#' @rdname iatom.to.charge
#' @export
#' @examples
#'  iatom = smiles.to.iatom(c('OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O'))
#'  ch = iatom.to.charge(iatom)
#' @importFrom rcdk get.total.formal.charge
#' @importFrom rJava .jcall
iatom.to.charge <- function(iatoms, silent = TRUE) {
  new.charges <- sapply(iatoms, function(mol, silent) {
    ch <- 0
    try(
      {
        fun <- function() rcdk::get.total.formal.charge(mol = mol)
        ch <- if (silent) {
          suppressWarnings(fun())
        } else {
          fun()
        }
      },
      silent = silent
    )
    ch
  }, silent = silent)
  try({
    rJava::.jcall("java/lang/System", "V", "gc")
    gc()
  })
  return(new.charges)
}

#' @title Get molecular formula from iatomcontainer
#' @description This function takes iatomcontainer object and returns the molecular formula.
#' @param iatoms list of rcdk iatomcontainers
#' @param silent suppress warnings?, Default: TRUE
#' @return Character vector of formulas per iatomcontainer.
#' @seealso
#'  \code{\link[rcdk]{get.mol2formula}}
#'  \code{\link[rJava]{jcall}}
#' @rdname iatom.to.formula
#' @export
#' @examples
#'  iatom = smiles.to.iatom(c('OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O'))
#'  form = iatom.to.formula(iatom)
#' @importFrom rcdk get.mol2formula
#' @importFrom rJava .jcall
iatom.to.formula <- function(iatoms, silent = TRUE) {
  new.formulas <- sapply(iatoms, function(mol, silent) {
    form <- list(a = NULL)
    try(
      {
        fun <- function() rcdk::get.mol2formula(molecule = mol)@string
        form <- if (silent) {
          suppressWarnings(fun())
        } else {
          fun()
        }
      },
      silent = silent
    )
    form[[1]]
  }, silent = silent)
  try({
    rJava::.jcall("java/lang/System", "V", "gc")
    gc()
  })
  return(new.formulas)
}

#' @title Uniformize database and remove invalid formulas/SMILES
#' @description This is a wrapper function to take a 'raw' input data table with compound information, uniformize the SMILES
#' @param db.formatted Data table with columns 'compoundname, structure, baseformula, charge, description'
#' @param cl parallel::makeCluster object for multithreading
#' @param silent Suppress warnings?
#' @param blocksize How many compounds to process per 'block'? Higher number means bigger memory spikes, but faster processing time.
#' @param smitype SMILES format, Default: 'Canonical'
#' @return Data table with SMILES in the correct format, and charge/formula re-generated from said SMILES if available.
#' @seealso
#'  \code{\link[parallel]{clusterApply}}
#'  \code{\link[pbapply]{pbapply}}
#'  \code{\link[enviPat]{check_chemform}}
#'  \code{\link[data.table]{rbindlist}}
#' @rdname cleanDB
#' @export
#' @examples
#' \dontrun{myDB = build.LMDB(tempdir())}
#' \dontrun{cleanedDB = cleanDB(myDB$db, cl = 0, blocksize = 10)}
#' @importFrom parallel clusterExport
#' @importFrom pbapply pblapply
#' @importFrom enviPat check_chemform
#' @importFrom data.table rbindlist
cleanDB <- function(db.formatted, cl, silent = TRUE, blocksize, smitype = "Canonical") {
  blocks <- split(1:nrow(db.formatted), ceiling(seq_along(1:nrow(db.formatted)) / blocksize))
  if (is.list(cl)) {
    parallel::clusterExport(cl, varlist = c("db.formatted", "iatom.to.smiles",
                                            "smiles.to.iatom", "iatom.to.formula",
                                            "iatom.to.charge", "silent", "isotopes"), envir = environment())
  }
  print("Uniformizing formulas/SMILES/charges and checking for mistakes...")
  db.fixed.rows <- pbapply::pblapply(blocks, cl = cl, function(block) {
    db.form.block <- db.formatted[block, ]
    iats <- smiles.to.iatom(db.form.block$structure, silent = silent)
    valid.struct <- unlist(lapply(iats, function(x) !is.null(x)))
    new.smiles <- iatom.to.smiles(iats[valid.struct], smitype = smitype, silent = silent)
    new.charge <- iatom.to.charge(iats[valid.struct], silent = silent)
    new.formula <- iatom.to.formula(iats[valid.struct], silent = silent)
    db.redone.struct <- db.form.block
    db.redone.struct$structure[valid.struct] <- new.smiles
    db.redone.struct$baseformula[valid.struct] <- new.formula
    db.redone.struct$charge[valid.struct] <- new.charge
    db.removed.invalid <- db.redone.struct
    formulas <- as.character(db.redone.struct$baseformula)
    null.or.na <- which(is.null(formulas) | is.na(formulas) | formulas == "NULL")
    if (length(null.or.na) > 0) {
      db.removed.invalid <- db.removed.invalid[-null.or.na, ]
      valid.struct <- valid.struct[-null.or.na]
    }
    checked <- enviPat::check_chemform(isotopes, chemforms = as.character(db.removed.invalid$baseformula))
    db.removed.invalid$baseformula <- checked$new_formula
    invalid.struct <- !valid.struct
    db.removed.invalid$structure[which(invalid.struct)] <- paste0("[", db.removed.invalid$baseformula[invalid.struct], "]", db.removed.invalid$charge[invalid.struct])
    invalid.formula <- which(checked$warning)
    if (length(invalid.formula) > 0) {
      db.removed.invalid <- db.removed.invalid[-invalid.formula, ]
    }
    return(db.removed.invalid)
  })
  return(data.table::rbindlist(db.fixed.rows))
}

#' @title Build the base database
#' @description This is a large wrapper function that calls upon all individual database parsers, cleans the resulting database and saves it in a SQLite database.
#' @param outfolder In which folder are you building your databases? Temp folders etc. will be put here.
#' @param dbname Which database do you want to build? Options: chebi,maconda,kegg,bloodexposome,dimedb,expoexplorer, foodb, drugbank, lipidmaps, massbank, metabolights, metacyc, phenolexplorer, respect, wikidata, wikipathways, t3db, vmh, hmdb, smpdb, lmdb, ymdb, ecmdb, bmdb, rmdb, stoff, anpdb, mcdb, mvoc, pamdb
#' @param custom_csv_path PARAM_DESCRIPTION, Default: NULL
#' @param smitype Which SMILES format do you want?, Default: 'Canonical'
#' @param silent Suppress warnings?, Default: TRUE
#' @param cl parallel::makeCluster object for multithreading, Default: 0
#' @param test Run in test mode? Makes an incomplete ver of db, but is faster.
#' @param doBT Do a biotransformation step using Biotransformer?
#' @param btOpts Biotransfomer -q options. Defaults to phase II transformation only.
#' @param btLoc Location of Biotransformer JAR file. Needs to be executable!
#' @param skipClean Skip cleaning step? Cleaning step uses SMILES to acquire formula, charge, and transforms SMILES into 'smitype' dialect.
#' @return Nothing, writes SQLite database to 'outfolder'.
#' @seealso
#'  \code{\link[data.table]{fread}},\code{\link[data.table]{as.data.table}}
#'  \code{\link[DBI]{dbDisconnect}}
#' @rdname buildBaseDB
#' @export
#' @examples
#'  \dontrun{buildBaseDB(outfolder = tempdir(), "lmdb", test=TRUE)}
#' @importFrom data.table fread as.data.table data.table
#' @importFrom RSQLite dbExecute
#' @importFrom DBI dbDisconnect
buildBaseDB <- function(outfolder, dbname, custom_csv_path = NULL, smitype = "Canonical",
                        silent = TRUE, cl = 0, test = FALSE, doBT = FALSE, btOpts = "phaseII:1", btLoc, skipClean=F) {
  httr::set_config(httr::config(ssl_verifypeer = 0))
  oldpar <- options()
  options(stringsAsFactors = FALSE, timeout=1000)
  on.exit(options(oldpar))
  removeDB(outfolder, paste0(dbname, ".db"))
  conn <- openBaseDB(outfolder, paste0(dbname, ".db"))
  if (is.null(custom_csv_path)) {
    db.error = simpleError(paste(toupper(dbname), "not available. Please submit an issue to the joannawolthuis/MetaDBparse GitHub repository."))
    db.formatted.all <- tryCatch(switch(dbname,
                                        chebi = build.CHEBI(outfolder),
                                        maconda = build.MACONDA(outfolder, conn), kegg = build.KEGG(outfolder),
                                        bloodexposome = build.BLOODEXPOSOME(outfolder), dimedb = build.DIMEDB(outfolder),
                                        expoexplorer = build.EXPOSOMEEXPLORER(outfolder), foodb = build.FOODB(outfolder),
                                        drugbank = build.DRUGBANK(outfolder), lipidmaps = build.LIPIDMAPS(outfolder),
                                        massbank = build.MASSBANK(outfolder), metabolights = build.METABOLIGHTS(outfolder),
                                        metacyc = build.METACYC(outfolder),
                                        phenolexplorer = build.PHENOLEXPLORER(outfolder),
                                        respect = build.RESPECT(outfolder), wikidata = build.WIKIDATA(outfolder), wikipathways = build.WIKIPATHWAYS(outfolder), t3db = build.T3DB(outfolder), vmh = build.VMH(outfolder), hmdb = build.HMDB(outfolder), smpdb = build.SMPDB(outfolder), lmdb = build.LMDB(outfolder), ymdb = build.YMDB(outfolder), ecmdb = build.ECMDB(outfolder), bmdb = build.BMDB(outfolder), rmdb = build.RMDB(outfolder), stoff = build.STOFF(outfolder), anpdb = build.ANPDB(outfolder),
                                        mcdb = build.MCDB(outfolder), mvoc = build.mVOC(outfolder),
                                        pamdb = build.PAMDB(outfolder), pharmgkb = build.PHARMGKB(outfolder),
                                        reactome = build.REACTOME(outfolder),
                                        metabolomicsworkbench = build.METABOLOMICSWORKBENCH(outfolder),
                                        npa = build.NPA(outfolder),
                                        markerdb = build.MARKERDB(outfolder),
                                 error = function(e) e,
                                 finally = data.table::data.table())
    )
    if(is.null(db.formatted.all)){
      stop("This DB is currently not included in MetaDBparse.")
    }
  } else {
    db.formatted.all <- list(db = data.table::fread(custom_csv_path, header = TRUE), version = Sys.time())
  }
  if (dbname == "maconda") {
    return(NA)
  }

  db.formatted <- data.table::as.data.table(db.formatted.all$db)
  db.formatted <- data.frame(lapply(db.formatted, as.character), stringsAsFactors = FALSE)
  if (test & nrow(db.formatted > 10)) {
    db.formatted <- db.formatted[1:10, ]
  }
  if (doBT) {
    db.biotrans <- doBT(smis = db.formatted$structure, opts = btOpts, jarloc = btLoc)
    if (nrow(db.biotrans) > 0) {
      db.biotrans.fin <- data.table::rbindlist(lapply(1:nrow(db.biotrans), function(i) {
        newInfo <- db.biotrans[i, ]
        oldIdx <- which(db.formatted$structure == newInfo$structure)
        oldInfo <- db.formatted[oldIdx, ]
        oldInfo$structure <- newInfo$SMILES
        oldInfo$description <- paste0("Modified through Biotransformer: ", newInfo$Reaction, ". ", oldInfo$description)
        reacShorthand <- if (grepl("glucuronidation", tolower(newInfo$Reaction))) {
          "glucuronide"
        }
        else if (grepl("sulfonation", tolower(newInfo$Reaction))) {
          "sulfone"
        }
        else if (grepl("sulfation", tolower(newInfo$Reaction))) {
          "sulphate"
        }
        oldInfo$compoundname <- paste(oldInfo$compoundname, reacShorthand)
        oldInfo$identifier <- paste0(oldInfo$identifier, "_", newInfo$`Reaction ID`)
        oldInfo
      }))
      db.formatted <- data.table::rbindlist(list(db.formatted, db.biotrans.fin), use.names = TRUE)
    }
  }

  if(all(is.na(db.formatted$structure))) skipClean = T

  if(!skipClean){
    db.formatted <- db.formatted[!(db.formatted$structure %in% c(NA,"")),]
    db.final <- data.table::as.data.table(cleanDB(db.formatted,
                                                  cl = cl,
                                                  silent = silent,
                                                  blocksize = 400))
  }else{
    db.final = data.table::as.data.table(db.formatted)
    db.final$structure = paste0("[", db.final$baseformula, "]", db.final$charge)
  }

  print("Preview:")
  print(head(db.final))

  db.final <- db.final[, lapply(.SD, as.character)]
  writeDB(conn, data.table::data.table(date = Sys.Date(), version = db.formatted.all$version),
          "metadata")
  writeDB(conn, table = db.final, "base")
  if("path" %in% names(db.formatted.all)){
    writeDB(conn, table = db.formatted.all$path, "pathways")
  }
  RSQLite::dbExecute(conn, "CREATE INDEX b_idx1 ON base(structure)")
  DBI::dbDisconnect(conn)
}
joannawolthuis/MetaDBparse documentation built on June 20, 2022, 9:07 p.m.