R/prepare_libraries_sop_hmdb.R

Defines functions prepare_libraries_sop_hmdb

Documented in prepare_libraries_sop_hmdb

#' @title Prepare libraries of structure organism pairs HMDB
#'
#' @description This function prepares the HMDB structure-organism pairs
#'
#' @include fake_sop_columns.R
#' @include get_params.R
#' @include round_reals.R
#' @include select_sop_columns.R
#'
#' @param input Input file
#' @param output Output file
#'
#' @return The path to the prepared structure-organism pairs library HMDB
#'
#' @export
#'
#' @examples
#' \dontrun{
#' copy_backbone()
#' go_to_cache()
#' prepare_libraries_sop_hmdb()
#' unlink("data", recursive = TRUE)
#' }
prepare_libraries_sop_hmdb <-
  function(
    input = get_params(
      step = "prepare_libraries_sop_hmdb"
    )$files$libraries$sop$raw$hmdb,
    output = get_params(
      step = "prepare_libraries_sop_hmdb"
    )$files$libraries$sop$prepared$hmdb
  ) {
    if (!file.exists(output) || file.size(output) < 100000) {
      if (file.exists(input)) {
        logger::log_trace("Unzipping HMDB")
        hmdb_prepared <- tryCatch(
          expr = {
            utils::unzip(zipfile = input, exdir = dirname(input))
            hmdb_structures <- input |>
              gsub(
                pattern = ".zip",
                replacement = ".sdf",
                fixed = TRUE
              )
            logger::log_trace("Loading HMDB")
            sdf_data <- readLines(con = hmdb_structures, warn = FALSE)

            find_fixed_pattern_line_in_file <- function(file, pattern) {
              return(
                file |>
                  stringi::stri_detect_fixed(pattern = pattern) |>
                  which()
              )
            }

            return_next_line <- function(x, file) {
              file[x + 1]
            }

            patterns <- list(
              "id" = "> <DATABASE_ID>",
              "smiles" = "> <SMILES>",
              ## Not needed
              # "inchi" = "> <INCHI_IDENTIFIER>",
              "inchikey" = "> <INCHI_KEY>",
              "formula" = "> <FORMULA>",
              "mass" = "> <EXACT_MASS>",
              "logp" = "> <JCHEM_LOGP>",
              "name" = "> <GENERIC_NAME>"
            )

            hmdb_list <- patterns |>
              purrr::map(.f = find_fixed_pattern_line_in_file, file = sdf_data)

            # Function to realign vectors with missing values
            realign_vectors <- function(vec1, vec2) {
              result <- numeric(length(vec1))
              result[] <- NA
              i <- 1
              j <- 1
              while (i <= length(vec1) && j <= length(vec2)) {
                if (i < length(vec1) && vec2[j] > vec1[i + 1]) {
                  i <- i + 1
                } else {
                  result[i] <- vec2[j]
                  i <- i + 1
                  j <- j + 1
                }
              }

              return(result)
            }
            hmdb_list$mass <- realign_vectors(hmdb_list$formula, hmdb_list$mass)
            hmdb_list$logp <- realign_vectors(hmdb_list$formula, hmdb_list$logp)

            hmdb_list <- hmdb_list |>
              purrr::map(.f = return_next_line, file = sdf_data)

            hmdb_df <- data.frame(
              id = hmdb_list$id,
              smiles = hmdb_list$smiles,
              inchikey = hmdb_list$inchikey,
              formula = hmdb_list$formula,
              mass = hmdb_list$mass,
              logp = hmdb_list$logp,
              name = hmdb_list$name
            )

            logger::log_trace("Formatting HMDB")
            hmdb_prepared <- hmdb_df |>
              tidytable::mutate(tidytable::across(
                .cols = tidyselect::everything(),
                .fns = tidytable::na_if,
                ""
              )) |>
              tidytable::filter(!is.na(inchikey)) |>
              tidytable::mutate(
                structure_inchikey_2D = stringi::stri_sub(
                  str = inchikey,
                  from = 1,
                  to = 14
                ),
                structure_smiles_2D = NA_character_,
              ) |>
              tidytable::select(
                structure_name = name,
                structure_inchikey = inchikey,
                structure_smiles = smiles,
                structure_inchikey_2D,
                structure_smiles_2D,
                structure_molecular_formula = formula,
                structure_exact_mass = mass,
                structure_xlogp = logp
              ) |>
              tidytable::mutate(
                structure_taxonomy_npclassifier_01pathway = NA_character_,
                structure_taxonomy_npclassifier_02superclass = NA_character_,
                structure_taxonomy_npclassifier_03class = NA_character_,
                structure_taxonomy_classyfire_chemontid = NA_character_,
                structure_taxonomy_classyfire_01kingdom = NA_character_,
                structure_taxonomy_classyfire_02superclass = NA_character_,
                structure_taxonomy_classyfire_03class = NA_character_,
                structure_taxonomy_classyfire_04directparent = NA_character_,
              ) |>
              tidytable::mutate(
                organism_name = "Homo sapiens",
                organism_taxonomy_ottid = 770315,
                organism_taxonomy_01domain = "Eukaryota",
                organism_taxonomy_02kingdom = "Metazoa",
                organism_taxonomy_03phylum = "Chordata",
                organism_taxonomy_04class = "Mammalia",
                organism_taxonomy_05order = "Primates",
                organism_taxonomy_06family = "Hominidae",
                organism_taxonomy_07tribe = NA_character_,
                organism_taxonomy_08genus = "Homo",
                organism_taxonomy_09species = "Homo sapiens",
                organism_taxonomy_10varietas = NA_character_,
                reference_doi = NA_character_
              ) |>
              select_sop_columns() |>
              round_reals() |>
              tidytable::distinct()

            logger::log_trace("Deleting unzipped file")
            file.remove(hmdb_structures)
            hmdb_prepared
          },
          error = function(e) {
            logger::log_error(
              "Something went wrong, see original error message:"
            )
            logger::log_error(e)
            hmdb_prepared <- fake_sop_columns()
          }
        )
      } else {
        logger::log_warn(
          "Sorry, HMDB not found, returning an empty file instead"
        )
        hmdb_prepared <- fake_sop_columns()
      }
      export_output(x = hmdb_prepared, file = output)
    }
    return(output)
  }
taxonomicallyinformedannotation/tima-r documentation built on June 1, 2025, 8:10 p.m.