R/prepare_libraries_rt.R

Defines functions prepare_libraries_rt

Documented in prepare_libraries_rt

#' @title Prepare libraries of retention times
#'
#' @description This function prepares retention times libraries
#'    to be used for later
#'
#' @include get_params.R
#' @include import_spectra.R
#'
#' @param mgf_exp MGF containing experimental retention times
#' @param mgf_is MGF containing in silico predicted retention times
#' @param temp_exp File containing experimental retention times
#' @param temp_is File containing in silico predicted retention times
#' @param output_rt Output retention time file
#' @param output_sop Output pseudo sop file
#' @param col_ik Name of the InChIKey in mgf
#' @param col_rt Name of the retention time in mgf
#' @param col_sm Name of the SMILES in mgf
#' @param name_inchikey Name of the InChIKey in file
#' @param name_rt Name of the retention time in file
#' @param name_smiles Name of the SMILES in file
#' @param unit_rt Unit of the retention time. Must be "seconds" or "minutes"
#'
#' @return The path to the prepared retention time library
#'
#' @export
#'
#' @examples
#' \dontrun{
#' copy_backbone()
#' go_to_cache()
#' prepare_libraries_rt()
#' unlink("data", recursive = TRUE)
#' }
prepare_libraries_rt <-
  function(
    mgf_exp = get_params(
      step = "prepare_libraries_rt"
    )$files$libraries$temporal$exp$mgf,
    mgf_is = get_params(
      step = "prepare_libraries_rt"
    )$files$libraries$temporal$is$mgf,
    temp_exp = get_params(
      step = "prepare_libraries_rt"
    )$files$libraries$temporal$exp$csv,
    temp_is = get_params(
      step = "prepare_libraries_rt"
    )$files$libraries$temporal$is$csv,
    output_rt = get_params(
      step = "prepare_libraries_rt"
    )$files$libraries$temporal$prepared,
    output_sop = get_params(
      step = "prepare_libraries_rt"
    )$files$libraries$sop$prepared$rt,
    col_ik = get_params(step = "prepare_libraries_rt")$names$mgf$inchikey,
    col_rt = get_params(step = "prepare_libraries_rt")$names$mgf$retention_time,
    col_sm = get_params(step = "prepare_libraries_rt")$names$mgf$smiles,
    name_inchikey = get_params(step = "prepare_libraries_rt")$names$inchikey,
    name_rt = get_params(step = "prepare_libraries_rt")$names$rt$library,
    name_smiles = get_params(step = "prepare_libraries_rt")$names$smiles,
    unit_rt = get_params(step = "prepare_libraries_rt")$units$rt
  ) {
    ## default transforms from `Spectra`
    if (col_rt == "RTINSECONDS") {
      col_rt <- "rtime"
    }
    if (col_sm == "SMILES") {
      col_sm <- "smiles"
    }

    full_or_null <- function(x) {
      if (length(x) == 0) {
        NULL
      } else {
        x
      }
    }
    mgf_exp <- mgf_exp |>
      purrr::map(.f = full_or_null)
    mgf_exp <- mgf_exp[
      mgf_exp |>
        purrr::map(
          .f = function(x) {
            !is.null(x)
          }
        ) |>
        unlist()
    ]
    mgf_is <- mgf_is |>
      purrr::map(.f = full_or_null)
    mgf_is <- mgf_is[
      mgf_is |>
        purrr::map(
          .f = function(x) {
            !is.null(x)
          }
        ) |>
        unlist()
    ]
    mgf_exp <- mgf_exp |>
      purrr::map(.f = full_or_null)
    temp_exp <- temp_exp[
      temp_exp |>
        purrr::map(
          .f = function(x) {
            !is.null(x)
          }
        ) |>
        unlist()
    ]
    mgf_is <- mgf_is |>
      purrr::map(.f = full_or_null)
    temp_is <- temp_is[
      temp_is |>
        purrr::map(
          .f = function(x) {
            !is.null(x)
          }
        ) |>
        unlist()
    ]
    if (length(mgf_exp) == 0) {
      mgf_exp <- NULL
    }
    if (length(mgf_is) == 0) {
      mgf_is <- NULL
    }
    if (length(temp_exp) == 0) {
      temp_exp <- NULL
    }
    if (length(temp_is) == 0) {
      temp_is <- NULL
    }

    rts_from_mgf <-
      function(mgf) {
        logger::log_trace("Importing spectra")
        spectra <- mgf |>
          purrr::map(.f = import_spectra)
        logger::log_trace("Extracting retention times")
        rts <- spectra |>
          purrr::map(.f = function(x) {
            x@backend@spectraData |>
              data.frame() |>
              tidytable::as_tidytable()
          }) |>
          tidytable::bind_rows() |>
          tidytable::select(tidyselect::any_of(c(
            rt = col_rt,
            inchikey = col_ik,
            smiles = col_sm
          )))
        return(rts)
      }

    rts_from_tab <-
      function(tab) {
        logger::log_trace("Importing file")
        rts <- tab |>
          purrr::map(.f = tidytable::fread) |>
          tidytable::bind_rows() |>
          tidytable::select(tidyselect::any_of(
            c(
              rt = name_rt,
              inchikey = name_inchikey,
              smiles = name_smiles
            )
          ))
        return(rts)
      }

    polish_df <-
      function(df, type = "experimental", unit = unit_rt) {
        df_polished <- df |>
          data.frame() |>
          tidytable::mutate(type = type) |>
          tidytable::rowwise() |>
          tidytable::mutate(
            rt = tidytable::if_else(
              condition = unit == "seconds",
              true = as.numeric(rt) / 60L,
              false = as.numeric(rt)
            )
          ) |>
          tidytable::bind_rows(data.frame(
            inchikey = NA_character_,
            smiles = NA_character_
          )) |>
          tidytable::filter(!is.na(rt)) |>
          tidytable::filter(!is.na(smiles)) |>
          tidytable::distinct()
        return(df_polished)
      }

    complete_df <- function(df) {
      df_full <- df |>
        tidytable::filter(!is.na(inchikey))
      df_missing <- df |>
        tidytable::filter(is.na(inchikey))
      logger::log_warn(
        "There are ",
        nrow(df_missing),
        " entries without InChIKey.",
        " We would recommend you adding them but will try completing.",
        " We will query them on the fly, this might take some time."
      )
      smiles <- unique(df_missing$smiles)

      ## TODO replace with process_smiles
      get_inchikey <- function(smiles, toolkit = "rdkit") {
        url <- paste0(
          "https://api.naturalproducts.net/latest/convert/inchikey?smiles=",
          utils::URLencode(smiles),
          "&toolkit=",
          toolkit
        )
        tryCatch(
          expr = jsonlite::fromJSON(txt = url),
          error = function(e) {
            return(NA_character_)
          }
        )
      }

      inchikey <- purrr::map(.x = smiles, .f = get_inchikey) |>
        as.character()
      df_missing <- df_missing |>
        tidytable::select(-inchikey) |>
        tidytable::left_join(tidytable::tidytable(smiles, inchikey))
      rm(smiles)

      df_completed <- df_full |>
        tidytable::bind_rows(df_missing) |>
        tidytable::mutate(tidytable::across(
          .cols = tidyselect::where(is.character),
          .fns = function(x) {
            tidytable::na_if(x, "NA")
          }
        ))
      rm(df_full, df_missing)
      df_completed <- df_completed |>
        tidytable::select(
          rt,
          structure_smiles = smiles,
          structure_inchikey = inchikey,
          type
        ) |>
        tidytable::distinct()
      logger::log_warn(
        "There were still",
        nrow(
          df_completed |>
            tidytable::filter(is.na(
              structure_inchikey
            ))
        ),
        "entries for which no InChIKey could not be found in the end."
      )
      return(df_completed)
    }

    empty_df <- tidytable::tidytable(
      rt = NA_real_,
      structure_smiles = NA_character_,
      structure_inchikey = NA_character_,
      type = NA_character_
    )

    ## from mgf
    if (!is.null(mgf_exp)) {
      rts_exp_1 <- mgf_exp |>
        rts_from_mgf() |>
        polish_df() |>
        complete_df()
    } else {
      rts_exp_1 <- empty_df
    }
    if (!is.null(mgf_is)) {
      rts_is_1 <- mgf_is |>
        rts_from_mgf() |>
        polish_df(type = "predicted") |>
        complete_df()
    } else {
      rts_is_1 <- empty_df
    }

    ## from csv
    if (!is.null(temp_exp)) {
      rts_exp_2 <- temp_exp |>
        rts_from_tab() |>
        polish_df() |>
        complete_df()
    } else {
      rts_exp_2 <- empty_df
    }
    if (!is.null(temp_is)) {
      rts_is_2 <- temp_is |>
        rts_from_tab() |>
        polish_df(type = "predicted") |>
        complete_df()
    } else {
      rts_is_2 <- empty_df
    }

    df_rts <- tidytable::bind_rows(rts_exp_1, rts_exp_2, rts_is_1, rts_is_2) |>
      tidytable::filter(!is.na(as.numeric(rt))) |>
      tidytable::filter(!is.na((structure_inchikey)))
    rm(rts_exp_1, rts_exp_2, rts_is_1, rts_is_2)

    sop <- df_rts |>
      tidytable::select(structure_smiles, structure_inchikey) |>
      tidytable::distinct() |>
      tidytable::mutate(
        structure_inchikey_connectivity_layer = stringi::stri_sub(
          str = structure_inchikey,
          from = 1L,
          to = 14L
        ),
        organism_name = NA_character_
      )

    rts <- df_rts |>
      tidytable::select(-structure_smiles) |>
      tidytable::distinct() |>
      tidytable::mutate(
        candidate_structure_inchikey_connectivity_layer = gsub(
          pattern = "-.*",
          replacement = "",
          x = structure_inchikey,
          perl = TRUE
        )
      ) |>
      ## TODO REMINDER FOR NOW
      tidytable::distinct(
        rt,
        candidate_structure_inchikey_connectivity_layer,
        type
      )
    rm(df_rts)

    if (nrow(rts) == 0) {
      logger::log_warn(
        "No retention time library found, returning empty sop table."
      )
      sop <- tidytable::tidytable(
        structure_smiles = NA_character_,
        structure_inchikey = NA_character_,
        structure_inchikey_connectivity_layer = NA_character_,
        organism_name = NA_character_
      )
    }

    if (nrow(rts) == 0) {
      logger::log_warn(
        "No retention time library found, returning empty retention time table."
      )
      rts <- tidytable::tidytable(
        rt = NA_real_,
        candidate_structure_inchikey_connectivity_layer = NA_character_,
        type = NA_character_
      )
    }

    export_params(
      parameters = get_params(step = "prepare_libraries_rt"),
      step = "prepare_libraries_rt"
    )
    export_output(x = rts, file = output_rt)
    export_output(x = sop, file = output_sop)
    rm(rts, sop)
    return(c("rt" = output_rt, "sop" = output_sop))
  }
taxonomicallyinformedannotation/tima-r documentation built on June 1, 2025, 8:10 p.m.