R/prepare_taxa.R

Defines functions prepare_taxa

Documented in prepare_taxa

#' @title Prepare taxa
#'
#' @description This function performs taxon name preparation
#'    to match the Open Tree of Life taxonomy
#'
#' @details Depending if the features are aligned between samples
#'    originating from various organisms or not,
#'    It can either attribute all features to a single organism,
#'    or attribute them to multiple ones,
#'    according to their relative intensities among the samples.
#'
#' @include clean_collapse.R
#' @include get_organism_taxonomy_ott.R
#'
#' @param input File containing your features intensities
#' @param extension Does your column names contain the file extension?
#'    (MZmine mainly)
#' @param name_features Name of the features column in the features file
#' @param name_filename Name of the file name column in the metadata file
#' @param colname Name of the column containing biological source information
#' @param metadata File containing your metadata including biological source
#' @param top_k Number of organisms to be retained per feature top intensities
#' @param org_tax_ott File containing Open Tree of Life Taxonomy
#' @param output Output file
#' @param taxon If you want to enforce all features to a given taxon,
#'    put its name here.
#'
#' @return NULL
#'
#' @export
#'
#' @examples NULL
prepare_taxa <-
  function(input = get_params(step = "prepare_taxa")$files$features$raw,
           extension = get_params(step = "prepare_taxa")$names$extension,
           name_features = get_params(step = "prepare_taxa")$names$features,
           name_filename = get_params(step = "prepare_taxa")$names$filename,
           colname = get_params(step = "prepare_taxa")$names$taxon,
           metadata = get_params(step = "prepare_taxa")$files$metadata$raw,
           top_k = get_params(step = "prepare_taxa")$organisms$candidates,
           org_tax_ott = get_params(step = "prepare_taxa")$files$libraries$sop$merged$organisms$taxonomies$ott,
           output = get_params(step = "prepare_taxa")$files$metadata$prepared,
           taxon = get_params(step = "prepare_taxa")$organisms$taxon) {
    if (!is.null(taxon)) {
      if (taxon == "") {
        taxon <- NULL
      }
    }
    if (is.null(taxon)) {
      stopifnot("Your metadata file does not exist" = file.exists(metadata))
      stopifnot("Your input file does not exist" = file.exists(input))
    }

    stopifnot(
      "Your top k organisms parameter should be lower or equal to 5" =
        top_k <= 5
    )

    log_debug(x = "Loading feature table")
    feature_table_0 <- tidytable::fread(
      file = input,
      na.strings = c("", "NA"),
      colClasses = "character"
    )

    if (is.null(taxon)) {
      log_debug(x = "Loading metadata table")
      metadata_table <- tidytable::fread(
        file = metadata,
        na.strings = c("", "NA"),
        colClasses = "character"
      )
    }

    log_debug(x = "Formatting feature table ...")
    log_debug(x = "... WARNING: requires 'Peak area'
              in columns (MZmine format)")
    log_debug(x = "... WARNING: or 'quant_' in columns (SLAW format)")
    feature_table <- feature_table_0 |>
      tidytable::select(
        tidytable::all_of(c(name_features)),
        tidytable::matches(" Peak area"),
        tidytable::matches("quant_"),
      ) |>
      tidytable::select(-tidytable::matches("quant_peaktable")) |>
      tidyfst::col_rn(var = name_features)
    colnames(feature_table) <- colnames(feature_table) |>
      stringi::stri_replace_all_fixed(
        pattern = " Peak area",
        replacement = "",
        vectorize_all = FALSE
      )
    colnames(feature_table) <- colnames(feature_table) |>
      stringi::stri_replace_all_fixed(
        pattern = "quant_",
        replacement = "",
        vectorize_all = FALSE
      )
    log_debug(x = "... filtering top K intensities per feature")
    top_n <- feature_table |>
      tidyfst::rn_col() |>
      tidytable::pivot_longer(cols = seq_len(ncol(feature_table)) + 1) |>
      tidytable::filter(value != 0) |>
      tidytable::mutate(rank = rank(-as.numeric(value)), .by = c(rowname)) |>
      tidytable::filter(rank <= top_k) |>
      tidytable::arrange(rowname, rank)
    rm(feature_table)

    if (!is.null(taxon)) {
      log_debug(x = "Forcing all features to given organism")
      metadata_table <- data.frame(taxon)
      colnames(metadata_table) <- colname
    }

    log_debug(x = "Preparing organisms names")
    organism_table <- metadata_table |>
      tidytable::filter(!is.na(!!as.name(colname))) |>
      tidytable::distinct(!!as.name(colname)) |>
      tidytable::select(organism = !!as.name(colname)) |>
      tidytable::separate_rows(organism,
        sep = "\\|",
      )

    log_debug(x = "Retrieving already computed Open Tree of Life Taxonomy")
    organism_table_filled <- organism_table |>
      tidytable::left_join(
        tidytable::fread(org_tax_ott,
          na.strings = c("", "NA"),
          colClasses = "character"
        ),
        by = c("organism" = "organism_name")
      )
    rm(organism_table)

    log_debug(x = "Submitting the rest to OTL")
    organism_table_missing <- organism_table_filled |>
      tidytable::filter(is.na(organism_taxonomy_ottid))

    if (nrow(organism_table_missing) != 0) {
      biological_metadata_1 <- organism_table_missing |>
        get_organism_taxonomy_ott()

      log_debug(x = "Joining all results")
      biological_metadata <- organism_table_filled |>
        tidytable::filter(!is.na(organism_taxonomy_ottid)) |>
        tidytable::rename(organism_name = organism) |>
        tidytable::bind_rows(biological_metadata_1)
      rm(biological_metadata_1)
    } else {
      biological_metadata <- organism_table_filled |>
        tidytable::filter(!is.na(organism_taxonomy_ottid)) |>
        tidytable::rename(organism_name = organism)
    }
    rm(organism_table_filled, organism_table_missing)

    if (is.null(taxon)) {
      if (extension == FALSE) {
        log_debug("Removing filename extensions")
        metadata_table <- metadata_table |>
          tidytable::mutate(
            filename = stringi::stri_replace_all_fixed(
              str = filename,
              pattern = ".mzML",
              replacement = "",
              vectorize_all = FALSE
            )
          ) |>
          tidytable::mutate(
            filename = stringi::stri_replace_all_fixed(
              str = filename,
              pattern = ".mzxML",
              replacement = "",
              vectorize_all = FALSE
            )
          )
      }
    }
    log_debug(x = "Joining top K with metadata table")
    if (!is.null(taxon)) {
      metadata_table_joined <- cbind(
        feature_table_0 |>
          tidytable::mutate(feature_id = !!as.name(name_features)),
        biological_metadata |>
          tidytable::select(organismOriginal = organism_name)
      )
    } else {
      metadata_table_joined <-
        tidytable::left_join(top_n,
          metadata_table,
          by = c("name" = name_filename)
        ) |>
        tidytable::select(
          feature_id := rowname,
          organismOriginal = tidytable::all_of(colname),
          tidytable::everything()
        )
    }
    rm(feature_table_0, metadata_table)

    log_debug(x = "Joining with cleaned taxonomy table")
    taxed_features_table <-
      tidytable::left_join(
        metadata_table_joined,
        biological_metadata,
        by = c("organismOriginal" = "organism_name")
      ) |>
      tidytable::distinct() |>
      tidytable::select(
        feature_id,
        sample_organism_01_domain = organism_taxonomy_01domain,
        sample_organism_02_kingdom = organism_taxonomy_02kingdom,
        sample_organism_03_phylum = organism_taxonomy_03phylum,
        sample_organism_04_class = organism_taxonomy_04class,
        sample_organism_05_order = organism_taxonomy_05order,
        sample_organism_06_family = organism_taxonomy_06family,
        sample_organism_07_tribe = organism_taxonomy_07tribe,
        sample_organism_08_genus = organism_taxonomy_08genus,
        sample_organism_09_species = organism_taxonomy_09species,
        sample_organism_10_varietas = organism_taxonomy_10varietas
      ) |>
      tidytable::mutate(
        tidytable::across(
          .cols = tidytable::everything(),
          .fns = as.character
        )
      ) |>
      tidytable::group_by(feature_id) |>
      clean_collapse(
        cols = c(
          "sample_organism_01_domain",
          "sample_organism_02_kingdom",
          "sample_organism_03_phylum",
          "sample_organism_04_class",
          "sample_organism_05_order",
          "sample_organism_06_family",
          "sample_organism_07_tribe",
          "sample_organism_08_genus",
          "sample_organism_09_species",
          "sample_organism_10_varietas"
        )
      ) |>
      tidytable::mutate(
        tidytable::across(
          .cols = tidytable::where(is.character),
          .fns = function(x) {
            tidytable::replace_na(x, "ND")
          }
        )
      )
    rm(biological_metadata, metadata_table_joined)

    log_debug(x = "Exporting ...")
    export_params(parameters = get_params(step = "prepare_taxa"), step = "prepare_taxa")
    export_output(x = taxed_features_table, file = output)
    rm(taxed_features_table)
    return(output)
  }
taxonomicallyinformedannotation/tima-r documentation built on July 17, 2024, 7:11 p.m.