R/ko2kegg_abundance.R

Defines functions filter_kegg_reference_for_prokaryotes filter_kegg_reference_to_pathways

#' Convert KO abundance in picrust2 export files to KEGG pathway abundance
#'
#' This function takes a file containing KO (KEGG Orthology) abundance data in picrust2 export format and converts it to KEGG pathway abundance data.
#' The input file should be in .tsv, .txt, or .csv format.
#'
#' @param file A character string representing the file path of the input file containing KO abundance data in picrust2 export format. The input file should have KO identifiers in the first column and sample identifiers in the first row. The remaining cells should contain the abundance values for each KO-sample pair.
#' @param data An optional data.frame containing KO abundance data in the same format as the input file. If provided, the function will use this data instead of reading from the file. By default, this parameter is set to NULL.
#' @param method Method for calculating pathway abundance. One of:
#'   \itemize{
#'     \item \code{"abundance"}: (Default) PICRUSt2-style calculation using the mean of upper-half sorted KO abundances. This method is more robust and avoids inflating abundances for pathways with more KOs.
#'     \item \code{"sum"}: Simple summation of all KO abundances. This is the legacy method and may double-count KOs belonging to multiple pathways.
#'   }
#' @param filter_for_prokaryotes Logical. If TRUE (default), filters out KEGG pathways
#'   that are not relevant to prokaryotic (bacterial/archaeal) analysis. The function
#'   always removes non-pathway KEGG buckets before this filter is applied. The
#'   prokaryote filter removes pathways in categories such as:
#'   \itemize{
#'     \item Human diseases (cancer, neurodegenerative diseases, addiction, etc.)
#'     \item Organismal systems (immune system, nervous system, endocrine system, etc.)
#'   }
#'   Bacterial infection pathways and antimicrobial resistance pathways are retained.
#'   Set to FALSE to include all KEGG pathways (for eukaryotic analysis or custom filtering).
#' @param progress Logical. Whether to show a progress bar while aggregating
#'   pathways. Defaults to \code{interactive()} so non-interactive scripts and
#'   tests stay quiet.
#'
#' @return
#' A data frame with KEGG pathway abundance values. Rows represent KEGG pathways, identified by their KEGG pathway IDs. Columns represent samples, identified by their sample IDs from the input file.
#'
#' @details
#' The default \code{"abundance"} method follows PICRUSt2's approach for calculating pathway abundance:
#' \enumerate{
#'   \item For each pathway, collect abundances of all associated KOs present in the data
#'   \item Sort the abundances in ascending order
#'   \item Take the upper half of the sorted values
#'   \item Calculate the mean as the pathway abundance
#' }
#'
#' This approach has several advantages over simple summation:
#' \itemize{
#'   \item Does not inflate abundances for pathways containing more KOs
#'   \item More robust to missing or low-abundance KOs
#'   \item Provides a more accurate representation of pathway activity
#' }
#'
#' The \code{"sum"} method is provided for backward compatibility and simply sums all KO abundances for each pathway.
#'
#' @section Pathway Filtering:
#' Before abundance calculation, KEGG BRITE hierarchies and "Not Included in Pathway or
#' Brite" pseudo-pathways are removed because they are not KEGG pathway maps and cannot
#' be consistently annotated as pathways (for example, \code{ko99980}).
#'
#' When \code{filter_for_prokaryotes = TRUE}, the function excludes KEGG pathways that are
#' biologically irrelevant to prokaryotic organisms. KEGG reference pathways include pathways
#' from all domains of life, and many human/animal-specific pathways would appear in bacterial
#' analysis simply because some KOs are shared across organisms.
#'
#' The following KEGG Level 2 categories are excluded:
#' \itemize{
#'   \item Cancer pathways (overview and specific types)
#'   \item Neurodegenerative diseases (Alzheimer's, Parkinson's, etc.)
#'   \item Substance dependence (addiction pathways)
#'   \item Cardiovascular diseases
#'   \item Endocrine and metabolic diseases
#'   \item Immune diseases
#'   \item Organismal systems (immune, nervous, endocrine, digestive, etc.)
#' }
#'
#' The following are RETAINED even with filtering:
#' \itemize{
#'   \item Infectious disease: bacterial (Salmonella, E. coli, Tuberculosis, etc.)
#'   \item Drug resistance: antimicrobial (antibiotic resistance)
#'   \item All Metabolism pathways
#'   \item Genetic/Environmental Information Processing
#'   \item Cellular Processes
#' }
#'
#' @examples
#' \dontrun{
#' library(ggpicrust2)
#' library(readr)
#'
#' # Example 1: Default - filtered for prokaryotic analysis
#' data(ko_abundance)
#' kegg_abundance <- ko2kegg_abundance(data = ko_abundance)
#'
#' # Example 2: Include all pathways (for eukaryotic analysis)
#' kegg_abundance_all <- ko2kegg_abundance(data = ko_abundance, filter_for_prokaryotes = FALSE)
#'
#' # Example 3: Using legacy sum method with filtering
#' kegg_abundance_sum <- ko2kegg_abundance(data = ko_abundance, method = "sum")
#'
#' # Example 4: From file
#' input_file <- "path/to/your/picrust2/results/pred_metagenome_unstrat.tsv"
#' kegg_abundance <- ko2kegg_abundance(file = input_file)
#' }
#' @export
ko2kegg_abundance <- function (file = NULL, data = NULL, method = c("abundance", "sum"),
                               filter_for_prokaryotes = TRUE,
                               progress = interactive()) {
  method <- match.arg(method)
  filter_for_prokaryotes <- normalize_logical_flag(filter_for_prokaryotes, "filter_for_prokaryotes")
  progress <- normalize_logical_flag(progress, "progress")

  # Basic parameter validation
  if (is.null(file) & is.null(data)) {
    stop("Error: Please provide either a file or a data.frame.")
  }

  if (!is.null(file) && !is.null(data)) {
    warning("Both file and data provided. Using data and ignoring file.")
    file <- NULL
  }

  # Load data from file or use provided data
  if (!is.null(file)) {
    abundance <- read_abundance_file(file)
  } else {
    if (!is.data.frame(data)) {
      stop("'data' must be a data.frame")
    }
    abundance <- data
    if (ncol(abundance) < 2) {
      stop("Data must have at least 2 columns (KO IDs and samples)")
    }
  }

  # Standardize first column name (PICRUSt2 uses various formats)
  valid_ko_column_names <- c("function.", "function", "sequence", "#NAME")
  first_col_name <- colnames(abundance)[1]

  if (first_col_name %in% valid_ko_column_names) {
    colnames(abundance)[1] <- "function."
  } else {
    # Check if first column contains KO IDs
    first_col_values <- as.character(abundance[[1]])
    ko_pattern <- "(^K\\d{5}$)|(^ko:K\\d{5}$)"
    if (any(grepl(ko_pattern, first_col_values))) {
      colnames(abundance)[1] <- "function."
    } else {
      stop(sprintf("First column '%s' does not contain KO IDs (expected: K##### or ko:K#####)", first_col_name))
    }
  }

  if (nrow(abundance) == 0) {
    stop("Data contains no rows")
  }

  # Standardize KO ID format and validate
 abundance <- clean_ko_abundance(abundance, verbose = FALSE)

  # Validate numeric matrix (negative values, NA, duplicates)
  numeric_mat <- as.matrix(abundance[, -1, drop = FALSE])
  validate_numeric_matrix(numeric_mat)
  validate_feature_ids(abundance[[1]], type = "KO")
  
  # Load KEGG reference data using unified loader
  ko_to_kegg_reference <- load_reference_data("ko_to_kegg")

  # Keep only KEGG pathway maps. The bundled KO-to-KEGG reference also contains
  # BRITE hierarchies and "Not Included in Pathway or Brite" buckets such as
  # ko99980; those are KO groupings, not pathways, and downstream pathway
  # annotation should not be asked to treat them as pathway IDs.
  ko_to_kegg_reference <- filter_kegg_reference_to_pathways(ko_to_kegg_reference)

  # Filter for prokaryotic pathways if requested
  if (filter_for_prokaryotes) {
    ko_to_kegg_reference <- filter_kegg_reference_for_prokaryotes(ko_to_kegg_reference)
  }

  # Get all unique pathway IDs
  all_pathways <- unique(ko_to_kegg_reference$pathway_id)

  # Create fast pathway → KOs lookup mapping
  pathway_to_ko <- split(ko_to_kegg_reference$ko_id, ko_to_kegg_reference$pathway_id)

  # Create result data frame
  kegg_abundance <- data.frame(
    row.names = all_pathways  # Set pathways as row names
  )

  sample_names <- colnames(abundance)[-1]
  kegg_abundance[sample_names] <- 0

  # Calculate pathway abundances. Progress is interactive-only by default so
  # scripts and tests do not get flooded with carriage-return progress output.
  pb <- NULL
  if (progress) {
    pb <- utils::txtProgressBar(min = 0, max = nrow(kegg_abundance), style = 3)
    on.exit({
      if (!is.null(pb)) close(pb)
    }, add = TRUE)
  }

  for (i in seq_len(nrow(kegg_abundance))) {
    if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
    current_kegg <- rownames(kegg_abundance)[i]
    relevant_kos <- pathway_to_ko[[current_kegg]]
    matching_rows <- abundance[[1]] %in% relevant_kos

    if (any(matching_rows)) {
      if (method == "sum") {
        kegg_abundance[i, ] <- colSums(abundance[matching_rows, -1, drop = FALSE])
      } else {
        # PICRUSt2-style: upper-half mean
        ko_abundances <- as.matrix(abundance[matching_rows, -1, drop = FALSE])
        kegg_abundance[i, ] <- apply(ko_abundances, 2, function(col) {
          sorted_col <- sort(col)
          mean(sorted_col[ceiling(length(sorted_col) / 2):length(sorted_col)])
        })
      }
    }
  }
  if (!is.null(pb)) {
    close(pb)
    pb <- NULL
  }

  # Remove zero-abundance pathways. Reaching all-zero here means the IDs
  # passed format validation but none are present in the KEGG reference
  # (e.g. outdated or non-standard KO identifiers). Continuing would just
  # hand an empty matrix to downstream DAA and produce a misleading error.
  zero_rows <- rowSums(kegg_abundance, na.rm = TRUE) == 0
  if (all(zero_rows)) {
    stop("No KO IDs in the input matched any KEGG pathway. ",
         "This usually means the KO IDs are outdated or not in the KEGG reference. ",
         "Expected format: K##### (e.g., K00001).")
  }
  kegg_abundance <- kegg_abundance[!zero_rows, , drop = FALSE]

  kegg_abundance
}

#' Filter KO-to-KEGG reference data to true KEGG pathway maps
#'
#' @param ko_to_kegg_reference KO-to-KEGG reference data frame
#' @return Filtered data frame
#' @noRd
filter_kegg_reference_to_pathways <- function(ko_to_kegg_reference) {
  level1 <- as.character(ko_to_kegg_reference$level1)
  is_non_pathway <- grepl("^(09180|09190)\\b", level1)

  ko_to_kegg_reference[!is_non_pathway, , drop = FALSE]
}

#' Filter KO-to-KEGG reference data for prokaryotic analyses
#'
#' KEGG hierarchy labels in the bundled reference include BRITE numeric
#' prefixes (for example, "09160 Human Diseases"). Match on those stable
#' hierarchy IDs instead of exact free-text labels.
#'
#' @param ko_to_kegg_reference KO-to-KEGG reference data frame
#' @return Filtered data frame
#' @noRd
filter_kegg_reference_for_prokaryotes <- function(ko_to_kegg_reference) {
  level1 <- as.character(ko_to_kegg_reference$level1)
  level2 <- as.character(ko_to_kegg_reference$level2)

  is_organismal_system <- grepl("^09150\\b", level1)
  is_human_disease <- grepl("^09160\\b", level1)
  retained_human_disease <- grepl("^(09171|09175)\\b", level2)
  excluded_human_disease <- is_human_disease & !retained_human_disease

  keep <- !(is_organismal_system |
              excluded_human_disease)

  ko_to_kegg_reference[keep, , drop = FALSE]
}

Try the ggpicrust2 package in your browser

Any scripts or data that you put into this service are public.

ggpicrust2 documentation built on May 20, 2026, 5:07 p.m.